Browse Source

Merge git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile

* git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile: (27 commits)
  arch/tile: support newer binutils assembler shift semantics
  arch/tile: fix deadlock bugs in rwlock implementation
  drivers/edac: provide support for tile architecture
  tile on-chip network driver: sync up with latest fixes
  arch/tile: support 4KB page size as well as 64KB
  arch/tile: add some more VMSPLIT options and use consistent naming
  arch/tile: fix some comments and whitespace
  arch/tile: export some additional module symbols
  arch/tile: enhance existing finv_buffer_remote() routine
  arch/tile: fix two bugs in the backtracer code
  arch/tile: use extended assembly to inline __mb_incoherent()
  arch/tile: use a cleaner technique to enable interrupt for cpu_idle()
  arch/tile: sync up with <arch/sim.h> and <arch/sim_def.h> changes
  arch/tile: fix reversed test of strict_strtol() return value
  arch/tile: avoid a simulator warning during bootup
  arch/tile: export <asm/hardwall.h> to userspace
  arch/tile: warn and retry if an IPI is not accepted by the target cpu
  arch/tile: stop disabling INTCTRL_1 interrupts during hypervisor downcalls
  arch/tile: fix __ndelay etc to work better
  arch/tile: bug fix: exec'ed task thought it was still single-stepping
  ...

Fix up trivial conflict in arch/tile/kernel/vmlinux.lds.S (percpu
alignment vs section naming convention fix)
Linus Torvalds 14 years ago
parent
commit
08351fc6a7
58 changed files with 1648 additions and 1007 deletions
  1. 1 0
      MAINTAINERS
  2. 1 1
      README
  3. 20 19
      arch/tile/Kconfig
  4. 6 3
      arch/tile/include/arch/interrupts_32.h
  5. 36 12
      arch/tile/include/arch/sim.h
  6. 3 0
      arch/tile/include/arch/sim_def.h
  7. 1 0
      arch/tile/include/asm/Kbuild
  8. 1 1
      arch/tile/include/asm/atomic.h
  9. 1 1
      arch/tile/include/asm/bitops_32.h
  10. 1 1
      arch/tile/include/asm/cache.h
  11. 6 49
      arch/tile/include/asm/cacheflush.h
  12. 29 0
      arch/tile/include/asm/edac.h
  13. 1 1
      arch/tile/include/asm/hugetlb.h
  14. 16 2
      arch/tile/include/asm/irqflags.h
  15. 11 23
      arch/tile/include/asm/page.h
  16. 5 2
      arch/tile/include/asm/pgalloc.h
  17. 12 19
      arch/tile/include/asm/pgtable.h
  18. 7 1
      arch/tile/include/asm/pgtable_32.h
  19. 0 1
      arch/tile/include/asm/processor.h
  20. 3 0
      arch/tile/include/asm/ptrace.h
  21. 7 76
      arch/tile/include/asm/spinlock_32.h
  22. 2 1
      arch/tile/include/asm/stack.h
  23. 18 1
      arch/tile/include/asm/system.h
  24. 1 0
      arch/tile/include/asm/thread_info.h
  25. 3 0
      arch/tile/include/asm/timex.h
  26. 50 0
      arch/tile/include/hv/drv_mshim_intf.h
  27. 45 1
      arch/tile/include/hv/hypervisor.h
  28. 5 17
      arch/tile/kernel/entry.S
  29. 9 6
      arch/tile/kernel/head_32.S
  30. 20 54
      arch/tile/kernel/intvec_32.S
  31. 20 18
      arch/tile/kernel/irq.c
  32. 5 2
      arch/tile/kernel/machine_kexec.c
  33. 19 19
      arch/tile/kernel/pci-dma.c
  34. 5 1
      arch/tile/kernel/process.c
  35. 12 8
      arch/tile/kernel/setup.c
  36. 19 2
      arch/tile/kernel/single_step.c
  37. 19 14
      arch/tile/kernel/smp.c
  38. 19 9
      arch/tile/kernel/stack.c
  39. 10 0
      arch/tile/kernel/time.c
  40. 1 4
      arch/tile/kernel/vmlinux.lds.S
  41. 2 3
      arch/tile/lib/Makefile
  42. 2 3
      arch/tile/lib/atomic_32.c
  43. 1 1
      arch/tile/lib/atomic_asm_32.S
  44. 102 0
      arch/tile/lib/cacheflush.c
  45. 16 5
      arch/tile/lib/delay.c
  46. 7 3
      arch/tile/lib/exports.c
  47. 0 34
      arch/tile/lib/mb_incoherent.S
  48. 2 2
      arch/tile/lib/memcpy_tile64.c
  49. 96 65
      arch/tile/lib/spinlock_32.c
  50. 0 8
      arch/tile/mm/fault.c
  51. 32 6
      arch/tile/mm/homecache.c
  52. 15 19
      arch/tile/mm/init.c
  53. 1 0
      arch/tile/mm/migrate_32.S
  54. 141 40
      arch/tile/mm/pgtable.c
  55. 9 1
      drivers/edac/Kconfig
  56. 1 0
      drivers/edac/Makefile
  57. 254 0
      drivers/edac/tile_edac.c
  58. 517 448
      drivers/net/tile/tilepro.c

+ 1 - 0
MAINTAINERS

@@ -6127,6 +6127,7 @@ S:	Supported
 F:	arch/tile/
 F:	arch/tile/
 F:	drivers/tty/hvc/hvc_tile.c
 F:	drivers/tty/hvc/hvc_tile.c
 F:	drivers/net/tile/
 F:	drivers/net/tile/
+F:	drivers/edac/tile_edac.c
 
 
 TLAN NETWORK DRIVER
 TLAN NETWORK DRIVER
 M:	Samuel Chessman <chessman@tux.org>
 M:	Samuel Chessman <chessman@tux.org>

+ 1 - 1
README

@@ -24,7 +24,7 @@ ON WHAT HARDWARE DOES IT RUN?
   today Linux also runs on (at least) the Compaq Alpha AXP, Sun SPARC and
   today Linux also runs on (at least) the Compaq Alpha AXP, Sun SPARC and
   UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, Hitachi SuperH, Cell,
   UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, Hitachi SuperH, Cell,
   IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD x86-64, AXIS CRIS,
   IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD x86-64, AXIS CRIS,
-  Xtensa, AVR32 and Renesas M32R architectures.
+  Xtensa, Tilera TILE, AVR32 and Renesas M32R architectures.
 
 
   Linux is easily portable to most general-purpose 32- or 64-bit architectures
   Linux is easily portable to most general-purpose 32- or 64-bit architectures
   as long as they have a paged memory management unit (PMMU) and a port of the
   as long as they have a paged memory management unit (PMMU) and a port of the

+ 20 - 19
arch/tile/Kconfig

@@ -1,5 +1,5 @@
 # For a description of the syntax of this configuration file,
 # For a description of the syntax of this configuration file,
-# see Documentation/kbuild/config-language.txt.
+# see Documentation/kbuild/kconfig-language.txt.
 
 
 config TILE
 config TILE
 	def_bool y
 	def_bool y
@@ -11,17 +11,18 @@ config TILE
 	select HAVE_GENERIC_HARDIRQS
 	select HAVE_GENERIC_HARDIRQS
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_PROBE
 	select GENERIC_PENDING_IRQ if SMP
 	select GENERIC_PENDING_IRQ if SMP
+	select GENERIC_HARDIRQS_NO_DEPRECATED
 
 
 # FIXME: investigate whether we need/want these options.
 # FIXME: investigate whether we need/want these options.
 #	select HAVE_IOREMAP_PROT
 #	select HAVE_IOREMAP_PROT
-#       select HAVE_OPTPROBES
-#       select HAVE_REGS_AND_STACK_ACCESS_API
-#       select HAVE_HW_BREAKPOINT
-#       select PERF_EVENTS
-#       select HAVE_USER_RETURN_NOTIFIER
-#       config NO_BOOTMEM
-#       config ARCH_SUPPORTS_DEBUG_PAGEALLOC
-#       config HUGETLB_PAGE_SIZE_VARIABLE
+#	select HAVE_OPTPROBES
+#	select HAVE_REGS_AND_STACK_ACCESS_API
+#	select HAVE_HW_BREAKPOINT
+#	select PERF_EVENTS
+#	select HAVE_USER_RETURN_NOTIFIER
+#	config NO_BOOTMEM
+#	config ARCH_SUPPORTS_DEBUG_PAGEALLOC
+#	config HUGETLB_PAGE_SIZE_VARIABLE
 
 
 config MMU
 config MMU
 	def_bool y
 	def_bool y
@@ -39,7 +40,7 @@ config HAVE_SETUP_PER_CPU_AREA
 	def_bool y
 	def_bool y
 
 
 config NEED_PER_CPU_PAGE_FIRST_CHUNK
 config NEED_PER_CPU_PAGE_FIRST_CHUNK
-        def_bool y
+	def_bool y
 
 
 config SYS_SUPPORTS_HUGETLBFS
 config SYS_SUPPORTS_HUGETLBFS
 	def_bool y
 	def_bool y
@@ -201,12 +202,6 @@ config NODES_SHIFT
 	  By default, 2, i.e. 2^2 == 4 DDR2 controllers.
 	  By default, 2, i.e. 2^2 == 4 DDR2 controllers.
 	  In a system with more controllers, this value should be raised.
 	  In a system with more controllers, this value should be raised.
 
 
-# Need 16MB areas to enable hugetlb
-# See build-time check in arch/tile/mm/init.c.
-config FORCE_MAX_ZONEORDER
-	int
-	default 9
-
 choice
 choice
 	depends on !TILEGX
 	depends on !TILEGX
 	prompt "Memory split" if EXPERT
 	prompt "Memory split" if EXPERT
@@ -233,8 +228,12 @@ choice
 		bool "3.5G/0.5G user/kernel split"
 		bool "3.5G/0.5G user/kernel split"
 	config VMSPLIT_3G
 	config VMSPLIT_3G
 		bool "3G/1G user/kernel split"
 		bool "3G/1G user/kernel split"
-	config VMSPLIT_3G_OPT
-		bool "3G/1G user/kernel split (for full 1G low memory)"
+	config VMSPLIT_2_75G
+		bool "2.75G/1.25G user/kernel split (for full 1G low memory)"
+	config VMSPLIT_2_5G
+		bool "2.5G/1.5G user/kernel split"
+	config VMSPLIT_2_25G
+		bool "2.25G/1.75G user/kernel split"
 	config VMSPLIT_2G
 	config VMSPLIT_2G
 		bool "2G/2G user/kernel split"
 		bool "2G/2G user/kernel split"
 	config VMSPLIT_1G
 	config VMSPLIT_1G
@@ -245,7 +244,9 @@ config PAGE_OFFSET
 	hex
 	hex
 	default 0xF0000000 if VMSPLIT_3_75G
 	default 0xF0000000 if VMSPLIT_3_75G
 	default 0xE0000000 if VMSPLIT_3_5G
 	default 0xE0000000 if VMSPLIT_3_5G
-	default 0xB0000000 if VMSPLIT_3G_OPT
+	default 0xB0000000 if VMSPLIT_2_75G
+	default 0xA0000000 if VMSPLIT_2_5G
+	default 0x90000000 if VMSPLIT_2_25G
 	default 0x80000000 if VMSPLIT_2G
 	default 0x80000000 if VMSPLIT_2G
 	default 0x40000000 if VMSPLIT_1G
 	default 0x40000000 if VMSPLIT_1G
 	default 0xC0000000
 	default 0xC0000000

+ 6 - 3
arch/tile/include/arch/interrupts_32.h

@@ -16,10 +16,11 @@
 #define __ARCH_INTERRUPTS_H__
 #define __ARCH_INTERRUPTS_H__
 
 
 /** Mask for an interrupt. */
 /** Mask for an interrupt. */
-#ifdef __ASSEMBLER__
 /* Note: must handle breaking interrupts into high and low words manually. */
 /* Note: must handle breaking interrupts into high and low words manually. */
-#define INT_MASK(intno) (1 << (intno))
-#else
+#define INT_MASK_LO(intno) (1 << (intno))
+#define INT_MASK_HI(intno) (1 << ((intno) - 32))
+
+#ifndef __ASSEMBLER__
 #define INT_MASK(intno) (1ULL << (intno))
 #define INT_MASK(intno) (1ULL << (intno))
 #endif
 #endif
 
 
@@ -89,6 +90,7 @@
 
 
 #define NUM_INTERRUPTS 49
 #define NUM_INTERRUPTS 49
 
 
+#ifndef __ASSEMBLER__
 #define QUEUED_INTERRUPTS ( \
 #define QUEUED_INTERRUPTS ( \
     INT_MASK(INT_MEM_ERROR) | \
     INT_MASK(INT_MEM_ERROR) | \
     INT_MASK(INT_DMATLB_MISS) | \
     INT_MASK(INT_DMATLB_MISS) | \
@@ -301,4 +303,5 @@
     INT_MASK(INT_DOUBLE_FAULT) | \
     INT_MASK(INT_DOUBLE_FAULT) | \
     INT_MASK(INT_AUX_PERF_COUNT) | \
     INT_MASK(INT_AUX_PERF_COUNT) | \
     0)
     0)
+#endif /* !__ASSEMBLER__ */
 #endif /* !__ARCH_INTERRUPTS_H__ */
 #endif /* !__ARCH_INTERRUPTS_H__ */

+ 36 - 12
arch/tile/include/arch/sim.h

@@ -152,16 +152,33 @@ sim_dump(unsigned int mask)
 /**
 /**
  * Print a string to the simulator stdout.
  * Print a string to the simulator stdout.
  *
  *
- * @param str The string to be written; a newline is automatically added.
+ * @param str The string to be written.
+ */
+static __inline void
+sim_print(const char* str)
+{
+  for ( ; *str != '\0'; str++)
+  {
+    __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+                 (*str << _SIM_CONTROL_OPERATOR_BITS));
+  }
+  __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
+               (SIM_PUTC_FLUSH_BINARY << _SIM_CONTROL_OPERATOR_BITS));
+}
+
+
+/**
+ * Print a string to the simulator stdout.
+ *
+ * @param str The string to be written (a newline is automatically added).
  */
  */
 static __inline void
 static __inline void
 sim_print_string(const char* str)
 sim_print_string(const char* str)
 {
 {
-  int i;
-  for (i = 0; str[i] != 0; i++)
+  for ( ; *str != '\0'; str++)
   {
   {
     __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
     __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
-                 (str[i] << _SIM_CONTROL_OPERATOR_BITS));
+                 (*str << _SIM_CONTROL_OPERATOR_BITS));
   }
   }
   __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
   __insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_PUTC |
                (SIM_PUTC_FLUSH_STRING << _SIM_CONTROL_OPERATOR_BITS));
                (SIM_PUTC_FLUSH_STRING << _SIM_CONTROL_OPERATOR_BITS));
@@ -203,7 +220,7 @@ sim_command(const char* str)
  * we are passing to the simulator are actually valid in the registers
  * we are passing to the simulator are actually valid in the registers
  * (i.e. returned from memory) prior to the SIM_CONTROL spr.
  * (i.e. returned from memory) prior to the SIM_CONTROL spr.
  */
  */
-static __inline int _sim_syscall0(int val)
+static __inline long _sim_syscall0(int val)
 {
 {
   long result;
   long result;
   __asm__ __volatile__ ("mtspr SIM_CONTROL, r0"
   __asm__ __volatile__ ("mtspr SIM_CONTROL, r0"
@@ -211,7 +228,7 @@ static __inline int _sim_syscall0(int val)
   return result;
   return result;
 }
 }
 
 
-static __inline int _sim_syscall1(int val, long arg1)
+static __inline long _sim_syscall1(int val, long arg1)
 {
 {
   long result;
   long result;
   __asm__ __volatile__ ("{ and zero, r1, r1; mtspr SIM_CONTROL, r0 }"
   __asm__ __volatile__ ("{ and zero, r1, r1; mtspr SIM_CONTROL, r0 }"
@@ -219,7 +236,7 @@ static __inline int _sim_syscall1(int val, long arg1)
   return result;
   return result;
 }
 }
 
 
-static __inline int _sim_syscall2(int val, long arg1, long arg2)
+static __inline long _sim_syscall2(int val, long arg1, long arg2)
 {
 {
   long result;
   long result;
   __asm__ __volatile__ ("{ and zero, r1, r2; mtspr SIM_CONTROL, r0 }"
   __asm__ __volatile__ ("{ and zero, r1, r2; mtspr SIM_CONTROL, r0 }"
@@ -233,7 +250,7 @@ static __inline int _sim_syscall2(int val, long arg1, long arg2)
    the register values for arguments 3 and up may still be in flight
    the register values for arguments 3 and up may still be in flight
    to the core from a stack frame reload. */
    to the core from a stack frame reload. */
 
 
-static __inline int _sim_syscall3(int val, long arg1, long arg2, long arg3)
+static __inline long _sim_syscall3(int val, long arg1, long arg2, long arg3)
 {
 {
   long result;
   long result;
   __asm__ __volatile__ ("{ and zero, r3, r3 };"
   __asm__ __volatile__ ("{ and zero, r3, r3 };"
@@ -244,7 +261,7 @@ static __inline int _sim_syscall3(int val, long arg1, long arg2, long arg3)
   return result;
   return result;
 }
 }
 
 
-static __inline int _sim_syscall4(int val, long arg1, long arg2, long arg3,
+static __inline long _sim_syscall4(int val, long arg1, long arg2, long arg3,
                                   long arg4)
                                   long arg4)
 {
 {
   long result;
   long result;
@@ -256,7 +273,7 @@ static __inline int _sim_syscall4(int val, long arg1, long arg2, long arg3,
   return result;
   return result;
 }
 }
 
 
-static __inline int _sim_syscall5(int val, long arg1, long arg2, long arg3,
+static __inline long _sim_syscall5(int val, long arg1, long arg2, long arg3,
                                   long arg4, long arg5)
                                   long arg4, long arg5)
 {
 {
   long result;
   long result;
@@ -268,7 +285,6 @@ static __inline int _sim_syscall5(int val, long arg1, long arg2, long arg3,
   return result;
   return result;
 }
 }
 
 
-
 /**
 /**
  * Make a special syscall to the simulator itself, if running under
  * Make a special syscall to the simulator itself, if running under
  * simulation. This is used as the implementation of other functions
  * simulation. This is used as the implementation of other functions
@@ -281,7 +297,8 @@ static __inline int _sim_syscall5(int val, long arg1, long arg2, long arg3,
  */
  */
 #define _sim_syscall(syscall_num, nr, args...) \
 #define _sim_syscall(syscall_num, nr, args...) \
   _sim_syscall##nr( \
   _sim_syscall##nr( \
-    ((syscall_num) << _SIM_CONTROL_OPERATOR_BITS) | SIM_CONTROL_SYSCALL, args)
+    ((syscall_num) << _SIM_CONTROL_OPERATOR_BITS) | SIM_CONTROL_SYSCALL, \
+    ##args)
 
 
 
 
 /* Values for the "access_mask" parameters below. */
 /* Values for the "access_mask" parameters below. */
@@ -365,6 +382,13 @@ sim_validate_lines_evicted(unsigned long long pa, unsigned long length)
 }
 }
 
 
 
 
+/* Return the current CPU speed in cycles per second. */
+static __inline long
+sim_query_cpu_speed(void)
+{
+  return _sim_syscall(SIM_SYSCALL_QUERY_CPU_SPEED, 0);
+}
+
 #endif /* !__DOXYGEN__ */
 #endif /* !__DOXYGEN__ */
 
 
 
 

+ 3 - 0
arch/tile/include/arch/sim_def.h

@@ -243,6 +243,9 @@
  */
  */
 #define SIM_SYSCALL_VALIDATE_LINES_EVICTED 5
 #define SIM_SYSCALL_VALIDATE_LINES_EVICTED 5
 
 
+/** Syscall number for sim_query_cpu_speed(). */
+#define SIM_SYSCALL_QUERY_CPU_SPEED 6
+
 
 
 /*
 /*
  * Bit masks which can be shifted by 8, combined with
  * Bit masks which can be shifted by 8, combined with

+ 1 - 0
arch/tile/include/asm/Kbuild

@@ -1,3 +1,4 @@
 include include/asm-generic/Kbuild.asm
 include include/asm-generic/Kbuild.asm
 
 
 header-y += ucontext.h
 header-y += ucontext.h
+header-y += hardwall.h

+ 1 - 1
arch/tile/include/asm/atomic.h

@@ -32,7 +32,7 @@
  */
  */
 static inline int atomic_read(const atomic_t *v)
 static inline int atomic_read(const atomic_t *v)
 {
 {
-       return v->counter;
+	return ACCESS_ONCE(v->counter);
 }
 }
 
 
 /**
 /**

+ 1 - 1
arch/tile/include/asm/bitops_32.h

@@ -122,7 +122,7 @@ static inline int test_and_change_bit(unsigned nr,
 	return (_atomic_xor(addr, mask) & mask) != 0;
 	return (_atomic_xor(addr, mask) & mask) != 0;
 }
 }
 
 
-/* See discussion at smp_mb__before_atomic_dec() in <asm/atomic.h>. */
+/* See discussion at smp_mb__before_atomic_dec() in <asm/atomic_32.h>. */
 #define smp_mb__before_clear_bit()	smp_mb()
 #define smp_mb__before_clear_bit()	smp_mb()
 #define smp_mb__after_clear_bit()	do {} while (0)
 #define smp_mb__after_clear_bit()	do {} while (0)
 
 

+ 1 - 1
arch/tile/include/asm/cache.h

@@ -40,7 +40,7 @@
 #define INTERNODE_CACHE_BYTES   L2_CACHE_BYTES
 #define INTERNODE_CACHE_BYTES   L2_CACHE_BYTES
 
 
 /* Group together read-mostly things to avoid cache false sharing */
 /* Group together read-mostly things to avoid cache false sharing */
-#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
 
 /*
 /*
  * Attribute for data that is kept read/write coherent until the end of
  * Attribute for data that is kept read/write coherent until the end of

+ 6 - 49
arch/tile/include/asm/cacheflush.h

@@ -138,55 +138,12 @@ static inline void finv_buffer(void *buffer, size_t size)
 }
 }
 
 
 /*
 /*
- * Flush & invalidate a VA range that is homed remotely on a single core,
- * waiting until the memory controller holds the flushed values.
+ * Flush and invalidate a VA range that is homed remotely, waiting
+ * until the memory controller holds the flushed values.  If "hfh" is
+ * true, we will do a more expensive flush involving additional loads
+ * to make sure we have touched all the possible home cpus of a buffer
+ * that is homed with "hash for home".
  */
  */
-static inline void finv_buffer_remote(void *buffer, size_t size)
-{
-	char *p;
-	int i;
-
-	/*
-	 * Flush and invalidate the buffer out of the local L1/L2
-	 * and request the home cache to flush and invalidate as well.
-	 */
-	__finv_buffer(buffer, size);
-
-	/*
-	 * Wait for the home cache to acknowledge that it has processed
-	 * all the flush-and-invalidate requests.  This does not mean
-	 * that the flushed data has reached the memory controller yet,
-	 * but it does mean the home cache is processing the flushes.
-	 */
-	__insn_mf();
-
-	/*
-	 * Issue a load to the last cache line, which can't complete
-	 * until all the previously-issued flushes to the same memory
-	 * controller have also completed.  If we weren't striping
-	 * memory, that one load would be sufficient, but since we may
-	 * be, we also need to back up to the last load issued to
-	 * another memory controller, which would be the point where
-	 * we crossed an 8KB boundary (the granularity of striping
-	 * across memory controllers).  Keep backing up and doing this
-	 * until we are before the beginning of the buffer, or have
-	 * hit all the controllers.
-	 */
-	for (i = 0, p = (char *)buffer + size - 1;
-	     i < (1 << CHIP_LOG_NUM_MSHIMS()) && p >= (char *)buffer;
-	     ++i) {
-		const unsigned long STRIPE_WIDTH = 8192;
-
-		/* Force a load instruction to issue. */
-		*(volatile char *)p;
-
-		/* Jump to end of previous stripe. */
-		p -= STRIPE_WIDTH;
-		p = (char *)((unsigned long)p | (STRIPE_WIDTH - 1));
-	}
-
-	/* Wait for the loads (and thus flushes) to have completed. */
-	__insn_mf();
-}
+void finv_buffer_remote(void *buffer, size_t size, int hfh);
 
 
 #endif /* _ASM_TILE_CACHEFLUSH_H */
 #endif /* _ASM_TILE_CACHEFLUSH_H */

+ 29 - 0
arch/tile/include/asm/edac.h

@@ -0,0 +1,29 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_EDAC_H
+#define _ASM_TILE_EDAC_H
+
+/* ECC atomic, DMA, SMP and interrupt safe scrub function */
+
+static inline void atomic_scrub(void *va, u32 size)
+{
+	/*
+	 * These is nothing to be done here because CE is
+	 * corrected by the mshim.
+	 */
+	return;
+}
+
+#endif /* _ASM_TILE_EDAC_H */

+ 1 - 1
arch/tile/include/asm/hugetlb.h

@@ -54,7 +54,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 				   pte_t *ptep, pte_t pte)
 				   pte_t *ptep, pte_t pte)
 {
 {
-	set_pte_order(ptep, pte, HUGETLB_PAGE_ORDER);
+	set_pte(ptep, pte);
 }
 }
 
 
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,

+ 16 - 2
arch/tile/include/asm/irqflags.h

@@ -18,12 +18,24 @@
 #include <arch/interrupts.h>
 #include <arch/interrupts.h>
 #include <arch/chip.h>
 #include <arch/chip.h>
 
 
+#if !defined(__tilegx__) && defined(__ASSEMBLY__)
+
 /*
 /*
  * The set of interrupts we want to allow when interrupts are nominally
  * The set of interrupts we want to allow when interrupts are nominally
  * disabled.  The remainder are effectively "NMI" interrupts from
  * disabled.  The remainder are effectively "NMI" interrupts from
  * the point of view of the generic Linux code.  Note that synchronous
  * the point of view of the generic Linux code.  Note that synchronous
  * interrupts (aka "non-queued") are not blocked by the mask in any case.
  * interrupts (aka "non-queued") are not blocked by the mask in any case.
  */
  */
+#if CHIP_HAS_AUX_PERF_COUNTERS()
+#define LINUX_MASKABLE_INTERRUPTS_HI \
+       (~(INT_MASK_HI(INT_PERF_COUNT) | INT_MASK_HI(INT_AUX_PERF_COUNT)))
+#else
+#define LINUX_MASKABLE_INTERRUPTS_HI \
+       (~(INT_MASK_HI(INT_PERF_COUNT)))
+#endif
+
+#else
+
 #if CHIP_HAS_AUX_PERF_COUNTERS()
 #if CHIP_HAS_AUX_PERF_COUNTERS()
 #define LINUX_MASKABLE_INTERRUPTS \
 #define LINUX_MASKABLE_INTERRUPTS \
 	(~(INT_MASK(INT_PERF_COUNT) | INT_MASK(INT_AUX_PERF_COUNT)))
 	(~(INT_MASK(INT_PERF_COUNT) | INT_MASK(INT_AUX_PERF_COUNT)))
@@ -32,6 +44,8 @@
 	(~(INT_MASK(INT_PERF_COUNT)))
 	(~(INT_MASK(INT_PERF_COUNT)))
 #endif
 #endif
 
 
+#endif
+
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
 
 
 /* NOTE: we can't include <linux/percpu.h> due to #include dependencies. */
 /* NOTE: we can't include <linux/percpu.h> due to #include dependencies. */
@@ -224,11 +238,11 @@ DECLARE_PER_CPU(unsigned long long, interrupts_enabled_mask);
 #define IRQ_DISABLE(tmp0, tmp1)					\
 #define IRQ_DISABLE(tmp0, tmp1)					\
 	{							\
 	{							\
 	 movei  tmp0, -1;					\
 	 movei  tmp0, -1;					\
-	 moveli tmp1, lo16(LINUX_MASKABLE_INTERRUPTS)		\
+	 moveli tmp1, lo16(LINUX_MASKABLE_INTERRUPTS_HI)	\
 	};							\
 	};							\
 	{							\
 	{							\
 	 mtspr  SPR_INTERRUPT_MASK_SET_K_0, tmp0;		\
 	 mtspr  SPR_INTERRUPT_MASK_SET_K_0, tmp0;		\
-	 auli   tmp1, tmp1, ha16(LINUX_MASKABLE_INTERRUPTS)	\
+	 auli   tmp1, tmp1, ha16(LINUX_MASKABLE_INTERRUPTS_HI)	\
 	};							\
 	};							\
 	mtspr   SPR_INTERRUPT_MASK_SET_K_1, tmp1
 	mtspr   SPR_INTERRUPT_MASK_SET_K_1, tmp1
 
 

+ 11 - 23
arch/tile/include/asm/page.h

@@ -16,10 +16,11 @@
 #define _ASM_TILE_PAGE_H
 #define _ASM_TILE_PAGE_H
 
 
 #include <linux/const.h>
 #include <linux/const.h>
+#include <hv/pagesize.h>
 
 
 /* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */
 /* PAGE_SHIFT and HPAGE_SHIFT determine the page sizes. */
-#define PAGE_SHIFT	16
-#define HPAGE_SHIFT	24
+#define PAGE_SHIFT	HV_LOG2_PAGE_SIZE_SMALL
+#define HPAGE_SHIFT	HV_LOG2_PAGE_SIZE_LARGE
 
 
 #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
 #define PAGE_SIZE	(_AC(1, UL) << PAGE_SHIFT)
 #define HPAGE_SIZE	(_AC(1, UL) << HPAGE_SHIFT)
 #define HPAGE_SIZE	(_AC(1, UL) << HPAGE_SHIFT)
@@ -29,25 +30,18 @@
 
 
 #ifdef __KERNEL__
 #ifdef __KERNEL__
 
 
-#include <hv/hypervisor.h>
-#include <arch/chip.h>
-
 /*
 /*
- * The {,H}PAGE_SHIFT values must match the HV_LOG2_PAGE_SIZE_xxx
- * definitions in <hv/hypervisor.h>.  We validate this at build time
- * here, and again at runtime during early boot.  We provide a
- * separate definition since userspace doesn't have <hv/hypervisor.h>.
- *
- * Be careful to distinguish PAGE_SHIFT from HV_PTE_INDEX_PFN, since
- * they are the same on i386 but not TILE.
+ * If the Kconfig doesn't specify, set a maximum zone order that
+ * is enough so that we can create huge pages from small pages given
+ * the respective sizes of the two page types.  See <linux/mmzone.h>.
  */
  */
-#if HV_LOG2_PAGE_SIZE_SMALL != PAGE_SHIFT
-# error Small page size mismatch in Linux
-#endif
-#if HV_LOG2_PAGE_SIZE_LARGE != HPAGE_SHIFT
-# error Huge page size mismatch in Linux
+#ifndef CONFIG_FORCE_MAX_ZONEORDER
+#define CONFIG_FORCE_MAX_ZONEORDER (HPAGE_SHIFT - PAGE_SHIFT + 1)
 #endif
 #endif
 
 
+#include <hv/hypervisor.h>
+#include <arch/chip.h>
+
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
 
 
 #include <linux/types.h>
 #include <linux/types.h>
@@ -81,12 +75,6 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
  * Hypervisor page tables are made of the same basic structure.
  * Hypervisor page tables are made of the same basic structure.
  */
  */
 
 
-typedef __u64 pteval_t;
-typedef __u64 pmdval_t;
-typedef __u64 pudval_t;
-typedef __u64 pgdval_t;
-typedef __u64 pgprotval_t;
-
 typedef HV_PTE pte_t;
 typedef HV_PTE pte_t;
 typedef HV_PTE pgd_t;
 typedef HV_PTE pgd_t;
 typedef HV_PTE pgprot_t;
 typedef HV_PTE pgprot_t;

+ 5 - 2
arch/tile/include/asm/pgalloc.h

@@ -41,9 +41,9 @@
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 {
 {
 #ifdef CONFIG_64BIT
 #ifdef CONFIG_64BIT
-	set_pte_order(pmdp, pmd, L2_USER_PGTABLE_ORDER);
+	set_pte(pmdp, pmd);
 #else
 #else
-	set_pte_order(&pmdp->pud.pgd, pmd.pud.pgd, L2_USER_PGTABLE_ORDER);
+	set_pte(&pmdp->pud.pgd, pmd.pud.pgd);
 #endif
 #endif
 }
 }
 
 
@@ -100,6 +100,9 @@ pte_t *get_prealloc_pte(unsigned long pfn);
 /* During init, we can shatter kernel huge pages if needed. */
 /* During init, we can shatter kernel huge pages if needed. */
 void shatter_pmd(pmd_t *pmd);
 void shatter_pmd(pmd_t *pmd);
 
 
+/* After init, a more complex technique is required. */
+void shatter_huge_page(unsigned long addr);
+
 #ifdef __tilegx__
 #ifdef __tilegx__
 /* We share a single page allocator for both L1 and L2 page tables. */
 /* We share a single page allocator for both L1 and L2 page tables. */
 #if HV_L1_SIZE != HV_L2_SIZE
 #if HV_L1_SIZE != HV_L2_SIZE

+ 12 - 19
arch/tile/include/asm/pgtable.h

@@ -233,15 +233,23 @@ static inline void __pte_clear(pte_t *ptep)
 #define pgd_ERROR(e) \
 #define pgd_ERROR(e) \
 	pr_err("%s:%d: bad pgd 0x%016llx.\n", __FILE__, __LINE__, pgd_val(e))
 	pr_err("%s:%d: bad pgd 0x%016llx.\n", __FILE__, __LINE__, pgd_val(e))
 
 
+/* Return PA and protection info for a given kernel VA. */
+int va_to_cpa_and_pte(void *va, phys_addr_t *cpa, pte_t *pte);
+
+/*
+ * __set_pte() ensures we write the 64-bit PTE with 32-bit words in
+ * the right order on 32-bit platforms and also allows us to write
+ * hooks to check valid PTEs, etc., if we want.
+ */
+void __set_pte(pte_t *ptep, pte_t pte);
+
 /*
 /*
- * set_pte_order() sets the given PTE and also sanity-checks the
+ * set_pte() sets the given PTE and also sanity-checks the
  * requested PTE against the page homecaching.  Unspecified parts
  * requested PTE against the page homecaching.  Unspecified parts
  * of the PTE are filled in when it is written to memory, i.e. all
  * of the PTE are filled in when it is written to memory, i.e. all
  * caching attributes if "!forcecache", or the home cpu if "anyhome".
  * caching attributes if "!forcecache", or the home cpu if "anyhome".
  */
  */
-extern void set_pte_order(pte_t *ptep, pte_t pte, int order);
-
-#define set_pte(ptep, pteval) set_pte_order(ptep, pteval, 0)
+extern void set_pte(pte_t *ptep, pte_t pte);
 #define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval)
 #define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval)
 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr, pteval)
 #define set_pte_atomic(pteptr, pteval) set_pte(pteptr, pteval)
 
 
@@ -292,21 +300,6 @@ extern void check_mm_caching(struct mm_struct *prev, struct mm_struct *next);
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { (pte).val >> 32 })
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { (pte).val >> 32 })
 #define __swp_entry_to_pte(swp)	((pte_t) { (((long long) ((swp).val)) << 32) })
 #define __swp_entry_to_pte(swp)	((pte_t) { (((long long) ((swp).val)) << 32) })
 
 
-/*
- * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
- *
- *  dst - pointer to pgd range anwhere on a pgd page
- *  src - ""
- *  count - the number of pgds to copy.
- *
- * dst and src can be on the same page, but the range must not overlap,
- * and must not cross a page boundary.
- */
-static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
-{
-       memcpy(dst, src, count * sizeof(pgd_t));
-}
-
 /*
 /*
  * Conversion functions: convert a page and protection to a page entry,
  * Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
  * and a page entry and page directory to the page they refer to.

+ 7 - 1
arch/tile/include/asm/pgtable_32.h

@@ -24,6 +24,7 @@
 #define PGDIR_SIZE	HV_PAGE_SIZE_LARGE
 #define PGDIR_SIZE	HV_PAGE_SIZE_LARGE
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 #define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
 #define PTRS_PER_PGD	(1 << (32 - PGDIR_SHIFT))
+#define SIZEOF_PGD	(PTRS_PER_PGD * sizeof(pgd_t))
 
 
 /*
 /*
  * The level-2 index is defined by the difference between the huge
  * The level-2 index is defined by the difference between the huge
@@ -33,6 +34,7 @@
  * this nomenclature is somewhat confusing.
  * this nomenclature is somewhat confusing.
  */
  */
 #define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
 #define PTRS_PER_PTE (1 << (HV_LOG2_PAGE_SIZE_LARGE - HV_LOG2_PAGE_SIZE_SMALL))
+#define SIZEOF_PTE	(PTRS_PER_PTE * sizeof(pte_t))
 
 
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
 
 
@@ -94,7 +96,6 @@ static inline int pgd_addr_invalid(unsigned long addr)
  */
  */
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 
 
 extern int ptep_test_and_clear_young(struct vm_area_struct *,
 extern int ptep_test_and_clear_young(struct vm_area_struct *,
 				     unsigned long addr, pte_t *);
 				     unsigned long addr, pte_t *);
@@ -110,6 +111,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	return pte;
 	return pte;
 }
 }
 
 
+static inline void __set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+	set_pte(&pmdp->pud.pgd, pmdval.pud.pgd);
+}
+
 /* Create a pmd from a PTFN. */
 /* Create a pmd from a PTFN. */
 static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
 static inline pmd_t ptfn_pmd(unsigned long ptfn, pgprot_t prot)
 {
 {

+ 0 - 1
arch/tile/include/asm/processor.h

@@ -269,7 +269,6 @@ extern char chip_model[64];
 /* Data on which physical memory controller corresponds to which NUMA node. */
 /* Data on which physical memory controller corresponds to which NUMA node. */
 extern int node_controller[];
 extern int node_controller[];
 
 
-
 /* Do we dump information to the console when a user application crashes? */
 /* Do we dump information to the console when a user application crashes? */
 extern int show_crashinfo;
 extern int show_crashinfo;
 
 

+ 3 - 0
arch/tile/include/asm/ptrace.h

@@ -141,6 +141,9 @@ struct single_step_state {
 /* Single-step the instruction at regs->pc */
 /* Single-step the instruction at regs->pc */
 extern void single_step_once(struct pt_regs *regs);
 extern void single_step_once(struct pt_regs *regs);
 
 
+/* Clean up after execve(). */
+extern void single_step_execve(void);
+
 struct task_struct;
 struct task_struct;
 
 
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,

+ 7 - 76
arch/tile/include/asm/spinlock_32.h

@@ -78,13 +78,6 @@ void arch_spin_unlock_wait(arch_spinlock_t *lock);
 #define _RD_COUNT_SHIFT 24
 #define _RD_COUNT_SHIFT 24
 #define _RD_COUNT_WIDTH 8
 #define _RD_COUNT_WIDTH 8
 
 
-/* Internal functions; do not use. */
-void arch_read_lock_slow(arch_rwlock_t *, u32);
-int arch_read_trylock_slow(arch_rwlock_t *);
-void arch_read_unlock_slow(arch_rwlock_t *);
-void arch_write_lock_slow(arch_rwlock_t *, u32);
-void arch_write_unlock_slow(arch_rwlock_t *, u32);
-
 /**
 /**
  * arch_read_can_lock() - would read_trylock() succeed?
  * arch_read_can_lock() - would read_trylock() succeed?
  */
  */
@@ -104,94 +97,32 @@ static inline int arch_write_can_lock(arch_rwlock_t *rwlock)
 /**
 /**
  * arch_read_lock() - acquire a read lock.
  * arch_read_lock() - acquire a read lock.
  */
  */
-static inline void arch_read_lock(arch_rwlock_t *rwlock)
-{
-	u32 val = __insn_tns((int *)&rwlock->lock);
-	if (unlikely(val << _RD_COUNT_WIDTH)) {
-		arch_read_lock_slow(rwlock, val);
-		return;
-	}
-	rwlock->lock = val + (1 << _RD_COUNT_SHIFT);
-}
+void arch_read_lock(arch_rwlock_t *rwlock);
 
 
 /**
 /**
- * arch_read_lock() - acquire a write lock.
+ * arch_write_lock() - acquire a write lock.
  */
  */
-static inline void arch_write_lock(arch_rwlock_t *rwlock)
-{
-	u32 val = __insn_tns((int *)&rwlock->lock);
-	if (unlikely(val != 0)) {
-		arch_write_lock_slow(rwlock, val);
-		return;
-	}
-	rwlock->lock = 1 << _WR_NEXT_SHIFT;
-}
+void arch_write_lock(arch_rwlock_t *rwlock);
 
 
 /**
 /**
  * arch_read_trylock() - try to acquire a read lock.
  * arch_read_trylock() - try to acquire a read lock.
  */
  */
-static inline int arch_read_trylock(arch_rwlock_t *rwlock)
-{
-	int locked;
-	u32 val = __insn_tns((int *)&rwlock->lock);
-	if (unlikely(val & 1))
-		return arch_read_trylock_slow(rwlock);
-	locked = (val << _RD_COUNT_WIDTH) == 0;
-	rwlock->lock = val + (locked << _RD_COUNT_SHIFT);
-	return locked;
-}
+int arch_read_trylock(arch_rwlock_t *rwlock);
 
 
 /**
 /**
  * arch_write_trylock() - try to acquire a write lock.
  * arch_write_trylock() - try to acquire a write lock.
  */
  */
-static inline int arch_write_trylock(arch_rwlock_t *rwlock)
-{
-	u32 val = __insn_tns((int *)&rwlock->lock);
-
-	/*
-	 * If a tns is in progress, or there's a waiting or active locker,
-	 * or active readers, we can't take the lock, so give up.
-	 */
-	if (unlikely(val != 0)) {
-		if (!(val & 1))
-			rwlock->lock = val;
-		return 0;
-	}
-
-	/* Set the "next" field to mark it locked. */
-	rwlock->lock = 1 << _WR_NEXT_SHIFT;
-	return 1;
-}
+int arch_write_trylock(arch_rwlock_t *rwlock);
 
 
 /**
 /**
  * arch_read_unlock() - release a read lock.
  * arch_read_unlock() - release a read lock.
  */
  */
-static inline void arch_read_unlock(arch_rwlock_t *rwlock)
-{
-	u32 val;
-	mb();  /* guarantee anything modified under the lock is visible */
-	val = __insn_tns((int *)&rwlock->lock);
-	if (unlikely(val & 1)) {
-		arch_read_unlock_slow(rwlock);
-		return;
-	}
-	rwlock->lock = val - (1 << _RD_COUNT_SHIFT);
-}
+void arch_read_unlock(arch_rwlock_t *rwlock);
 
 
 /**
 /**
  * arch_write_unlock() - release a write lock.
  * arch_write_unlock() - release a write lock.
  */
  */
-static inline void arch_write_unlock(arch_rwlock_t *rwlock)
-{
-	u32 val;
-	mb();  /* guarantee anything modified under the lock is visible */
-	val = __insn_tns((int *)&rwlock->lock);
-	if (unlikely(val != (1 << _WR_NEXT_SHIFT))) {
-		arch_write_unlock_slow(rwlock, val);
-		return;
-	}
-	rwlock->lock = 0;
-}
+void arch_write_unlock(arch_rwlock_t *rwlock);
 
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)

+ 2 - 1
arch/tile/include/asm/stack.h

@@ -18,13 +18,14 @@
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/sched.h>
 #include <asm/backtrace.h>
 #include <asm/backtrace.h>
+#include <asm/page.h>
 #include <hv/hypervisor.h>
 #include <hv/hypervisor.h>
 
 
 /* Everything we need to keep track of a backtrace iteration */
 /* Everything we need to keep track of a backtrace iteration */
 struct KBacktraceIterator {
 struct KBacktraceIterator {
 	BacktraceIterator it;
 	BacktraceIterator it;
 	struct task_struct *task;     /* task we are backtracing */
 	struct task_struct *task;     /* task we are backtracing */
-	HV_PTE *pgtable;	      /* page table for user space access */
+	pte_t *pgtable;		      /* page table for user space access */
 	int end;		      /* iteration complete. */
 	int end;		      /* iteration complete. */
 	int new_context;              /* new context is starting */
 	int new_context;              /* new context is starting */
 	int profile;                  /* profiling, so stop on async intrpt */
 	int profile;                  /* profiling, so stop on async intrpt */

+ 18 - 1
arch/tile/include/asm/system.h

@@ -90,7 +90,24 @@
 #endif
 #endif
 
 
 #if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
 #if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
-int __mb_incoherent(void);  /* Helper routine for mb_incoherent(). */
+#include <hv/syscall_public.h>
+/*
+ * Issue an uncacheable load to each memory controller, then
+ * wait until those loads have completed.
+ */
+static inline void __mb_incoherent(void)
+{
+	long clobber_r10;
+	asm volatile("swint2"
+		     : "=R10" (clobber_r10)
+		     : "R10" (HV_SYS_fence_incoherent)
+		     : "r0", "r1", "r2", "r3", "r4",
+		       "r5", "r6", "r7", "r8", "r9",
+		       "r11", "r12", "r13", "r14",
+		       "r15", "r16", "r17", "r18", "r19",
+		       "r20", "r21", "r22", "r23", "r24",
+		       "r25", "r26", "r27", "r28", "r29");
+}
 #endif
 #endif
 
 
 /* Fence to guarantee visibility of stores to incoherent memory. */
 /* Fence to guarantee visibility of stores to incoherent memory. */

+ 1 - 0
arch/tile/include/asm/thread_info.h

@@ -68,6 +68,7 @@ struct thread_info {
 #else
 #else
 #define THREAD_SIZE_ORDER (0)
 #define THREAD_SIZE_ORDER (0)
 #endif
 #endif
+#define THREAD_SIZE_PAGES (1 << THREAD_SIZE_ORDER)
 
 
 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define LOG2_THREAD_SIZE (PAGE_SHIFT + THREAD_SIZE_ORDER)
 #define LOG2_THREAD_SIZE (PAGE_SHIFT + THREAD_SIZE_ORDER)

+ 3 - 0
arch/tile/include/asm/timex.h

@@ -38,6 +38,9 @@ static inline cycles_t get_cycles(void)
 
 
 cycles_t get_clock_rate(void);
 cycles_t get_clock_rate(void);
 
 
+/* Convert nanoseconds to core clock cycles. */
+cycles_t ns2cycles(unsigned long nsecs);
+
 /* Called at cpu initialization to set some low-level constants. */
 /* Called at cpu initialization to set some low-level constants. */
 void setup_clock(void);
 void setup_clock(void);
 
 

+ 50 - 0
arch/tile/include/hv/drv_mshim_intf.h

@@ -0,0 +1,50 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/**
+ * @file drv_mshim_intf.h
+ * Interface definitions for the Linux EDAC memory controller driver.
+ */
+
+#ifndef _SYS_HV_INCLUDE_DRV_MSHIM_INTF_H
+#define _SYS_HV_INCLUDE_DRV_MSHIM_INTF_H
+
+/** Number of memory controllers in the public API. */
+#define TILE_MAX_MSHIMS 4
+
+/** Memory info under each memory controller. */
+struct mshim_mem_info
+{
+  uint64_t mem_size;     /**< Total memory size in bytes. */
+  uint8_t mem_type;      /**< Memory type, DDR2 or DDR3. */
+  uint8_t mem_ecc;       /**< Memory supports ECC. */
+};
+
+/**
+ * DIMM error structure.
+ * For now, only correctable errors are counted and the mshim doesn't record
+ * the error PA. HV takes panic upon uncorrectable errors.
+ */
+struct mshim_mem_error
+{
+  uint32_t sbe_count;     /**< Number of single-bit errors. */
+};
+
+/** Read this offset to get the memory info per mshim. */
+#define MSHIM_MEM_INFO_OFF 0x100
+
+/** Read this offset to check DIMM error. */
+#define MSHIM_MEM_ERROR_OFF 0x200
+
+#endif /* _SYS_HV_INCLUDE_DRV_MSHIM_INTF_H */

+ 45 - 1
arch/tile/include/hv/hypervisor.h

@@ -338,9 +338,10 @@ typedef int HV_Errno;
 #define HV_ENOTREADY   -812  /**< Device not ready */
 #define HV_ENOTREADY   -812  /**< Device not ready */
 #define HV_EIO         -813  /**< I/O error */
 #define HV_EIO         -813  /**< I/O error */
 #define HV_ENOMEM      -814  /**< Out of memory */
 #define HV_ENOMEM      -814  /**< Out of memory */
+#define HV_EAGAIN      -815  /**< Try again */
 
 
 #define HV_ERR_MAX     -801  /**< Largest HV error code */
 #define HV_ERR_MAX     -801  /**< Largest HV error code */
-#define HV_ERR_MIN     -814  /**< Smallest HV error code */
+#define HV_ERR_MIN     -815  /**< Smallest HV error code */
 
 
 #ifndef __ASSEMBLER__
 #ifndef __ASSEMBLER__
 
 
@@ -867,6 +868,43 @@ typedef struct
  */
  */
 HV_PhysAddrRange hv_inquire_physical(int idx);
 HV_PhysAddrRange hv_inquire_physical(int idx);
 
 
+/** Possible DIMM types. */
+typedef enum
+{
+  NO_DIMM                    = 0,  /**< No DIMM */
+  DDR2                       = 1,  /**< DDR2 */
+  DDR3                       = 2   /**< DDR3 */
+} HV_DIMM_Type;
+
+#ifdef __tilegx__
+
+/** Log2 of minimum DIMM bytes supported by the memory controller. */
+#define HV_MSH_MIN_DIMM_SIZE_SHIFT 29
+
+/** Max number of DIMMs contained by one memory controller. */
+#define HV_MSH_MAX_DIMMS 8
+
+#else
+
+/** Log2 of minimum DIMM bytes supported by the memory controller. */
+#define HV_MSH_MIN_DIMM_SIZE_SHIFT 26
+
+/** Max number of DIMMs contained by one memory controller. */
+#define HV_MSH_MAX_DIMMS 2
+
+#endif
+
+/** Number of bits to right-shift to get the DIMM type. */
+#define HV_DIMM_TYPE_SHIFT 0
+
+/** Bits to mask to get the DIMM type. */
+#define HV_DIMM_TYPE_MASK 0xf
+
+/** Number of bits to right-shift to get the DIMM size. */
+#define HV_DIMM_SIZE_SHIFT 4
+
+/** Bits to mask to get the DIMM size. */
+#define HV_DIMM_SIZE_MASK 0xf
 
 
 /** Memory controller information. */
 /** Memory controller information. */
 typedef struct
 typedef struct
@@ -963,6 +1001,11 @@ HV_ASIDRange hv_inquire_asid(int idx);
 
 
 
 
 /** Waits for at least the specified number of nanoseconds then returns.
 /** Waits for at least the specified number of nanoseconds then returns.
+ *
+ * NOTE: this deprecated function currently assumes a 750 MHz clock,
+ * and is thus not generally suitable for use.  New code should call
+ * hv_sysconf(HV_SYSCONF_CPU_SPEED), compute a cycle count to wait for,
+ * and delay by looping while checking the cycle counter SPR.
  *
  *
  * @param nanosecs The number of nanoseconds to sleep.
  * @param nanosecs The number of nanoseconds to sleep.
  */
  */
@@ -1038,6 +1081,7 @@ int hv_console_write(HV_VirtAddr bytes, int len);
  *  downcall:
  *  downcall:
  *
  *
  *  INT_MESSAGE_RCV_DWNCL   (hypervisor message available)
  *  INT_MESSAGE_RCV_DWNCL   (hypervisor message available)
+ *  INT_DEV_INTR_DWNCL      (device interrupt)
  *  INT_DMATLB_MISS_DWNCL   (DMA TLB miss)
  *  INT_DMATLB_MISS_DWNCL   (DMA TLB miss)
  *  INT_SNITLB_MISS_DWNCL   (SNI TLB miss)
  *  INT_SNITLB_MISS_DWNCL   (SNI TLB miss)
  *  INT_DMATLB_ACCESS_DWNCL (DMA TLB access violation)
  *  INT_DMATLB_ACCESS_DWNCL (DMA TLB access violation)

+ 5 - 17
arch/tile/kernel/entry.S

@@ -38,12 +38,6 @@ STD_ENTRY(kernel_execve)
 	jrp lr
 	jrp lr
 	STD_ENDPROC(kernel_execve)
 	STD_ENDPROC(kernel_execve)
 
 
-/* Delay a fixed number of cycles. */
-STD_ENTRY(__delay)
-	{ addi r0, r0, -1; bnzt r0, . }
-	jrp lr
-	STD_ENDPROC(__delay)
-
 /*
 /*
  * We don't run this function directly, but instead copy it to a page
  * We don't run this function directly, but instead copy it to a page
  * we map into every user process.  See vdso_setup().
  * we map into every user process.  See vdso_setup().
@@ -97,23 +91,17 @@ STD_ENTRY(smp_nap)
 
 
 /*
 /*
  * Enable interrupts racelessly and then nap until interrupted.
  * Enable interrupts racelessly and then nap until interrupted.
+ * Architecturally, we are guaranteed that enabling interrupts via
+ * mtspr to INTERRUPT_CRITICAL_SECTION only interrupts at the next PC.
  * This function's _cpu_idle_nap address is special; see intvec.S.
  * This function's _cpu_idle_nap address is special; see intvec.S.
  * When interrupted at _cpu_idle_nap, we bump the PC forward 8, and
  * When interrupted at _cpu_idle_nap, we bump the PC forward 8, and
  * as a result return to the function that called _cpu_idle().
  * as a result return to the function that called _cpu_idle().
  */
  */
 STD_ENTRY(_cpu_idle)
 STD_ENTRY(_cpu_idle)
-	{
-	 lnk r0
-	 movei r1, KERNEL_PL
-	}
-	{
-	 addli r0, r0, _cpu_idle_nap - .
-	 mtspr INTERRUPT_CRITICAL_SECTION, r1
-	}
+	movei r1, 1
+	mtspr INTERRUPT_CRITICAL_SECTION, r1
 	IRQ_ENABLE(r2, r3)             /* unmask, but still with ICS set */
 	IRQ_ENABLE(r2, r3)             /* unmask, but still with ICS set */
-	mtspr SPR_EX_CONTEXT_K_1, r1   /* Kernel PL, ICS clear */
-	mtspr SPR_EX_CONTEXT_K_0, r0
-	iret
+	mtspr INTERRUPT_CRITICAL_SECTION, zero
 	.global _cpu_idle_nap
 	.global _cpu_idle_nap
 _cpu_idle_nap:
 _cpu_idle_nap:
 	nap
 	nap

+ 9 - 6
arch/tile/kernel/head_32.S

@@ -133,7 +133,7 @@ ENTRY(_start)
 	}
 	}
 	ENDPROC(_start)
 	ENDPROC(_start)
 
 
-.section ".bss.page_aligned","w"
+__PAGE_ALIGNED_BSS
 	.align PAGE_SIZE
 	.align PAGE_SIZE
 ENTRY(empty_zero_page)
 ENTRY(empty_zero_page)
 	.fill PAGE_SIZE,1,0
 	.fill PAGE_SIZE,1,0
@@ -145,10 +145,10 @@ ENTRY(empty_zero_page)
 	.endif
 	.endif
 	.word HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED | \
 	.word HV_PTE_PAGE | HV_PTE_DIRTY | HV_PTE_PRESENT | HV_PTE_ACCESSED | \
 	      (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE)
 	      (HV_PTE_MODE_CACHE_NO_L3 << HV_PTE_INDEX_MODE)
-	.word (\bits1) | (HV_CPA_TO_PFN(\cpa) << HV_PTE_INDEX_PFN)
+	.word (\bits1) | (HV_CPA_TO_PFN(\cpa) << (HV_PTE_INDEX_PFN - 32))
 	.endm
 	.endm
 
 
-.section ".data.page_aligned","wa"
+__PAGE_ALIGNED_DATA
 	.align PAGE_SIZE
 	.align PAGE_SIZE
 ENTRY(swapper_pg_dir)
 ENTRY(swapper_pg_dir)
 	/*
 	/*
@@ -158,12 +158,14 @@ ENTRY(swapper_pg_dir)
 	 */
 	 */
 	.set addr, 0
 	.set addr, 0
 	.rept (MEM_USER_INTRPT - PAGE_OFFSET) >> PGDIR_SHIFT
 	.rept (MEM_USER_INTRPT - PAGE_OFFSET) >> PGDIR_SHIFT
-	PTE addr + PAGE_OFFSET, addr, HV_PTE_READABLE | HV_PTE_WRITABLE
+	PTE addr + PAGE_OFFSET, addr, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+				      (1 << (HV_PTE_INDEX_WRITABLE - 32))
 	.set addr, addr + PGDIR_SIZE
 	.set addr, addr + PGDIR_SIZE
 	.endr
 	.endr
 
 
 	/* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
 	/* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
-	PTE MEM_SV_INTRPT, 0, HV_PTE_READABLE | HV_PTE_EXECUTABLE
+	PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+			      (1 << (HV_PTE_INDEX_EXECUTABLE - 32))
 	.org swapper_pg_dir + HV_L1_SIZE
 	.org swapper_pg_dir + HV_L1_SIZE
 	END(swapper_pg_dir)
 	END(swapper_pg_dir)
 
 
@@ -176,6 +178,7 @@ ENTRY(swapper_pg_dir)
 	__INITDATA
 	__INITDATA
 	.align CHIP_L2_LINE_SIZE()
 	.align CHIP_L2_LINE_SIZE()
 ENTRY(swapper_pgprot)
 ENTRY(swapper_pgprot)
-	PTE	0, 0, HV_PTE_READABLE | HV_PTE_WRITABLE, 1
+	PTE	0, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+		      (1 << (HV_PTE_INDEX_WRITABLE - 32)), 1
 	.align CHIP_L2_LINE_SIZE()
 	.align CHIP_L2_LINE_SIZE()
 	END(swapper_pgprot)
 	END(swapper_pgprot)

+ 20 - 54
arch/tile/kernel/intvec_32.S

@@ -32,10 +32,6 @@
 # error "No support for kernel preemption currently"
 # error "No support for kernel preemption currently"
 #endif
 #endif
 
 
-#if INT_INTCTRL_K < 32 || INT_INTCTRL_K >= 48
-# error INT_INTCTRL_K coded to set high interrupt mask
-#endif
-
 #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
 #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
 
 
 #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
 #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
@@ -1198,46 +1194,6 @@ STD_ENTRY(interrupt_return)
 
 
 	STD_ENDPROC(interrupt_return)
 	STD_ENDPROC(interrupt_return)
 
 
-	/*
-	 * This interrupt variant clears the INT_INTCTRL_K interrupt mask bit
-	 * before returning, so we can properly get more downcalls.
-	 */
-	.pushsection .text.handle_interrupt_downcall,"ax"
-handle_interrupt_downcall:
-	finish_interrupt_save handle_interrupt_downcall
-	check_single_stepping normal, .Ldispatch_downcall
-.Ldispatch_downcall:
-
-	/* Clear INTCTRL_K from the set of interrupts we ever enable. */
-	GET_INTERRUPTS_ENABLED_MASK_PTR(r30)
-	{
-	 addi   r30, r30, 4
-	 movei  r31, INT_MASK(INT_INTCTRL_K)
-	}
-	{
-	 lw     r20, r30
-	 nor    r21, r31, zero
-	}
-	and     r20, r20, r21
-	sw      r30, r20
-
-	{
-	 jalr   r0
-	 PTREGS_PTR(r0, PTREGS_OFFSET_BASE)
-	}
-	FEEDBACK_REENTER(handle_interrupt_downcall)
-
-	/* Allow INTCTRL_K to be enabled next time we enable interrupts. */
-	lw      r20, r30
-	or      r20, r20, r31
-	sw      r30, r20
-
-	{
-	 movei  r30, 0   /* not an NMI */
-	 j      interrupt_return
-	}
-	STD_ENDPROC(handle_interrupt_downcall)
-
 	/*
 	/*
 	 * Some interrupts don't check for single stepping
 	 * Some interrupts don't check for single stepping
 	 */
 	 */
@@ -1600,7 +1556,10 @@ STD_ENTRY(_sys_clone)
 	.align 64
 	.align 64
 	/* Align much later jump on the start of a cache line. */
 	/* Align much later jump on the start of a cache line. */
 #if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
 #if !ATOMIC_LOCKS_FOUND_VIA_TABLE()
-	nop; nop
+	nop
+#if PAGE_SIZE >= 0x10000
+	nop
+#endif
 #endif
 #endif
 ENTRY(sys_cmpxchg)
 ENTRY(sys_cmpxchg)
 
 
@@ -1628,9 +1587,13 @@ ENTRY(sys_cmpxchg)
 	 * about aliasing among multiple mappings of the same physical page,
 	 * about aliasing among multiple mappings of the same physical page,
 	 * and we ignore the low 3 bits so we have one lock that covers
 	 * and we ignore the low 3 bits so we have one lock that covers
 	 * both a cmpxchg64() and a cmpxchg() on either its low or high word.
 	 * both a cmpxchg64() and a cmpxchg() on either its low or high word.
-	 * NOTE: this code must match __atomic_hashed_lock() in lib/atomic.c.
+	 * NOTE: this must match __atomic_hashed_lock() in lib/atomic_32.c.
 	 */
 	 */
 
 
+#if (PAGE_OFFSET & 0xffff) != 0
+# error Code here assumes PAGE_OFFSET can be loaded with just hi16()
+#endif
+
 #if ATOMIC_LOCKS_FOUND_VIA_TABLE()
 #if ATOMIC_LOCKS_FOUND_VIA_TABLE()
 	{
 	{
 	 /* Check for unaligned input. */
 	 /* Check for unaligned input. */
@@ -1723,11 +1686,14 @@ ENTRY(sys_cmpxchg)
 	 lw	r26, r0
 	 lw	r26, r0
 	}
 	}
 	{
 	{
-	 /* atomic_locks is page aligned so this suffices to get its addr. */
-	 auli	r21, zero, hi16(atomic_locks)
+	 auli	r21, zero, ha16(atomic_locks)
 
 
 	 bbns   r23, .Lcmpxchg_badaddr
 	 bbns   r23, .Lcmpxchg_badaddr
 	}
 	}
+#if PAGE_SIZE < 0x10000
+	/* atomic_locks is page-aligned so for big pages we don't need this. */
+	addli   r21, r21, lo16(atomic_locks)
+#endif
 	{
 	{
 	 /*
 	 /*
 	  * Insert the hash bits into the page-aligned pointer.
 	  * Insert the hash bits into the page-aligned pointer.
@@ -1762,7 +1728,7 @@ ENTRY(sys_cmpxchg)
 
 
 	/*
 	/*
 	 * Perform the actual cmpxchg or atomic_update.
 	 * Perform the actual cmpxchg or atomic_update.
-	 * Note that __futex_mark_unlocked() in uClibc relies on
+	 * Note that the system <arch/atomic.h> header relies on
 	 * atomic_update() to always perform an "mf", so don't make
 	 * atomic_update() to always perform an "mf", so don't make
 	 * it optional or conditional without modifying that code.
 	 * it optional or conditional without modifying that code.
 	 */
 	 */
@@ -2014,17 +1980,17 @@ int_unalign:
 #endif
 #endif
 	int_hand     INT_INTCTRL_0, INTCTRL_0, bad_intr
 	int_hand     INT_INTCTRL_0, INTCTRL_0, bad_intr
 	int_hand     INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \
 	int_hand     INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \
-		     hv_message_intr, handle_interrupt_downcall
+		     hv_message_intr
 	int_hand     INT_DEV_INTR_DWNCL, DEV_INTR_DWNCL, \
 	int_hand     INT_DEV_INTR_DWNCL, DEV_INTR_DWNCL, \
-		     tile_dev_intr, handle_interrupt_downcall
+		     tile_dev_intr
 	int_hand     INT_I_ASID, I_ASID, bad_intr
 	int_hand     INT_I_ASID, I_ASID, bad_intr
 	int_hand     INT_D_ASID, D_ASID, bad_intr
 	int_hand     INT_D_ASID, D_ASID, bad_intr
 	int_hand     INT_DMATLB_MISS_DWNCL, DMATLB_MISS_DWNCL, \
 	int_hand     INT_DMATLB_MISS_DWNCL, DMATLB_MISS_DWNCL, \
-		     do_page_fault, handle_interrupt_downcall
+		     do_page_fault
 	int_hand     INT_SNITLB_MISS_DWNCL, SNITLB_MISS_DWNCL, \
 	int_hand     INT_SNITLB_MISS_DWNCL, SNITLB_MISS_DWNCL, \
-		     do_page_fault, handle_interrupt_downcall
+		     do_page_fault
 	int_hand     INT_DMATLB_ACCESS_DWNCL, DMATLB_ACCESS_DWNCL, \
 	int_hand     INT_DMATLB_ACCESS_DWNCL, DMATLB_ACCESS_DWNCL, \
-		     do_page_fault, handle_interrupt_downcall
+		     do_page_fault
 	int_hand     INT_SN_CPL, SN_CPL, bad_intr
 	int_hand     INT_SN_CPL, SN_CPL, bad_intr
 	int_hand     INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
 	int_hand     INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
 #if CHIP_HAS_AUX_PERF_COUNTERS()
 #if CHIP_HAS_AUX_PERF_COUNTERS()

+ 20 - 18
arch/tile/kernel/irq.c

@@ -176,43 +176,43 @@ void disable_percpu_irq(unsigned int irq)
 EXPORT_SYMBOL(disable_percpu_irq);
 EXPORT_SYMBOL(disable_percpu_irq);
 
 
 /* Mask an interrupt. */
 /* Mask an interrupt. */
-static void tile_irq_chip_mask(unsigned int irq)
+static void tile_irq_chip_mask(struct irq_data *d)
 {
 {
-	mask_irqs(1UL << irq);
+	mask_irqs(1UL << d->irq);
 }
 }
 
 
 /* Unmask an interrupt. */
 /* Unmask an interrupt. */
-static void tile_irq_chip_unmask(unsigned int irq)
+static void tile_irq_chip_unmask(struct irq_data *d)
 {
 {
-	unmask_irqs(1UL << irq);
+	unmask_irqs(1UL << d->irq);
 }
 }
 
 
 /*
 /*
  * Clear an interrupt before processing it so that any new assertions
  * Clear an interrupt before processing it so that any new assertions
  * will trigger another irq.
  * will trigger another irq.
  */
  */
-static void tile_irq_chip_ack(unsigned int irq)
+static void tile_irq_chip_ack(struct irq_data *d)
 {
 {
-	if ((unsigned long)get_irq_chip_data(irq) != IS_HW_CLEARED)
-		clear_irqs(1UL << irq);
+	if ((unsigned long)irq_data_get_irq_chip_data(d) != IS_HW_CLEARED)
+		clear_irqs(1UL << d->irq);
 }
 }
 
 
 /*
 /*
  * For per-cpu interrupts, we need to avoid unmasking any interrupts
  * For per-cpu interrupts, we need to avoid unmasking any interrupts
  * that we disabled via disable_percpu_irq().
  * that we disabled via disable_percpu_irq().
  */
  */
-static void tile_irq_chip_eoi(unsigned int irq)
+static void tile_irq_chip_eoi(struct irq_data *d)
 {
 {
-	if (!(__get_cpu_var(irq_disable_mask) & (1UL << irq)))
-		unmask_irqs(1UL << irq);
+	if (!(__get_cpu_var(irq_disable_mask) & (1UL << d->irq)))
+		unmask_irqs(1UL << d->irq);
 }
 }
 
 
 static struct irq_chip tile_irq_chip = {
 static struct irq_chip tile_irq_chip = {
 	.name = "tile_irq_chip",
 	.name = "tile_irq_chip",
-	.ack = tile_irq_chip_ack,
-	.eoi = tile_irq_chip_eoi,
-	.mask = tile_irq_chip_mask,
-	.unmask = tile_irq_chip_unmask,
+	.irq_ack = tile_irq_chip_ack,
+	.irq_eoi = tile_irq_chip_eoi,
+	.irq_mask = tile_irq_chip_mask,
+	.irq_unmask = tile_irq_chip_unmask,
 };
 };
 
 
 void __init init_IRQ(void)
 void __init init_IRQ(void)
@@ -277,8 +277,10 @@ int show_interrupts(struct seq_file *p, void *v)
 	}
 	}
 
 
 	if (i < NR_IRQS) {
 	if (i < NR_IRQS) {
-		raw_spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
+		struct irq_desc *desc = irq_to_desc(i);
+
+		raw_spin_lock_irqsave(&desc->lock, flags);
+		action = desc->action;
 		if (!action)
 		if (!action)
 			goto skip;
 			goto skip;
 		seq_printf(p, "%3d: ", i);
 		seq_printf(p, "%3d: ", i);
@@ -288,7 +290,7 @@ int show_interrupts(struct seq_file *p, void *v)
 		for_each_online_cpu(j)
 		for_each_online_cpu(j)
 			seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
 			seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
 #endif
 #endif
-		seq_printf(p, " %14s", irq_desc[i].chip->name);
+		seq_printf(p, " %14s", get_irq_desc_chip(desc)->name);
 		seq_printf(p, "  %s", action->name);
 		seq_printf(p, "  %s", action->name);
 
 
 		for (action = action->next; action; action = action->next)
 		for (action = action->next; action; action = action->next)
@@ -296,7 +298,7 @@ int show_interrupts(struct seq_file *p, void *v)
 
 
 		seq_putc(p, '\n');
 		seq_putc(p, '\n');
 skip:
 skip:
-		raw_spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+		raw_spin_unlock_irqrestore(&desc->lock, flags);
 	}
 	}
 	return 0;
 	return 0;
 }
 }

+ 5 - 2
arch/tile/kernel/machine_kexec.c

@@ -240,8 +240,11 @@ static void setup_quasi_va_is_pa(void)
 	pte = hv_pte(_PAGE_KERNEL | _PAGE_HUGE_PAGE);
 	pte = hv_pte(_PAGE_KERNEL | _PAGE_HUGE_PAGE);
 	pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
 	pte = hv_pte_set_mode(pte, HV_PTE_MODE_CACHE_NO_L3);
 
 
-	for (i = 0; i < pgd_index(PAGE_OFFSET); i++)
-		pgtable[i] = pfn_pte(i << (HPAGE_SHIFT - PAGE_SHIFT), pte);
+	for (i = 0; i < pgd_index(PAGE_OFFSET); i++) {
+		unsigned long pfn = i << (HPAGE_SHIFT - PAGE_SHIFT);
+		if (pfn_valid(pfn))
+			__set_pte(&pgtable[i], pfn_pte(pfn, pte));
+	}
 }
 }
 
 
 
 

+ 19 - 19
arch/tile/kernel/pci-dma.c

@@ -86,6 +86,21 @@ EXPORT_SYMBOL(dma_free_coherent);
  * can count on nothing having been touched.
  * can count on nothing having been touched.
  */
  */
 
 
+/* Flush a PA range from cache page by page. */
+static void __dma_map_pa_range(dma_addr_t dma_addr, size_t size)
+{
+	struct page *page = pfn_to_page(PFN_DOWN(dma_addr));
+	size_t bytesleft = PAGE_SIZE - (dma_addr & (PAGE_SIZE - 1));
+
+	while ((ssize_t)size > 0) {
+		/* Flush the page. */
+		homecache_flush_cache(page++, 0);
+
+		/* Figure out if we need to continue on the next page. */
+		size -= bytesleft;
+		bytesleft = PAGE_SIZE;
+	}
+}
 
 
 /*
 /*
  * dma_map_single can be passed any memory address, and there appear
  * dma_map_single can be passed any memory address, and there appear
@@ -97,26 +112,12 @@ EXPORT_SYMBOL(dma_free_coherent);
 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
 	       enum dma_data_direction direction)
 	       enum dma_data_direction direction)
 {
 {
-	struct page *page;
-	dma_addr_t dma_addr;
-	int thispage;
+	dma_addr_t dma_addr = __pa(ptr);
 
 
 	BUG_ON(!valid_dma_direction(direction));
 	BUG_ON(!valid_dma_direction(direction));
 	WARN_ON(size == 0);
 	WARN_ON(size == 0);
 
 
-	dma_addr = __pa(ptr);
-
-	/* We might have been handed a buffer that wraps a page boundary */
-	while ((int)size > 0) {
-		/* The amount to flush that's on this page */
-		thispage = PAGE_SIZE - ((unsigned long)ptr & (PAGE_SIZE - 1));
-		thispage = min((int)thispage, (int)size);
-		/* Is this valid for any page we could be handed? */
-		page = pfn_to_page(kaddr_to_pfn(ptr));
-		homecache_flush_cache(page, 0);
-		ptr += thispage;
-		size -= thispage;
-	}
+	__dma_map_pa_range(dma_addr, size);
 
 
 	return dma_addr;
 	return dma_addr;
 }
 }
@@ -140,10 +141,8 @@ int dma_map_sg(struct device *dev, struct scatterlist *sglist, int nents,
 	WARN_ON(nents == 0 || sglist->length == 0);
 	WARN_ON(nents == 0 || sglist->length == 0);
 
 
 	for_each_sg(sglist, sg, nents, i) {
 	for_each_sg(sglist, sg, nents, i) {
-		struct page *page;
 		sg->dma_address = sg_phys(sg);
 		sg->dma_address = sg_phys(sg);
-		page = pfn_to_page(sg->dma_address >> PAGE_SHIFT);
-		homecache_flush_cache(page, 0);
+		__dma_map_pa_range(sg->dma_address, sg->length);
 	}
 	}
 
 
 	return nents;
 	return nents;
@@ -163,6 +162,7 @@ dma_addr_t dma_map_page(struct device *dev, struct page *page,
 {
 {
 	BUG_ON(!valid_dma_direction(direction));
 	BUG_ON(!valid_dma_direction(direction));
 
 
+	BUG_ON(offset + size > PAGE_SIZE);
 	homecache_flush_cache(page, 0);
 	homecache_flush_cache(page, 0);
 
 
 	return page_to_pa(page) + offset;
 	return page_to_pa(page) + offset;

+ 5 - 1
arch/tile/kernel/process.c

@@ -165,7 +165,7 @@ void free_thread_info(struct thread_info *info)
 		kfree(step_state);
 		kfree(step_state);
 	}
 	}
 
 
-	free_page((unsigned long)info);
+	free_pages((unsigned long)info, THREAD_SIZE_ORDER);
 }
 }
 
 
 static void save_arch_state(struct thread_struct *t);
 static void save_arch_state(struct thread_struct *t);
@@ -574,6 +574,8 @@ SYSCALL_DEFINE4(execve, const char __user *, path,
 		goto out;
 		goto out;
 	error = do_execve(filename, argv, envp, regs);
 	error = do_execve(filename, argv, envp, regs);
 	putname(filename);
 	putname(filename);
+	if (error == 0)
+		single_step_execve();
 out:
 out:
 	return error;
 	return error;
 }
 }
@@ -593,6 +595,8 @@ long compat_sys_execve(const char __user *path,
 		goto out;
 		goto out;
 	error = compat_do_execve(filename, argv, envp, regs);
 	error = compat_do_execve(filename, argv, envp, regs);
 	putname(filename);
 	putname(filename);
+	if (error == 0)
+		single_step_execve();
 out:
 out:
 	return error;
 	return error;
 }
 }

+ 12 - 8
arch/tile/kernel/setup.c

@@ -59,6 +59,8 @@ unsigned long __initdata node_memmap_pfn[MAX_NUMNODES];
 unsigned long __initdata node_percpu_pfn[MAX_NUMNODES];
 unsigned long __initdata node_percpu_pfn[MAX_NUMNODES];
 unsigned long __initdata node_free_pfn[MAX_NUMNODES];
 unsigned long __initdata node_free_pfn[MAX_NUMNODES];
 
 
+static unsigned long __initdata node_percpu[MAX_NUMNODES];
+
 #ifdef CONFIG_HIGHMEM
 #ifdef CONFIG_HIGHMEM
 /* Page frame index of end of lowmem on each controller. */
 /* Page frame index of end of lowmem on each controller. */
 unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES];
 unsigned long __cpuinitdata node_lowmem_end_pfn[MAX_NUMNODES];
@@ -554,7 +556,6 @@ static void __init setup_bootmem_allocator(void)
 		reserve_bootmem(crashk_res.start,
 		reserve_bootmem(crashk_res.start,
 			crashk_res.end - crashk_res.start + 1, 0);
 			crashk_res.end - crashk_res.start + 1, 0);
 #endif
 #endif
-
 }
 }
 
 
 void *__init alloc_remap(int nid, unsigned long size)
 void *__init alloc_remap(int nid, unsigned long size)
@@ -568,11 +569,13 @@ void *__init alloc_remap(int nid, unsigned long size)
 
 
 static int __init percpu_size(void)
 static int __init percpu_size(void)
 {
 {
-	int size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE);
-#ifdef CONFIG_MODULES
-	if (size < PERCPU_ENOUGH_ROOM)
-		size = PERCPU_ENOUGH_ROOM;
-#endif
+	int size = __per_cpu_end - __per_cpu_start;
+	size += PERCPU_MODULE_RESERVE;
+	size += PERCPU_DYNAMIC_EARLY_SIZE;
+	if (size < PCPU_MIN_UNIT_SIZE)
+		size = PCPU_MIN_UNIT_SIZE;
+	size = roundup(size, PAGE_SIZE);
+
 	/* In several places we assume the per-cpu data fits on a huge page. */
 	/* In several places we assume the per-cpu data fits on a huge page. */
 	BUG_ON(kdata_huge && size > HPAGE_SIZE);
 	BUG_ON(kdata_huge && size > HPAGE_SIZE);
 	return size;
 	return size;
@@ -589,7 +592,6 @@ static inline unsigned long alloc_bootmem_pfn(int size, unsigned long goal)
 static void __init zone_sizes_init(void)
 static void __init zone_sizes_init(void)
 {
 {
 	unsigned long zones_size[MAX_NR_ZONES] = { 0 };
 	unsigned long zones_size[MAX_NR_ZONES] = { 0 };
-	unsigned long node_percpu[MAX_NUMNODES] = { 0 };
 	int size = percpu_size();
 	int size = percpu_size();
 	int num_cpus = smp_height * smp_width;
 	int num_cpus = smp_height * smp_width;
 	int i;
 	int i;
@@ -674,7 +676,7 @@ static void __init zone_sizes_init(void)
 		NODE_DATA(i)->bdata = NODE_DATA(0)->bdata;
 		NODE_DATA(i)->bdata = NODE_DATA(0)->bdata;
 
 
 		free_area_init_node(i, zones_size, start, NULL);
 		free_area_init_node(i, zones_size, start, NULL);
-		printk(KERN_DEBUG "  DMA zone: %ld per-cpu pages\n",
+		printk(KERN_DEBUG "  Normal zone: %ld per-cpu pages\n",
 		       PFN_UP(node_percpu[i]));
 		       PFN_UP(node_percpu[i]));
 
 
 		/* Track the type of memory on each node */
 		/* Track the type of memory on each node */
@@ -1312,6 +1314,8 @@ static void *__init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
 
 
 	BUG_ON(size % PAGE_SIZE != 0);
 	BUG_ON(size % PAGE_SIZE != 0);
 	pfn_offset[nid] += size / PAGE_SIZE;
 	pfn_offset[nid] += size / PAGE_SIZE;
+	BUG_ON(node_percpu[nid] < size);
+	node_percpu[nid] -= size;
 	if (percpu_pfn[cpu] == 0)
 	if (percpu_pfn[cpu] == 0)
 		percpu_pfn[cpu] = pfn;
 		percpu_pfn[cpu] = pfn;
 	return pfn_to_kaddr(pfn);
 	return pfn_to_kaddr(pfn);

+ 19 - 2
arch/tile/kernel/single_step.c

@@ -56,7 +56,7 @@ enum mem_op {
 	MEMOP_STORE_POSTINCR
 	MEMOP_STORE_POSTINCR
 };
 };
 
 
-static inline tile_bundle_bits set_BrOff_X1(tile_bundle_bits n, int32_t offset)
+static inline tile_bundle_bits set_BrOff_X1(tile_bundle_bits n, s32 offset)
 {
 {
 	tile_bundle_bits result;
 	tile_bundle_bits result;
 
 
@@ -254,6 +254,18 @@ P("\n");
 	return bundle;
 	return bundle;
 }
 }
 
 
+/*
+ * Called after execve() has started the new image.  This allows us
+ * to reset the info state.  Note that the the mmap'ed memory, if there
+ * was any, has already been unmapped by the exec.
+ */
+void single_step_execve(void)
+{
+	struct thread_info *ti = current_thread_info();
+	kfree(ti->step_state);
+	ti->step_state = NULL;
+}
+
 /**
 /**
  * single_step_once() - entry point when single stepping has been triggered.
  * single_step_once() - entry point when single stepping has been triggered.
  * @regs: The machine register state
  * @regs: The machine register state
@@ -373,7 +385,7 @@ void single_step_once(struct pt_regs *regs)
 		/* branches */
 		/* branches */
 		case BRANCH_OPCODE_X1:
 		case BRANCH_OPCODE_X1:
 		{
 		{
-			int32_t offset = signExtend17(get_BrOff_X1(bundle));
+			s32 offset = signExtend17(get_BrOff_X1(bundle));
 
 
 			/*
 			/*
 			 * For branches, we use a rewriting trick to let the
 			 * For branches, we use a rewriting trick to let the
@@ -731,4 +743,9 @@ void single_step_once(struct pt_regs *regs)
 	__insn_mtspr(SPR_SINGLE_STEP_EN_K_K, 1 << USER_PL);
 	__insn_mtspr(SPR_SINGLE_STEP_EN_K_K, 1 << USER_PL);
 }
 }
 
 
+void single_step_execve(void)
+{
+	/* Nothing */
+}
+
 #endif /* !__tilegx__ */
 #endif /* !__tilegx__ */

+ 19 - 14
arch/tile/kernel/smp.c

@@ -36,6 +36,22 @@ static unsigned long __iomem *ipi_mappings[NR_CPUS];
 /* Set by smp_send_stop() to avoid recursive panics. */
 /* Set by smp_send_stop() to avoid recursive panics. */
 static int stopping_cpus;
 static int stopping_cpus;
 
 
+static void __send_IPI_many(HV_Recipient *recip, int nrecip, int tag)
+{
+	int sent = 0;
+	while (sent < nrecip) {
+		int rc = hv_send_message(recip, nrecip,
+					 (HV_VirtAddr)&tag, sizeof(tag));
+		if (rc < 0) {
+			if (!stopping_cpus)  /* avoid recursive panic */
+				panic("hv_send_message returned %d", rc);
+			break;
+		}
+		WARN_ONCE(rc == 0, "hv_send_message() returned zero\n");
+		sent += rc;
+	}
+}
+
 void send_IPI_single(int cpu, int tag)
 void send_IPI_single(int cpu, int tag)
 {
 {
 	HV_Recipient recip = {
 	HV_Recipient recip = {
@@ -43,14 +59,13 @@ void send_IPI_single(int cpu, int tag)
 		.x = cpu % smp_width,
 		.x = cpu % smp_width,
 		.state = HV_TO_BE_SENT
 		.state = HV_TO_BE_SENT
 	};
 	};
-	int rc = hv_send_message(&recip, 1, (HV_VirtAddr)&tag, sizeof(tag));
-	BUG_ON(rc <= 0);
+	__send_IPI_many(&recip, 1, tag);
 }
 }
 
 
 void send_IPI_many(const struct cpumask *mask, int tag)
 void send_IPI_many(const struct cpumask *mask, int tag)
 {
 {
 	HV_Recipient recip[NR_CPUS];
 	HV_Recipient recip[NR_CPUS];
-	int cpu, sent;
+	int cpu;
 	int nrecip = 0;
 	int nrecip = 0;
 	int my_cpu = smp_processor_id();
 	int my_cpu = smp_processor_id();
 	for_each_cpu(cpu, mask) {
 	for_each_cpu(cpu, mask) {
@@ -61,17 +76,7 @@ void send_IPI_many(const struct cpumask *mask, int tag)
 		r->x = cpu % smp_width;
 		r->x = cpu % smp_width;
 		r->state = HV_TO_BE_SENT;
 		r->state = HV_TO_BE_SENT;
 	}
 	}
-	sent = 0;
-	while (sent < nrecip) {
-		int rc = hv_send_message(recip, nrecip,
-					 (HV_VirtAddr)&tag, sizeof(tag));
-		if (rc <= 0) {
-			if (!stopping_cpus)  /* avoid recursive panic */
-				panic("hv_send_message returned %d", rc);
-			break;
-		}
-		sent += rc;
-	}
+	__send_IPI_many(recip, nrecip, tag);
 }
 }
 
 
 void send_IPI_allbutself(int tag)
 void send_IPI_allbutself(int tag)

+ 19 - 9
arch/tile/kernel/stack.c

@@ -44,13 +44,6 @@ static int in_kernel_stack(struct KBacktraceIterator *kbt, VirtualAddress sp)
 	return sp >= kstack_base && sp < kstack_base + THREAD_SIZE;
 	return sp >= kstack_base && sp < kstack_base + THREAD_SIZE;
 }
 }
 
 
-/* Is address in the specified kernel code? */
-static int in_kernel_text(VirtualAddress address)
-{
-	return (address >= MEM_SV_INTRPT &&
-		address < MEM_SV_INTRPT + HPAGE_SIZE);
-}
-
 /* Is address valid for reading? */
 /* Is address valid for reading? */
 static int valid_address(struct KBacktraceIterator *kbt, VirtualAddress address)
 static int valid_address(struct KBacktraceIterator *kbt, VirtualAddress address)
 {
 {
@@ -63,6 +56,23 @@ static int valid_address(struct KBacktraceIterator *kbt, VirtualAddress address)
 	if (l1_pgtable == NULL)
 	if (l1_pgtable == NULL)
 		return 0;	/* can't read user space in other tasks */
 		return 0;	/* can't read user space in other tasks */
 
 
+#ifdef CONFIG_64BIT
+	/* Find the real l1_pgtable by looking in the l0_pgtable. */
+	pte = l1_pgtable[HV_L0_INDEX(address)];
+	if (!hv_pte_get_present(pte))
+		return 0;
+	pfn = hv_pte_get_pfn(pte);
+	if (pte_huge(pte)) {
+		if (!pfn_valid(pfn)) {
+			pr_err("L0 huge page has bad pfn %#lx\n", pfn);
+			return 0;
+		}
+		return hv_pte_get_present(pte) && hv_pte_get_readable(pte);
+	}
+	page = pfn_to_page(pfn);
+	BUG_ON(PageHighMem(page));  /* No HIGHMEM on 64-bit. */
+	l1_pgtable = (HV_PTE *)pfn_to_kaddr(pfn);
+#endif
 	pte = l1_pgtable[HV_L1_INDEX(address)];
 	pte = l1_pgtable[HV_L1_INDEX(address)];
 	if (!hv_pte_get_present(pte))
 	if (!hv_pte_get_present(pte))
 		return 0;
 		return 0;
@@ -92,7 +102,7 @@ static bool read_memory_func(void *result, VirtualAddress address,
 {
 {
 	int retval;
 	int retval;
 	struct KBacktraceIterator *kbt = (struct KBacktraceIterator *)vkbt;
 	struct KBacktraceIterator *kbt = (struct KBacktraceIterator *)vkbt;
-	if (in_kernel_text(address)) {
+	if (__kernel_text_address(address)) {
 		/* OK to read kernel code. */
 		/* OK to read kernel code. */
 	} else if (address >= PAGE_OFFSET) {
 	} else if (address >= PAGE_OFFSET) {
 		/* We only tolerate kernel-space reads of this task's stack */
 		/* We only tolerate kernel-space reads of this task's stack */
@@ -132,7 +142,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 		}
 		}
 	}
 	}
 	if (EX1_PL(p->ex1) == KERNEL_PL &&
 	if (EX1_PL(p->ex1) == KERNEL_PL &&
-	    in_kernel_text(p->pc) &&
+	    __kernel_text_address(p->pc) &&
 	    in_kernel_stack(kbt, p->sp) &&
 	    in_kernel_stack(kbt, p->sp) &&
 	    p->sp >= sp) {
 	    p->sp >= sp) {
 		if (kbt->verbose)
 		if (kbt->verbose)

+ 10 - 0
arch/tile/kernel/time.c

@@ -224,3 +224,13 @@ int setup_profiling_timer(unsigned int multiplier)
 {
 {
 	return -EINVAL;
 	return -EINVAL;
 }
 }
+
+/*
+ * Use the tile timer to convert nsecs to core clock cycles, relying
+ * on it having the same frequency as SPR_CYCLE.
+ */
+cycles_t ns2cycles(unsigned long nsecs)
+{
+	struct clock_event_device *dev = &__get_cpu_var(tile_timer);
+	return ((u64)nsecs * dev->mult) >> dev->shift;
+}

+ 1 - 4
arch/tile/kernel/vmlinux.lds.S

@@ -59,10 +59,7 @@ SECTIONS
 
 
   . = ALIGN(PAGE_SIZE);
   . = ALIGN(PAGE_SIZE);
   VMLINUX_SYMBOL(_sinitdata) = .;
   VMLINUX_SYMBOL(_sinitdata) = .;
-  .init.page : AT (ADDR(.init.page) - LOAD_OFFSET) {
-    *(.init.page)
-  } :data =0
-  INIT_DATA_SECTION(16)
+  INIT_DATA_SECTION(16) :data =0
   PERCPU(L2_CACHE_BYTES, PAGE_SIZE)
   PERCPU(L2_CACHE_BYTES, PAGE_SIZE)
   . = ALIGN(PAGE_SIZE);
   . = ALIGN(PAGE_SIZE);
   VMLINUX_SYMBOL(_einitdata) = .;
   VMLINUX_SYMBOL(_einitdata) = .;

+ 2 - 3
arch/tile/lib/Makefile

@@ -2,9 +2,8 @@
 # Makefile for TILE-specific library files..
 # Makefile for TILE-specific library files..
 #
 #
 
 
-lib-y = cacheflush.o checksum.o cpumask.o delay.o \
-	mb_incoherent.o uaccess.o memmove.o \
-	memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
+lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \
+	memmove.o memcpy_$(BITS).o memchr_$(BITS).o memset_$(BITS).o \
 	strchr_$(BITS).o strlen_$(BITS).o
 	strchr_$(BITS).o strlen_$(BITS).o
 
 
 ifeq ($(CONFIG_TILEGX),y)
 ifeq ($(CONFIG_TILEGX),y)

+ 2 - 3
arch/tile/lib/atomic_32.c

@@ -46,14 +46,13 @@ struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE]
 #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 #else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 
 
 /* This page is remapped on startup to be hash-for-home. */
 /* This page is remapped on startup to be hash-for-home. */
-int atomic_locks[PAGE_SIZE / sizeof(int) /* Only ATOMIC_HASH_SIZE is used */]
-  __attribute__((aligned(PAGE_SIZE), section(".bss.page_aligned")));
+int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss;
 
 
 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 #endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */
 
 
 static inline int *__atomic_hashed_lock(volatile void *v)
 static inline int *__atomic_hashed_lock(volatile void *v)
 {
 {
-	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec.S */
+	/* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */
 #if ATOMIC_LOCKS_FOUND_VIA_TABLE()
 #if ATOMIC_LOCKS_FOUND_VIA_TABLE()
 	unsigned long i =
 	unsigned long i =
 		(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));
 		(unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long));

+ 1 - 1
arch/tile/lib/atomic_asm_32.S

@@ -14,7 +14,7 @@
  * Support routines for atomic operations.  Each function takes:
  * Support routines for atomic operations.  Each function takes:
  *
  *
  * r0: address to manipulate
  * r0: address to manipulate
- * r1: pointer to atomic lock guarding this operation (for FUTEX_LOCK_REG)
+ * r1: pointer to atomic lock guarding this operation (for ATOMIC_LOCK_REG)
  * r2: new value to write, or for cmpxchg/add_unless, value to compare against
  * r2: new value to write, or for cmpxchg/add_unless, value to compare against
  * r3: (cmpxchg/xchg_add_unless) new value to write or add;
  * r3: (cmpxchg/xchg_add_unless) new value to write or add;
  *     (atomic64 ops) high word of value to write
  *     (atomic64 ops) high word of value to write

+ 102 - 0
arch/tile/lib/cacheflush.c

@@ -21,3 +21,105 @@ void __flush_icache_range(unsigned long start, unsigned long end)
 {
 {
 	invalidate_icache((const void *)start, end - start, PAGE_SIZE);
 	invalidate_icache((const void *)start, end - start, PAGE_SIZE);
 }
 }
+
+
+/* Force a load instruction to issue. */
+static inline void force_load(char *p)
+{
+	*(volatile char *)p;
+}
+
+/*
+ * Flush and invalidate a VA range that is homed remotely on a single
+ * core (if "!hfh") or homed via hash-for-home (if "hfh"), waiting
+ * until the memory controller holds the flushed values.
+ */
+void finv_buffer_remote(void *buffer, size_t size, int hfh)
+{
+	char *p, *base;
+	size_t step_size, load_count;
+	const unsigned long STRIPE_WIDTH = 8192;
+
+	/*
+	 * Flush and invalidate the buffer out of the local L1/L2
+	 * and request the home cache to flush and invalidate as well.
+	 */
+	__finv_buffer(buffer, size);
+
+	/*
+	 * Wait for the home cache to acknowledge that it has processed
+	 * all the flush-and-invalidate requests.  This does not mean
+	 * that the flushed data has reached the memory controller yet,
+	 * but it does mean the home cache is processing the flushes.
+	 */
+	__insn_mf();
+
+	/*
+	 * Issue a load to the last cache line, which can't complete
+	 * until all the previously-issued flushes to the same memory
+	 * controller have also completed.  If we weren't striping
+	 * memory, that one load would be sufficient, but since we may
+	 * be, we also need to back up to the last load issued to
+	 * another memory controller, which would be the point where
+	 * we crossed an 8KB boundary (the granularity of striping
+	 * across memory controllers).  Keep backing up and doing this
+	 * until we are before the beginning of the buffer, or have
+	 * hit all the controllers.
+	 *
+	 * If we are flushing a hash-for-home buffer, it's even worse.
+	 * Each line may be homed on a different tile, and each tile
+	 * may have up to four lines that are on different
+	 * controllers.  So as we walk backwards, we have to touch
+	 * enough cache lines to satisfy these constraints.  In
+	 * practice this ends up being close enough to "load from
+	 * every cache line on a full memory stripe on each
+	 * controller" that we simply do that, to simplify the logic.
+	 *
+	 * FIXME: See bug 9535 for some issues with this code.
+	 */
+	if (hfh) {
+		step_size = L2_CACHE_BYTES;
+		load_count = (STRIPE_WIDTH / L2_CACHE_BYTES) *
+			      (1 << CHIP_LOG_NUM_MSHIMS());
+	} else {
+		step_size = STRIPE_WIDTH;
+		load_count = (1 << CHIP_LOG_NUM_MSHIMS());
+	}
+
+	/* Load the last byte of the buffer. */
+	p = (char *)buffer + size - 1;
+	force_load(p);
+
+	/* Bump down to the end of the previous stripe or cache line. */
+	p -= step_size;
+	p = (char *)((unsigned long)p | (step_size - 1));
+
+	/* Figure out how far back we need to go. */
+	base = p - (step_size * (load_count - 2));
+	if ((long)base < (long)buffer)
+		base = buffer;
+
+	/*
+	 * Fire all the loads we need.  The MAF only has eight entries
+	 * so we can have at most eight outstanding loads, so we
+	 * unroll by that amount.
+	 */
+#pragma unroll 8
+	for (; p >= base; p -= step_size)
+		force_load(p);
+
+	/*
+	 * Repeat, but with inv's instead of loads, to get rid of the
+	 * data we just loaded into our own cache and the old home L3.
+	 * No need to unroll since inv's don't target a register.
+	 */
+	p = (char *)buffer + size - 1;
+	__insn_inv(p);
+	p -= step_size;
+	p = (char *)((unsigned long)p | (step_size - 1));
+	for (; p >= base; p -= step_size)
+		__insn_inv(p);
+
+	/* Wait for the load+inv's (and thus finvs) to have completed. */
+	__insn_mf();
+}

+ 16 - 5
arch/tile/lib/delay.c

@@ -15,20 +15,31 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/delay.h>
 #include <linux/thread_info.h>
 #include <linux/thread_info.h>
-#include <asm/fixmap.h>
-#include <hv/hypervisor.h>
+#include <asm/timex.h>
 
 
 void __udelay(unsigned long usecs)
 void __udelay(unsigned long usecs)
 {
 {
-	hv_nanosleep(usecs * 1000);
+	if (usecs > ULONG_MAX / 1000) {
+		WARN_ON_ONCE(usecs > ULONG_MAX / 1000);
+		usecs = ULONG_MAX / 1000;
+	}
+	__ndelay(usecs * 1000);
 }
 }
 EXPORT_SYMBOL(__udelay);
 EXPORT_SYMBOL(__udelay);
 
 
 void __ndelay(unsigned long nsecs)
 void __ndelay(unsigned long nsecs)
 {
 {
-	hv_nanosleep(nsecs);
+	cycles_t target = get_cycles();
+	target += ns2cycles(nsecs);
+	while (get_cycles() < target)
+		cpu_relax();
 }
 }
 EXPORT_SYMBOL(__ndelay);
 EXPORT_SYMBOL(__ndelay);
 
 
-/* FIXME: should be declared in a header somewhere. */
+void __delay(unsigned long cycles)
+{
+	cycles_t target = get_cycles() + cycles;
+	while (get_cycles() < target)
+		cpu_relax();
+}
 EXPORT_SYMBOL(__delay);
 EXPORT_SYMBOL(__delay);

+ 7 - 3
arch/tile/lib/exports.c

@@ -29,6 +29,9 @@ EXPORT_SYMBOL(__put_user_8);
 EXPORT_SYMBOL(strnlen_user_asm);
 EXPORT_SYMBOL(strnlen_user_asm);
 EXPORT_SYMBOL(strncpy_from_user_asm);
 EXPORT_SYMBOL(strncpy_from_user_asm);
 EXPORT_SYMBOL(clear_user_asm);
 EXPORT_SYMBOL(clear_user_asm);
+EXPORT_SYMBOL(flush_user_asm);
+EXPORT_SYMBOL(inv_user_asm);
+EXPORT_SYMBOL(finv_user_asm);
 
 
 /* arch/tile/kernel/entry.S */
 /* arch/tile/kernel/entry.S */
 #include <linux/kernel.h>
 #include <linux/kernel.h>
@@ -45,9 +48,6 @@ EXPORT_SYMBOL(__copy_from_user_zeroing);
 EXPORT_SYMBOL(__copy_in_user_inatomic);
 EXPORT_SYMBOL(__copy_in_user_inatomic);
 #endif
 #endif
 
 
-/* arch/tile/lib/mb_incoherent.S */
-EXPORT_SYMBOL(__mb_incoherent);
-
 /* hypervisor glue */
 /* hypervisor glue */
 #include <hv/hypervisor.h>
 #include <hv/hypervisor.h>
 EXPORT_SYMBOL(hv_dev_open);
 EXPORT_SYMBOL(hv_dev_open);
@@ -85,4 +85,8 @@ int64_t __muldi3(int64_t, int64_t);
 EXPORT_SYMBOL(__muldi3);
 EXPORT_SYMBOL(__muldi3);
 uint64_t __lshrdi3(uint64_t, unsigned int);
 uint64_t __lshrdi3(uint64_t, unsigned int);
 EXPORT_SYMBOL(__lshrdi3);
 EXPORT_SYMBOL(__lshrdi3);
+uint64_t __ashrdi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashrdi3);
+uint64_t __ashldi3(uint64_t, unsigned int);
+EXPORT_SYMBOL(__ashldi3);
 #endif
 #endif

+ 0 - 34
arch/tile/lib/mb_incoherent.S

@@ -1,34 +0,0 @@
-/*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
- *
- *   This program is free software; you can redistribute it and/or
- *   modify it under the terms of the GNU General Public License
- *   as published by the Free Software Foundation, version 2.
- *
- *   This program is distributed in the hope that it will be useful, but
- *   WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- *   NON INFRINGEMENT.  See the GNU General Public License for
- *   more details.
- *
- * Assembly code for invoking the HV's fence_incoherent syscall.
- */
-
-#include <linux/linkage.h>
-#include <hv/syscall_public.h>
-#include <arch/abi.h>
-#include <arch/chip.h>
-
-#if !CHIP_HAS_MF_WAITS_FOR_VICTIMS()
-
-/*
- * Invoke the hypervisor's fence_incoherent syscall, which guarantees
- * that all victims for cachelines homed on this tile have reached memory.
- */
-STD_ENTRY(__mb_incoherent)
-	moveli TREG_SYSCALL_NR_NAME, HV_SYS_fence_incoherent
-	swint2
-	jrp lr
-	STD_ENDPROC(__mb_incoherent)
-
-#endif

+ 2 - 2
arch/tile/lib/memcpy_tile64.c

@@ -96,7 +96,7 @@ static void memcpy_multicache(void *dest, const void *source,
 	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
 	newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1));
 	pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
 	pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc);
 	ptep = pte_offset_kernel(pmdp, newsrc);
 	ptep = pte_offset_kernel(pmdp, newsrc);
-	*ptep = src_pte;   /* set_pte() would be confused by this */
+	__set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
 	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
 	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
 
 
 	/* Actually move the data. */
 	/* Actually move the data. */
@@ -109,7 +109,7 @@ static void memcpy_multicache(void *dest, const void *source,
 	 */
 	 */
 	src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
 	src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3);
 	src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
 	src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */
-	*ptep = src_pte;   /* set_pte() would be confused by this */
+	__set_pte(ptep, src_pte);   /* set_pte() would be confused by this */
 	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
 	local_flush_tlb_page(NULL, newsrc, PAGE_SIZE);
 
 
 	/*
 	/*

+ 96 - 65
arch/tile/lib/spinlock_32.c

@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <linux/module.h>
 #include <asm/processor.h>
 #include <asm/processor.h>
+#include <arch/spr_def.h>
 
 
 #include "spinlock_common.h"
 #include "spinlock_common.h"
 
 
@@ -91,75 +92,75 @@ EXPORT_SYMBOL(arch_spin_unlock_wait);
 #define RD_COUNT_MASK   ((1 << RD_COUNT_WIDTH) - 1)
 #define RD_COUNT_MASK   ((1 << RD_COUNT_WIDTH) - 1)
 
 
 
 
-/* Lock the word, spinning until there are no tns-ers. */
-static inline u32 get_rwlock(arch_rwlock_t *rwlock)
-{
-	u32 iterations = 0;
-	for (;;) {
-		u32 val = __insn_tns((int *)&rwlock->lock);
-		if (unlikely(val & 1)) {
-			delay_backoff(iterations++);
-			continue;
-		}
-		return val;
-	}
-}
-
-int arch_read_trylock_slow(arch_rwlock_t *rwlock)
-{
-	u32 val = get_rwlock(rwlock);
-	int locked = (val << RD_COUNT_WIDTH) == 0;
-	rwlock->lock = val + (locked << RD_COUNT_SHIFT);
-	return locked;
-}
-EXPORT_SYMBOL(arch_read_trylock_slow);
-
-void arch_read_unlock_slow(arch_rwlock_t *rwlock)
-{
-	u32 val = get_rwlock(rwlock);
-	rwlock->lock = val - (1 << RD_COUNT_SHIFT);
-}
-EXPORT_SYMBOL(arch_read_unlock_slow);
-
-void arch_write_unlock_slow(arch_rwlock_t *rwlock, u32 val)
+/*
+ * We can get the read lock if everything but the reader bits (which
+ * are in the high part of the word) is zero, i.e. no active or
+ * waiting writers, no tns.
+ *
+ * We guard the tns/store-back with an interrupt critical section to
+ * preserve the semantic that the same read lock can be acquired in an
+ * interrupt context.
+ */
+inline int arch_read_trylock(arch_rwlock_t *rwlock)
 {
 {
-	u32 eq, mask = 1 << WR_CURR_SHIFT;
-	while (unlikely(val & 1)) {
-		/* Limited backoff since we are the highest-priority task. */
-		relax(4);
-		val = __insn_tns((int *)&rwlock->lock);
+	u32 val;
+	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
+	val = __insn_tns((int *)&rwlock->lock);
+	if (likely((val << _RD_COUNT_WIDTH) == 0)) {
+		val += 1 << RD_COUNT_SHIFT;
+		rwlock->lock = val;
+		__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+		BUG_ON(val == 0);  /* we don't expect wraparound */
+		return 1;
 	}
 	}
-	val = __insn_addb(val, mask);
-	eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
-	val = __insn_mz(eq & mask, val);
-	rwlock->lock = val;
+	if ((val & 1) == 0)
+		rwlock->lock = val;
+	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+	return 0;
 }
 }
-EXPORT_SYMBOL(arch_write_unlock_slow);
+EXPORT_SYMBOL(arch_read_trylock);
 
 
 /*
 /*
- * We spin until everything but the reader bits (which are in the high
- * part of the word) are zero, i.e. no active or waiting writers, no tns.
- *
+ * Spin doing arch_read_trylock() until we acquire the lock.
  * ISSUE: This approach can permanently starve readers.  A reader who sees
  * ISSUE: This approach can permanently starve readers.  A reader who sees
  * a writer could instead take a ticket lock (just like a writer would),
  * a writer could instead take a ticket lock (just like a writer would),
  * and atomically enter read mode (with 1 reader) when it gets the ticket.
  * and atomically enter read mode (with 1 reader) when it gets the ticket.
- * This way both readers and writers will always make forward progress
+ * This way both readers and writers would always make forward progress
  * in a finite time.
  * in a finite time.
  */
  */
-void arch_read_lock_slow(arch_rwlock_t *rwlock, u32 val)
+void arch_read_lock(arch_rwlock_t *rwlock)
 {
 {
 	u32 iterations = 0;
 	u32 iterations = 0;
-	do {
-		if (!(val & 1))
-			rwlock->lock = val;
+	while (unlikely(!arch_read_trylock(rwlock)))
 		delay_backoff(iterations++);
 		delay_backoff(iterations++);
+}
+EXPORT_SYMBOL(arch_read_lock);
+
+void arch_read_unlock(arch_rwlock_t *rwlock)
+{
+	u32 val, iterations = 0;
+
+	mb();  /* guarantee anything modified under the lock is visible */
+	for (;;) {
+		__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
 		val = __insn_tns((int *)&rwlock->lock);
 		val = __insn_tns((int *)&rwlock->lock);
-	} while ((val << RD_COUNT_WIDTH) != 0);
-	rwlock->lock = val + (1 << RD_COUNT_SHIFT);
+		if (likely(val & 1) == 0) {
+			rwlock->lock = val - (1 << _RD_COUNT_SHIFT);
+			__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+			break;
+		}
+		__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+		delay_backoff(iterations++);
+	}
 }
 }
-EXPORT_SYMBOL(arch_read_lock_slow);
+EXPORT_SYMBOL(arch_read_unlock);
 
 
-void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
+/*
+ * We don't need an interrupt critical section here (unlike for
+ * arch_read_lock) since we should never use a bare write lock where
+ * it could be interrupted by code that could try to re-acquire it.
+ */
+void arch_write_lock(arch_rwlock_t *rwlock)
 {
 {
 	/*
 	/*
 	 * The trailing underscore on this variable (and curr_ below)
 	 * The trailing underscore on this variable (and curr_ below)
@@ -168,6 +169,12 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
 	 */
 	 */
 	u32 my_ticket_;
 	u32 my_ticket_;
 	u32 iterations = 0;
 	u32 iterations = 0;
+	u32 val = __insn_tns((int *)&rwlock->lock);
+
+	if (likely(val == 0)) {
+		rwlock->lock = 1 << _WR_NEXT_SHIFT;
+		return;
+	}
 
 
 	/*
 	/*
 	 * Wait until there are no readers, then bump up the next
 	 * Wait until there are no readers, then bump up the next
@@ -206,23 +213,47 @@ void arch_write_lock_slow(arch_rwlock_t *rwlock, u32 val)
 			relax(4);
 			relax(4);
 	}
 	}
 }
 }
-EXPORT_SYMBOL(arch_write_lock_slow);
+EXPORT_SYMBOL(arch_write_lock);
 
 
-int __tns_atomic_acquire(atomic_t *lock)
+int arch_write_trylock(arch_rwlock_t *rwlock)
 {
 {
-	int ret;
-	u32 iterations = 0;
+	u32 val = __insn_tns((int *)&rwlock->lock);
 
 
-	BUG_ON(__insn_mfspr(SPR_INTERRUPT_CRITICAL_SECTION));
-	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 1);
+	/*
+	 * If a tns is in progress, or there's a waiting or active locker,
+	 * or active readers, we can't take the lock, so give up.
+	 */
+	if (unlikely(val != 0)) {
+		if (!(val & 1))
+			rwlock->lock = val;
+		return 0;
+	}
 
 
-	while ((ret = __insn_tns((void *)&lock->counter)) == 1)
-		delay_backoff(iterations++);
-	return ret;
+	/* Set the "next" field to mark it locked. */
+	rwlock->lock = 1 << _WR_NEXT_SHIFT;
+	return 1;
 }
 }
+EXPORT_SYMBOL(arch_write_trylock);
 
 
-void __tns_atomic_release(atomic_t *p, int v)
+void arch_write_unlock(arch_rwlock_t *rwlock)
 {
 {
-	p->counter = v;
-	__insn_mtspr(SPR_INTERRUPT_CRITICAL_SECTION, 0);
+	u32 val, eq, mask;
+
+	mb();  /* guarantee anything modified under the lock is visible */
+	val = __insn_tns((int *)&rwlock->lock);
+	if (likely(val == (1 << _WR_NEXT_SHIFT))) {
+		rwlock->lock = 0;
+		return;
+	}
+	while (unlikely(val & 1)) {
+		/* Limited backoff since we are the highest-priority task. */
+		relax(4);
+		val = __insn_tns((int *)&rwlock->lock);
+	}
+	mask = 1 << WR_CURR_SHIFT;
+	val = __insn_addb(val, mask);
+	eq = __insn_seqb(val, val << (WR_CURR_SHIFT - WR_NEXT_SHIFT));
+	val = __insn_mz(eq & mask, val);
+	rwlock->lock = val;
 }
 }
+EXPORT_SYMBOL(arch_write_unlock);

+ 0 - 8
arch/tile/mm/fault.c

@@ -654,14 +654,6 @@ struct intvec_state do_page_fault_ics(struct pt_regs *regs, int fault_num,
 		regs->ex1 = PL_ICS_EX1(KERNEL_PL, 0);
 		regs->ex1 = PL_ICS_EX1(KERNEL_PL, 0);
 	}
 	}
 
 
-	/*
-	 * NOTE: the one other type of access that might bring us here
-	 * are the memory ops in __tns_atomic_acquire/__tns_atomic_release,
-	 * but we don't have to check specially for them since we can
-	 * always safely return to the address of the fault and retry,
-	 * since no separate atomic locks are involved.
-	 */
-
 	/*
 	/*
 	 * Now that we have released the atomic lock (if necessary),
 	 * Now that we have released the atomic lock (if necessary),
 	 * it's safe to spin if the PTE that caused the fault was migrating.
 	 * it's safe to spin if the PTE that caused the fault was migrating.

+ 32 - 6
arch/tile/mm/homecache.c

@@ -179,23 +179,46 @@ void flush_remote(unsigned long cache_pfn, unsigned long cache_control,
 	panic("Unsafe to continue.");
 	panic("Unsafe to continue.");
 }
 }
 
 
+void flush_remote_page(struct page *page, int order)
+{
+	int i, pages = (1 << order);
+	for (i = 0; i < pages; ++i, ++page) {
+		void *p = kmap_atomic(page);
+		int hfh = 0;
+		int home = page_home(page);
+#if CHIP_HAS_CBOX_HOME_MAP()
+		if (home == PAGE_HOME_HASH)
+			hfh = 1;
+		else
+#endif
+			BUG_ON(home < 0 || home >= NR_CPUS);
+		finv_buffer_remote(p, PAGE_SIZE, hfh);
+		kunmap_atomic(p);
+	}
+}
+
 void homecache_evict(const struct cpumask *mask)
 void homecache_evict(const struct cpumask *mask)
 {
 {
 	flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
 	flush_remote(0, HV_FLUSH_EVICT_L2, mask, 0, 0, 0, NULL, NULL, 0);
 }
 }
 
 
-/* Return a mask of the cpus whose caches currently own these pages. */
-static void homecache_mask(struct page *page, int pages,
-			   struct cpumask *home_mask)
+/*
+ * Return a mask of the cpus whose caches currently own these pages.
+ * The return value is whether the pages are all coherently cached
+ * (i.e. none are immutable, incoherent, or uncached).
+ */
+static int homecache_mask(struct page *page, int pages,
+			  struct cpumask *home_mask)
 {
 {
 	int i;
 	int i;
+	int cached_coherently = 1;
 	cpumask_clear(home_mask);
 	cpumask_clear(home_mask);
 	for (i = 0; i < pages; ++i) {
 	for (i = 0; i < pages; ++i) {
 		int home = page_home(&page[i]);
 		int home = page_home(&page[i]);
 		if (home == PAGE_HOME_IMMUTABLE ||
 		if (home == PAGE_HOME_IMMUTABLE ||
 		    home == PAGE_HOME_INCOHERENT) {
 		    home == PAGE_HOME_INCOHERENT) {
 			cpumask_copy(home_mask, cpu_possible_mask);
 			cpumask_copy(home_mask, cpu_possible_mask);
-			return;
+			return 0;
 		}
 		}
 #if CHIP_HAS_CBOX_HOME_MAP()
 #if CHIP_HAS_CBOX_HOME_MAP()
 		if (home == PAGE_HOME_HASH) {
 		if (home == PAGE_HOME_HASH) {
@@ -203,11 +226,14 @@ static void homecache_mask(struct page *page, int pages,
 			continue;
 			continue;
 		}
 		}
 #endif
 #endif
-		if (home == PAGE_HOME_UNCACHED)
+		if (home == PAGE_HOME_UNCACHED) {
+			cached_coherently = 0;
 			continue;
 			continue;
+		}
 		BUG_ON(home < 0 || home >= NR_CPUS);
 		BUG_ON(home < 0 || home >= NR_CPUS);
 		cpumask_set_cpu(home, home_mask);
 		cpumask_set_cpu(home, home_mask);
 	}
 	}
+	return cached_coherently;
 }
 }
 
 
 /*
 /*
@@ -386,7 +412,7 @@ void homecache_change_page_home(struct page *page, int order, int home)
 		pte_t *ptep = virt_to_pte(NULL, kva);
 		pte_t *ptep = virt_to_pte(NULL, kva);
 		pte_t pteval = *ptep;
 		pte_t pteval = *ptep;
 		BUG_ON(!pte_present(pteval) || pte_huge(pteval));
 		BUG_ON(!pte_present(pteval) || pte_huge(pteval));
-		*ptep = pte_set_home(pteval, home);
+		__set_pte(ptep, pte_set_home(pteval, home));
 	}
 	}
 }
 }
 
 

+ 15 - 19
arch/tile/mm/init.c

@@ -53,22 +53,11 @@
 
 
 #include "migrate.h"
 #include "migrate.h"
 
 
-/*
- * We could set FORCE_MAX_ZONEORDER to "(HPAGE_SHIFT - PAGE_SHIFT + 1)"
- * in the Tile Kconfig, but this generates configure warnings.
- * Do it here and force people to get it right to compile this file.
- * The problem is that with 4KB small pages and 16MB huge pages,
- * the default value doesn't allow us to group enough small pages
- * together to make up a huge page.
- */
-#if CONFIG_FORCE_MAX_ZONEORDER < HPAGE_SHIFT - PAGE_SHIFT + 1
-# error "Change FORCE_MAX_ZONEORDER in arch/tile/Kconfig to match page size"
-#endif
-
 #define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0))
 #define clear_pgd(pmdptr) (*(pmdptr) = hv_pte(0))
 
 
 #ifndef __tilegx__
 #ifndef __tilegx__
 unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE;
 unsigned long VMALLOC_RESERVE = CONFIG_VMALLOC_RESERVE;
+EXPORT_SYMBOL(VMALLOC_RESERVE);
 #endif
 #endif
 
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -445,7 +434,7 @@ static pmd_t *__init get_pmd(pgd_t pgtables[], unsigned long va)
 
 
 /* Temporary page table we use for staging. */
 /* Temporary page table we use for staging. */
 static pgd_t pgtables[PTRS_PER_PGD]
 static pgd_t pgtables[PTRS_PER_PGD]
- __attribute__((section(".init.page")));
+ __attribute__((aligned(HV_PAGE_TABLE_ALIGN)));
 
 
 /*
 /*
  * This maps the physical memory to kernel virtual address space, a total
  * This maps the physical memory to kernel virtual address space, a total
@@ -653,6 +642,17 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 	memcpy(pgd_base, pgtables, sizeof(pgtables));
 	memcpy(pgd_base, pgtables, sizeof(pgtables));
 	__install_page_table(pgd_base, __get_cpu_var(current_asid),
 	__install_page_table(pgd_base, __get_cpu_var(current_asid),
 			     swapper_pgprot);
 			     swapper_pgprot);
+
+	/*
+	 * We just read swapper_pgprot and thus brought it into the cache,
+	 * with its new home & caching mode.  When we start the other CPUs,
+	 * they're going to reference swapper_pgprot via their initial fake
+	 * VA-is-PA mappings, which cache everything locally.  At that
+	 * time, if it's in our cache with a conflicting home, the
+	 * simulator's coherence checker will complain.  So, flush it out
+	 * of our cache; we're not going to ever use it again anyway.
+	 */
+	__insn_finv(&swapper_pgprot);
 }
 }
 
 
 /*
 /*
@@ -950,11 +950,7 @@ struct kmem_cache *pgd_cache;
 
 
 void __init pgtable_cache_init(void)
 void __init pgtable_cache_init(void)
 {
 {
-	pgd_cache = kmem_cache_create("pgd",
-				PTRS_PER_PGD*sizeof(pgd_t),
-				PTRS_PER_PGD*sizeof(pgd_t),
-				0,
-				NULL);
+	pgd_cache = kmem_cache_create("pgd", SIZEOF_PGD, SIZEOF_PGD, 0, NULL);
 	if (!pgd_cache)
 	if (!pgd_cache)
 		panic("pgtable_cache_init(): Cannot create pgd cache");
 		panic("pgtable_cache_init(): Cannot create pgd cache");
 }
 }
@@ -989,7 +985,7 @@ static long __write_once initfree = 1;
 static int __init set_initfree(char *str)
 static int __init set_initfree(char *str)
 {
 {
 	long val;
 	long val;
-	if (strict_strtol(str, 0, &val)) {
+	if (strict_strtol(str, 0, &val) == 0) {
 		initfree = val;
 		initfree = val;
 		pr_info("initfree: %s free init pages\n",
 		pr_info("initfree: %s free init pages\n",
 			initfree ? "will" : "won't");
 			initfree ? "will" : "won't");

+ 1 - 0
arch/tile/mm/migrate_32.S

@@ -18,6 +18,7 @@
 #include <linux/linkage.h>
 #include <linux/linkage.h>
 #include <linux/threads.h>
 #include <linux/threads.h>
 #include <asm/page.h>
 #include <asm/page.h>
+#include <asm/thread_info.h>
 #include <asm/types.h>
 #include <asm/types.h>
 #include <asm/asm-offsets.h>
 #include <asm/asm-offsets.h>
 #include <hv/hypervisor.h>
 #include <hv/hypervisor.h>

+ 141 - 40
arch/tile/mm/pgtable.c

@@ -142,6 +142,76 @@ pte_t *_pte_offset_map(pmd_t *dir, unsigned long address)
 }
 }
 #endif
 #endif
 
 
+/**
+ * shatter_huge_page() - ensure a given address is mapped by a small page.
+ *
+ * This function converts a huge PTE mapping kernel LOWMEM into a bunch
+ * of small PTEs with the same caching.  No cache flush required, but we
+ * must do a global TLB flush.
+ *
+ * Any caller that wishes to modify a kernel mapping that might
+ * have been made with a huge page should call this function,
+ * since doing so properly avoids race conditions with installing the
+ * newly-shattered page and then flushing all the TLB entries.
+ *
+ * @addr: Address at which to shatter any existing huge page.
+ */
+void shatter_huge_page(unsigned long addr)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	unsigned long flags = 0;  /* happy compiler */
+#ifdef __PAGETABLE_PMD_FOLDED
+	struct list_head *pos;
+#endif
+
+	/* Get a pointer to the pmd entry that we need to change. */
+	addr &= HPAGE_MASK;
+	BUG_ON(pgd_addr_invalid(addr));
+	BUG_ON(addr < PAGE_OFFSET);  /* only for kernel LOWMEM */
+	pgd = swapper_pg_dir + pgd_index(addr);
+	pud = pud_offset(pgd, addr);
+	BUG_ON(!pud_present(*pud));
+	pmd = pmd_offset(pud, addr);
+	BUG_ON(!pmd_present(*pmd));
+	if (!pmd_huge_page(*pmd))
+		return;
+
+	/*
+	 * Grab the pgd_lock, since we may need it to walk the pgd_list,
+	 * and since we need some kind of lock here to avoid races.
+	 */
+	spin_lock_irqsave(&pgd_lock, flags);
+	if (!pmd_huge_page(*pmd)) {
+		/* Lost the race to convert the huge page. */
+		spin_unlock_irqrestore(&pgd_lock, flags);
+		return;
+	}
+
+	/* Shatter the huge page into the preallocated L2 page table. */
+	pmd_populate_kernel(&init_mm, pmd,
+			    get_prealloc_pte(pte_pfn(*(pte_t *)pmd)));
+
+#ifdef __PAGETABLE_PMD_FOLDED
+	/* Walk every pgd on the system and update the pmd there. */
+	list_for_each(pos, &pgd_list) {
+		pmd_t *copy_pmd;
+		pgd = list_to_pgd(pos) + pgd_index(addr);
+		pud = pud_offset(pgd, addr);
+		copy_pmd = pmd_offset(pud, addr);
+		__set_pmd(copy_pmd, *pmd);
+	}
+#endif
+
+	/* Tell every cpu to notice the change. */
+	flush_remote(0, 0, NULL, addr, HPAGE_SIZE, HPAGE_SIZE,
+		     cpu_possible_mask, NULL, 0);
+
+	/* Hold the lock until the TLB flush is finished to avoid races. */
+	spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
 /*
 /*
  * List of all pgd's needed so it can invalidate entries in both cached
  * List of all pgd's needed so it can invalidate entries in both cached
  * and uncached pgd's. This is essentially codepath-based locking
  * and uncached pgd's. This is essentially codepath-based locking
@@ -184,9 +254,9 @@ static void pgd_ctor(pgd_t *pgd)
 	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
 	BUG_ON(((u64 *)swapper_pg_dir)[pgd_index(MEM_USER_INTRPT)] != 0);
 #endif
 #endif
 
 
-	clone_pgd_range(pgd + KERNEL_PGD_INDEX_START,
-			swapper_pg_dir + KERNEL_PGD_INDEX_START,
-			KERNEL_PGD_PTRS);
+	memcpy(pgd + KERNEL_PGD_INDEX_START,
+	       swapper_pg_dir + KERNEL_PGD_INDEX_START,
+	       KERNEL_PGD_PTRS * sizeof(pgd_t));
 
 
 	pgd_list_add(pgd);
 	pgd_list_add(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
 	spin_unlock_irqrestore(&pgd_lock, flags);
@@ -220,8 +290,11 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
 {
-	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO|__GFP_COMP;
+	gfp_t flags = GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO;
 	struct page *p;
 	struct page *p;
+#if L2_USER_PGTABLE_ORDER > 0
+	int i;
+#endif
 
 
 #ifdef CONFIG_HIGHPTE
 #ifdef CONFIG_HIGHPTE
 	flags |= __GFP_HIGHMEM;
 	flags |= __GFP_HIGHMEM;
@@ -231,6 +304,18 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	if (p == NULL)
 	if (p == NULL)
 		return NULL;
 		return NULL;
 
 
+#if L2_USER_PGTABLE_ORDER > 0
+	/*
+	 * Make every page have a page_count() of one, not just the first.
+	 * We don't use __GFP_COMP since it doesn't look like it works
+	 * correctly with tlb_remove_page().
+	 */
+	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+		init_page_count(p+i);
+		inc_zone_page_state(p+i, NR_PAGETABLE);
+	}
+#endif
+
 	pgtable_page_ctor(p);
 	pgtable_page_ctor(p);
 	return p;
 	return p;
 }
 }
@@ -242,8 +327,15 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
  */
  */
 void pte_free(struct mm_struct *mm, struct page *p)
 void pte_free(struct mm_struct *mm, struct page *p)
 {
 {
+	int i;
+
 	pgtable_page_dtor(p);
 	pgtable_page_dtor(p);
-	__free_pages(p, L2_USER_PGTABLE_ORDER);
+	__free_page(p);
+
+	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+		__free_page(p+i);
+		dec_zone_page_state(p+i, NR_PAGETABLE);
+	}
 }
 }
 
 
 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
 void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
@@ -252,18 +344,11 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte,
 	int i;
 	int i;
 
 
 	pgtable_page_dtor(pte);
 	pgtable_page_dtor(pte);
-	tlb->need_flush = 1;
-	if (tlb_fast_mode(tlb)) {
-		struct page *pte_pages[L2_USER_PGTABLE_PAGES];
-		for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i)
-			pte_pages[i] = pte + i;
-		free_pages_and_swap_cache(pte_pages, L2_USER_PGTABLE_PAGES);
-		return;
-	}
-	for (i = 0; i < L2_USER_PGTABLE_PAGES; ++i) {
-		tlb->pages[tlb->nr++] = pte + i;
-		if (tlb->nr >= FREE_PTE_NR)
-			tlb_flush_mmu(tlb, 0, 0);
+	tlb_remove_page(tlb, pte);
+
+	for (i = 1; i < L2_USER_PGTABLE_PAGES; ++i) {
+		tlb_remove_page(tlb, pte + i);
+		dec_zone_page_state(pte + i, NR_PAGETABLE);
 	}
 	}
 }
 }
 
 
@@ -346,35 +431,51 @@ int get_remote_cache_cpu(pgprot_t prot)
 	return x + y * smp_width;
 	return x + y * smp_width;
 }
 }
 
 
-void set_pte_order(pte_t *ptep, pte_t pte, int order)
+/*
+ * Convert a kernel VA to a PA and homing information.
+ */
+int va_to_cpa_and_pte(void *va, unsigned long long *cpa, pte_t *pte)
 {
 {
-	unsigned long pfn = pte_pfn(pte);
-	struct page *page = pfn_to_page(pfn);
+	struct page *page = virt_to_page(va);
+	pte_t null_pte = { 0 };
 
 
-	/* Update the home of a PTE if necessary */
-	pte = pte_set_home(pte, page_home(page));
+	*cpa = __pa(va);
 
 
+	/* Note that this is not writing a page table, just returning a pte. */
+	*pte = pte_set_home(null_pte, page_home(page));
+
+	return 0; /* return non-zero if not hfh? */
+}
+EXPORT_SYMBOL(va_to_cpa_and_pte);
+
+void __set_pte(pte_t *ptep, pte_t pte)
+{
 #ifdef __tilegx__
 #ifdef __tilegx__
 	*ptep = pte;
 	*ptep = pte;
 #else
 #else
-	/*
-	 * When setting a PTE, write the high bits first, then write
-	 * the low bits.  This sets the "present" bit only after the
-	 * other bits are in place.  If a particular PTE update
-	 * involves transitioning from one valid PTE to another, it
-	 * may be necessary to call set_pte_order() more than once,
-	 * transitioning via a suitable intermediate state.
-	 * Note that this sequence also means that if we are transitioning
-	 * from any migrating PTE to a non-migrating one, we will not
-	 * see a half-updated PTE with the migrating bit off.
-	 */
-#if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
-# error Must write the present and migrating bits last
-#endif
-	((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
-	barrier();
-	((u32 *)ptep)[0] = (u32)(pte_val(pte));
-#endif
+# if HV_PTE_INDEX_PRESENT >= 32 || HV_PTE_INDEX_MIGRATING >= 32
+#  error Must write the present and migrating bits last
+# endif
+	if (pte_present(pte)) {
+		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
+		barrier();
+		((u32 *)ptep)[0] = (u32)(pte_val(pte));
+	} else {
+		((u32 *)ptep)[0] = (u32)(pte_val(pte));
+		barrier();
+		((u32 *)ptep)[1] = (u32)(pte_val(pte) >> 32);
+	}
+#endif /* __tilegx__ */
+}
+
+void set_pte(pte_t *ptep, pte_t pte)
+{
+	struct page *page = pfn_to_page(pte_pfn(pte));
+
+	/* Update the home of a PTE if necessary */
+	pte = pte_set_home(pte, page_home(page));
+
+	__set_pte(ptep, pte);
 }
 }
 
 
 /* Can this mm load a PTE with cached_priority set? */
 /* Can this mm load a PTE with cached_priority set? */

+ 9 - 1
drivers/edac/Kconfig

@@ -7,7 +7,7 @@
 menuconfig EDAC
 menuconfig EDAC
 	bool "EDAC (Error Detection And Correction) reporting"
 	bool "EDAC (Error Detection And Correction) reporting"
 	depends on HAS_IOMEM
 	depends on HAS_IOMEM
-	depends on X86 || PPC
+	depends on X86 || PPC || TILE
 	help
 	help
 	  EDAC is designed to report errors in the core system.
 	  EDAC is designed to report errors in the core system.
 	  These are low-level errors that are reported in the CPU or
 	  These are low-level errors that are reported in the CPU or
@@ -282,4 +282,12 @@ config EDAC_CPC925
 	  a companion chip to the PowerPC 970 family of
 	  a companion chip to the PowerPC 970 family of
 	  processors.
 	  processors.
 
 
+config EDAC_TILE
+	tristate "Tilera Memory Controller"
+	depends on EDAC_MM_EDAC && TILE
+	default y
+	help
+	  Support for error detection and correction on the
+	  Tilera memory controller.
+
 endif # EDAC
 endif # EDAC

+ 1 - 0
drivers/edac/Makefile

@@ -54,3 +54,4 @@ obj-$(CONFIG_EDAC_PPC4XX)		+= ppc4xx_edac.o
 obj-$(CONFIG_EDAC_AMD8111)		+= amd8111_edac.o
 obj-$(CONFIG_EDAC_AMD8111)		+= amd8111_edac.o
 obj-$(CONFIG_EDAC_AMD8131)		+= amd8131_edac.o
 obj-$(CONFIG_EDAC_AMD8131)		+= amd8131_edac.o
 
 
+obj-$(CONFIG_EDAC_TILE)			+= tile_edac.o

+ 254 - 0
drivers/edac/tile_edac.c

@@ -0,0 +1,254 @@
+/*
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ * Tilera-specific EDAC driver.
+ *
+ * This source code is derived from the following driver:
+ *
+ * Cell MIC driver for ECC counting
+ *
+ * Copyright 2007 Benjamin Herrenschmidt, IBM Corp.
+ *                <benh@kernel.crashing.org>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <linux/edac.h>
+#include <hv/hypervisor.h>
+#include <hv/drv_mshim_intf.h>
+
+#include "edac_core.h"
+
+#define DRV_NAME	"tile-edac"
+
+/* Number of cs_rows needed per memory controller on TILEPro. */
+#define TILE_EDAC_NR_CSROWS	1
+
+/* Number of channels per memory controller on TILEPro. */
+#define TILE_EDAC_NR_CHANS	1
+
+/* Granularity of reported error in bytes on TILEPro. */
+#define TILE_EDAC_ERROR_GRAIN	8
+
+/* TILE processor has multiple independent memory controllers. */
+struct platform_device *mshim_pdev[TILE_MAX_MSHIMS];
+
+struct tile_edac_priv {
+	int		hv_devhdl;	/* Hypervisor device handle. */
+	int		node;		/* Memory controller instance #. */
+	unsigned int	ce_count;	/*
+					 * Correctable-error counter
+					 * kept by the driver.
+					 */
+};
+
+static void tile_edac_check(struct mem_ctl_info *mci)
+{
+	struct tile_edac_priv	*priv = mci->pvt_info;
+	struct mshim_mem_error	mem_error;
+
+	if (hv_dev_pread(priv->hv_devhdl, 0, (HV_VirtAddr)&mem_error,
+		sizeof(struct mshim_mem_error), MSHIM_MEM_ERROR_OFF) !=
+		sizeof(struct mshim_mem_error)) {
+		pr_err(DRV_NAME ": MSHIM_MEM_ERROR_OFF pread failure.\n");
+		return;
+	}
+
+	/* Check if the current error count is different from the saved one. */
+	if (mem_error.sbe_count != priv->ce_count) {
+		dev_dbg(mci->dev, "ECC CE err on node %d\n", priv->node);
+		priv->ce_count = mem_error.sbe_count;
+		edac_mc_handle_ce(mci, 0, 0, 0, 0, 0, mci->ctl_name);
+	}
+}
+
+/*
+ * Initialize the 'csrows' table within the mci control structure with the
+ * addressing of memory.
+ */
+static int __devinit tile_edac_init_csrows(struct mem_ctl_info *mci)
+{
+	struct csrow_info	*csrow = &mci->csrows[0];
+	struct tile_edac_priv	*priv = mci->pvt_info;
+	struct mshim_mem_info	mem_info;
+
+	if (hv_dev_pread(priv->hv_devhdl, 0, (HV_VirtAddr)&mem_info,
+		sizeof(struct mshim_mem_info), MSHIM_MEM_INFO_OFF) !=
+		sizeof(struct mshim_mem_info)) {
+		pr_err(DRV_NAME ": MSHIM_MEM_INFO_OFF pread failure.\n");
+		return -1;
+	}
+
+	if (mem_info.mem_ecc)
+		csrow->edac_mode = EDAC_SECDED;
+	else
+		csrow->edac_mode = EDAC_NONE;
+	switch (mem_info.mem_type) {
+	case DDR2:
+		csrow->mtype = MEM_DDR2;
+		break;
+
+	case DDR3:
+		csrow->mtype = MEM_DDR3;
+		break;
+
+	default:
+		return -1;
+	}
+
+	csrow->first_page = 0;
+	csrow->nr_pages = mem_info.mem_size >> PAGE_SHIFT;
+	csrow->last_page = csrow->first_page + csrow->nr_pages - 1;
+	csrow->grain = TILE_EDAC_ERROR_GRAIN;
+	csrow->dtype = DEV_UNKNOWN;
+
+	return 0;
+}
+
+static int __devinit tile_edac_mc_probe(struct platform_device *pdev)
+{
+	char			hv_file[32];
+	int			hv_devhdl;
+	struct mem_ctl_info	*mci;
+	struct tile_edac_priv	*priv;
+	int			rc;
+
+	sprintf(hv_file, "mshim/%d", pdev->id);
+	hv_devhdl = hv_dev_open((HV_VirtAddr)hv_file, 0);
+	if (hv_devhdl < 0)
+		return -EINVAL;
+
+	/* A TILE MC has a single channel and one chip-select row. */
+	mci = edac_mc_alloc(sizeof(struct tile_edac_priv),
+		TILE_EDAC_NR_CSROWS, TILE_EDAC_NR_CHANS, pdev->id);
+	if (mci == NULL)
+		return -ENOMEM;
+	priv = mci->pvt_info;
+	priv->node = pdev->id;
+	priv->hv_devhdl = hv_devhdl;
+
+	mci->dev = &pdev->dev;
+	mci->mtype_cap = MEM_FLAG_DDR2;
+	mci->edac_ctl_cap = EDAC_FLAG_SECDED;
+
+	mci->mod_name = DRV_NAME;
+	mci->ctl_name = "TILEPro_Memory_Controller";
+	mci->dev_name = dev_name(&pdev->dev);
+	mci->edac_check = tile_edac_check;
+
+	/*
+	 * Initialize the MC control structure 'csrows' table
+	 * with the mapping and control information.
+	 */
+	if (tile_edac_init_csrows(mci)) {
+		/* No csrows found. */
+		mci->edac_cap = EDAC_FLAG_NONE;
+	} else {
+		mci->edac_cap = EDAC_FLAG_SECDED;
+	}
+
+	platform_set_drvdata(pdev, mci);
+
+	/* Register with EDAC core */
+	rc = edac_mc_add_mc(mci);
+	if (rc) {
+		dev_err(&pdev->dev, "failed to register with EDAC core\n");
+		edac_mc_free(mci);
+		return rc;
+	}
+
+	return 0;
+}
+
+static int __devexit tile_edac_mc_remove(struct platform_device *pdev)
+{
+	struct mem_ctl_info *mci = platform_get_drvdata(pdev);
+
+	edac_mc_del_mc(&pdev->dev);
+	if (mci)
+		edac_mc_free(mci);
+	return 0;
+}
+
+static struct platform_driver tile_edac_mc_driver = {
+	.driver		= {
+		.name	= DRV_NAME,
+		.owner	= THIS_MODULE,
+	},
+	.probe		= tile_edac_mc_probe,
+	.remove		= __devexit_p(tile_edac_mc_remove),
+};
+
+/*
+ * Driver init routine.
+ */
+static int __init tile_edac_init(void)
+{
+	char	hv_file[32];
+	struct platform_device *pdev;
+	int i, err, num = 0;
+
+	/* Only support POLL mode. */
+	edac_op_state = EDAC_OPSTATE_POLL;
+
+	err = platform_driver_register(&tile_edac_mc_driver);
+	if (err)
+		return err;
+
+	for (i = 0; i < TILE_MAX_MSHIMS; i++) {
+		/*
+		 * Not all memory controllers are configured such as in the
+		 * case of a simulator. So we register only those mshims
+		 * that are configured by the hypervisor.
+		 */
+		sprintf(hv_file, "mshim/%d", i);
+		if (hv_dev_open((HV_VirtAddr)hv_file, 0) < 0)
+			continue;
+
+		pdev = platform_device_register_simple(DRV_NAME, i, NULL, 0);
+		if (IS_ERR(pdev))
+			continue;
+		mshim_pdev[i] = pdev;
+		num++;
+	}
+
+	if (num == 0) {
+		platform_driver_unregister(&tile_edac_mc_driver);
+		return -ENODEV;
+	}
+	return 0;
+}
+
+/*
+ * Driver cleanup routine.
+ */
+static void __exit tile_edac_exit(void)
+{
+	int i;
+
+	for (i = 0; i < TILE_MAX_MSHIMS; i++) {
+		struct platform_device *pdev = mshim_pdev[i];
+		if (!pdev)
+			continue;
+
+		platform_set_drvdata(pdev, NULL);
+		platform_device_unregister(pdev);
+	}
+	platform_driver_unregister(&tile_edac_mc_driver);
+}
+
+module_init(tile_edac_init);
+module_exit(tile_edac_exit);

+ 517 - 448
drivers/net/tile/tilepro.c

@@ -1,5 +1,5 @@
 /*
 /*
- * Copyright 2010 Tilera Corporation. All Rights Reserved.
+ * Copyright 2011 Tilera Corporation. All Rights Reserved.
  *
  *
  *   This program is free software; you can redistribute it and/or
  *   This program is free software; you can redistribute it and/or
  *   modify it under the terms of the GNU General Public License
  *   modify it under the terms of the GNU General Public License
@@ -44,10 +44,6 @@
 #include <linux/tcp.h>
 #include <linux/tcp.h>
 
 
 
 
-/* There is no singlethread_cpu, so schedule work on the current cpu. */
-#define singlethread_cpu -1
-
-
 /*
 /*
  * First, "tile_net_init_module()" initializes all four "devices" which
  * First, "tile_net_init_module()" initializes all four "devices" which
  * can be used by linux.
  * can be used by linux.
@@ -73,15 +69,16 @@
  * return, knowing we will be called again later.  Otherwise, we
  * return, knowing we will be called again later.  Otherwise, we
  * reenable the ingress interrupt, and call "napi_complete()".
  * reenable the ingress interrupt, and call "napi_complete()".
  *
  *
+ * HACK: Since disabling the ingress interrupt is not reliable, we
+ * ignore the interrupt if the global "active" flag is false.
+ *
  *
  *
  * NOTE: The use of "native_driver" ensures that EPP exists, and that
  * NOTE: The use of "native_driver" ensures that EPP exists, and that
- * "epp_sendv" is legal, and that "LIPP" is being used.
+ * we are using "LIPP" and "LEPP".
  *
  *
  * NOTE: Failing to free completions for an arbitrarily long time
  * NOTE: Failing to free completions for an arbitrarily long time
  * (which is defined to be illegal) does in fact cause bizarre
  * (which is defined to be illegal) does in fact cause bizarre
  * problems.  The "egress_timer" helps prevent this from happening.
  * problems.  The "egress_timer" helps prevent this from happening.
- *
- * NOTE: The egress code can be interrupted by the interrupt handler.
  */
  */
 
 
 
 
@@ -142,6 +139,7 @@
 MODULE_AUTHOR("Tilera");
 MODULE_AUTHOR("Tilera");
 MODULE_LICENSE("GPL");
 MODULE_LICENSE("GPL");
 
 
+
 /*
 /*
  * Queue of incoming packets for a specific cpu and device.
  * Queue of incoming packets for a specific cpu and device.
  *
  *
@@ -177,7 +175,7 @@ struct tile_net_cpu {
 	struct tile_netio_queue queue;
 	struct tile_netio_queue queue;
 	/* Statistics. */
 	/* Statistics. */
 	struct tile_net_stats_t stats;
 	struct tile_net_stats_t stats;
-	/* ISSUE: Is this needed? */
+	/* True iff NAPI is enabled. */
 	bool napi_enabled;
 	bool napi_enabled;
 	/* True if this tile has succcessfully registered with the IPP. */
 	/* True if this tile has succcessfully registered with the IPP. */
 	bool registered;
 	bool registered;
@@ -200,20 +198,20 @@ struct tile_net_cpu {
 struct tile_net_priv {
 struct tile_net_priv {
 	/* Our network device. */
 	/* Our network device. */
 	struct net_device *dev;
 	struct net_device *dev;
-	/* The actual egress queue. */
-	lepp_queue_t *epp_queue;
-	/* Protects "epp_queue->cmd_tail" and "epp_queue->comp_tail" */
-	spinlock_t cmd_lock;
-	/* Protects "epp_queue->comp_head". */
-	spinlock_t comp_lock;
+	/* Pages making up the egress queue. */
+	struct page *eq_pages;
+	/* Address of the actual egress queue. */
+	lepp_queue_t *eq;
+	/* Protects "eq". */
+	spinlock_t eq_lock;
 	/* The hypervisor handle for this interface. */
 	/* The hypervisor handle for this interface. */
 	int hv_devhdl;
 	int hv_devhdl;
 	/* The intr bit mask that IDs this device. */
 	/* The intr bit mask that IDs this device. */
 	u32 intr_id;
 	u32 intr_id;
 	/* True iff "tile_net_open_aux()" has succeeded. */
 	/* True iff "tile_net_open_aux()" has succeeded. */
-	int partly_opened;
-	/* True iff "tile_net_open_inner()" has succeeded. */
-	int fully_opened;
+	bool partly_opened;
+	/* True iff the device is "active". */
+	bool active;
 	/* Effective network cpus. */
 	/* Effective network cpus. */
 	struct cpumask network_cpus_map;
 	struct cpumask network_cpus_map;
 	/* Number of network cpus. */
 	/* Number of network cpus. */
@@ -228,6 +226,10 @@ struct tile_net_priv {
 	struct tile_net_cpu *cpu[NR_CPUS];
 	struct tile_net_cpu *cpu[NR_CPUS];
 };
 };
 
 
+/* Log2 of the number of small pages needed for the egress queue. */
+#define EQ_ORDER  get_order(sizeof(lepp_queue_t))
+/* Size of the egress queue's pages. */
+#define EQ_SIZE   (1 << (PAGE_SHIFT + EQ_ORDER))
 
 
 /*
 /*
  * The actual devices (xgbe0, xgbe1, gbe0, gbe1).
  * The actual devices (xgbe0, xgbe1, gbe0, gbe1).
@@ -284,7 +286,11 @@ static void net_printk(char *fmt, ...)
  */
  */
 static void dump_packet(unsigned char *data, unsigned long length, char *s)
 static void dump_packet(unsigned char *data, unsigned long length, char *s)
 {
 {
+	int my_cpu = smp_processor_id();
+
 	unsigned long i;
 	unsigned long i;
+	char buf[128];
+
 	static unsigned int count;
 	static unsigned int count;
 
 
 	pr_info("dump_packet(data %p, length 0x%lx s %s count 0x%x)\n",
 	pr_info("dump_packet(data %p, length 0x%lx s %s count 0x%x)\n",
@@ -294,10 +300,12 @@ static void dump_packet(unsigned char *data, unsigned long length, char *s)
 
 
 	for (i = 0; i < length; i++) {
 	for (i = 0; i < length; i++) {
 		if ((i & 0xf) == 0)
 		if ((i & 0xf) == 0)
-			sprintf(buf, "%8.8lx:", i);
+			sprintf(buf, "[%02d] %8.8lx:", my_cpu, i);
 		sprintf(buf + strlen(buf), " %2.2x", data[i]);
 		sprintf(buf + strlen(buf), " %2.2x", data[i]);
-		if ((i & 0xf) == 0xf || i == length - 1)
-			pr_info("%s\n", buf);
+		if ((i & 0xf) == 0xf || i == length - 1) {
+			strcat(buf, "\n");
+			pr_info("%s", buf);
+		}
 	}
 	}
 }
 }
 #endif
 #endif
@@ -351,60 +359,109 @@ static void tile_net_provide_linux_buffer(struct tile_net_cpu *info,
 
 
 /*
 /*
  * Provide a linux buffer for LIPP.
  * Provide a linux buffer for LIPP.
+ *
+ * Note that the ACTUAL allocation for each buffer is a "struct sk_buff",
+ * plus a chunk of memory that includes not only the requested bytes, but
+ * also NET_SKB_PAD bytes of initial padding, and a "struct skb_shared_info".
+ *
+ * Note that "struct skb_shared_info" is 88 bytes with 64K pages and
+ * 268 bytes with 4K pages (since the frags[] array needs 18 entries).
+ *
+ * Without jumbo packets, the maximum packet size will be 1536 bytes,
+ * and we use 2 bytes (NET_IP_ALIGN) of padding.  ISSUE: If we told
+ * the hardware to clip at 1518 bytes instead of 1536 bytes, then we
+ * could save an entire cache line, but in practice, we don't need it.
+ *
+ * Since CPAs are 38 bits, and we can only encode the high 31 bits in
+ * a "linux_buffer_t", the low 7 bits must be zero, and thus, we must
+ * align the actual "va" mod 128.
+ *
+ * We assume that the underlying "head" will be aligned mod 64.  Note
+ * that in practice, we have seen "head" NOT aligned mod 128 even when
+ * using 2048 byte allocations, which is surprising.
+ *
+ * If "head" WAS always aligned mod 128, we could change LIPP to
+ * assume that the low SIX bits are zero, and the 7th bit is one, that
+ * is, align the actual "va" mod 128 plus 64, which would be "free".
+ *
+ * For now, the actual "head" pointer points at NET_SKB_PAD bytes of
+ * padding, plus 28 or 92 bytes of extra padding, plus the sk_buff
+ * pointer, plus the NET_IP_ALIGN padding, plus 126 or 1536 bytes for
+ * the actual packet, plus 62 bytes of empty padding, plus some
+ * padding and the "struct skb_shared_info".
+ *
+ * With 64K pages, a large buffer thus needs 32+92+4+2+1536+62+88
+ * bytes, or 1816 bytes, which fits comfortably into 2048 bytes.
+ *
+ * With 64K pages, a small buffer thus needs 32+92+4+2+126+88
+ * bytes, or 344 bytes, which means we are wasting 64+ bytes, and
+ * could presumably increase the size of small buffers.
+ *
+ * With 4K pages, a large buffer thus needs 32+92+4+2+1536+62+268
+ * bytes, or 1996 bytes, which fits comfortably into 2048 bytes.
+ *
+ * With 4K pages, a small buffer thus needs 32+92+4+2+126+268
+ * bytes, or 524 bytes, which is annoyingly wasteful.
+ *
+ * Maybe we should increase LIPP_SMALL_PACKET_SIZE to 192?
+ *
+ * ISSUE: Maybe we should increase "NET_SKB_PAD" to 64?
  */
  */
 static bool tile_net_provide_needed_buffer(struct tile_net_cpu *info,
 static bool tile_net_provide_needed_buffer(struct tile_net_cpu *info,
 					   bool small)
 					   bool small)
 {
 {
-	/* ISSUE: What should we use here? */
+#if TILE_NET_MTU <= 1536
+	/* Without "jumbo", 2 + 1536 should be sufficient. */
+	unsigned int large_size = NET_IP_ALIGN + 1536;
+#else
+	/* ISSUE: This has not been tested. */
 	unsigned int large_size = NET_IP_ALIGN + TILE_NET_MTU + 100;
 	unsigned int large_size = NET_IP_ALIGN + TILE_NET_MTU + 100;
+#endif
 
 
-	/* Round up to ensure to avoid "false sharing" with last cache line. */
-	unsigned int buffer_size =
+	/* Avoid "false sharing" with last cache line. */
+	/* ISSUE: This is already done by "dev_alloc_skb()". */
+	unsigned int len =
 		 (((small ? LIPP_SMALL_PACKET_SIZE : large_size) +
 		 (((small ? LIPP_SMALL_PACKET_SIZE : large_size) +
 		   CHIP_L2_LINE_SIZE() - 1) & -CHIP_L2_LINE_SIZE());
 		   CHIP_L2_LINE_SIZE() - 1) & -CHIP_L2_LINE_SIZE());
 
 
-	/*
-	 * ISSUE: Since CPAs are 38 bits, and we can only encode the
-	 * high 31 bits in a "linux_buffer_t", the low 7 bits must be
-	 * zero, and thus, we must align the actual "va" mod 128.
-	 */
-	const unsigned long align = 128;
+	unsigned int padding = 128 - NET_SKB_PAD;
+	unsigned int align;
 
 
 	struct sk_buff *skb;
 	struct sk_buff *skb;
 	void *va;
 	void *va;
 
 
 	struct sk_buff **skb_ptr;
 	struct sk_buff **skb_ptr;
 
 
-	/* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes, */
-	/* and also "reserves" that many bytes. */
-	/* ISSUE: Can we "share" the NET_SKB_PAD bytes with "skb_ptr"? */
-	int len = sizeof(*skb_ptr) + align + buffer_size;
-
-	while (1) {
-
-		/* Allocate (or fail). */
-		skb = dev_alloc_skb(len);
-		if (skb == NULL)
-			return false;
-
-		/* Make room for a back-pointer to 'skb'. */
-		skb_reserve(skb, sizeof(*skb_ptr));
+	/* Request 96 extra bytes for alignment purposes. */
+	skb = dev_alloc_skb(len + padding);
+	if (skb == NULL)
+		return false;
 
 
-		/* Make sure we are aligned. */
-		skb_reserve(skb, -(long)skb->data & (align - 1));
+	/* Skip 32 or 96 bytes to align "data" mod 128. */
+	align = -(long)skb->data & (128 - 1);
+	BUG_ON(align > padding);
+	skb_reserve(skb, align);
 
 
-		/* This address is given to IPP. */
-		va = skb->data;
+	/* This address is given to IPP. */
+	va = skb->data;
 
 
-		if (small)
-			break;
+	/* Buffers must not span a huge page. */
+	BUG_ON(((((long)va & ~HPAGE_MASK) + len) & HPAGE_MASK) != 0);
 
 
-		/* ISSUE: This has never been observed! */
-		/* Large buffers must not span a huge page. */
-		if (((((long)va & ~HPAGE_MASK) + 1535) & HPAGE_MASK) == 0)
-			break;
-		pr_err("Leaking unaligned linux buffer at %p.\n", va);
+#ifdef TILE_NET_PARANOIA
+#if CHIP_HAS_CBOX_HOME_MAP()
+	if (hash_default) {
+		HV_PTE pte = *virt_to_pte(current->mm, (unsigned long)va);
+		if (hv_pte_get_mode(pte) != HV_PTE_MODE_CACHE_HASH_L3)
+			panic("Non-HFH ingress buffer! VA=%p Mode=%d PTE=%llx",
+			      va, hv_pte_get_mode(pte), hv_pte_val(pte));
 	}
 	}
+#endif
+#endif
+
+	/* Invalidate the packet buffer. */
+	if (!hash_default)
+		__inv_buffer(va, len);
 
 
 	/* Skip two bytes to satisfy LIPP assumptions. */
 	/* Skip two bytes to satisfy LIPP assumptions. */
 	/* Note that this aligns IP on a 16 byte boundary. */
 	/* Note that this aligns IP on a 16 byte boundary. */
@@ -415,23 +472,9 @@ static bool tile_net_provide_needed_buffer(struct tile_net_cpu *info,
 	skb_ptr = va - sizeof(*skb_ptr);
 	skb_ptr = va - sizeof(*skb_ptr);
 	*skb_ptr = skb;
 	*skb_ptr = skb;
 
 
-	/* Invalidate the packet buffer. */
-	if (!hash_default)
-		__inv_buffer(skb->data, buffer_size);
-
 	/* Make sure "skb_ptr" has been flushed. */
 	/* Make sure "skb_ptr" has been flushed. */
 	__insn_mf();
 	__insn_mf();
 
 
-#ifdef TILE_NET_PARANOIA
-#if CHIP_HAS_CBOX_HOME_MAP()
-	if (hash_default) {
-		HV_PTE pte = *virt_to_pte(current->mm, (unsigned long)va);
-		if (hv_pte_get_mode(pte) != HV_PTE_MODE_CACHE_HASH_L3)
-			panic("Non-coherent ingress buffer!");
-	}
-#endif
-#endif
-
 	/* Provide the new buffer. */
 	/* Provide the new buffer. */
 	tile_net_provide_linux_buffer(info, va, small);
 	tile_net_provide_linux_buffer(info, va, small);
 
 
@@ -469,47 +512,63 @@ oops:
  * Grab some LEPP completions, and store them in "comps", of size
  * Grab some LEPP completions, and store them in "comps", of size
  * "comps_size", and return the number of completions which were
  * "comps_size", and return the number of completions which were
  * stored, so the caller can free them.
  * stored, so the caller can free them.
- *
- * If "pending" is not NULL, it will be set to true if there might
- * still be some pending completions caused by this tile, else false.
  */
  */
-static unsigned int tile_net_lepp_grab_comps(struct net_device *dev,
+static unsigned int tile_net_lepp_grab_comps(lepp_queue_t *eq,
 					     struct sk_buff *comps[],
 					     struct sk_buff *comps[],
 					     unsigned int comps_size,
 					     unsigned int comps_size,
-					     bool *pending)
+					     unsigned int min_size)
 {
 {
-	struct tile_net_priv *priv = netdev_priv(dev);
-
-	lepp_queue_t *eq = priv->epp_queue;
-
 	unsigned int n = 0;
 	unsigned int n = 0;
 
 
-	unsigned int comp_head;
-	unsigned int comp_busy;
-	unsigned int comp_tail;
-
-	spin_lock(&priv->comp_lock);
-
-	comp_head = eq->comp_head;
-	comp_busy = eq->comp_busy;
-	comp_tail = eq->comp_tail;
+	unsigned int comp_head = eq->comp_head;
+	unsigned int comp_busy = eq->comp_busy;
 
 
 	while (comp_head != comp_busy && n < comps_size) {
 	while (comp_head != comp_busy && n < comps_size) {
 		comps[n++] = eq->comps[comp_head];
 		comps[n++] = eq->comps[comp_head];
 		LEPP_QINC(comp_head);
 		LEPP_QINC(comp_head);
 	}
 	}
 
 
-	if (pending != NULL)
-		*pending = (comp_head != comp_tail);
+	if (n < min_size)
+		return 0;
 
 
 	eq->comp_head = comp_head;
 	eq->comp_head = comp_head;
 
 
-	spin_unlock(&priv->comp_lock);
-
 	return n;
 	return n;
 }
 }
 
 
 
 
+/*
+ * Free some comps, and return true iff there are still some pending.
+ */
+static bool tile_net_lepp_free_comps(struct net_device *dev, bool all)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	lepp_queue_t *eq = priv->eq;
+
+	struct sk_buff *olds[64];
+	unsigned int wanted = 64;
+	unsigned int i, n;
+	bool pending;
+
+	spin_lock(&priv->eq_lock);
+
+	if (all)
+		eq->comp_busy = eq->comp_tail;
+
+	n = tile_net_lepp_grab_comps(eq, olds, wanted, 0);
+
+	pending = (eq->comp_head != eq->comp_tail);
+
+	spin_unlock(&priv->eq_lock);
+
+	for (i = 0; i < n; i++)
+		kfree_skb(olds[i]);
+
+	return pending;
+}
+
+
 /*
 /*
  * Make sure the egress timer is scheduled.
  * Make sure the egress timer is scheduled.
  *
  *
@@ -544,21 +603,11 @@ static void tile_net_handle_egress_timer(unsigned long arg)
 	struct tile_net_cpu *info = (struct tile_net_cpu *)arg;
 	struct tile_net_cpu *info = (struct tile_net_cpu *)arg;
 	struct net_device *dev = info->napi.dev;
 	struct net_device *dev = info->napi.dev;
 
 
-	struct sk_buff *olds[32];
-	unsigned int wanted = 32;
-	unsigned int i, nolds = 0;
-	bool pending;
-
 	/* The timer is no longer scheduled. */
 	/* The timer is no longer scheduled. */
 	info->egress_timer_scheduled = false;
 	info->egress_timer_scheduled = false;
 
 
-	nolds = tile_net_lepp_grab_comps(dev, olds, wanted, &pending);
-
-	for (i = 0; i < nolds; i++)
-		kfree_skb(olds[i]);
-
-	/* Reschedule timer if needed. */
-	if (pending)
+	/* Free comps, and reschedule timer if more are pending. */
+	if (tile_net_lepp_free_comps(dev, false))
 		tile_net_schedule_egress_timer(info);
 		tile_net_schedule_egress_timer(info);
 }
 }
 
 
@@ -636,8 +685,39 @@ static bool is_dup_ack(char *s1, char *s2, unsigned int len)
 
 
 
 
 
 
+static void tile_net_discard_aux(struct tile_net_cpu *info, int index)
+{
+	struct tile_netio_queue *queue = &info->queue;
+	netio_queue_impl_t *qsp = queue->__system_part;
+	netio_queue_user_impl_t *qup = &queue->__user_part;
+
+	int index2_aux = index + sizeof(netio_pkt_t);
+	int index2 =
+		((index2_aux ==
+		  qsp->__packet_receive_queue.__last_packet_plus_one) ?
+		 0 : index2_aux);
+
+	netio_pkt_t *pkt = (netio_pkt_t *)((unsigned long) &qsp[1] + index);
+
+	/* Extract the "linux_buffer_t". */
+	unsigned int buffer = pkt->__packet.word;
+
+	/* Convert "linux_buffer_t" to "va". */
+	void *va = __va((phys_addr_t)(buffer >> 1) << 7);
+
+	/* Acquire the associated "skb". */
+	struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+	struct sk_buff *skb = *skb_ptr;
+
+	kfree_skb(skb);
+
+	/* Consume this packet. */
+	qup->__packet_receive_read = index2;
+}
+
+
 /*
 /*
- * Like "tile_net_handle_packets()", but just discard packets.
+ * Like "tile_net_poll()", but just discard packets.
  */
  */
 static void tile_net_discard_packets(struct net_device *dev)
 static void tile_net_discard_packets(struct net_device *dev)
 {
 {
@@ -650,32 +730,8 @@ static void tile_net_discard_packets(struct net_device *dev)
 
 
 	while (qup->__packet_receive_read !=
 	while (qup->__packet_receive_read !=
 	       qsp->__packet_receive_queue.__packet_write) {
 	       qsp->__packet_receive_queue.__packet_write) {
-
 		int index = qup->__packet_receive_read;
 		int index = qup->__packet_receive_read;
-
-		int index2_aux = index + sizeof(netio_pkt_t);
-		int index2 =
-			((index2_aux ==
-			  qsp->__packet_receive_queue.__last_packet_plus_one) ?
-			 0 : index2_aux);
-
-		netio_pkt_t *pkt = (netio_pkt_t *)
-			((unsigned long) &qsp[1] + index);
-
-		/* Extract the "linux_buffer_t". */
-		unsigned int buffer = pkt->__packet.word;
-
-		/* Convert "linux_buffer_t" to "va". */
-		void *va = __va((phys_addr_t)(buffer >> 1) << 7);
-
-		/* Acquire the associated "skb". */
-		struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
-		struct sk_buff *skb = *skb_ptr;
-
-		kfree_skb(skb);
-
-		/* Consume this packet. */
-		qup->__packet_receive_read = index2;
+		tile_net_discard_aux(info, index);
 	}
 	}
 }
 }
 
 
@@ -704,7 +760,8 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index)
 
 
 	netio_pkt_metadata_t *metadata = NETIO_PKT_METADATA(pkt);
 	netio_pkt_metadata_t *metadata = NETIO_PKT_METADATA(pkt);
 
 
-	/* Extract the packet size. */
+	/* Extract the packet size.  FIXME: Shouldn't the second line */
+	/* get subtracted?  Mostly moot, since it should be "zero". */
 	unsigned long len =
 	unsigned long len =
 		(NETIO_PKT_CUSTOM_LENGTH(pkt) +
 		(NETIO_PKT_CUSTOM_LENGTH(pkt) +
 		 NET_IP_ALIGN - NETIO_PACKET_PADDING);
 		 NET_IP_ALIGN - NETIO_PACKET_PADDING);
@@ -722,15 +779,6 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index)
 	/* Compare to "NETIO_PKT_CUSTOM_DATA(pkt)". */
 	/* Compare to "NETIO_PKT_CUSTOM_DATA(pkt)". */
 	unsigned char *buf = va + NET_IP_ALIGN;
 	unsigned char *buf = va + NET_IP_ALIGN;
 
 
-#ifdef IGNORE_DUP_ACKS
-
-	static int other;
-	static int final;
-	static int keep;
-	static int skip;
-
-#endif
-
 	/* Invalidate the packet buffer. */
 	/* Invalidate the packet buffer. */
 	if (!hash_default)
 	if (!hash_default)
 		__inv_buffer(buf, len);
 		__inv_buffer(buf, len);
@@ -745,16 +793,8 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index)
 #ifdef TILE_NET_VERIFY_INGRESS
 #ifdef TILE_NET_VERIFY_INGRESS
 	if (!NETIO_PKT_L4_CSUM_CORRECT_M(metadata, pkt) &&
 	if (!NETIO_PKT_L4_CSUM_CORRECT_M(metadata, pkt) &&
 	    NETIO_PKT_L4_CSUM_CALCULATED_M(metadata, pkt)) {
 	    NETIO_PKT_L4_CSUM_CALCULATED_M(metadata, pkt)) {
-		/*
-		 * FIXME: This complains about UDP packets
-		 * with a "zero" checksum (bug 6624).
-		 */
-#ifdef TILE_NET_PANIC_ON_BAD
-		dump_packet(buf, len, "rx");
-		panic("Bad L4 checksum.");
-#else
+		/* Bug 6624: Includes UDP packets with a "zero" checksum. */
 		pr_warning("Bad L4 checksum on %d byte packet.\n", len);
 		pr_warning("Bad L4 checksum on %d byte packet.\n", len);
-#endif
 	}
 	}
 	if (!NETIO_PKT_L3_CSUM_CORRECT_M(metadata, pkt) &&
 	if (!NETIO_PKT_L3_CSUM_CORRECT_M(metadata, pkt) &&
 	    NETIO_PKT_L3_CSUM_CALCULATED_M(metadata, pkt)) {
 	    NETIO_PKT_L3_CSUM_CALCULATED_M(metadata, pkt)) {
@@ -769,90 +809,29 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index)
 		}
 		}
 		break;
 		break;
 	case NETIO_PKT_STATUS_BAD:
 	case NETIO_PKT_STATUS_BAD:
-#ifdef TILE_NET_PANIC_ON_BAD
-		dump_packet(buf, len, "rx");
-		panic("Unexpected BAD packet.");
-#else
-		pr_warning("Unexpected BAD %d byte packet.\n", len);
-#endif
+		pr_warning("Unexpected BAD %ld byte packet.\n", len);
 	}
 	}
 #endif
 #endif
 
 
 	filter = 0;
 	filter = 0;
 
 
+	/* ISSUE: Filter TCP packets with "bad" checksums? */
+
 	if (!(dev->flags & IFF_UP)) {
 	if (!(dev->flags & IFF_UP)) {
 		/* Filter packets received before we're up. */
 		/* Filter packets received before we're up. */
 		filter = 1;
 		filter = 1;
+	} else if (NETIO_PKT_STATUS_M(metadata, pkt) == NETIO_PKT_STATUS_BAD) {
+		/* Filter "truncated" packets. */
+		filter = 1;
 	} else if (!(dev->flags & IFF_PROMISC)) {
 	} else if (!(dev->flags & IFF_PROMISC)) {
-		/*
-		 * FIXME: Implement HW multicast filter.
-		 */
-		if (is_unicast_ether_addr(buf)) {
+		/* FIXME: Implement HW multicast filter. */
+		if (!is_multicast_ether_addr(buf)) {
 			/* Filter packets not for our address. */
 			/* Filter packets not for our address. */
 			const u8 *mine = dev->dev_addr;
 			const u8 *mine = dev->dev_addr;
 			filter = compare_ether_addr(mine, buf);
 			filter = compare_ether_addr(mine, buf);
 		}
 		}
 	}
 	}
 
 
-#ifdef IGNORE_DUP_ACKS
-
-	if (len != 66) {
-		/* FIXME: Must check "is_tcp_ack(buf, len)" somehow. */
-
-		other++;
-
-	} else if (index2 ==
-		   qsp->__packet_receive_queue.__packet_write) {
-
-		final++;
-
-	} else {
-
-		netio_pkt_t *pkt2 = (netio_pkt_t *)
-			((unsigned long) &qsp[1] + index2);
-
-		netio_pkt_metadata_t *metadata2 =
-			NETIO_PKT_METADATA(pkt2);
-
-		/* Extract the packet size. */
-		unsigned long len2 =
-			(NETIO_PKT_CUSTOM_LENGTH(pkt2) +
-			 NET_IP_ALIGN - NETIO_PACKET_PADDING);
-
-		if (len2 == 66 &&
-		    NETIO_PKT_FLOW_HASH_M(metadata, pkt) ==
-		    NETIO_PKT_FLOW_HASH_M(metadata2, pkt2)) {
-
-			/* Extract the "linux_buffer_t". */
-			unsigned int buffer2 = pkt2->__packet.word;
-
-			/* Convert "linux_buffer_t" to "va". */
-			void *va2 =
-				__va((phys_addr_t)(buffer2 >> 1) << 7);
-
-			/* Extract the packet data pointer. */
-			/* Compare to "NETIO_PKT_CUSTOM_DATA(pkt)". */
-			unsigned char *buf2 = va2 + NET_IP_ALIGN;
-
-			/* Invalidate the packet buffer. */
-			if (!hash_default)
-				__inv_buffer(buf2, len2);
-
-			if (is_dup_ack(buf, buf2, len)) {
-				skip++;
-				filter = 1;
-			} else {
-				keep++;
-			}
-		}
-	}
-
-	if (net_ratelimit())
-		pr_info("Other %d Final %d Keep %d Skip %d.\n",
-			other, final, keep, skip);
-
-#endif
-
 	if (filter) {
 	if (filter) {
 
 
 		/* ISSUE: Update "drop" statistics? */
 		/* ISSUE: Update "drop" statistics? */
@@ -877,10 +856,7 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index)
 		/* NOTE: This call also sets "skb->dev = dev". */
 		/* NOTE: This call also sets "skb->dev = dev". */
 		skb->protocol = eth_type_trans(skb, dev);
 		skb->protocol = eth_type_trans(skb, dev);
 
 
-		/* ISSUE: Discard corrupt packets? */
-		/* ISSUE: Discard packets with bad checksums? */
-
-		/* Avoid recomputing TCP/UDP checksums. */
+		/* Avoid recomputing "good" TCP/UDP checksums. */
 		if (NETIO_PKT_L4_CSUM_CORRECT_M(metadata, pkt))
 		if (NETIO_PKT_L4_CSUM_CORRECT_M(metadata, pkt))
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 			skb->ip_summed = CHECKSUM_UNNECESSARY;
 
 
@@ -912,9 +888,14 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index)
 /*
 /*
  * Handle some packets for the given device on the current CPU.
  * Handle some packets for the given device on the current CPU.
  *
  *
- * ISSUE: The "rotting packet" race condition occurs if a packet
- * arrives after the queue appears to be empty, and before the
- * hypervisor interrupt is re-enabled.
+ * If "tile_net_stop()" is called on some other tile while this
+ * function is running, we will return, hopefully before that
+ * other tile asks us to call "napi_disable()".
+ *
+ * The "rotting packet" race condition occurs if a packet arrives
+ * during the extremely narrow window between the queue appearing to
+ * be empty, and the ingress interrupt being re-enabled.  This happens
+ * a LOT under heavy network load.
  */
  */
 static int tile_net_poll(struct napi_struct *napi, int budget)
 static int tile_net_poll(struct napi_struct *napi, int budget)
 {
 {
@@ -928,7 +909,7 @@ static int tile_net_poll(struct napi_struct *napi, int budget)
 
 
 	unsigned int work = 0;
 	unsigned int work = 0;
 
 
-	while (1) {
+	while (priv->active) {
 		int index = qup->__packet_receive_read;
 		int index = qup->__packet_receive_read;
 		if (index == qsp->__packet_receive_queue.__packet_write)
 		if (index == qsp->__packet_receive_queue.__packet_write)
 			break;
 			break;
@@ -941,19 +922,24 @@ static int tile_net_poll(struct napi_struct *napi, int budget)
 
 
 	napi_complete(&info->napi);
 	napi_complete(&info->napi);
 
 
-	/* Re-enable hypervisor interrupts. */
+	if (!priv->active)
+		goto done;
+
+	/* Re-enable the ingress interrupt. */
 	enable_percpu_irq(priv->intr_id);
 	enable_percpu_irq(priv->intr_id);
 
 
-	/* HACK: Avoid the "rotting packet" problem. */
+	/* HACK: Avoid the "rotting packet" problem (see above). */
 	if (qup->__packet_receive_read !=
 	if (qup->__packet_receive_read !=
-	    qsp->__packet_receive_queue.__packet_write)
-		napi_schedule(&info->napi);
-
-	/* ISSUE: Handle completions? */
+	    qsp->__packet_receive_queue.__packet_write) {
+		/* ISSUE: Sometimes this returns zero, presumably */
+		/* because an interrupt was handled for this tile. */
+		(void)napi_reschedule(&info->napi);
+	}
 
 
 done:
 done:
 
 
-	tile_net_provide_needed_buffers(info);
+	if (priv->active)
+		tile_net_provide_needed_buffers(info);
 
 
 	return work;
 	return work;
 }
 }
@@ -961,6 +947,12 @@ done:
 
 
 /*
 /*
  * Handle an ingress interrupt for the given device on the current cpu.
  * Handle an ingress interrupt for the given device on the current cpu.
+ *
+ * ISSUE: Sometimes this gets called after "disable_percpu_irq()" has
+ * been called!  This is probably due to "pending hypervisor downcalls".
+ *
+ * ISSUE: Is there any race condition between the "napi_schedule()" here
+ * and the "napi_complete()" call above?
  */
  */
 static irqreturn_t tile_net_handle_ingress_interrupt(int irq, void *dev_ptr)
 static irqreturn_t tile_net_handle_ingress_interrupt(int irq, void *dev_ptr)
 {
 {
@@ -969,9 +961,15 @@ static irqreturn_t tile_net_handle_ingress_interrupt(int irq, void *dev_ptr)
 	int my_cpu = smp_processor_id();
 	int my_cpu = smp_processor_id();
 	struct tile_net_cpu *info = priv->cpu[my_cpu];
 	struct tile_net_cpu *info = priv->cpu[my_cpu];
 
 
-	/* Disable hypervisor interrupt. */
+	/* Disable the ingress interrupt. */
 	disable_percpu_irq(priv->intr_id);
 	disable_percpu_irq(priv->intr_id);
 
 
+	/* Ignore unwanted interrupts. */
+	if (!priv->active)
+		return IRQ_HANDLED;
+
+	/* ISSUE: Sometimes "info->napi_enabled" is false here. */
+
 	napi_schedule(&info->napi);
 	napi_schedule(&info->napi);
 
 
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
@@ -1005,8 +1003,7 @@ static int tile_net_open_aux(struct net_device *dev)
 	 */
 	 */
 	{
 	{
 		int epp_home = hv_lotar_to_cpu(epp_lotar);
 		int epp_home = hv_lotar_to_cpu(epp_lotar);
-		struct page *page = virt_to_page(priv->epp_queue);
-		homecache_change_page_home(page, 0, epp_home);
+		homecache_change_page_home(priv->eq_pages, EQ_ORDER, epp_home);
 	}
 	}
 
 
 	/*
 	/*
@@ -1015,9 +1012,9 @@ static int tile_net_open_aux(struct net_device *dev)
 	{
 	{
 		netio_ipp_address_t ea = {
 		netio_ipp_address_t ea = {
 			.va = 0,
 			.va = 0,
-			.pa = __pa(priv->epp_queue),
+			.pa = __pa(priv->eq),
 			.pte = hv_pte(0),
 			.pte = hv_pte(0),
-			.size = PAGE_SIZE,
+			.size = EQ_SIZE,
 		};
 		};
 		ea.pte = hv_pte_set_lotar(ea.pte, epp_lotar);
 		ea.pte = hv_pte_set_lotar(ea.pte, epp_lotar);
 		ea.pte = hv_pte_set_mode(ea.pte, HV_PTE_MODE_CACHE_TILE_L3);
 		ea.pte = hv_pte_set_mode(ea.pte, HV_PTE_MODE_CACHE_TILE_L3);
@@ -1043,7 +1040,7 @@ static int tile_net_open_aux(struct net_device *dev)
 
 
 
 
 /*
 /*
- * Register with hypervisor on each CPU.
+ * Register with hypervisor on the current CPU.
  *
  *
  * Strangely, this function does important things even if it "fails",
  * Strangely, this function does important things even if it "fails",
  * which is especially common if the link is not up yet.  Hopefully
  * which is especially common if the link is not up yet.  Hopefully
@@ -1092,7 +1089,8 @@ static void tile_net_register(void *dev_ptr)
 	priv->cpu[my_cpu] = info;
 	priv->cpu[my_cpu] = info;
 
 
 	/*
 	/*
-	 * Register ourselves with the IPP.
+	 * Register ourselves with LIPP.  This does a lot of stuff,
+	 * including invoking the LIPP registration code.
 	 */
 	 */
 	ret = hv_dev_pwrite(priv->hv_devhdl, 0,
 	ret = hv_dev_pwrite(priv->hv_devhdl, 0,
 			    (HV_VirtAddr)&config,
 			    (HV_VirtAddr)&config,
@@ -1101,8 +1099,11 @@ static void tile_net_register(void *dev_ptr)
 	PDEBUG("hv_dev_pwrite(NETIO_IPP_INPUT_REGISTER_OFF) returned %d\n",
 	PDEBUG("hv_dev_pwrite(NETIO_IPP_INPUT_REGISTER_OFF) returned %d\n",
 	       ret);
 	       ret);
 	if (ret < 0) {
 	if (ret < 0) {
-		printk(KERN_DEBUG "hv_dev_pwrite NETIO_IPP_INPUT_REGISTER_OFF"
-		       " failure %d\n", ret);
+		if (ret != NETIO_LINK_DOWN) {
+			printk(KERN_DEBUG "hv_dev_pwrite "
+			       "NETIO_IPP_INPUT_REGISTER_OFF failure %d\n",
+			       ret);
+		}
 		info->link_down = (ret == NETIO_LINK_DOWN);
 		info->link_down = (ret == NETIO_LINK_DOWN);
 		return;
 		return;
 	}
 	}
@@ -1145,15 +1146,47 @@ static void tile_net_register(void *dev_ptr)
 			   NETIO_IPP_GET_FASTIO_OFF);
 			   NETIO_IPP_GET_FASTIO_OFF);
 	PDEBUG("hv_dev_pread(NETIO_IPP_GET_FASTIO_OFF) returned %d\n", ret);
 	PDEBUG("hv_dev_pread(NETIO_IPP_GET_FASTIO_OFF) returned %d\n", ret);
 
 
-	netif_napi_add(dev, &info->napi, tile_net_poll, 64);
-
 	/* Now we are registered. */
 	/* Now we are registered. */
 	info->registered = true;
 	info->registered = true;
 }
 }
 
 
 
 
 /*
 /*
- * Unregister with hypervisor on each CPU.
+ * Deregister with hypervisor on the current CPU.
+ *
+ * This simply discards all our credits, so no more packets will be
+ * delivered to this tile.  There may still be packets in our queue.
+ *
+ * Also, disable the ingress interrupt.
+ */
+static void tile_net_deregister(void *dev_ptr)
+{
+	struct net_device *dev = (struct net_device *)dev_ptr;
+	struct tile_net_priv *priv = netdev_priv(dev);
+	int my_cpu = smp_processor_id();
+	struct tile_net_cpu *info = priv->cpu[my_cpu];
+
+	/* Disable the ingress interrupt. */
+	disable_percpu_irq(priv->intr_id);
+
+	/* Do nothing else if not registered. */
+	if (info == NULL || !info->registered)
+		return;
+
+	{
+		struct tile_netio_queue *queue = &info->queue;
+		netio_queue_user_impl_t *qup = &queue->__user_part;
+
+		/* Discard all our credits. */
+		__netio_fastio_return_credits(qup->__fastio_index, -1);
+	}
+}
+
+
+/*
+ * Unregister with hypervisor on the current CPU.
+ *
+ * Also, disable the ingress interrupt.
  */
  */
 static void tile_net_unregister(void *dev_ptr)
 static void tile_net_unregister(void *dev_ptr)
 {
 {
@@ -1162,35 +1195,23 @@ static void tile_net_unregister(void *dev_ptr)
 	int my_cpu = smp_processor_id();
 	int my_cpu = smp_processor_id();
 	struct tile_net_cpu *info = priv->cpu[my_cpu];
 	struct tile_net_cpu *info = priv->cpu[my_cpu];
 
 
-	int ret = 0;
+	int ret;
 	int dummy = 0;
 	int dummy = 0;
 
 
-	/* Do nothing if never registered. */
-	if (info == NULL)
-		return;
+	/* Disable the ingress interrupt. */
+	disable_percpu_irq(priv->intr_id);
 
 
-	/* Do nothing if already unregistered. */
-	if (!info->registered)
+	/* Do nothing else if not registered. */
+	if (info == NULL || !info->registered)
 		return;
 		return;
 
 
-	/*
-	 * Unregister ourselves with LIPP.
-	 */
+	/* Unregister ourselves with LIPP/LEPP. */
 	ret = hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy,
 	ret = hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy,
 			    sizeof(dummy), NETIO_IPP_INPUT_UNREGISTER_OFF);
 			    sizeof(dummy), NETIO_IPP_INPUT_UNREGISTER_OFF);
-	PDEBUG("hv_dev_pwrite(NETIO_IPP_INPUT_UNREGISTER_OFF) returned %d\n",
-	       ret);
-	if (ret < 0) {
-		/* FIXME: Just panic? */
-		pr_err("hv_dev_pwrite NETIO_IPP_INPUT_UNREGISTER_OFF"
-		       " failure %d\n", ret);
-	}
+	if (ret < 0)
+		panic("Failed to unregister with LIPP/LEPP!\n");
 
 
-	/*
-	 * Discard all packets still in our NetIO queue.  Hopefully,
-	 * once the unregister call is complete, there will be no
-	 * packets still in flight on the IDN.
-	 */
+	/* Discard all packets still in our NetIO queue. */
 	tile_net_discard_packets(dev);
 	tile_net_discard_packets(dev);
 
 
 	/* Reset state. */
 	/* Reset state. */
@@ -1200,11 +1221,6 @@ static void tile_net_unregister(void *dev_ptr)
 	/* Cancel egress timer. */
 	/* Cancel egress timer. */
 	del_timer(&info->egress_timer);
 	del_timer(&info->egress_timer);
 	info->egress_timer_scheduled = false;
 	info->egress_timer_scheduled = false;
-
-	netif_napi_del(&info->napi);
-
-	/* Now we are unregistered. */
-	info->registered = false;
 }
 }
 
 
 
 
@@ -1212,18 +1228,28 @@ static void tile_net_unregister(void *dev_ptr)
  * Helper function for "tile_net_stop()".
  * Helper function for "tile_net_stop()".
  *
  *
  * Also used to handle registration failure in "tile_net_open_inner()",
  * Also used to handle registration failure in "tile_net_open_inner()",
- * when "fully_opened" is known to be false, and the various extra
- * steps in "tile_net_stop()" are not necessary.  ISSUE: It might be
- * simpler if we could just call "tile_net_stop()" anyway.
+ * when the various extra steps in "tile_net_stop()" are not necessary.
  */
  */
 static void tile_net_stop_aux(struct net_device *dev)
 static void tile_net_stop_aux(struct net_device *dev)
 {
 {
 	struct tile_net_priv *priv = netdev_priv(dev);
 	struct tile_net_priv *priv = netdev_priv(dev);
+	int i;
 
 
 	int dummy = 0;
 	int dummy = 0;
 
 
-	/* Unregister all tiles, so LIPP will stop delivering packets. */
+	/*
+	 * Unregister all tiles, so LIPP will stop delivering packets.
+	 * Also, delete all the "napi" objects (sequentially, to protect
+	 * "dev->napi_list").
+	 */
 	on_each_cpu(tile_net_unregister, (void *)dev, 1);
 	on_each_cpu(tile_net_unregister, (void *)dev, 1);
+	for_each_online_cpu(i) {
+		struct tile_net_cpu *info = priv->cpu[i];
+		if (info != NULL && info->registered) {
+			netif_napi_del(&info->napi);
+			info->registered = false;
+		}
+	}
 
 
 	/* Stop LIPP/LEPP. */
 	/* Stop LIPP/LEPP. */
 	if (hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy,
 	if (hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy,
@@ -1235,18 +1261,15 @@ static void tile_net_stop_aux(struct net_device *dev)
 
 
 
 
 /*
 /*
- * Disable ingress interrupts for the given device on the current cpu.
+ * Disable NAPI for the given device on the current cpu.
  */
  */
-static void tile_net_disable_intr(void *dev_ptr)
+static void tile_net_stop_disable(void *dev_ptr)
 {
 {
 	struct net_device *dev = (struct net_device *)dev_ptr;
 	struct net_device *dev = (struct net_device *)dev_ptr;
 	struct tile_net_priv *priv = netdev_priv(dev);
 	struct tile_net_priv *priv = netdev_priv(dev);
 	int my_cpu = smp_processor_id();
 	int my_cpu = smp_processor_id();
 	struct tile_net_cpu *info = priv->cpu[my_cpu];
 	struct tile_net_cpu *info = priv->cpu[my_cpu];
 
 
-	/* Disable hypervisor interrupt. */
-	disable_percpu_irq(priv->intr_id);
-
 	/* Disable NAPI if needed. */
 	/* Disable NAPI if needed. */
 	if (info != NULL && info->napi_enabled) {
 	if (info != NULL && info->napi_enabled) {
 		napi_disable(&info->napi);
 		napi_disable(&info->napi);
@@ -1256,21 +1279,24 @@ static void tile_net_disable_intr(void *dev_ptr)
 
 
 
 
 /*
 /*
- * Enable ingress interrupts for the given device on the current cpu.
+ * Enable NAPI and the ingress interrupt for the given device
+ * on the current cpu.
+ *
+ * ISSUE: Only do this for "network cpus"?
  */
  */
-static void tile_net_enable_intr(void *dev_ptr)
+static void tile_net_open_enable(void *dev_ptr)
 {
 {
 	struct net_device *dev = (struct net_device *)dev_ptr;
 	struct net_device *dev = (struct net_device *)dev_ptr;
 	struct tile_net_priv *priv = netdev_priv(dev);
 	struct tile_net_priv *priv = netdev_priv(dev);
 	int my_cpu = smp_processor_id();
 	int my_cpu = smp_processor_id();
 	struct tile_net_cpu *info = priv->cpu[my_cpu];
 	struct tile_net_cpu *info = priv->cpu[my_cpu];
 
 
-	/* Enable hypervisor interrupt. */
-	enable_percpu_irq(priv->intr_id);
-
 	/* Enable NAPI. */
 	/* Enable NAPI. */
 	napi_enable(&info->napi);
 	napi_enable(&info->napi);
 	info->napi_enabled = true;
 	info->napi_enabled = true;
+
+	/* Enable the ingress interrupt. */
+	enable_percpu_irq(priv->intr_id);
 }
 }
 
 
 
 
@@ -1288,8 +1314,9 @@ static int tile_net_open_inner(struct net_device *dev)
 	int my_cpu = smp_processor_id();
 	int my_cpu = smp_processor_id();
 	struct tile_net_cpu *info;
 	struct tile_net_cpu *info;
 	struct tile_netio_queue *queue;
 	struct tile_netio_queue *queue;
-	unsigned int irq;
+	int result = 0;
 	int i;
 	int i;
+	int dummy = 0;
 
 
 	/*
 	/*
 	 * First try to register just on the local CPU, and handle any
 	 * First try to register just on the local CPU, and handle any
@@ -1307,42 +1334,52 @@ static int tile_net_open_inner(struct net_device *dev)
 	/*
 	/*
 	 * Now register everywhere else.  If any registration fails,
 	 * Now register everywhere else.  If any registration fails,
 	 * even for "link down" (which might not be possible), we
 	 * even for "link down" (which might not be possible), we
-	 * clean up using "tile_net_stop_aux()".
+	 * clean up using "tile_net_stop_aux()".  Also, add all the
+	 * "napi" objects (sequentially, to protect "dev->napi_list").
+	 * ISSUE: Only use "netif_napi_add()" for "network cpus"?
 	 */
 	 */
 	smp_call_function(tile_net_register, (void *)dev, 1);
 	smp_call_function(tile_net_register, (void *)dev, 1);
 	for_each_online_cpu(i) {
 	for_each_online_cpu(i) {
-		if (!priv->cpu[i]->registered) {
-			tile_net_stop_aux(dev);
-			return -EAGAIN;
-		}
+		struct tile_net_cpu *info = priv->cpu[i];
+		if (info->registered)
+			netif_napi_add(dev, &info->napi, tile_net_poll, 64);
+		else
+			result = -EAGAIN;
+	}
+	if (result != 0) {
+		tile_net_stop_aux(dev);
+		return result;
 	}
 	}
 
 
 	queue = &info->queue;
 	queue = &info->queue;
 
 
-	/*
-	 * Set the device intr bit mask.
-	 * The tile_net_register above sets per tile __intr_id.
-	 */
-	priv->intr_id = queue->__system_part->__intr_id;
-	BUG_ON(!priv->intr_id);
-
-	/*
-	 * Register the device interrupt handler.
-	 * The __ffs() function returns the index into the interrupt handler
-	 * table from the interrupt bit mask which should have one bit
-	 * and one bit only set.
-	 */
-	irq = __ffs(priv->intr_id);
-	tile_irq_activate(irq, TILE_IRQ_PERCPU);
-	BUG_ON(request_irq(irq, tile_net_handle_ingress_interrupt,
-			   0, dev->name, (void *)dev) != 0);
+	if (priv->intr_id == 0) {
+		unsigned int irq;
 
 
-	/* ISSUE: How could "priv->fully_opened" ever be "true" here? */
-
-	if (!priv->fully_opened) {
+		/*
+		 * Acquire the irq allocated by the hypervisor.  Every
+		 * queue gets the same irq.  The "__intr_id" field is
+		 * "1 << irq", so we use "__ffs()" to extract "irq".
+		 */
+		priv->intr_id = queue->__system_part->__intr_id;
+		BUG_ON(priv->intr_id == 0);
+		irq = __ffs(priv->intr_id);
 
 
-		int dummy = 0;
+		/*
+		 * Register the ingress interrupt handler for this
+		 * device, permanently.
+		 *
+		 * We used to call "free_irq()" in "tile_net_stop()",
+		 * and then re-register the handler here every time,
+		 * but that caused DNP errors in "handle_IRQ_event()"
+		 * because "desc->action" was NULL.  See bug 9143.
+		 */
+		tile_irq_activate(irq, TILE_IRQ_PERCPU);
+		BUG_ON(request_irq(irq, tile_net_handle_ingress_interrupt,
+				   0, dev->name, (void *)dev) != 0);
+	}
 
 
+	{
 		/* Allocate initial buffers. */
 		/* Allocate initial buffers. */
 
 
 		int max_buffers =
 		int max_buffers =
@@ -1359,18 +1396,21 @@ static int tile_net_open_inner(struct net_device *dev)
 		if (info->num_needed_small_buffers != 0 ||
 		if (info->num_needed_small_buffers != 0 ||
 		    info->num_needed_large_buffers != 0)
 		    info->num_needed_large_buffers != 0)
 			panic("Insufficient memory for buffer stack!");
 			panic("Insufficient memory for buffer stack!");
+	}
 
 
-		/* Start LIPP/LEPP and activate "ingress" at the shim. */
-		if (hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy,
-				  sizeof(dummy), NETIO_IPP_INPUT_INIT_OFF) < 0)
-			panic("Failed to activate the LIPP Shim!\n");
+	/* We are about to be active. */
+	priv->active = true;
 
 
-		priv->fully_opened = 1;
-	}
+	/* Make sure "active" is visible to all tiles. */
+	mb();
 
 
-	/* On each tile, enable the hypervisor to trigger interrupts. */
-	/* ISSUE: Do this before starting LIPP/LEPP? */
-	on_each_cpu(tile_net_enable_intr, (void *)dev, 1);
+	/* On each tile, enable NAPI and the ingress interrupt. */
+	on_each_cpu(tile_net_open_enable, (void *)dev, 1);
+
+	/* Start LIPP/LEPP and activate "ingress" at the shim. */
+	if (hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy,
+			  sizeof(dummy), NETIO_IPP_INPUT_INIT_OFF) < 0)
+		panic("Failed to activate the LIPP Shim!\n");
 
 
 	/* Start our transmit queue. */
 	/* Start our transmit queue. */
 	netif_start_queue(dev);
 	netif_start_queue(dev);
@@ -1396,9 +1436,9 @@ static void tile_net_open_retry(struct work_struct *w)
 	 * ourselves to try again later; otherwise, tell Linux we now have
 	 * ourselves to try again later; otherwise, tell Linux we now have
 	 * a working link.  ISSUE: What if the return value is negative?
 	 * a working link.  ISSUE: What if the return value is negative?
 	 */
 	 */
-	if (tile_net_open_inner(priv->dev))
-		schedule_delayed_work_on(singlethread_cpu, &priv->retry_work,
-					 TILE_NET_RETRY_INTERVAL);
+	if (tile_net_open_inner(priv->dev) != 0)
+		schedule_delayed_work(&priv->retry_work,
+				      TILE_NET_RETRY_INTERVAL);
 	else
 	else
 		netif_carrier_on(priv->dev);
 		netif_carrier_on(priv->dev);
 }
 }
@@ -1412,8 +1452,8 @@ static void tile_net_open_retry(struct work_struct *w)
  * The open entry point is called when a network interface is made
  * The open entry point is called when a network interface is made
  * active by the system (IFF_UP).  At this point all resources needed
  * active by the system (IFF_UP).  At this point all resources needed
  * for transmit and receive operations are allocated, the interrupt
  * for transmit and receive operations are allocated, the interrupt
- * handler is registered with the OS, the watchdog timer is started,
- * and the stack is notified that the interface is ready.
+ * handler is registered with the OS (if needed), the watchdog timer
+ * is started, and the stack is notified that the interface is ready.
  *
  *
  * If the actual link is not available yet, then we tell Linux that
  * If the actual link is not available yet, then we tell Linux that
  * we have no carrier, and we keep checking until the link comes up.
  * we have no carrier, and we keep checking until the link comes up.
@@ -1468,6 +1508,10 @@ static int tile_net_open(struct net_device *dev)
 #endif
 #endif
 
 
 		priv->partly_opened = 1;
 		priv->partly_opened = 1;
+
+	} else {
+		/* FIXME: Is this possible? */
+		/* printk("Already partly opened.\n"); */
 	}
 	}
 
 
 	/*
 	/*
@@ -1487,57 +1531,17 @@ static int tile_net_open(struct net_device *dev)
 	 * and then remember to try again later.
 	 * and then remember to try again later.
 	 */
 	 */
 	netif_carrier_off(dev);
 	netif_carrier_off(dev);
-	schedule_delayed_work_on(singlethread_cpu, &priv->retry_work,
-				 TILE_NET_RETRY_INTERVAL);
+	schedule_delayed_work(&priv->retry_work, TILE_NET_RETRY_INTERVAL);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
 
 
-/*
- * Disables a network interface.
- *
- * Returns 0, this is not allowed to fail.
- *
- * The close entry point is called when an interface is de-activated
- * by the OS.  The hardware is still under the drivers control, but
- * needs to be disabled.  A global MAC reset is issued to stop the
- * hardware, and all transmit and receive resources are freed.
- *
- * ISSUE: Can this can be called while "tile_net_poll()" is running?
- */
-static int tile_net_stop(struct net_device *dev)
+static int tile_net_drain_lipp_buffers(struct tile_net_priv *priv)
 {
 {
-	struct tile_net_priv *priv = netdev_priv(dev);
-
-	bool pending = true;
-
-	PDEBUG("tile_net_stop()\n");
-
-	/* ISSUE: Only needed if not yet fully open. */
-	cancel_delayed_work_sync(&priv->retry_work);
-
-	/* Can't transmit any more. */
-	netif_stop_queue(dev);
-
-	/*
-	 * Disable hypervisor interrupts on each tile.
-	 */
-	on_each_cpu(tile_net_disable_intr, (void *)dev, 1);
-
-	/*
-	 * Unregister the interrupt handler.
-	 * The __ffs() function returns the index into the interrupt handler
-	 * table from the interrupt bit mask which should have one bit
-	 * and one bit only set.
-	 */
-	if (priv->intr_id)
-		free_irq(__ffs(priv->intr_id), dev);
-
-	/*
-	 * Drain all the LIPP buffers.
-	 */
+	int n = 0;
 
 
+	/* Drain all the LIPP buffers. */
 	while (true) {
 	while (true) {
 		int buffer;
 		int buffer;
 
 
@@ -1560,43 +1564,105 @@ static int tile_net_stop(struct net_device *dev)
 
 
 			kfree_skb(skb);
 			kfree_skb(skb);
 		}
 		}
+
+		n++;
 	}
 	}
 
 
-	/* Stop LIPP/LEPP. */
-	tile_net_stop_aux(dev);
+	return n;
+}
 
 
 
 
-	priv->fully_opened = 0;
+/*
+ * Disables a network interface.
+ *
+ * Returns 0, this is not allowed to fail.
+ *
+ * The close entry point is called when an interface is de-activated
+ * by the OS.  The hardware is still under the drivers control, but
+ * needs to be disabled.  A global MAC reset is issued to stop the
+ * hardware, and all transmit and receive resources are freed.
+ *
+ * ISSUE: How closely does "netif_running(dev)" mirror "priv->active"?
+ *
+ * Before we are called by "__dev_close()", "netif_running()" will
+ * have been cleared, so no NEW calls to "tile_net_poll()" will be
+ * made by "netpoll_poll_dev()".
+ *
+ * Often, this can cause some tiles to still have packets in their
+ * queues, so we must call "tile_net_discard_packets()" later.
+ *
+ * Note that some other tile may still be INSIDE "tile_net_poll()",
+ * and in fact, many will be, if there is heavy network load.
+ *
+ * Calling "on_each_cpu(tile_net_stop_disable, (void *)dev, 1)" when
+ * any tile is still "napi_schedule()"'d will induce a horrible crash
+ * when "msleep()" is called.  This includes tiles which are inside
+ * "tile_net_poll()" which have not yet called "napi_complete()".
+ *
+ * So, we must first try to wait long enough for other tiles to finish
+ * with any current "tile_net_poll()" call, and, hopefully, to clear
+ * the "scheduled" flag.  ISSUE: It is unclear what happens to tiles
+ * which have called "napi_schedule()" but which had not yet tried to
+ * call "tile_net_poll()", or which exhausted their budget inside
+ * "tile_net_poll()" just before this function was called.
+ */
+static int tile_net_stop(struct net_device *dev)
+{
+	struct tile_net_priv *priv = netdev_priv(dev);
+
+	PDEBUG("tile_net_stop()\n");
 
 
+	/* Start discarding packets. */
+	priv->active = false;
+
+	/* Make sure "active" is visible to all tiles. */
+	mb();
 
 
 	/*
 	/*
-	 * XXX: ISSUE: It appears that, in practice anyway, by the
-	 * time we get here, there are no pending completions.
+	 * On each tile, make sure no NEW packets get delivered, and
+	 * disable the ingress interrupt.
+	 *
+	 * Note that the ingress interrupt can fire AFTER this,
+	 * presumably due to packets which were recently delivered,
+	 * but it will have no effect.
 	 */
 	 */
-	while (pending) {
+	on_each_cpu(tile_net_deregister, (void *)dev, 1);
 
 
-		struct sk_buff *olds[32];
-		unsigned int wanted = 32;
-		unsigned int i, nolds = 0;
+	/* Optimistically drain LIPP buffers. */
+	(void)tile_net_drain_lipp_buffers(priv);
 
 
-		nolds = tile_net_lepp_grab_comps(dev, olds,
-						 wanted, &pending);
+	/* ISSUE: Only needed if not yet fully open. */
+	cancel_delayed_work_sync(&priv->retry_work);
 
 
-		/* ISSUE: We have never actually seen this debug spew. */
-		if (nolds != 0)
-			pr_info("During tile_net_stop(), grabbed %d comps.\n",
-			       nolds);
+	/* Can't transmit any more. */
+	netif_stop_queue(dev);
 
 
-		for (i = 0; i < nolds; i++)
-			kfree_skb(olds[i]);
-	}
+	/* Disable NAPI on each tile. */
+	on_each_cpu(tile_net_stop_disable, (void *)dev, 1);
+
+	/*
+	 * Drain any remaining LIPP buffers.  NOTE: This "printk()"
+	 * has never been observed, but in theory it could happen.
+	 */
+	if (tile_net_drain_lipp_buffers(priv) != 0)
+		printk("Had to drain some extra LIPP buffers!\n");
 
 
+	/* Stop LIPP/LEPP. */
+	tile_net_stop_aux(dev);
+
+	/*
+	 * ISSUE: It appears that, in practice anyway, by the time we
+	 * get here, there are no pending completions, but just in case,
+	 * we free (all of) them anyway.
+	 */
+	while (tile_net_lepp_free_comps(dev, true))
+		/* loop */;
 
 
 	/* Wipe the EPP queue. */
 	/* Wipe the EPP queue. */
-	memset(priv->epp_queue, 0, sizeof(lepp_queue_t));
+	memset(priv->eq, 0, sizeof(lepp_queue_t));
 
 
 	/* Evict the EPP queue. */
 	/* Evict the EPP queue. */
-	finv_buffer(priv->epp_queue, PAGE_SIZE);
+	finv_buffer(priv->eq, EQ_SIZE);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -1620,7 +1686,7 @@ static unsigned int tile_net_tx_frags(lepp_frag_t *frags,
 	if (b_len != 0) {
 	if (b_len != 0) {
 
 
 		if (!hash_default)
 		if (!hash_default)
-			finv_buffer_remote(b_data, b_len);
+			finv_buffer_remote(b_data, b_len, 0);
 
 
 		cpa = __pa(b_data);
 		cpa = __pa(b_data);
 		frags[n].cpa_lo = cpa;
 		frags[n].cpa_lo = cpa;
@@ -1643,7 +1709,7 @@ static unsigned int tile_net_tx_frags(lepp_frag_t *frags,
 		if (!hash_default) {
 		if (!hash_default) {
 			void *va = pfn_to_kaddr(pfn) + f->page_offset;
 			void *va = pfn_to_kaddr(pfn) + f->page_offset;
 			BUG_ON(PageHighMem(f->page));
 			BUG_ON(PageHighMem(f->page));
-			finv_buffer_remote(va, f->size);
+			finv_buffer_remote(va, f->size, 0);
 		}
 		}
 
 
 		cpa = ((phys_addr_t)pfn << PAGE_SHIFT) + f->page_offset;
 		cpa = ((phys_addr_t)pfn << PAGE_SHIFT) + f->page_offset;
@@ -1742,17 +1808,15 @@ static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
 
 
 	unsigned long irqflags;
 	unsigned long irqflags;
 
 
-	lepp_queue_t *eq = priv->epp_queue;
+	lepp_queue_t *eq = priv->eq;
 
 
-	struct sk_buff *olds[4];
-	unsigned int wanted = 4;
+	struct sk_buff *olds[8];
+	unsigned int wanted = 8;
 	unsigned int i, nolds = 0;
 	unsigned int i, nolds = 0;
 
 
 	unsigned int cmd_head, cmd_tail, cmd_next;
 	unsigned int cmd_head, cmd_tail, cmd_next;
 	unsigned int comp_tail;
 	unsigned int comp_tail;
 
 
-	unsigned int free_slots;
-
 
 
 	/* Paranoia. */
 	/* Paranoia. */
 	BUG_ON(skb->protocol != htons(ETH_P_IP));
 	BUG_ON(skb->protocol != htons(ETH_P_IP));
@@ -1780,34 +1844,32 @@ static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
 
 
 	/* Enqueue the command. */
 	/* Enqueue the command. */
 
 
-	spin_lock_irqsave(&priv->cmd_lock, irqflags);
+	spin_lock_irqsave(&priv->eq_lock, irqflags);
 
 
 	/*
 	/*
 	 * Handle completions if needed to make room.
 	 * Handle completions if needed to make room.
 	 * HACK: Spin until there is sufficient room.
 	 * HACK: Spin until there is sufficient room.
 	 */
 	 */
-	free_slots = lepp_num_free_comp_slots(eq);
-	if (free_slots < 1) {
-spin:
-		nolds += tile_net_lepp_grab_comps(dev, olds + nolds,
-						  wanted - nolds, NULL);
-		if (lepp_num_free_comp_slots(eq) < 1)
-			goto spin;
+	if (lepp_num_free_comp_slots(eq) == 0) {
+		nolds = tile_net_lepp_grab_comps(eq, olds, wanted, 0);
+		if (nolds == 0) {
+busy:
+			spin_unlock_irqrestore(&priv->eq_lock, irqflags);
+			return NETDEV_TX_BUSY;
+		}
 	}
 	}
 
 
 	cmd_head = eq->cmd_head;
 	cmd_head = eq->cmd_head;
 	cmd_tail = eq->cmd_tail;
 	cmd_tail = eq->cmd_tail;
 
 
-	/* NOTE: The "gotos" below are untested. */
-
 	/* Prepare to advance, detecting full queue. */
 	/* Prepare to advance, detecting full queue. */
 	cmd_next = cmd_tail + cmd_size;
 	cmd_next = cmd_tail + cmd_size;
 	if (cmd_tail < cmd_head && cmd_next >= cmd_head)
 	if (cmd_tail < cmd_head && cmd_next >= cmd_head)
-		goto spin;
+		goto busy;
 	if (cmd_next > LEPP_CMD_LIMIT) {
 	if (cmd_next > LEPP_CMD_LIMIT) {
 		cmd_next = 0;
 		cmd_next = 0;
 		if (cmd_next == cmd_head)
 		if (cmd_next == cmd_head)
-			goto spin;
+			goto busy;
 	}
 	}
 
 
 	/* Copy the command. */
 	/* Copy the command. */
@@ -1823,14 +1885,18 @@ spin:
 	eq->comp_tail = comp_tail;
 	eq->comp_tail = comp_tail;
 
 
 	/* Flush before allowing LEPP to handle the command. */
 	/* Flush before allowing LEPP to handle the command. */
+	/* ISSUE: Is this the optimal location for the flush? */
 	__insn_mf();
 	__insn_mf();
 
 
 	eq->cmd_tail = cmd_tail;
 	eq->cmd_tail = cmd_tail;
 
 
-	spin_unlock_irqrestore(&priv->cmd_lock, irqflags);
-
+	/* NOTE: Using "4" here is more efficient than "0" or "2", */
+	/* and, strangely, more efficient than pre-checking the number */
+	/* of available completions, and comparing it to 4. */
 	if (nolds == 0)
 	if (nolds == 0)
-		nolds = tile_net_lepp_grab_comps(dev, olds, wanted, NULL);
+		nolds = tile_net_lepp_grab_comps(eq, olds, wanted, 4);
+
+	spin_unlock_irqrestore(&priv->eq_lock, irqflags);
 
 
 	/* Handle completions. */
 	/* Handle completions. */
 	for (i = 0; i < nolds; i++)
 	for (i = 0; i < nolds; i++)
@@ -1870,10 +1936,10 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
 
 
 	unsigned int num_frags;
 	unsigned int num_frags;
 
 
-	lepp_queue_t *eq = priv->epp_queue;
+	lepp_queue_t *eq = priv->eq;
 
 
-	struct sk_buff *olds[4];
-	unsigned int wanted = 4;
+	struct sk_buff *olds[8];
+	unsigned int wanted = 8;
 	unsigned int i, nolds = 0;
 	unsigned int i, nolds = 0;
 
 
 	unsigned int cmd_size = sizeof(lepp_cmd_t);
 	unsigned int cmd_size = sizeof(lepp_cmd_t);
@@ -1883,8 +1949,6 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
 
 
 	lepp_cmd_t cmds[LEPP_MAX_FRAGS];
 	lepp_cmd_t cmds[LEPP_MAX_FRAGS];
 
 
-	unsigned int free_slots;
-
 
 
 	/*
 	/*
 	 * This is paranoia, since we think that if the link doesn't come
 	 * This is paranoia, since we think that if the link doesn't come
@@ -1905,7 +1969,8 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
 	if (hash_default) {
 	if (hash_default) {
 		HV_PTE pte = *virt_to_pte(current->mm, (unsigned long)data);
 		HV_PTE pte = *virt_to_pte(current->mm, (unsigned long)data);
 		if (hv_pte_get_mode(pte) != HV_PTE_MODE_CACHE_HASH_L3)
 		if (hv_pte_get_mode(pte) != HV_PTE_MODE_CACHE_HASH_L3)
-			panic("Non-coherent egress buffer!");
+			panic("Non-HFH egress buffer! VA=%p Mode=%d PTE=%llx",
+			      data, hv_pte_get_mode(pte), hv_pte_val(pte));
 	}
 	}
 #endif
 #endif
 #endif
 #endif
@@ -1958,37 +2023,35 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
 
 
 	/* Enqueue the commands. */
 	/* Enqueue the commands. */
 
 
-	spin_lock_irqsave(&priv->cmd_lock, irqflags);
+	spin_lock_irqsave(&priv->eq_lock, irqflags);
 
 
 	/*
 	/*
 	 * Handle completions if needed to make room.
 	 * Handle completions if needed to make room.
 	 * HACK: Spin until there is sufficient room.
 	 * HACK: Spin until there is sufficient room.
 	 */
 	 */
-	free_slots = lepp_num_free_comp_slots(eq);
-	if (free_slots < 1) {
-spin:
-		nolds += tile_net_lepp_grab_comps(dev, olds + nolds,
-						  wanted - nolds, NULL);
-		if (lepp_num_free_comp_slots(eq) < 1)
-			goto spin;
+	if (lepp_num_free_comp_slots(eq) == 0) {
+		nolds = tile_net_lepp_grab_comps(eq, olds, wanted, 0);
+		if (nolds == 0) {
+busy:
+			spin_unlock_irqrestore(&priv->eq_lock, irqflags);
+			return NETDEV_TX_BUSY;
+		}
 	}
 	}
 
 
 	cmd_head = eq->cmd_head;
 	cmd_head = eq->cmd_head;
 	cmd_tail = eq->cmd_tail;
 	cmd_tail = eq->cmd_tail;
 
 
-	/* NOTE: The "gotos" below are untested. */
-
 	/* Copy the commands, or fail. */
 	/* Copy the commands, or fail. */
 	for (i = 0; i < num_frags; i++) {
 	for (i = 0; i < num_frags; i++) {
 
 
 		/* Prepare to advance, detecting full queue. */
 		/* Prepare to advance, detecting full queue. */
 		cmd_next = cmd_tail + cmd_size;
 		cmd_next = cmd_tail + cmd_size;
 		if (cmd_tail < cmd_head && cmd_next >= cmd_head)
 		if (cmd_tail < cmd_head && cmd_next >= cmd_head)
-			goto spin;
+			goto busy;
 		if (cmd_next > LEPP_CMD_LIMIT) {
 		if (cmd_next > LEPP_CMD_LIMIT) {
 			cmd_next = 0;
 			cmd_next = 0;
 			if (cmd_next == cmd_head)
 			if (cmd_next == cmd_head)
-				goto spin;
+				goto busy;
 		}
 		}
 
 
 		/* Copy the command. */
 		/* Copy the command. */
@@ -2005,14 +2068,18 @@ spin:
 	eq->comp_tail = comp_tail;
 	eq->comp_tail = comp_tail;
 
 
 	/* Flush before allowing LEPP to handle the command. */
 	/* Flush before allowing LEPP to handle the command. */
+	/* ISSUE: Is this the optimal location for the flush? */
 	__insn_mf();
 	__insn_mf();
 
 
 	eq->cmd_tail = cmd_tail;
 	eq->cmd_tail = cmd_tail;
 
 
-	spin_unlock_irqrestore(&priv->cmd_lock, irqflags);
-
+	/* NOTE: Using "4" here is more efficient than "0" or "2", */
+	/* and, strangely, more efficient than pre-checking the number */
+	/* of available completions, and comparing it to 4. */
 	if (nolds == 0)
 	if (nolds == 0)
-		nolds = tile_net_lepp_grab_comps(dev, olds, wanted, NULL);
+		nolds = tile_net_lepp_grab_comps(eq, olds, wanted, 4);
+
+	spin_unlock_irqrestore(&priv->eq_lock, irqflags);
 
 
 	/* Handle completions. */
 	/* Handle completions. */
 	for (i = 0; i < nolds; i++)
 	for (i = 0; i < nolds; i++)
@@ -2261,7 +2328,6 @@ static struct net_device *tile_net_dev_init(const char *name)
 	int ret;
 	int ret;
 	struct net_device *dev;
 	struct net_device *dev;
 	struct tile_net_priv *priv;
 	struct tile_net_priv *priv;
-	struct page *page;
 
 
 	/*
 	/*
 	 * Allocate the device structure.  This allocates "priv", calls
 	 * Allocate the device structure.  This allocates "priv", calls
@@ -2285,23 +2351,21 @@ static struct net_device *tile_net_dev_init(const char *name)
 
 
 	INIT_DELAYED_WORK(&priv->retry_work, tile_net_open_retry);
 	INIT_DELAYED_WORK(&priv->retry_work, tile_net_open_retry);
 
 
-	spin_lock_init(&priv->cmd_lock);
-	spin_lock_init(&priv->comp_lock);
+	spin_lock_init(&priv->eq_lock);
 
 
-	/* Allocate "epp_queue". */
-	BUG_ON(get_order(sizeof(lepp_queue_t)) != 0);
-	page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
-	if (!page) {
+	/* Allocate "eq". */
+	priv->eq_pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, EQ_ORDER);
+	if (!priv->eq_pages) {
 		free_netdev(dev);
 		free_netdev(dev);
 		return NULL;
 		return NULL;
 	}
 	}
-	priv->epp_queue = page_address(page);
+	priv->eq = page_address(priv->eq_pages);
 
 
 	/* Register the network device. */
 	/* Register the network device. */
 	ret = register_netdev(dev);
 	ret = register_netdev(dev);
 	if (ret) {
 	if (ret) {
 		pr_err("register_netdev %s failed %d\n", dev->name, ret);
 		pr_err("register_netdev %s failed %d\n", dev->name, ret);
-		free_page((unsigned long)priv->epp_queue);
+		__free_pages(priv->eq_pages, EQ_ORDER);
 		free_netdev(dev);
 		free_netdev(dev);
 		return NULL;
 		return NULL;
 	}
 	}
@@ -2310,7 +2374,7 @@ static struct net_device *tile_net_dev_init(const char *name)
 	ret = tile_net_get_mac(dev);
 	ret = tile_net_get_mac(dev);
 	if (ret < 0) {
 	if (ret < 0) {
 		unregister_netdev(dev);
 		unregister_netdev(dev);
-		free_page((unsigned long)priv->epp_queue);
+		__free_pages(priv->eq_pages, EQ_ORDER);
 		free_netdev(dev);
 		free_netdev(dev);
 		return NULL;
 		return NULL;
 	}
 	}
@@ -2321,6 +2385,9 @@ static struct net_device *tile_net_dev_init(const char *name)
 
 
 /*
 /*
  * Module cleanup.
  * Module cleanup.
+ *
+ * FIXME: If compiled as a module, this module cannot be "unloaded",
+ * because the "ingress interrupt handler" is registered permanently.
  */
  */
 static void tile_net_cleanup(void)
 static void tile_net_cleanup(void)
 {
 {
@@ -2331,8 +2398,8 @@ static void tile_net_cleanup(void)
 			struct net_device *dev = tile_net_devs[i];
 			struct net_device *dev = tile_net_devs[i];
 			struct tile_net_priv *priv = netdev_priv(dev);
 			struct tile_net_priv *priv = netdev_priv(dev);
 			unregister_netdev(dev);
 			unregister_netdev(dev);
-			finv_buffer(priv->epp_queue, PAGE_SIZE);
-			free_page((unsigned long)priv->epp_queue);
+			finv_buffer(priv->eq, EQ_SIZE);
+			__free_pages(priv->eq_pages, EQ_ORDER);
 			free_netdev(dev);
 			free_netdev(dev);
 		}
 		}
 	}
 	}
@@ -2355,7 +2422,12 @@ static int tile_net_init_module(void)
 }
 }
 
 
 
 
+module_init(tile_net_init_module);
+module_exit(tile_net_cleanup);
+
+
 #ifndef MODULE
 #ifndef MODULE
+
 /*
 /*
  * The "network_cpus" boot argument specifies the cpus that are dedicated
  * The "network_cpus" boot argument specifies the cpus that are dedicated
  * to handle ingress packets.
  * to handle ingress packets.
@@ -2391,8 +2463,5 @@ static int __init network_cpus_setup(char *str)
 	return 0;
 	return 0;
 }
 }
 __setup("network_cpus=", network_cpus_setup);
 __setup("network_cpus=", network_cpus_setup);
-#endif
-
 
 
-module_init(tile_net_init_module);
-module_exit(tile_net_cleanup);
+#endif