Browse Source

Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-2.6

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-2.6: (23 commits)
  sparc: sunzilog uart order
  [SPARC64]: Detect trap frames in stack backtraces.
  [SPARC64]: %l6 trap return handling no longer necessary.
  [SPARC64]: Use trap type stored in pt_regs to handle syscall restart.
  [SPARC64]: Store magic cookie and trap type in pt_regs.
  [SPARC64]: PROM debug console can be CON_ANYTIME.
  sparc64: cleanup after SunOS/Solaris binary emulation removal
  sparc: cleanup after SunOS binary emulation removal
  [SPARC64]: Add NUMA support.
  [SPARC64]: Allocate TSB node-local.
  [SPARC64]: NUMA device infrastructure.
  [SPARC64]: Kill pci_iommu_table_init() declaration.
  [SPARC64]: Once we have the boot cmdline, call parse_early_param()
  [SPARC64]: Remove unused asm-sparc64/numnodes.h
  [SPARC64]: Decrease SECTION_SIZE_BITS to 30.
  [SPARC64]: Initialize MDESC earlier and use lmb_alloc()
  [SPARC64]: Use lmb_alloc() for PROM device tree.
  [SPARC64]: Call real_setup_per_cpu_areas() earlier and use lmb_alloc().
  [SPARC64]: Fully use LMB information in bootmem_init().
  [SPARC64]: Start using LMB information in bootmem_init().
  ...
Linus Torvalds 17 years ago
parent
commit
e270b51df6
49 changed files with 1160 additions and 484 deletions
  1. 1 1
      Makefile
  2. 0 1
      arch/sparc/kernel/entry.S
  3. 0 5
      arch/sparc/kernel/signal.c
  4. 20 0
      arch/sparc64/Kconfig
  5. 64 35
      arch/sparc64/defconfig
  6. 1 0
      arch/sparc64/kernel/ebus.c
  7. 18 19
      arch/sparc64/kernel/entry.S
  8. 0 1
      arch/sparc64/kernel/entry.h
  9. 4 0
      arch/sparc64/kernel/etrap.S
  10. 21 12
      arch/sparc64/kernel/iommu.c
  11. 1 0
      arch/sparc64/kernel/isa.c
  12. 16 12
      arch/sparc64/kernel/mdesc.c
  13. 11 1
      arch/sparc64/kernel/of_device.c
  14. 12 0
      arch/sparc64/kernel/pci.c
  15. 4 1
      arch/sparc64/kernel/pci_fire.c
  16. 2 2
      arch/sparc64/kernel/pci_impl.h
  17. 7 1
      arch/sparc64/kernel/pci_msi.c
  18. 4 1
      arch/sparc64/kernel/pci_psycho.c
  19. 3 1
      arch/sparc64/kernel/pci_sabre.c
  20. 4 1
      arch/sparc64/kernel/pci_schizo.c
  21. 10 3
      arch/sparc64/kernel/pci_sun4v.c
  22. 9 5
      arch/sparc64/kernel/prom.c
  23. 4 17
      arch/sparc64/kernel/rtrap.S
  24. 2 1
      arch/sparc64/kernel/sbus.c
  25. 2 1
      arch/sparc64/kernel/setup.c
  26. 14 11
      arch/sparc64/kernel/signal.c
  27. 8 12
      arch/sparc64/kernel/signal32.c
  28. 8 3
      arch/sparc64/kernel/smp.c
  29. 0 2
      arch/sparc64/kernel/sparc64_ksyms.c
  30. 13 3
      arch/sparc64/kernel/stacktrace.c
  31. 8 8
      arch/sparc64/kernel/sun4v_tlb_miss.S
  32. 12 0
      arch/sparc64/kernel/sysfs.c
  33. 15 4
      arch/sparc64/kernel/traps.c
  34. 1 1
      arch/sparc64/kernel/tsb.S
  35. 6 6
      arch/sparc64/kernel/winfixup.S
  36. 713 276
      arch/sparc64/mm/init.c
  37. 2 1
      arch/sparc64/mm/tsb.c
  38. 1 3
      arch/sparc64/mm/ultra.S
  39. 17 13
      drivers/serial/sunzilog.c
  40. 2 0
      include/asm-sparc/device.h
  41. 5 0
      include/asm-sparc/prom.h
  42. 2 1
      include/asm-sparc64/iommu.h
  43. 17 0
      include/asm-sparc64/mmzone.h
  44. 0 6
      include/asm-sparc64/numnodes.h
  45. 16 2
      include/asm-sparc64/ptrace.h
  46. 1 1
      include/asm-sparc64/sparsemem.h
  47. 71 2
      include/asm-sparc64/topology.h
  48. 7 7
      include/asm-sparc64/ttable.h
  49. 1 1
      lib/lmb.c

+ 1 - 1
Makefile

@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 25
-EXTRAVERSION =
+EXTRAVERSION = -numa
 NAME = Funky Weasel is Jiggy wit it
 
 # *DOCUMENTATION*

+ 0 - 1
arch/sparc/kernel/entry.S

@@ -1409,7 +1409,6 @@ syscall_is_too_hard:
 
 	st	%o0, [%sp + STACKFRAME_SZ + PT_I0]
 
-	.globl	ret_sys_call
 ret_sys_call:
 	ld	[%curptr + TI_FLAGS], %l6
 	cmp	%o0, -ERESTART_RESTARTBLOCK

+ 0 - 5
arch/sparc/kernel/signal.c

@@ -105,11 +105,6 @@ static int _sigpause_common(old_sigset_t set)
 	return -ERESTARTNOHAND;
 }
 
-asmlinkage int sys_sigpause(unsigned int set)
-{
-	return _sigpause_common(set);
-}
-
 asmlinkage int sys_sigsuspend(old_sigset_t set)
 {
 	return _sigpause_common(set);

+ 20 - 0
arch/sparc64/Kconfig

@@ -250,6 +250,26 @@ endchoice
 
 endmenu
 
+config NUMA
+	bool "NUMA support"
+
+config NODES_SHIFT
+	int
+	default "4"
+	depends on NEED_MULTIPLE_NODES
+
+# Some NUMA nodes have memory ranges that span
+# other nodes.  Even though a pfn is valid and
+# between a node's start and end pfns, it may not
+# reside on that node.  See memmap_init_zone()
+# for details.
+config NODES_SPAN_OTHER_NODES
+	def_bool y
+	depends on NEED_MULTIPLE_NODES
+
+config ARCH_POPULATES_NODE_MAP
+	def_bool y
+
 config ARCH_SELECT_MEMORY_MODEL
 	def_bool y
 

+ 64 - 35
arch/sparc64/defconfig

@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.25
-# Sun Apr 20 01:33:21 2008
+# Linux kernel version: 2.6.25-numa
+# Wed Apr 23 04:49:08 2008
 #
 CONFIG_SPARC=y
 CONFIG_SPARC64=y
@@ -152,6 +152,8 @@ CONFIG_GENERIC_CALIBRATE_DELAY=y
 CONFIG_HUGETLB_PAGE_SIZE_4MB=y
 # CONFIG_HUGETLB_PAGE_SIZE_512K is not set
 # CONFIG_HUGETLB_PAGE_SIZE_64K is not set
+# CONFIG_NUMA is not set
+CONFIG_ARCH_POPULATES_NODE_MAP=y
 CONFIG_ARCH_SELECT_MEMORY_MODEL=y
 CONFIG_ARCH_SPARSEMEM_ENABLE=y
 CONFIG_ARCH_SPARSEMEM_DEFAULT=y
@@ -787,7 +789,6 @@ CONFIG_I2C_ALGOBIT=y
 # CONFIG_SENSORS_PCF8574 is not set
 # CONFIG_PCF8575 is not set
 # CONFIG_SENSORS_PCF8591 is not set
-# CONFIG_TPS65010 is not set
 # CONFIG_SENSORS_MAX6875 is not set
 # CONFIG_SENSORS_TSL2550 is not set
 # CONFIG_I2C_DEBUG_CORE is not set
@@ -869,6 +870,7 @@ CONFIG_SSB_POSSIBLE=y
 # Multifunction device drivers
 #
 # CONFIG_MFD_SM501 is not set
+# CONFIG_HTC_PASIC3 is not set
 
 #
 # Multimedia devices
@@ -1219,10 +1221,6 @@ CONFIG_USB_STORAGE=m
 # CONFIG_NEW_LEDS is not set
 # CONFIG_INFINIBAND is not set
 # CONFIG_RTC_CLASS is not set
-
-#
-# Userspace I/O
-#
 # CONFIG_UIO is not set
 
 #
@@ -1399,6 +1397,7 @@ CONFIG_SCHEDSTATS=y
 CONFIG_DEBUG_BUGVERBOSE=y
 # CONFIG_DEBUG_INFO is not set
 # CONFIG_DEBUG_VM is not set
+# CONFIG_DEBUG_WRITECOUNT is not set
 # CONFIG_DEBUG_LIST is not set
 # CONFIG_DEBUG_SG is not set
 # CONFIG_BOOT_PRINTK_DELAY is not set
@@ -1425,53 +1424,82 @@ CONFIG_ASYNC_CORE=m
 CONFIG_ASYNC_MEMCPY=m
 CONFIG_ASYNC_XOR=m
 CONFIG_CRYPTO=y
+
+#
+# Crypto core or helper
+#
 CONFIG_CRYPTO_ALGAPI=y
 CONFIG_CRYPTO_AEAD=y
 CONFIG_CRYPTO_BLKCIPHER=y
-# CONFIG_CRYPTO_SEQIV is not set
 CONFIG_CRYPTO_HASH=y
 CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_GF128MUL=m
+CONFIG_CRYPTO_NULL=m
+# CONFIG_CRYPTO_CRYPTD is not set
+CONFIG_CRYPTO_AUTHENC=y
+CONFIG_CRYPTO_TEST=m
+
+#
+# Authenticated Encryption with Associated Data
+#
+# CONFIG_CRYPTO_CCM is not set
+# CONFIG_CRYPTO_GCM is not set
+# CONFIG_CRYPTO_SEQIV is not set
+
+#
+# Block modes
+#
+CONFIG_CRYPTO_CBC=y
+# CONFIG_CRYPTO_CTR is not set
+# CONFIG_CRYPTO_CTS is not set
+CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_LRW=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_XTS=m
+
+#
+# Hash modes
+#
 CONFIG_CRYPTO_HMAC=y
 CONFIG_CRYPTO_XCBC=y
-CONFIG_CRYPTO_NULL=m
+
+#
+# Digest
+#
+CONFIG_CRYPTO_CRC32C=m
 CONFIG_CRYPTO_MD4=y
 CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_MICHAEL_MIC=m
 CONFIG_CRYPTO_SHA1=y
 CONFIG_CRYPTO_SHA256=m
 CONFIG_CRYPTO_SHA512=m
-CONFIG_CRYPTO_WP512=m
 CONFIG_CRYPTO_TGR192=m
-CONFIG_CRYPTO_GF128MUL=m
-CONFIG_CRYPTO_ECB=m
-CONFIG_CRYPTO_CBC=y
-CONFIG_CRYPTO_PCBC=m
-CONFIG_CRYPTO_LRW=m
-CONFIG_CRYPTO_XTS=m
-# CONFIG_CRYPTO_CTR is not set
-# CONFIG_CRYPTO_GCM is not set
-# CONFIG_CRYPTO_CCM is not set
-# CONFIG_CRYPTO_CRYPTD is not set
-CONFIG_CRYPTO_DES=y
-CONFIG_CRYPTO_FCRYPT=m
-CONFIG_CRYPTO_BLOWFISH=m
-CONFIG_CRYPTO_TWOFISH=m
-CONFIG_CRYPTO_TWOFISH_COMMON=m
-CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_WP512=m
+
+#
+# Ciphers
+#
 CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_ANUBIS=m
+CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
-CONFIG_CRYPTO_TEA=m
-CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_DES=y
+CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
-CONFIG_CRYPTO_ANUBIS=m
-CONFIG_CRYPTO_SEED=m
 # CONFIG_CRYPTO_SALSA20 is not set
+CONFIG_CRYPTO_SEED=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_TEA=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_TWOFISH_COMMON=m
+
+#
+# Compression
+#
 CONFIG_CRYPTO_DEFLATE=y
-CONFIG_CRYPTO_MICHAEL_MIC=m
-CONFIG_CRYPTO_CRC32C=m
-CONFIG_CRYPTO_CAMELLIA=m
-CONFIG_CRYPTO_TEST=m
-CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_LZO is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
@@ -1492,3 +1520,4 @@ CONFIG_PLIST=y
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
+CONFIG_HAVE_LMB=y

+ 1 - 0
arch/sparc64/kernel/ebus.c

@@ -396,6 +396,7 @@ static void __init fill_ebus_device(struct device_node *dp, struct linux_ebus_de
 	sd->op = &dev->ofdev;
 	sd->iommu = dev->bus->ofdev.dev.parent->archdata.iommu;
 	sd->stc = dev->bus->ofdev.dev.parent->archdata.stc;
+	sd->numa_node = dev->bus->ofdev.dev.parent->archdata.numa_node;
 
 	dev->ofdev.node = dp;
 	dev->ofdev.dev.parent = &dev->bus->ofdev.dev;

+ 18 - 19
arch/sparc64/kernel/entry.S

@@ -47,7 +47,7 @@ do_fpdis:
 	ba,pt		%xcc, etrap
 109:	 or		%g7, %lo(109b), %g7
 	add		%g0, %g0, %g0
-	ba,a,pt		%xcc, rtrap_clr_l6
+	ba,a,pt		%xcc, rtrap
 
 1:	TRAP_LOAD_THREAD_REG(%g6, %g1)
 	ldub		[%g6 + TI_FPSAVED], %g5
@@ -226,7 +226,7 @@ fp_other_bounce:
 	call		do_fpother
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 	.globl		do_fpother_check_fitos
 	.align		32
@@ -489,7 +489,7 @@ utrap_trap:		/* %g3=handler,%g4=level */
         call		bad_trap
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 invoke_utrap:
 	sllx		%g3, 3, %g3
@@ -607,7 +607,7 @@ __spitfire_cee_trap_continue:
 	call		spitfire_access_error
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 	/* This is the trap handler entry point for ECC correctable
 	 * errors.  They are corrected, but we listen for the trap
@@ -686,7 +686,7 @@ __spitfire_data_access_exception_tl1:
 	call		spitfire_data_access_exception_tl1
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 __spitfire_data_access_exception:
 	rdpr		%pstate, %g4
@@ -705,7 +705,7 @@ __spitfire_data_access_exception:
 	call		spitfire_data_access_exception
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 	.globl		__spitfire_insn_access_exception
 	.globl		__spitfire_insn_access_exception_tl1
@@ -725,7 +725,7 @@ __spitfire_insn_access_exception_tl1:
 	call		spitfire_insn_access_exception_tl1
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 __spitfire_insn_access_exception:
 	rdpr		%pstate, %g4
@@ -743,7 +743,7 @@ __spitfire_insn_access_exception:
 	call		spitfire_insn_access_exception
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 	/* These get patched into the trap table at boot time
 	 * once we know we have a cheetah processor.
@@ -937,7 +937,7 @@ do_dcpe_tl1_fatal:
 	call		cheetah_plus_parity_error
 	 add		%sp, PTREGS_OFF, %o1
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 do_icpe_tl1:
 	rdpr		%tl, %g1		! Save original trap level
@@ -979,7 +979,7 @@ do_icpe_tl1_fatal:
 	call		cheetah_plus_parity_error
 	 add		%sp, PTREGS_OFF, %o1
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 	
 dcpe_icpe_tl1_common:
 	/* Flush D-cache, re-enable D/I caches in DCU and finally
@@ -1281,7 +1281,7 @@ __do_privact:
 	call		do_privact
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 	.globl		do_mna
 do_mna:
@@ -1308,7 +1308,7 @@ do_mna:
 	call		mem_address_unaligned
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 	.globl		do_lddfmna
 do_lddfmna:
@@ -1326,7 +1326,7 @@ do_lddfmna:
 	call		handle_lddfmna
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 	.globl		do_stdfmna
 do_stdfmna:
@@ -1344,7 +1344,7 @@ do_stdfmna:
 	call		handle_stdfmna
 	 add		%sp, PTREGS_OFF, %o0
 	ba,pt		%xcc, rtrap
-	 clr		%l6
+	 nop
 
 	.globl	breakpoint_trap
 breakpoint_trap:
@@ -1424,13 +1424,13 @@ sys32_rt_sigreturn:
 1:		ldx		[%curptr + TI_FLAGS], %l5
 		andcc		%l5, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT), %g0
 		be,pt		%icc, rtrap
-		 clr		%l6
+		 nop
 		add		%sp, PTREGS_OFF, %o0
 		call		syscall_trace
 		 mov		1, %o1
 
 		ba,pt		%xcc, rtrap
-		 clr		%l6
+		 nop
 
 	/* This is how fork() was meant to be done, 8 instruction entry.
 	 *
@@ -1559,7 +1559,7 @@ linux_sparc_syscall32:
 
 	/* Linux native system calls enter here... */
 	.align	32
-	.globl	linux_sparc_syscall, ret_sys_call
+	.globl	linux_sparc_syscall
 linux_sparc_syscall:
 	/* Direct access to user regs, much faster. */
 	cmp		%g1, NR_SYSCALLS			! IEU1	Group
@@ -1605,7 +1605,7 @@ ret_sys_call:
 	bne,pn		%icc, linux_syscall_trace2
 	 add		%l1, 0x4, %l2			! npc = npc+4
 	stx		%l1, [%sp + PTREGS_OFF + PT_V9_TPC]
-	ba,pt		%xcc, rtrap_clr_l6
+	ba,pt		%xcc, rtrap
 	 stx		%l2, [%sp + PTREGS_OFF + PT_V9_TNPC]
 
 1:
@@ -1616,7 +1616,6 @@ ret_sys_call:
 	sub		%g0, %o0, %o0
 	or		%g3, %g2, %g3
 	stx		%o0, [%sp + PTREGS_OFF + PT_V9_I0]
-	mov		1, %l6
 	stx		%g3, [%sp + PTREGS_OFF + PT_V9_TSTATE]
 	bne,pn		%icc, linux_syscall_trace2
 	 add		%l1, 0x4, %l2			! npc = npc+4

+ 0 - 1
arch/sparc64/kernel/entry.h

@@ -20,7 +20,6 @@ extern void timer_interrupt(int irq, struct pt_regs *regs);
 
 extern void do_notify_resume(struct pt_regs *regs,
 			     unsigned long orig_i0,
-			     int restart_syscall,
 			     unsigned long thread_info_flags);
 
 extern asmlinkage void syscall_trace(struct pt_regs *regs,

+ 4 - 0
arch/sparc64/kernel/etrap.S

@@ -53,7 +53,11 @@ etrap_irq:
 		stx	%g3, [%g2 + STACKFRAME_SZ + PT_V9_TPC]
 		rd	%y, %g3
 		stx	%g1, [%g2 + STACKFRAME_SZ + PT_V9_TNPC]
+		rdpr	%tt, %g1
 		st	%g3, [%g2 + STACKFRAME_SZ + PT_V9_Y]
+		sethi	%hi(PT_REGS_MAGIC), %g3
+		or	%g3, %g1, %g1
+		st	%g1, [%g2 + STACKFRAME_SZ + PT_V9_MAGIC]
 
 		rdpr	%cansave, %g1
 		brnz,pt %g1, etrap_save

+ 21 - 12
arch/sparc64/kernel/iommu.c

@@ -173,9 +173,11 @@ void iommu_range_free(struct iommu *iommu, dma_addr_t dma_addr, unsigned long np
 }
 
 int iommu_table_init(struct iommu *iommu, int tsbsize,
-		     u32 dma_offset, u32 dma_addr_mask)
+		     u32 dma_offset, u32 dma_addr_mask,
+		     int numa_node)
 {
-	unsigned long i, tsbbase, order, sz, num_tsb_entries;
+	unsigned long i, order, sz, num_tsb_entries;
+	struct page *page;
 
 	num_tsb_entries = tsbsize / sizeof(iopte_t);
 
@@ -188,11 +190,12 @@ int iommu_table_init(struct iommu *iommu, int tsbsize,
 	/* Allocate and initialize the free area map.  */
 	sz = num_tsb_entries / 8;
 	sz = (sz + 7UL) & ~7UL;
-	iommu->arena.map = kzalloc(sz, GFP_KERNEL);
+	iommu->arena.map = kmalloc_node(sz, GFP_KERNEL, numa_node);
 	if (!iommu->arena.map) {
 		printk(KERN_ERR "IOMMU: Error, kmalloc(arena.map) failed.\n");
 		return -ENOMEM;
 	}
+	memset(iommu->arena.map, 0, sz);
 	iommu->arena.limit = num_tsb_entries;
 
 	if (tlb_type != hypervisor)
@@ -201,21 +204,23 @@ int iommu_table_init(struct iommu *iommu, int tsbsize,
 	/* Allocate and initialize the dummy page which we
 	 * set inactive IO PTEs to point to.
 	 */
-	iommu->dummy_page = get_zeroed_page(GFP_KERNEL);
-	if (!iommu->dummy_page) {
+	page = alloc_pages_node(numa_node, GFP_KERNEL, 0);
+	if (!page) {
 		printk(KERN_ERR "IOMMU: Error, gfp(dummy_page) failed.\n");
 		goto out_free_map;
 	}
+	iommu->dummy_page = (unsigned long) page_address(page);
+	memset((void *)iommu->dummy_page, 0, PAGE_SIZE);
 	iommu->dummy_page_pa = (unsigned long) __pa(iommu->dummy_page);
 
 	/* Now allocate and setup the IOMMU page table itself.  */
 	order = get_order(tsbsize);
-	tsbbase = __get_free_pages(GFP_KERNEL, order);
-	if (!tsbbase) {
+	page = alloc_pages_node(numa_node, GFP_KERNEL, order);
+	if (!page) {
 		printk(KERN_ERR "IOMMU: Error, gfp(tsb) failed.\n");
 		goto out_free_dummy_page;
 	}
-	iommu->page_table = (iopte_t *)tsbbase;
+	iommu->page_table = (iopte_t *)page_address(page);
 
 	for (i = 0; i < num_tsb_entries; i++)
 		iopte_make_dummy(iommu, &iommu->page_table[i]);
@@ -276,20 +281,24 @@ static inline void iommu_free_ctx(struct iommu *iommu, int ctx)
 static void *dma_4u_alloc_coherent(struct device *dev, size_t size,
 				   dma_addr_t *dma_addrp, gfp_t gfp)
 {
+	unsigned long flags, order, first_page;
 	struct iommu *iommu;
+	struct page *page;
+	int npages, nid;
 	iopte_t *iopte;
-	unsigned long flags, order, first_page;
 	void *ret;
-	int npages;
 
 	size = IO_PAGE_ALIGN(size);
 	order = get_order(size);
 	if (order >= 10)
 		return NULL;
 
-	first_page = __get_free_pages(gfp, order);
-	if (first_page == 0UL)
+	nid = dev->archdata.numa_node;
+	page = alloc_pages_node(nid, gfp, order);
+	if (unlikely(!page))
 		return NULL;
+
+	first_page = (unsigned long) page_address(page);
 	memset((char *)first_page, 0, PAGE_SIZE << order);
 
 	iommu = dev->archdata.iommu;

+ 1 - 0
arch/sparc64/kernel/isa.c

@@ -92,6 +92,7 @@ static void __init isa_fill_devices(struct sparc_isa_bridge *isa_br)
 		sd->op = &isa_dev->ofdev;
 		sd->iommu = isa_br->ofdev.dev.parent->archdata.iommu;
 		sd->stc = isa_br->ofdev.dev.parent->archdata.stc;
+		sd->numa_node = isa_br->ofdev.dev.parent->archdata.numa_node;
 
 		isa_dev->ofdev.node = dp;
 		isa_dev->ofdev.dev.parent = &isa_br->ofdev.dev;

+ 16 - 12
arch/sparc64/kernel/mdesc.c

@@ -1,10 +1,10 @@
 /* mdesc.c: Sun4V machine description handling.
  *
- * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
+ * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net>
  */
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/bootmem.h>
+#include <linux/lmb.h>
 #include <linux/log2.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -84,24 +84,28 @@ static void mdesc_handle_init(struct mdesc_handle *hp,
 	hp->handle_size = handle_size;
 }
 
-static struct mdesc_handle * __init mdesc_bootmem_alloc(unsigned int mdesc_size)
+static struct mdesc_handle * __init mdesc_lmb_alloc(unsigned int mdesc_size)
 {
-	struct mdesc_handle *hp;
 	unsigned int handle_size, alloc_size;
+	struct mdesc_handle *hp;
+	unsigned long paddr;
 
 	handle_size = (sizeof(struct mdesc_handle) -
 		       sizeof(struct mdesc_hdr) +
 		       mdesc_size);
 	alloc_size = PAGE_ALIGN(handle_size);
 
-	hp = __alloc_bootmem(alloc_size, PAGE_SIZE, 0UL);
-	if (hp)
-		mdesc_handle_init(hp, handle_size, hp);
+	paddr = lmb_alloc(alloc_size, PAGE_SIZE);
 
+	hp = NULL;
+	if (paddr) {
+		hp = __va(paddr);
+		mdesc_handle_init(hp, handle_size, hp);
+	}
 	return hp;
 }
 
-static void mdesc_bootmem_free(struct mdesc_handle *hp)
+static void mdesc_lmb_free(struct mdesc_handle *hp)
 {
 	unsigned int alloc_size, handle_size = hp->handle_size;
 	unsigned long start, end;
@@ -124,9 +128,9 @@ static void mdesc_bootmem_free(struct mdesc_handle *hp)
 	}
 }
 
-static struct mdesc_mem_ops bootmem_mdesc_ops = {
-	.alloc = mdesc_bootmem_alloc,
-	.free  = mdesc_bootmem_free,
+static struct mdesc_mem_ops lmb_mdesc_ops = {
+	.alloc = mdesc_lmb_alloc,
+	.free  = mdesc_lmb_free,
 };
 
 static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size)
@@ -888,7 +892,7 @@ void __init sun4v_mdesc_init(void)
 
 	printk("MDESC: Size is %lu bytes.\n", len);
 
-	hp = mdesc_alloc(len, &bootmem_mdesc_ops);
+	hp = mdesc_alloc(len, &lmb_mdesc_ops);
 	if (hp == NULL) {
 		prom_printf("MDESC: alloc of %lu bytes failed.\n", len);
 		prom_halt();

+ 11 - 1
arch/sparc64/kernel/of_device.c

@@ -6,6 +6,7 @@
 #include <linux/mod_devicetable.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
+#include <linux/irq.h>
 #include <linux/of_device.h>
 #include <linux/of_platform.h>
 
@@ -660,6 +661,7 @@ static unsigned int __init build_one_device_irq(struct of_device *op,
 	struct device_node *dp = op->node;
 	struct device_node *pp, *ip;
 	unsigned int orig_irq = irq;
+	int nid;
 
 	if (irq == 0xffffffff)
 		return irq;
@@ -672,7 +674,7 @@ static unsigned int __init build_one_device_irq(struct of_device *op,
 			printk("%s: direct translate %x --> %x\n",
 			       dp->full_name, orig_irq, irq);
 
-		return irq;
+		goto out;
 	}
 
 	/* Something more complicated.  Walk up to the root, applying
@@ -744,6 +746,14 @@ static unsigned int __init build_one_device_irq(struct of_device *op,
 		printk("%s: Apply IRQ trans [%s] %x --> %x\n",
 		       op->node->full_name, ip->full_name, orig_irq, irq);
 
+out:
+	nid = of_node_to_nid(dp);
+	if (nid != -1) {
+		cpumask_t numa_mask = node_to_cpumask(nid);
+
+		irq_set_affinity(irq, numa_mask);
+	}
+
 	return irq;
 }
 

+ 12 - 0
arch/sparc64/kernel/pci.c

@@ -369,10 +369,12 @@ struct pci_dev *of_create_pci_dev(struct pci_pbm_info *pbm,
 	sd->host_controller = pbm;
 	sd->prom_node = node;
 	sd->op = of_find_device_by_node(node);
+	sd->numa_node = pbm->numa_node;
 
 	sd = &sd->op->dev.archdata;
 	sd->iommu = pbm->iommu;
 	sd->stc = &pbm->stc;
+	sd->numa_node = pbm->numa_node;
 
 	type = of_get_property(node, "device_type", NULL);
 	if (type == NULL)
@@ -1159,6 +1161,16 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 	return 0;
 }
 
+#ifdef CONFIG_NUMA
+int pcibus_to_node(struct pci_bus *pbus)
+{
+	struct pci_pbm_info *pbm = pbus->sysdata;
+
+	return pbm->numa_node;
+}
+EXPORT_SYMBOL(pcibus_to_node);
+#endif
+
 /* Return the domain nuber for this pci bus */
 
 int pci_domain_nr(struct pci_bus *pbus)

+ 4 - 1
arch/sparc64/kernel/pci_fire.c

@@ -71,7 +71,8 @@ static int pci_fire_pbm_iommu_init(struct pci_pbm_info *pbm)
 	 */
 	fire_write(iommu->iommu_flushinv, ~(u64)0);
 
-	err = iommu_table_init(iommu, tsbsize * 8 * 1024, vdma[0], dma_mask);
+	err = iommu_table_init(iommu, tsbsize * 8 * 1024, vdma[0], dma_mask,
+			       pbm->numa_node);
 	if (err)
 		return err;
 
@@ -449,6 +450,8 @@ static int __init pci_fire_pbm_init(struct pci_controller_info *p,
 	pbm->next = pci_pbm_root;
 	pci_pbm_root = pbm;
 
+	pbm->numa_node = -1;
+
 	pbm->scan_bus = pci_fire_scan_bus;
 	pbm->pci_ops = &sun4u_pci_ops;
 	pbm->config_space_reg_bits = 12;

+ 2 - 2
arch/sparc64/kernel/pci_impl.h

@@ -148,6 +148,8 @@ struct pci_pbm_info {
 	struct pci_bus			*pci_bus;
 	void (*scan_bus)(struct pci_pbm_info *);
 	struct pci_ops			*pci_ops;
+
+	int				numa_node;
 };
 
 struct pci_controller_info {
@@ -161,8 +163,6 @@ extern struct pci_pbm_info *pci_pbm_root;
 extern int pci_num_pbms;
 
 /* PCI bus scanning and fixup support. */
-extern void pci_iommu_table_init(struct iommu *iommu, int tsbsize,
-				 u32 dma_offset, u32 dma_addr_mask);
 extern void pci_get_pbm_props(struct pci_pbm_info *pbm);
 extern struct pci_bus *pci_scan_one_pbm(struct pci_pbm_info *pbm);
 extern void pci_determine_mem_io_space(struct pci_pbm_info *pbm);

+ 7 - 1
arch/sparc64/kernel/pci_msi.c

@@ -279,11 +279,17 @@ static int bringup_one_msi_queue(struct pci_pbm_info *pbm,
 				 unsigned long devino)
 {
 	int irq = ops->msiq_build_irq(pbm, msiqid, devino);
-	int err;
+	int err, nid;
 
 	if (irq < 0)
 		return irq;
 
+	nid = pbm->numa_node;
+	if (nid != -1) {
+		cpumask_t numa_mask = node_to_cpumask(nid);
+
+		irq_set_affinity(irq, numa_mask);
+	}
 	err = request_irq(irq, sparc64_msiq_interrupt, 0,
 			  "MSIQ",
 			  &pbm->msiq_irq_cookies[msiqid - pbm->msiq_first]);

+ 4 - 1
arch/sparc64/kernel/pci_psycho.c

@@ -848,7 +848,8 @@ static int psycho_iommu_init(struct pci_pbm_info *pbm)
 	/* Leave diag mode enabled for full-flushing done
 	 * in pci_iommu.c
 	 */
-	err = iommu_table_init(iommu, IO_TSB_SIZE, 0xc0000000, 0xffffffff);
+	err = iommu_table_init(iommu, IO_TSB_SIZE, 0xc0000000, 0xffffffff,
+			       pbm->numa_node);
 	if (err)
 		return err;
 
@@ -979,6 +980,8 @@ static void __init psycho_pbm_init(struct pci_controller_info *p,
 	pbm->next = pci_pbm_root;
 	pci_pbm_root = pbm;
 
+	pbm->numa_node = -1;
+
 	pbm->scan_bus = psycho_scan_bus;
 	pbm->pci_ops = &sun4u_pci_ops;
 	pbm->config_space_reg_bits = 8;

+ 3 - 1
arch/sparc64/kernel/pci_sabre.c

@@ -704,7 +704,7 @@ static int sabre_iommu_init(struct pci_pbm_info *pbm,
 	 * in pci_iommu.c
 	 */
 	err = iommu_table_init(iommu, tsbsize * 1024 * 8,
-			       dvma_offset, dma_mask);
+			       dvma_offset, dma_mask, pbm->numa_node);
 	if (err)
 		return err;
 
@@ -737,6 +737,8 @@ static void __init sabre_pbm_init(struct pci_controller_info *p,
 	pbm->name = dp->full_name;
 	printk("%s: SABRE PCI Bus Module\n", pbm->name);
 
+	pbm->numa_node = -1;
+
 	pbm->scan_bus = sabre_scan_bus;
 	pbm->pci_ops = &sun4u_pci_ops;
 	pbm->config_space_reg_bits = 8;

+ 4 - 1
arch/sparc64/kernel/pci_schizo.c

@@ -1220,7 +1220,8 @@ static int schizo_pbm_iommu_init(struct pci_pbm_info *pbm)
 	/* Leave diag mode enabled for full-flushing done
 	 * in pci_iommu.c
 	 */
-	err = iommu_table_init(iommu, tsbsize * 8 * 1024, vdma[0], dma_mask);
+	err = iommu_table_init(iommu, tsbsize * 8 * 1024, vdma[0], dma_mask,
+			       pbm->numa_node);
 	if (err)
 		return err;
 
@@ -1379,6 +1380,8 @@ static int __init schizo_pbm_init(struct pci_controller_info *p,
 	pbm->next = pci_pbm_root;
 	pci_pbm_root = pbm;
 
+	pbm->numa_node = -1;
+
 	pbm->scan_bus = schizo_scan_bus;
 	pbm->pci_ops = &sun4u_pci_ops;
 	pbm->config_space_reg_bits = 8;

+ 10 - 3
arch/sparc64/kernel/pci_sun4v.c

@@ -127,10 +127,12 @@ static inline long iommu_batch_end(void)
 static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
 				   dma_addr_t *dma_addrp, gfp_t gfp)
 {
-	struct iommu *iommu;
 	unsigned long flags, order, first_page, npages, n;
+	struct iommu *iommu;
+	struct page *page;
 	void *ret;
 	long entry;
+	int nid;
 
 	size = IO_PAGE_ALIGN(size);
 	order = get_order(size);
@@ -139,10 +141,12 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size,
 
 	npages = size >> IO_PAGE_SHIFT;
 
-	first_page = __get_free_pages(gfp, order);
-	if (unlikely(first_page == 0UL))
+	nid = dev->archdata.numa_node;
+	page = alloc_pages_node(nid, gfp, order);
+	if (unlikely(!page))
 		return NULL;
 
+	first_page = (unsigned long) page_address(page);
 	memset((char *)first_page, 0, PAGE_SIZE << order);
 
 	iommu = dev->archdata.iommu;
@@ -899,6 +903,8 @@ static void __init pci_sun4v_pbm_init(struct pci_controller_info *p,
 	pbm->next = pci_pbm_root;
 	pci_pbm_root = pbm;
 
+	pbm->numa_node = of_node_to_nid(dp);
+
 	pbm->scan_bus = pci_sun4v_scan_bus;
 	pbm->pci_ops = &sun4v_pci_ops;
 	pbm->config_space_reg_bits = 12;
@@ -913,6 +919,7 @@ static void __init pci_sun4v_pbm_init(struct pci_controller_info *p,
 	pbm->name = dp->full_name;
 
 	printk("%s: SUN4V PCI Bus Module\n", pbm->name);
+	printk("%s: On NUMA node %d\n", pbm->name, pbm->numa_node);
 
 	pci_determine_mem_io_space(pbm);
 

+ 9 - 5
arch/sparc64/kernel/prom.c

@@ -19,8 +19,8 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/mm.h>
-#include <linux/bootmem.h>
 #include <linux/module.h>
+#include <linux/lmb.h>
 
 #include <asm/prom.h>
 #include <asm/of_device.h>
@@ -122,16 +122,20 @@ int of_find_in_proplist(const char *list, const char *match, int len)
 }
 EXPORT_SYMBOL(of_find_in_proplist);
 
-static unsigned int prom_early_allocated;
+static unsigned int prom_early_allocated __initdata;
 
 static void * __init prom_early_alloc(unsigned long size)
 {
+	unsigned long paddr = lmb_alloc(size, SMP_CACHE_BYTES);
 	void *ret;
 
-	ret = __alloc_bootmem(size, SMP_CACHE_BYTES, 0UL);
-	if (ret != NULL)
-		memset(ret, 0, size);
+	if (!paddr) {
+		prom_printf("prom_early_alloc(%lu) failed\n");
+		prom_halt();
+	}
 
+	ret = __va(paddr);
+	memset(ret, 0, size);
 	prom_early_allocated += size;
 
 	return ret;

+ 4 - 17
arch/sparc64/kernel/rtrap.S

@@ -18,12 +18,6 @@
 #define		RTRAP_PSTATE_IRQOFF	(PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV)
 #define		RTRAP_PSTATE_AG_IRQOFF	(PSTATE_RMO|PSTATE_PEF|PSTATE_PRIV|PSTATE_AG)
 
-		/* Register %l6 keeps track of whether we are returning
-		 * from a system call or not.  It is cleared if we call
-		 * do_notify_resume, and it must not be otherwise modified
-		 * until we fully commit to returning to userspace.
-		 */
-
 		.text
 		.align			32
 __handle_softirq:
@@ -56,14 +50,12 @@ __handle_user_windows:
 		be,pt			%xcc, __handle_user_windows_continue
 		 nop
 		mov			%l5, %o1
-		mov			%l6, %o2
 		add			%sp, PTREGS_OFF, %o0
-		mov			%l0, %o3
+		mov			%l0, %o2
 
 		call			do_notify_resume
 		 wrpr			%g0, RTRAP_PSTATE, %pstate
 		wrpr			%g0, RTRAP_PSTATE_IRQOFF, %pstate
-		clr			%l6
 		/* Signal delivery can modify pt_regs tstate, so we must
 		 * reload it.
 		 */
@@ -99,14 +91,12 @@ __handle_perfctrs:
 		be,pt			%xcc, __handle_perfctrs_continue
 		 sethi			%hi(TSTATE_PEF), %o0
 		mov			%l5, %o1
-		mov			%l6, %o2
 		add			%sp, PTREGS_OFF, %o0
-		mov			%l0, %o3
+		mov			%l0, %o2
 		call			do_notify_resume
 
 		 wrpr			%g0, RTRAP_PSTATE, %pstate
 		wrpr			%g0, RTRAP_PSTATE_IRQOFF, %pstate
-		clr			%l6
 		/* Signal delivery can modify pt_regs tstate, so we must
 		 * reload it.
 		 */
@@ -127,13 +117,11 @@ __handle_userfpu:
 
 __handle_signal:
 		mov			%l5, %o1
-		mov			%l6, %o2
 		add			%sp, PTREGS_OFF, %o0
-		mov			%l0, %o3
+		mov			%l0, %o2
 		call			do_notify_resume
 		 wrpr			%g0, RTRAP_PSTATE, %pstate
 		wrpr			%g0, RTRAP_PSTATE_IRQOFF, %pstate
-		clr			%l6
 
 		/* Signal delivery can modify pt_regs tstate, so we must
 		 * reload it.
@@ -145,9 +133,8 @@ __handle_signal:
 		 andn			%l1, %l4, %l1
 
 		.align			64
-		.globl			rtrap_irq, rtrap_clr_l6, rtrap, irqsz_patchme, rtrap_xcall
+		.globl			rtrap_irq, rtrap, irqsz_patchme, rtrap_xcall
 rtrap_irq:
-rtrap_clr_l6:	clr			%l6
 rtrap:
 #ifndef CONFIG_SMP
 		sethi			%hi(per_cpu____cpu_data), %l0

+ 2 - 1
arch/sparc64/kernel/sbus.c

@@ -544,6 +544,7 @@ static void __init sbus_iommu_init(int __node, struct sbus_bus *sbus)
 
 	sbus->ofdev.dev.archdata.iommu = iommu;
 	sbus->ofdev.dev.archdata.stc = strbuf;
+	sbus->ofdev.dev.archdata.numa_node = -1;
 
 	reg_base = regs + SYSIO_IOMMUREG_BASE;
 	iommu->iommu_control = reg_base + IOMMU_CONTROL;
@@ -575,7 +576,7 @@ static void __init sbus_iommu_init(int __node, struct sbus_bus *sbus)
 	       sbus->portid, regs);
 
 	/* Setup for TSB_SIZE=7, TBW_SIZE=0, MMU_DE=1, MMU_EN=1 */
-	if (iommu_table_init(iommu, IO_TSB_SIZE, MAP_BASE, 0xffffffff))
+	if (iommu_table_init(iommu, IO_TSB_SIZE, MAP_BASE, 0xffffffff, -1))
 		goto fatal_memory_error;
 
 	control = upa_readq(iommu->iommu_control);

+ 2 - 1
arch/sparc64/kernel/setup.c

@@ -82,7 +82,7 @@ unsigned long cmdline_memory_size = 0;
 static struct console prom_early_console = {
 	.name =		"earlyprom",
 	.write =	prom_console_write,
-	.flags =	CON_PRINTBUFFER | CON_BOOT,
+	.flags =	CON_PRINTBUFFER | CON_BOOT | CON_ANYTIME,
 	.index =	-1,
 };
 
@@ -281,6 +281,7 @@ void __init setup_arch(char **cmdline_p)
 	/* Initialize PROM console and command line. */
 	*cmdline_p = prom_getbootargs();
 	strcpy(boot_command_line, *cmdline_p);
+	parse_early_param();
 
 	boot_flags_init(*cmdline_p);
 	register_console(&prom_early_console);

+ 14 - 11
arch/sparc64/kernel/signal.c

@@ -510,15 +510,20 @@ static inline void syscall_restart(unsigned long orig_i0, struct pt_regs *regs,
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * mistake.
  */
-static void do_signal(struct pt_regs *regs, unsigned long orig_i0, int restart_syscall)
+static void do_signal(struct pt_regs *regs, unsigned long orig_i0)
 {
-	siginfo_t info;
 	struct signal_deliver_cookie cookie;
 	struct k_sigaction ka;
-	int signr;
 	sigset_t *oldset;
+	siginfo_t info;
+	int signr, tt;
 	
-	cookie.restart_syscall = restart_syscall;
+	tt = regs->magic & 0x1ff;
+	if (tt == 0x110 || tt == 0x111 || tt == 0x16d) {
+		regs->magic &= ~0x1ff;
+		cookie.restart_syscall = 1;
+	} else
+		cookie.restart_syscall = 0;
 	cookie.orig_i0 = orig_i0;
 
 	if (test_thread_flag(TIF_RESTORE_SIGMASK))
@@ -529,9 +534,8 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0, int restart_s
 #ifdef CONFIG_SPARC32_COMPAT
 	if (test_thread_flag(TIF_32BIT)) {
 		extern void do_signal32(sigset_t *, struct pt_regs *,
-					unsigned long, int);
-		do_signal32(oldset, regs, orig_i0,
-			    cookie.restart_syscall);
+					struct signal_deliver_cookie *);
+		do_signal32(oldset, regs, &cookie);
 		return;
 	}
 #endif	
@@ -539,7 +543,7 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0, int restart_s
 	signr = get_signal_to_deliver(&info, &ka, regs, &cookie);
 	if (signr > 0) {
 		if (cookie.restart_syscall)
-			syscall_restart(orig_i0, regs, &ka.sa);
+			syscall_restart(cookie.orig_i0, regs, &ka.sa);
 		handle_signal(signr, &ka, &info, oldset, regs);
 
 		/* a signal was successfully delivered; the saved
@@ -576,11 +580,10 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0, int restart_s
 	}
 }
 
-void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, int restart_syscall,
-		      unsigned long thread_info_flags)
+void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long thread_info_flags)
 {
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
-		do_signal(regs, orig_i0, restart_syscall);
+		do_signal(regs, orig_i0);
 }
 
 void ptrace_signal_deliver(struct pt_regs *regs, void *cookie)

+ 8 - 12
arch/sparc64/kernel/signal32.c

@@ -982,20 +982,16 @@ static inline void syscall_restart32(unsigned long orig_i0, struct pt_regs *regs
  * mistake.
  */
 void do_signal32(sigset_t *oldset, struct pt_regs * regs,
-		 unsigned long orig_i0, int restart_syscall)
+		 struct signal_deliver_cookie *cookie)
 {
-	siginfo_t info;
-	struct signal_deliver_cookie cookie;
 	struct k_sigaction ka;
+	siginfo_t info;
 	int signr;
 	
-	cookie.restart_syscall = restart_syscall;
-	cookie.orig_i0 = orig_i0;
-
-	signr = get_signal_to_deliver(&info, &ka, regs, &cookie);
+	signr = get_signal_to_deliver(&info, &ka, regs, cookie);
 	if (signr > 0) {
-		if (cookie.restart_syscall)
-			syscall_restart32(orig_i0, regs, &ka.sa);
+		if (cookie->restart_syscall)
+			syscall_restart32(cookie->orig_i0, regs, &ka.sa);
 		handle_signal32(signr, &ka, &info, oldset, regs);
 
 		/* a signal was successfully delivered; the saved
@@ -1007,16 +1003,16 @@ void do_signal32(sigset_t *oldset, struct pt_regs * regs,
 			clear_thread_flag(TIF_RESTORE_SIGMASK);
 		return;
 	}
-	if (cookie.restart_syscall &&
+	if (cookie->restart_syscall &&
 	    (regs->u_regs[UREG_I0] == ERESTARTNOHAND ||
 	     regs->u_regs[UREG_I0] == ERESTARTSYS ||
 	     regs->u_regs[UREG_I0] == ERESTARTNOINTR)) {
 		/* replay the system call when we are done */
-		regs->u_regs[UREG_I0] = cookie.orig_i0;
+		regs->u_regs[UREG_I0] = cookie->orig_i0;
 		regs->tpc -= 4;
 		regs->tnpc -= 4;
 	}
-	if (cookie.restart_syscall &&
+	if (cookie->restart_syscall &&
 	    regs->u_regs[UREG_I0] == ERESTART_RESTARTBLOCK) {
 		regs->u_regs[UREG_G1] = __NR_restart_syscall;
 		regs->tpc -= 4;

+ 8 - 3
arch/sparc64/kernel/smp.c

@@ -20,7 +20,7 @@
 #include <linux/cache.h>
 #include <linux/jiffies.h>
 #include <linux/profile.h>
-#include <linux/bootmem.h>
+#include <linux/lmb.h>
 
 #include <asm/head.h>
 #include <asm/ptrace.h>
@@ -1431,7 +1431,7 @@ EXPORT_SYMBOL(__per_cpu_shift);
 
 void __init real_setup_per_cpu_areas(void)
 {
-	unsigned long goal, size, i;
+	unsigned long paddr, goal, size, i;
 	char *ptr;
 
 	/* Copy section for each CPU (we discard the original) */
@@ -1441,8 +1441,13 @@ void __init real_setup_per_cpu_areas(void)
 	for (size = PAGE_SIZE; size < goal; size <<= 1UL)
 		__per_cpu_shift++;
 
-	ptr = alloc_bootmem_pages(size * NR_CPUS);
+	paddr = lmb_alloc(size * NR_CPUS, PAGE_SIZE);
+	if (!paddr) {
+		prom_printf("Cannot allocate per-cpu memory.\n");
+		prom_halt();
+	}
 
+	ptr = __va(paddr);
 	__per_cpu_base = ptr - __per_cpu_start;
 
 	for (i = 0; i < NR_CPUS; i++, ptr += size)

+ 0 - 2
arch/sparc64/kernel/sparc64_ksyms.c

@@ -68,8 +68,6 @@ extern void *__memscan_zero(void *, size_t);
 extern void *__memscan_generic(void *, int, size_t);
 extern int __memcmp(const void *, const void *, __kernel_size_t);
 extern __kernel_size_t strlen(const char *);
-extern void linux_sparc_syscall(void);
-extern void rtrap(void);
 extern void show_regs(struct pt_regs *);
 extern void syscall_trace(struct pt_regs *, int);
 extern void sys_sigsuspend(void);

+ 13 - 3
arch/sparc64/kernel/stacktrace.c

@@ -20,6 +20,8 @@ void save_stack_trace(struct stack_trace *trace)
 	thread_base = (unsigned long) tp;
 	do {
 		struct reg_window *rw;
+		struct pt_regs *regs;
+		unsigned long pc;
 
 		/* Bogus frame pointer? */
 		if (fp < (thread_base + sizeof(struct thread_info)) ||
@@ -27,11 +29,19 @@ void save_stack_trace(struct stack_trace *trace)
 			break;
 
 		rw = (struct reg_window *) fp;
+		regs = (struct pt_regs *) (rw + 1);
+
+		if ((regs->magic & ~0x1ff) == PT_REGS_MAGIC) {
+			pc = regs->tpc;
+			fp = regs->u_regs[UREG_I6] + STACK_BIAS;
+		} else {
+			pc = rw->ins[7];
+			fp = rw->ins[6] + STACK_BIAS;
+		}
+
 		if (trace->skip > 0)
 			trace->skip--;
 		else
-			trace->entries[trace->nr_entries++] = rw->ins[7];
-
-		fp = rw->ins[6] + STACK_BIAS;
+			trace->entries[trace->nr_entries++] = pc;
 	} while (trace->nr_entries < trace->max_entries);
 }

+ 8 - 8
arch/sparc64/kernel/sun4v_tlb_miss.S

@@ -262,7 +262,7 @@ sun4v_iacc:
 	mov	%l5, %o2
 	call	sun4v_insn_access_exception
 	 add	%sp, PTREGS_OFF, %o0
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 	/* Instruction Access Exception, tl1. */
 sun4v_iacc_tl1:
@@ -278,7 +278,7 @@ sun4v_iacc_tl1:
 	mov	%l5, %o2
 	call	sun4v_insn_access_exception_tl1
 	 add	%sp, PTREGS_OFF, %o0
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 	/* Data Access Exception, tl0. */
 sun4v_dacc:
@@ -294,7 +294,7 @@ sun4v_dacc:
 	mov	%l5, %o2
 	call	sun4v_data_access_exception
 	 add	%sp, PTREGS_OFF, %o0
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 	/* Data Access Exception, tl1. */
 sun4v_dacc_tl1:
@@ -310,7 +310,7 @@ sun4v_dacc_tl1:
 	mov	%l5, %o2
 	call	sun4v_data_access_exception_tl1
 	 add	%sp, PTREGS_OFF, %o0
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 	/* Memory Address Unaligned.  */
 sun4v_mna:
@@ -344,7 +344,7 @@ sun4v_mna:
 	mov	%l5, %o2
 	call	sun4v_do_mna
 	 add	%sp, PTREGS_OFF, %o0
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 	/* Privileged Action.  */
 sun4v_privact:
@@ -352,7 +352,7 @@ sun4v_privact:
 	 rd	%pc, %g7
 	call	do_privact
 	 add	%sp, PTREGS_OFF, %o0
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 	/* Unaligned ldd float, tl0. */
 sun4v_lddfmna:
@@ -368,7 +368,7 @@ sun4v_lddfmna:
 	mov	%l5, %o2
 	call	handle_lddfmna
 	 add	%sp, PTREGS_OFF, %o0
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 	/* Unaligned std float, tl0. */
 sun4v_stdfmna:
@@ -384,7 +384,7 @@ sun4v_stdfmna:
 	mov	%l5, %o2
 	call	handle_stdfmna
 	 add	%sp, PTREGS_OFF, %o0
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 #define BRANCH_ALWAYS	0x10680000
 #define NOP		0x01000000

+ 12 - 0
arch/sparc64/kernel/sysfs.c

@@ -273,10 +273,22 @@ static void __init check_mmu_stats(void)
 		mmu_stats_supported = 1;
 }
 
+static void register_nodes(void)
+{
+#ifdef CONFIG_NUMA
+	int i;
+
+	for (i = 0; i < MAX_NUMNODES; i++)
+		register_one_node(i);
+#endif
+}
+
 static int __init topology_init(void)
 {
 	int cpu;
 
+	register_nodes();
+
 	check_mmu_stats();
 
 	register_cpu_notifier(&sysfs_cpu_nb);

+ 15 - 4
arch/sparc64/kernel/traps.c

@@ -2091,9 +2091,8 @@ static void user_instruction_dump(unsigned int __user *pc)
 
 void show_stack(struct task_struct *tsk, unsigned long *_ksp)
 {
-	unsigned long pc, fp, thread_base, ksp;
+	unsigned long fp, thread_base, ksp;
 	struct thread_info *tp;
-	struct reg_window *rw;
 	int count = 0;
 
 	ksp = (unsigned long) _ksp;
@@ -2117,15 +2116,27 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp)
 	printk("\n");
 #endif
 	do {
+		struct reg_window *rw;
+		struct pt_regs *regs;
+		unsigned long pc;
+
 		/* Bogus frame pointer? */
 		if (fp < (thread_base + sizeof(struct thread_info)) ||
 		    fp >= (thread_base + THREAD_SIZE))
 			break;
 		rw = (struct reg_window *)fp;
-		pc = rw->ins[7];
+		regs = (struct pt_regs *) (rw + 1);
+
+		if ((regs->magic & ~0x1ff) == PT_REGS_MAGIC) {
+			pc = regs->tpc;
+			fp = regs->u_regs[UREG_I6] + STACK_BIAS;
+		} else {
+			pc = rw->ins[7];
+			fp = rw->ins[6] + STACK_BIAS;
+		}
+
 		printk(" [%016lx] ", pc);
 		print_symbol("%s\n", pc);
-		fp = rw->ins[6] + STACK_BIAS;
 	} while (++count < 16);
 #ifndef CONFIG_KALLSYMS
 	printk("\n");

+ 1 - 1
arch/sparc64/kernel/tsb.S

@@ -275,7 +275,7 @@ sparc64_realfault_common:
 	stx	%l5, [%g6 + TI_FAULT_ADDR]	! Save fault address
 	call	do_sparc64_fault		! Call fault handler
 	 add	%sp, PTREGS_OFF, %o0		! Compute pt_regs arg
-	ba,pt	%xcc, rtrap_clr_l6		! Restore cpu state
+	ba,pt	%xcc, rtrap			! Restore cpu state
 	 nop					! Delay slot (fill me)
 
 winfix_trampoline:

+ 6 - 6
arch/sparc64/kernel/winfixup.S

@@ -32,7 +32,7 @@ fill_fixup:
 	 rd	%pc, %g7
 	call	do_sparc64_fault
 	 add	%sp, PTREGS_OFF, %o0
-	ba,pt	%xcc, rtrap_clr_l6
+	ba,pt	%xcc, rtrap
 	 nop
 
 	/* Be very careful about usage of the trap globals here.
@@ -100,7 +100,7 @@ spill_fixup_dax:
 	 rd	%pc, %g7
 	call	do_sparc64_fault
 	 add	%sp, PTREGS_OFF, %o0
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 winfix_mna:
 	andn	%g3, 0x7f, %g3
@@ -122,12 +122,12 @@ fill_fixup_mna:
 	mov	%l4, %o2
 	call	sun4v_do_mna
 	 mov	%l5, %o1
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 1:	mov	%l4, %o1
 	mov	%l5, %o2
 	call	mem_address_unaligned
 	 nop
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 winfix_dax:
 	andn	%g3, 0x7f, %g3
@@ -150,7 +150,7 @@ fill_fixup_dax:
 	 add	%sp, PTREGS_OFF, %o0
 	call	sun4v_data_access_exception
 	 nop
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 1:	call	spitfire_data_access_exception
 	 nop
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap

+ 713 - 276
arch/sparc64/mm/init.c

@@ -24,6 +24,8 @@
 #include <linux/cache.h>
 #include <linux/sort.h>
 #include <linux/percpu.h>
+#include <linux/lmb.h>
+#include <linux/mmzone.h>
 
 #include <asm/head.h>
 #include <asm/system.h>
@@ -72,9 +74,7 @@ extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
 #define MAX_BANKS	32
 
 static struct linux_prom64_registers pavail[MAX_BANKS] __initdata;
-static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
 static int pavail_ents __initdata;
-static int pavail_rescan_ents __initdata;
 
 static int cmp_p64(const void *a, const void *b)
 {
@@ -715,285 +715,684 @@ out:
 		smp_new_mmu_context_version();
 }
 
-/* Find a free area for the bootmem map, avoiding the kernel image
- * and the initial ramdisk.
- */
-static unsigned long __init choose_bootmap_pfn(unsigned long start_pfn,
-					       unsigned long end_pfn)
+static int numa_enabled = 1;
+static int numa_debug;
+
+static int __init early_numa(char *p)
 {
-	unsigned long avoid_start, avoid_end, bootmap_size;
-	int i;
+	if (!p)
+		return 0;
+
+	if (strstr(p, "off"))
+		numa_enabled = 0;
+
+	if (strstr(p, "debug"))
+		numa_debug = 1;
+
+	return 0;
+}
+early_param("numa", early_numa);
 
-	bootmap_size = bootmem_bootmap_pages(end_pfn - start_pfn);
-	bootmap_size <<= PAGE_SHIFT;
+#define numadbg(f, a...) \
+do {	if (numa_debug) \
+		printk(KERN_INFO f, ## a); \
+} while (0)
 
-	avoid_start = avoid_end = 0;
+static void __init find_ramdisk(unsigned long phys_base)
+{
 #ifdef CONFIG_BLK_DEV_INITRD
-	avoid_start = initrd_start;
-	avoid_end = PAGE_ALIGN(initrd_end);
+	if (sparc_ramdisk_image || sparc_ramdisk_image64) {
+		unsigned long ramdisk_image;
+
+		/* Older versions of the bootloader only supported a
+		 * 32-bit physical address for the ramdisk image
+		 * location, stored at sparc_ramdisk_image.  Newer
+		 * SILO versions set sparc_ramdisk_image to zero and
+		 * provide a full 64-bit physical address at
+		 * sparc_ramdisk_image64.
+		 */
+		ramdisk_image = sparc_ramdisk_image;
+		if (!ramdisk_image)
+			ramdisk_image = sparc_ramdisk_image64;
+
+		/* Another bootloader quirk.  The bootloader normalizes
+		 * the physical address to KERNBASE, so we have to
+		 * factor that back out and add in the lowest valid
+		 * physical page address to get the true physical address.
+		 */
+		ramdisk_image -= KERNBASE;
+		ramdisk_image += phys_base;
+
+		numadbg("Found ramdisk at physical address 0x%lx, size %u\n",
+			ramdisk_image, sparc_ramdisk_size);
+
+		initrd_start = ramdisk_image;
+		initrd_end = ramdisk_image + sparc_ramdisk_size;
+
+		lmb_reserve(initrd_start, initrd_end);
+	}
 #endif
+}
 
-	for (i = 0; i < pavail_ents; i++) {
-		unsigned long start, end;
+struct node_mem_mask {
+	unsigned long mask;
+	unsigned long val;
+	unsigned long bootmem_paddr;
+};
+static struct node_mem_mask node_masks[MAX_NUMNODES];
+static int num_node_masks;
 
-		start = pavail[i].phys_addr;
-		end = start + pavail[i].reg_size;
+int numa_cpu_lookup_table[NR_CPUS];
+cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
 
-		while (start < end) {
-			if (start >= kern_base &&
-			    start < PAGE_ALIGN(kern_base + kern_size)) {
-				start = PAGE_ALIGN(kern_base + kern_size);
-				continue;
-			}
-			if (start >= avoid_start && start < avoid_end) {
-				start = avoid_end;
-				continue;
-			}
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
-			if ((end - start) < bootmap_size)
-				break;
+struct mdesc_mblock {
+	u64	base;
+	u64	size;
+	u64	offset; /* RA-to-PA */
+};
+static struct mdesc_mblock *mblocks;
+static int num_mblocks;
 
-			if (start < kern_base &&
-			    (start + bootmap_size) > kern_base) {
-				start = PAGE_ALIGN(kern_base + kern_size);
-				continue;
-			}
+static unsigned long ra_to_pa(unsigned long addr)
+{
+	int i;
 
-			if (start < avoid_start &&
-			    (start + bootmap_size) > avoid_start) {
-				start = avoid_end;
-				continue;
-			}
+	for (i = 0; i < num_mblocks; i++) {
+		struct mdesc_mblock *m = &mblocks[i];
 
-			/* OK, it doesn't overlap anything, use it.  */
-			return start >> PAGE_SHIFT;
+		if (addr >= m->base &&
+		    addr < (m->base + m->size)) {
+			addr += m->offset;
+			break;
 		}
 	}
-
-	prom_printf("Cannot find free area for bootmap, aborting.\n");
-	prom_halt();
+	return addr;
 }
 
-static void __init trim_pavail(unsigned long *cur_size_p,
-			       unsigned long *end_of_phys_p)
+static int find_node(unsigned long addr)
 {
-	unsigned long to_trim = *cur_size_p - cmdline_memory_size;
-	unsigned long avoid_start, avoid_end;
 	int i;
 
-	to_trim = PAGE_ALIGN(to_trim);
+	addr = ra_to_pa(addr);
+	for (i = 0; i < num_node_masks; i++) {
+		struct node_mem_mask *p = &node_masks[i];
 
-	avoid_start = avoid_end = 0;
-#ifdef CONFIG_BLK_DEV_INITRD
-	avoid_start = initrd_start;
-	avoid_end = PAGE_ALIGN(initrd_end);
+		if ((addr & p->mask) == p->val)
+			return i;
+	}
+	return -1;
+}
+
+static unsigned long nid_range(unsigned long start, unsigned long end,
+			       int *nid)
+{
+	*nid = find_node(start);
+	start += PAGE_SIZE;
+	while (start < end) {
+		int n = find_node(start);
+
+		if (n != *nid)
+			break;
+		start += PAGE_SIZE;
+	}
+
+	return start;
+}
+#else
+static unsigned long nid_range(unsigned long start, unsigned long end,
+			       int *nid)
+{
+	*nid = 0;
+	return end;
+}
 #endif
 
-	/* Trim some pavail[] entries in order to satisfy the
-	 * requested "mem=xxx" kernel command line specification.
-	 *
-	 * We must not trim off the kernel image area nor the
-	 * initial ramdisk range (if any).  Also, we must not trim
-	 * any pavail[] entry down to zero in order to preserve
-	 * the invariant that all pavail[] entries have a non-zero
-	 * size which is assumed by all of the code in here.
-	 */
-	for (i = 0; i < pavail_ents; i++) {
-		unsigned long start, end, kern_end;
-		unsigned long trim_low, trim_high, n;
+/* This must be invoked after performing all of the necessary
+ * add_active_range() calls for 'nid'.  We need to be able to get
+ * correct data from get_pfn_range_for_nid().
+ */
+static void __init allocate_node_data(int nid)
+{
+	unsigned long paddr, num_pages, start_pfn, end_pfn;
+	struct pglist_data *p;
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	paddr = lmb_alloc_nid(sizeof(struct pglist_data),
+			      SMP_CACHE_BYTES, nid, nid_range);
+	if (!paddr) {
+		prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
+		prom_halt();
+	}
+	NODE_DATA(nid) = __va(paddr);
+	memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
 
-		kern_end = PAGE_ALIGN(kern_base + kern_size);
+	NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
+#endif
 
-		trim_low = start = pavail[i].phys_addr;
-		trim_high = end = start + pavail[i].reg_size;
+	p = NODE_DATA(nid);
 
-		if (kern_base >= start &&
-		    kern_base < end) {
-			trim_low = kern_base;
-			if (kern_end >= end)
-				continue;
-		}
-		if (kern_end >= start &&
-		    kern_end < end) {
-			trim_high = kern_end;
-		}
-		if (avoid_start &&
-		    avoid_start >= start &&
-		    avoid_start < end) {
-			if (trim_low > avoid_start)
-				trim_low = avoid_start;
-			if (avoid_end >= end)
-				continue;
-		}
-		if (avoid_end &&
-		    avoid_end >= start &&
-		    avoid_end < end) {
-			if (trim_high < avoid_end)
-				trim_high = avoid_end;
+	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+	p->node_start_pfn = start_pfn;
+	p->node_spanned_pages = end_pfn - start_pfn;
+
+	if (p->node_spanned_pages) {
+		num_pages = bootmem_bootmap_pages(p->node_spanned_pages);
+
+		paddr = lmb_alloc_nid(num_pages << PAGE_SHIFT, PAGE_SIZE, nid,
+				      nid_range);
+		if (!paddr) {
+			prom_printf("Cannot allocate bootmap for nid[%d]\n",
+				  nid);
+			prom_halt();
 		}
+		node_masks[nid].bootmem_paddr = paddr;
+	}
+}
+
+static void init_node_masks_nonnuma(void)
+{
+	int i;
+
+	numadbg("Initializing tables for non-numa.\n");
+
+	node_masks[0].mask = node_masks[0].val = 0;
+	num_node_masks = 1;
+
+	for (i = 0; i < NR_CPUS; i++)
+		numa_cpu_lookup_table[i] = 0;
+
+	numa_cpumask_lookup_table[0] = CPU_MASK_ALL;
+}
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data *node_data[MAX_NUMNODES];
+
+EXPORT_SYMBOL(numa_cpu_lookup_table);
+EXPORT_SYMBOL(numa_cpumask_lookup_table);
+EXPORT_SYMBOL(node_data);
+
+struct mdesc_mlgroup {
+	u64	node;
+	u64	latency;
+	u64	match;
+	u64	mask;
+};
+static struct mdesc_mlgroup *mlgroups;
+static int num_mlgroups;
+
+static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio,
+				   u32 cfg_handle)
+{
+	u64 arc;
 
-		if (trim_high <= trim_low)
+	mdesc_for_each_arc(arc, md, pio, MDESC_ARC_TYPE_FWD) {
+		u64 target = mdesc_arc_target(md, arc);
+		const u64 *val;
+
+		val = mdesc_get_property(md, target,
+					 "cfg-handle", NULL);
+		if (val && *val == cfg_handle)
+			return 0;
+	}
+	return -ENODEV;
+}
+
+static int scan_arcs_for_cfg_handle(struct mdesc_handle *md, u64 grp,
+				    u32 cfg_handle)
+{
+	u64 arc, candidate, best_latency = ~(u64)0;
+
+	candidate = MDESC_NODE_NULL;
+	mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
+		u64 target = mdesc_arc_target(md, arc);
+		const char *name = mdesc_node_name(md, target);
+		const u64 *val;
+
+		if (strcmp(name, "pio-latency-group"))
 			continue;
 
-		if (trim_low == start && trim_high == end) {
-			/* Whole chunk is available for trimming.
-			 * Trim all except one page, in order to keep
-			 * entry non-empty.
-			 */
-			n = (end - start) - PAGE_SIZE;
-			if (n > to_trim)
-				n = to_trim;
-
-			if (n) {
-				pavail[i].phys_addr += n;
-				pavail[i].reg_size -= n;
-				to_trim -= n;
-			}
-		} else {
-			n = (trim_low - start);
-			if (n > to_trim)
-				n = to_trim;
-
-			if (n) {
-				pavail[i].phys_addr += n;
-				pavail[i].reg_size -= n;
-				to_trim -= n;
-			}
-			if (to_trim) {
-				n = end - trim_high;
-				if (n > to_trim)
-					n = to_trim;
-				if (n) {
-					pavail[i].reg_size -= n;
-					to_trim -= n;
-				}
-			}
+		val = mdesc_get_property(md, target, "latency", NULL);
+		if (!val)
+			continue;
+
+		if (*val < best_latency) {
+			candidate = target;
+			best_latency = *val;
 		}
+	}
+
+	if (candidate == MDESC_NODE_NULL)
+		return -ENODEV;
+
+	return scan_pio_for_cfg_handle(md, candidate, cfg_handle);
+}
+
+int of_node_to_nid(struct device_node *dp)
+{
+	const struct linux_prom64_registers *regs;
+	struct mdesc_handle *md;
+	u32 cfg_handle;
+	int count, nid;
+	u64 grp;
 
-		if (!to_trim)
+	if (!mlgroups)
+		return -1;
+
+	regs = of_get_property(dp, "reg", NULL);
+	if (!regs)
+		return -1;
+
+	cfg_handle = (regs->phys_addr >> 32UL) & 0x0fffffff;
+
+	md = mdesc_grab();
+
+	count = 0;
+	nid = -1;
+	mdesc_for_each_node_by_name(md, grp, "group") {
+		if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
+			nid = count;
 			break;
+		}
+		count++;
 	}
 
-	/* Recalculate.  */
-	*cur_size_p = 0UL;
-	for (i = 0; i < pavail_ents; i++) {
-		*end_of_phys_p = pavail[i].phys_addr +
-			pavail[i].reg_size;
-		*cur_size_p += pavail[i].reg_size;
-	}
+	mdesc_release(md);
+
+	return nid;
 }
 
-/* About pages_avail, this is the value we will use to calculate
- * the zholes_size[] argument given to free_area_init_node().  The
- * page allocator uses this to calculate nr_kernel_pages,
- * nr_all_pages and zone->present_pages.  On NUMA it is used
- * to calculate zone->min_unmapped_pages and zone->min_slab_pages.
- *
- * So this number should really be set to what the page allocator
- * actually ends up with.  This means:
- * 1) It should include bootmem map pages, we'll release those.
- * 2) It should not include the kernel image, except for the
- *    __init sections which we will also release.
- * 3) It should include the initrd image, since we'll release
- *    that too.
- */
-static unsigned long __init bootmem_init(unsigned long *pages_avail,
-					 unsigned long phys_base)
+static void add_node_ranges(void)
 {
-	unsigned long bootmap_size, end_pfn;
-	unsigned long end_of_phys_memory = 0UL;
-	unsigned long bootmap_pfn, bytes_avail, size;
 	int i;
 
-	bytes_avail = 0UL;
-	for (i = 0; i < pavail_ents; i++) {
-		end_of_phys_memory = pavail[i].phys_addr +
-			pavail[i].reg_size;
-		bytes_avail += pavail[i].reg_size;
+	for (i = 0; i < lmb.memory.cnt; i++) {
+		unsigned long size = lmb_size_bytes(&lmb.memory, i);
+		unsigned long start, end;
+
+		start = lmb.memory.region[i].base;
+		end = start + size;
+		while (start < end) {
+			unsigned long this_end;
+			int nid;
+
+			this_end = nid_range(start, end, &nid);
+
+			numadbg("Adding active range nid[%d] "
+				"start[%lx] end[%lx]\n",
+				nid, start, this_end);
+
+			add_active_range(nid,
+					 start >> PAGE_SHIFT,
+					 this_end >> PAGE_SHIFT);
+
+			start = this_end;
+		}
 	}
+}
 
-	/* Determine the location of the initial ramdisk before trying
-	 * to honor the "mem=xxx" command line argument.  We must know
-	 * where the kernel image and the ramdisk image are so that we
-	 * do not trim those two areas from the physical memory map.
-	 */
+static int __init grab_mlgroups(struct mdesc_handle *md)
+{
+	unsigned long paddr;
+	int count = 0;
+	u64 node;
+
+	mdesc_for_each_node_by_name(md, node, "memory-latency-group")
+		count++;
+	if (!count)
+		return -ENOENT;
+
+	paddr = lmb_alloc(count * sizeof(struct mdesc_mlgroup),
+			  SMP_CACHE_BYTES);
+	if (!paddr)
+		return -ENOMEM;
+
+	mlgroups = __va(paddr);
+	num_mlgroups = count;
+
+	count = 0;
+	mdesc_for_each_node_by_name(md, node, "memory-latency-group") {
+		struct mdesc_mlgroup *m = &mlgroups[count++];
+		const u64 *val;
+
+		m->node = node;
+
+		val = mdesc_get_property(md, node, "latency", NULL);
+		m->latency = *val;
+		val = mdesc_get_property(md, node, "address-match", NULL);
+		m->match = *val;
+		val = mdesc_get_property(md, node, "address-mask", NULL);
+		m->mask = *val;
+
+		numadbg("MLGROUP[%d]: node[%lx] latency[%lx] "
+			"match[%lx] mask[%lx]\n",
+			count - 1, m->node, m->latency, m->match, m->mask);
+	}
 
-#ifdef CONFIG_BLK_DEV_INITRD
-	/* Now have to check initial ramdisk, so that bootmap does not overwrite it */
-	if (sparc_ramdisk_image || sparc_ramdisk_image64) {
-		unsigned long ramdisk_image = sparc_ramdisk_image ?
-			sparc_ramdisk_image : sparc_ramdisk_image64;
-		ramdisk_image -= KERNBASE;
-		initrd_start = ramdisk_image + phys_base;
-		initrd_end = initrd_start + sparc_ramdisk_size;
-		if (initrd_end > end_of_phys_memory) {
-			printk(KERN_CRIT "initrd extends beyond end of memory "
-		                 	 "(0x%016lx > 0x%016lx)\ndisabling initrd\n",
-			       initrd_end, end_of_phys_memory);
-			initrd_start = 0;
-			initrd_end = 0;
+	return 0;
+}
+
+static int __init grab_mblocks(struct mdesc_handle *md)
+{
+	unsigned long paddr;
+	int count = 0;
+	u64 node;
+
+	mdesc_for_each_node_by_name(md, node, "mblock")
+		count++;
+	if (!count)
+		return -ENOENT;
+
+	paddr = lmb_alloc(count * sizeof(struct mdesc_mblock),
+			  SMP_CACHE_BYTES);
+	if (!paddr)
+		return -ENOMEM;
+
+	mblocks = __va(paddr);
+	num_mblocks = count;
+
+	count = 0;
+	mdesc_for_each_node_by_name(md, node, "mblock") {
+		struct mdesc_mblock *m = &mblocks[count++];
+		const u64 *val;
+
+		val = mdesc_get_property(md, node, "base", NULL);
+		m->base = *val;
+		val = mdesc_get_property(md, node, "size", NULL);
+		m->size = *val;
+		val = mdesc_get_property(md, node,
+					 "address-congruence-offset", NULL);
+		m->offset = *val;
+
+		numadbg("MBLOCK[%d]: base[%lx] size[%lx] offset[%lx]\n",
+			count - 1, m->base, m->size, m->offset);
+	}
+
+	return 0;
+}
+
+static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md,
+					       u64 grp, cpumask_t *mask)
+{
+	u64 arc;
+
+	cpus_clear(*mask);
+
+	mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_BACK) {
+		u64 target = mdesc_arc_target(md, arc);
+		const char *name = mdesc_node_name(md, target);
+		const u64 *id;
+
+		if (strcmp(name, "cpu"))
+			continue;
+		id = mdesc_get_property(md, target, "id", NULL);
+		if (*id < NR_CPUS)
+			cpu_set(*id, *mask);
+	}
+}
+
+static struct mdesc_mlgroup * __init find_mlgroup(u64 node)
+{
+	int i;
+
+	for (i = 0; i < num_mlgroups; i++) {
+		struct mdesc_mlgroup *m = &mlgroups[i];
+		if (m->node == node)
+			return m;
+	}
+	return NULL;
+}
+
+static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
+				      int index)
+{
+	struct mdesc_mlgroup *candidate = NULL;
+	u64 arc, best_latency = ~(u64)0;
+	struct node_mem_mask *n;
+
+	mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
+		u64 target = mdesc_arc_target(md, arc);
+		struct mdesc_mlgroup *m = find_mlgroup(target);
+		if (!m)
+			continue;
+		if (m->latency < best_latency) {
+			candidate = m;
+			best_latency = m->latency;
 		}
 	}
-#endif	
+	if (!candidate)
+		return -ENOENT;
+
+	if (num_node_masks != index) {
+		printk(KERN_ERR "Inconsistent NUMA state, "
+		       "index[%d] != num_node_masks[%d]\n",
+		       index, num_node_masks);
+		return -EINVAL;
+	}
 
-	if (cmdline_memory_size &&
-	    bytes_avail > cmdline_memory_size)
-		trim_pavail(&bytes_avail,
-			    &end_of_phys_memory);
+	n = &node_masks[num_node_masks++];
 
-	*pages_avail = bytes_avail >> PAGE_SHIFT;
+	n->mask = candidate->mask;
+	n->val = candidate->match;
 
-	end_pfn = end_of_phys_memory >> PAGE_SHIFT;
+	numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%lx])\n",
+		index, n->mask, n->val, candidate->latency);
 
-	/* Initialize the boot-time allocator. */
-	max_pfn = max_low_pfn = end_pfn;
-	min_low_pfn = (phys_base >> PAGE_SHIFT);
+	return 0;
+}
+
+static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp,
+					 int index)
+{
+	cpumask_t mask;
+	int cpu;
 
-	bootmap_pfn = choose_bootmap_pfn(min_low_pfn, end_pfn);
+	numa_parse_mdesc_group_cpus(md, grp, &mask);
 
-	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap_pfn,
-					 min_low_pfn, end_pfn);
+	for_each_cpu_mask(cpu, mask)
+		numa_cpu_lookup_table[cpu] = index;
+	numa_cpumask_lookup_table[index] = mask;
 
-	/* Now register the available physical memory with the
-	 * allocator.
-	 */
-	for (i = 0; i < pavail_ents; i++)
-		free_bootmem(pavail[i].phys_addr, pavail[i].reg_size);
+	if (numa_debug) {
+		printk(KERN_INFO "NUMA GROUP[%d]: cpus [ ", index);
+		for_each_cpu_mask(cpu, mask)
+			printk("%d ", cpu);
+		printk("]\n");
+	}
 
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (initrd_start) {
-		size = initrd_end - initrd_start;
+	return numa_attach_mlgroup(md, grp, index);
+}
+
+static int __init numa_parse_mdesc(void)
+{
+	struct mdesc_handle *md = mdesc_grab();
+	int i, err, count;
+	u64 node;
+
+	node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
+	if (node == MDESC_NODE_NULL) {
+		mdesc_release(md);
+		return -ENOENT;
+	}
+
+	err = grab_mblocks(md);
+	if (err < 0)
+		goto out;
+
+	err = grab_mlgroups(md);
+	if (err < 0)
+		goto out;
+
+	count = 0;
+	mdesc_for_each_node_by_name(md, node, "group") {
+		err = numa_parse_mdesc_group(md, node, count);
+		if (err < 0)
+			break;
+		count++;
+	}
+
+	add_node_ranges();
+
+	for (i = 0; i < num_node_masks; i++) {
+		allocate_node_data(i);
+		node_set_online(i);
+	}
+
+	err = 0;
+out:
+	mdesc_release(md);
+	return err;
+}
+
+static int __init numa_parse_sun4u(void)
+{
+	return -1;
+}
 
-		/* Reserve the initrd image area. */
-		reserve_bootmem(initrd_start, size, BOOTMEM_DEFAULT);
+static int __init bootmem_init_numa(void)
+{
+	int err = -1;
 
-		initrd_start += PAGE_OFFSET;
-		initrd_end += PAGE_OFFSET;
+	numadbg("bootmem_init_numa()\n");
+
+	if (numa_enabled) {
+		if (tlb_type == hypervisor)
+			err = numa_parse_mdesc();
+		else
+			err = numa_parse_sun4u();
 	}
+	return err;
+}
+
+#else
+
+static int bootmem_init_numa(void)
+{
+	return -1;
+}
+
 #endif
-	/* Reserve the kernel text/data/bss. */
-	reserve_bootmem(kern_base, kern_size, BOOTMEM_DEFAULT);
-	*pages_avail -= PAGE_ALIGN(kern_size) >> PAGE_SHIFT;
-
-	/* Add back in the initmem pages. */
-	size = ((unsigned long)(__init_end) & PAGE_MASK) -
-		PAGE_ALIGN((unsigned long)__init_begin);
-	*pages_avail += size >> PAGE_SHIFT;
-
-	/* Reserve the bootmem map.   We do not account for it
-	 * in pages_avail because we will release that memory
-	 * in free_all_bootmem.
-	 */
-	size = bootmap_size;
-	reserve_bootmem((bootmap_pfn << PAGE_SHIFT), size, BOOTMEM_DEFAULT);
 
-	for (i = 0; i < pavail_ents; i++) {
+static void __init bootmem_init_nonnuma(void)
+{
+	unsigned long top_of_ram = lmb_end_of_DRAM();
+	unsigned long total_ram = lmb_phys_mem_size();
+	unsigned int i;
+
+	numadbg("bootmem_init_nonnuma()\n");
+
+	printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
+	       top_of_ram, total_ram);
+	printk(KERN_INFO "Memory hole size: %ldMB\n",
+	       (top_of_ram - total_ram) >> 20);
+
+	init_node_masks_nonnuma();
+
+	for (i = 0; i < lmb.memory.cnt; i++) {
+		unsigned long size = lmb_size_bytes(&lmb.memory, i);
 		unsigned long start_pfn, end_pfn;
 
-		start_pfn = pavail[i].phys_addr >> PAGE_SHIFT;
-		end_pfn = (start_pfn + (pavail[i].reg_size >> PAGE_SHIFT));
-		memory_present(0, start_pfn, end_pfn);
+		if (!size)
+			continue;
+
+		start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
+		end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
+		add_active_range(0, start_pfn, end_pfn);
+	}
+
+	allocate_node_data(0);
+
+	node_set_online(0);
+}
+
+static void __init reserve_range_in_node(int nid, unsigned long start,
+					 unsigned long end)
+{
+	numadbg("    reserve_range_in_node(nid[%d],start[%lx],end[%lx]\n",
+		nid, start, end);
+	while (start < end) {
+		unsigned long this_end;
+		int n;
+
+		this_end = nid_range(start, end, &n);
+		if (n == nid) {
+			numadbg("      MATCH reserving range [%lx:%lx]\n",
+				start, this_end);
+			reserve_bootmem_node(NODE_DATA(nid), start,
+					     (this_end - start), BOOTMEM_DEFAULT);
+		} else
+			numadbg("      NO MATCH, advancing start to %lx\n",
+				this_end);
+
+		start = this_end;
+	}
+}
+
+static void __init trim_reserved_in_node(int nid)
+{
+	int i;
+
+	numadbg("  trim_reserved_in_node(%d)\n", nid);
+
+	for (i = 0; i < lmb.reserved.cnt; i++) {
+		unsigned long start = lmb.reserved.region[i].base;
+		unsigned long size = lmb_size_bytes(&lmb.reserved, i);
+		unsigned long end = start + size;
+
+		reserve_range_in_node(nid, start, end);
+	}
+}
+
+static void __init bootmem_init_one_node(int nid)
+{
+	struct pglist_data *p;
+
+	numadbg("bootmem_init_one_node(%d)\n", nid);
+
+	p = NODE_DATA(nid);
+
+	if (p->node_spanned_pages) {
+		unsigned long paddr = node_masks[nid].bootmem_paddr;
+		unsigned long end_pfn;
+
+		end_pfn = p->node_start_pfn + p->node_spanned_pages;
+
+		numadbg("  init_bootmem_node(%d, %lx, %lx, %lx)\n",
+			nid, paddr >> PAGE_SHIFT, p->node_start_pfn, end_pfn);
+
+		init_bootmem_node(p, paddr >> PAGE_SHIFT,
+				  p->node_start_pfn, end_pfn);
+
+		numadbg("  free_bootmem_with_active_regions(%d, %lx)\n",
+			nid, end_pfn);
+		free_bootmem_with_active_regions(nid, end_pfn);
+
+		trim_reserved_in_node(nid);
+
+		numadbg("  sparse_memory_present_with_active_regions(%d)\n",
+			nid);
+		sparse_memory_present_with_active_regions(nid);
 	}
+}
+
+static unsigned long __init bootmem_init(unsigned long phys_base)
+{
+	unsigned long end_pfn;
+	int nid;
+
+	end_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
+	max_pfn = max_low_pfn = end_pfn;
+	min_low_pfn = (phys_base >> PAGE_SHIFT);
+
+	if (bootmem_init_numa() < 0)
+		bootmem_init_nonnuma();
+
+	/* XXX cpu notifier XXX */
+
+	for_each_online_node(nid)
+		bootmem_init_one_node(nid);
 
 	sparse_init();
 
@@ -1289,7 +1688,7 @@ void __init setup_per_cpu_areas(void)
 
 void __init paging_init(void)
 {
-	unsigned long end_pfn, pages_avail, shift, phys_base;
+	unsigned long end_pfn, shift, phys_base;
 	unsigned long real_end, i;
 
 	/* These build time checkes make sure that the dcache_dirty_cpu()
@@ -1330,12 +1729,26 @@ void __init paging_init(void)
 		sun4v_ktsb_init();
 	}
 
+	lmb_init();
+
 	/* Find available physical memory... */
 	read_obp_memory("available", &pavail[0], &pavail_ents);
 
 	phys_base = 0xffffffffffffffffUL;
-	for (i = 0; i < pavail_ents; i++)
+	for (i = 0; i < pavail_ents; i++) {
 		phys_base = min(phys_base, pavail[i].phys_addr);
+		lmb_add(pavail[i].phys_addr, pavail[i].reg_size);
+	}
+
+	lmb_reserve(kern_base, kern_size);
+
+	find_ramdisk(phys_base);
+
+	if (cmdline_memory_size)
+		lmb_enforce_memory_limit(phys_base + cmdline_memory_size);
+
+	lmb_analyze();
+	lmb_dump_all();
 
 	set_bit(0, mmu_context_bmap);
 
@@ -1371,14 +1784,10 @@ void __init paging_init(void)
 	if (tlb_type == hypervisor)
 		sun4v_ktsb_register();
 
-	/* Setup bootmem... */
-	pages_avail = 0;
-	last_valid_pfn = end_pfn = bootmem_init(&pages_avail, phys_base);
-
-	max_mapnr = last_valid_pfn;
-
-	kernel_physical_mapping_init();
-
+	/* We must setup the per-cpu areas before we pull in the
+	 * PROM and the MDESC.  The code there fills in cpu and
+	 * other information into per-cpu data structures.
+	 */
 	real_setup_per_cpu_areas();
 
 	prom_build_devicetree();
@@ -1386,20 +1795,22 @@ void __init paging_init(void)
 	if (tlb_type == hypervisor)
 		sun4v_mdesc_init();
 
+	/* Setup bootmem... */
+	last_valid_pfn = end_pfn = bootmem_init(phys_base);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+	max_mapnr = last_valid_pfn;
+#endif
+	kernel_physical_mapping_init();
+
 	{
-		unsigned long zones_size[MAX_NR_ZONES];
-		unsigned long zholes_size[MAX_NR_ZONES];
-		int znum;
+		unsigned long max_zone_pfns[MAX_NR_ZONES];
 
-		for (znum = 0; znum < MAX_NR_ZONES; znum++)
-			zones_size[znum] = zholes_size[znum] = 0;
+		memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
 
-		zones_size[ZONE_NORMAL] = end_pfn;
-		zholes_size[ZONE_NORMAL] = end_pfn - pages_avail;
+		max_zone_pfns[ZONE_NORMAL] = end_pfn;
 
-		free_area_init_node(0, &contig_page_data, zones_size,
-				    __pa(PAGE_OFFSET) >> PAGE_SHIFT,
-				    zholes_size);
+		free_area_init_nodes(max_zone_pfns);
 	}
 
 	printk("Booting Linux...\n");
@@ -1408,21 +1819,52 @@ void __init paging_init(void)
 	cpu_probe();
 }
 
-static void __init taint_real_pages(void)
+int __init page_in_phys_avail(unsigned long paddr)
+{
+	int i;
+
+	paddr &= PAGE_MASK;
+
+	for (i = 0; i < pavail_ents; i++) {
+		unsigned long start, end;
+
+		start = pavail[i].phys_addr;
+		end = start + pavail[i].reg_size;
+
+		if (paddr >= start && paddr < end)
+			return 1;
+	}
+	if (paddr >= kern_base && paddr < (kern_base + kern_size))
+		return 1;
+#ifdef CONFIG_BLK_DEV_INITRD
+	if (paddr >= __pa(initrd_start) &&
+	    paddr < __pa(PAGE_ALIGN(initrd_end)))
+		return 1;
+#endif
+
+	return 0;
+}
+
+static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
+static int pavail_rescan_ents __initdata;
+
+/* Certain OBP calls, such as fetching "available" properties, can
+ * claim physical memory.  So, along with initializing the valid
+ * address bitmap, what we do here is refetch the physical available
+ * memory list again, and make sure it provides at least as much
+ * memory as 'pavail' does.
+ */
+static void setup_valid_addr_bitmap_from_pavail(void)
 {
 	int i;
 
 	read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents);
 
-	/* Find changes discovered in the physmem available rescan and
-	 * reserve the lost portions in the bootmem maps.
-	 */
 	for (i = 0; i < pavail_ents; i++) {
 		unsigned long old_start, old_end;
 
 		old_start = pavail[i].phys_addr;
-		old_end = old_start +
-			pavail[i].reg_size;
+		old_end = old_start + pavail[i].reg_size;
 		while (old_start < old_end) {
 			int n;
 
@@ -1440,7 +1882,16 @@ static void __init taint_real_pages(void)
 					goto do_next_page;
 				}
 			}
-			reserve_bootmem(old_start, PAGE_SIZE, BOOTMEM_DEFAULT);
+
+			prom_printf("mem_init: Lost memory in pavail\n");
+			prom_printf("mem_init: OLD start[%lx] size[%lx]\n",
+				    pavail[i].phys_addr,
+				    pavail[i].reg_size);
+			prom_printf("mem_init: NEW start[%lx] size[%lx]\n",
+				    pavail_rescan[i].phys_addr,
+				    pavail_rescan[i].reg_size);
+			prom_printf("mem_init: Cannot continue, aborting.\n");
+			prom_halt();
 
 		do_next_page:
 			old_start += PAGE_SIZE;
@@ -1448,32 +1899,6 @@ static void __init taint_real_pages(void)
 	}
 }
 
-int __init page_in_phys_avail(unsigned long paddr)
-{
-	int i;
-
-	paddr &= PAGE_MASK;
-
-	for (i = 0; i < pavail_rescan_ents; i++) {
-		unsigned long start, end;
-
-		start = pavail_rescan[i].phys_addr;
-		end = start + pavail_rescan[i].reg_size;
-
-		if (paddr >= start && paddr < end)
-			return 1;
-	}
-	if (paddr >= kern_base && paddr < (kern_base + kern_size))
-		return 1;
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (paddr >= __pa(initrd_start) &&
-	    paddr < __pa(PAGE_ALIGN(initrd_end)))
-		return 1;
-#endif
-
-	return 0;
-}
-
 void __init mem_init(void)
 {
 	unsigned long codepages, datapages, initpages;
@@ -1496,14 +1921,26 @@ void __init mem_init(void)
 		addr += PAGE_SIZE;
 	}
 
-	taint_real_pages();
+	setup_valid_addr_bitmap_from_pavail();
 
 	high_memory = __va(last_valid_pfn << PAGE_SHIFT);
 
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+	for_each_online_node(i) {
+		if (NODE_DATA(i)->node_spanned_pages != 0) {
+			totalram_pages +=
+				free_all_bootmem_node(NODE_DATA(i));
+		}
+	}
+#else
+	totalram_pages = free_all_bootmem();
+#endif
+
 	/* We subtract one to account for the mem_map_zero page
 	 * allocated below.
 	 */
-	totalram_pages = num_physpages = free_all_bootmem() - 1;
+	totalram_pages -= 1;
+	num_physpages = totalram_pages;
 
 	/*
 	 * Set up the zero page, mark it reserved, so that page count

+ 2 - 1
arch/sparc64/mm/tsb.c

@@ -321,7 +321,8 @@ retry_tsb_alloc:
 	if (new_size > (PAGE_SIZE * 2))
 		gfp_flags = __GFP_NOWARN | __GFP_NORETRY;
 
-	new_tsb = kmem_cache_alloc(tsb_caches[new_cache_index], gfp_flags);
+	new_tsb = kmem_cache_alloc_node(tsb_caches[new_cache_index],
+					gfp_flags, numa_node_id());
 	if (unlikely(!new_tsb)) {
 		/* Not being able to fork due to a high-order TSB
 		 * allocation failure is very bad behavior.  Just back

+ 1 - 3
arch/sparc64/mm/ultra.S

@@ -476,7 +476,6 @@ xcall_sync_tick:
 #endif
 	call		smp_synchronize_tick_client
 	 nop
-	clr		%l6
 	b		rtrap_xcall
 	 ldx		[%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
 
@@ -511,7 +510,6 @@ xcall_report_regs:
 #endif
 	call		__show_regs
 	 add		%sp, PTREGS_OFF, %o0
-	clr		%l6
 	/* Has to be a non-v9 branch due to the large distance. */
 	b		rtrap_xcall
 	 ldx		[%sp + PTREGS_OFF + PT_V9_TSTATE], %l1
@@ -576,7 +574,7 @@ __hypervisor_tlb_xcall_error:
 	mov	%l4, %o0
 	call	hypervisor_tlbop_error_xcall
 	 mov	%l5, %o1
-	ba,a,pt	%xcc, rtrap_clr_l6
+	ba,a,pt	%xcc, rtrap
 
 	.globl		__hypervisor_xcall_flush_tlb_mm
 __hypervisor_xcall_flush_tlb_mm: /* 21 insns */

+ 17 - 13
drivers/serial/sunzilog.c

@@ -1015,6 +1015,7 @@ static struct uart_ops sunzilog_pops = {
 	.verify_port	=	sunzilog_verify_port,
 };
 
+static int uart_chip_count;
 static struct uart_sunzilog_port *sunzilog_port_table;
 static struct zilog_layout __iomem **sunzilog_chip_regs;
 
@@ -1350,16 +1351,22 @@ static int zilog_irq = -1;
 
 static int __devinit zs_probe(struct of_device *op, const struct of_device_id *match)
 {
-	static int inst;
+	static int kbm_inst, uart_inst;
+	int inst;
 	struct uart_sunzilog_port *up;
 	struct zilog_layout __iomem *rp;
-	int keyboard_mouse;
+	int keyboard_mouse = 0;
 	int err;
 
-	keyboard_mouse = 0;
 	if (of_find_property(op->node, "keyboard", NULL))
 		keyboard_mouse = 1;
 
+	/* uarts must come before keyboards/mice */
+	if (keyboard_mouse)
+		inst = uart_chip_count + kbm_inst;
+	else
+		inst = uart_inst;
+
 	sunzilog_chip_regs[inst] = of_ioremap(&op->resource[0], 0,
 					      sizeof(struct zilog_layout),
 					      "zs");
@@ -1427,6 +1434,7 @@ static int __devinit zs_probe(struct of_device *op, const struct of_device_id *m
 				   rp, sizeof(struct zilog_layout));
 			return err;
 		}
+		uart_inst++;
 	} else {
 		printk(KERN_INFO "%s: Keyboard at MMIO 0x%llx (irq = %d) "
 		       "is a %s\n",
@@ -1438,12 +1446,11 @@ static int __devinit zs_probe(struct of_device *op, const struct of_device_id *m
 		       op->dev.bus_id,
 		       (unsigned long long) up[1].port.mapbase,
 		       op->irqs[0], sunzilog_type(&up[1].port));
+		kbm_inst++;
 	}
 
 	dev_set_drvdata(&op->dev, &up[0]);
 
-	inst++;
-
 	return 0;
 }
 
@@ -1491,28 +1498,25 @@ static struct of_platform_driver zs_driver = {
 static int __init sunzilog_init(void)
 {
 	struct device_node *dp;
-	int err, uart_count;
-	int num_keybms;
+	int err;
+	int num_keybms = 0;
 	int num_sunzilog = 0;
 
-	num_keybms = 0;
 	for_each_node_by_name(dp, "zs") {
 		num_sunzilog++;
 		if (of_find_property(dp, "keyboard", NULL))
 			num_keybms++;
 	}
 
-	uart_count = 0;
 	if (num_sunzilog) {
-		int uart_count;
-
 		err = sunzilog_alloc_tables(num_sunzilog);
 		if (err)
 			goto out;
 
-		uart_count = (num_sunzilog * 2) - (2 * num_keybms);
+		uart_chip_count = num_sunzilog - num_keybms;
 
-		err = sunserial_register_minors(&sunzilog_reg, uart_count);
+		err = sunserial_register_minors(&sunzilog_reg,
+						uart_chip_count * 2);
 		if (err)
 			goto out_free_tables;
 	}

+ 2 - 0
include/asm-sparc/device.h

@@ -16,6 +16,8 @@ struct dev_archdata {
 
 	struct device_node	*prom_node;
 	struct of_device	*op;
+
+	int			numa_node;
 };
 
 #endif /* _ASM_SPARC_DEVICE_H */

+ 5 - 0
include/asm-sparc/prom.h

@@ -77,6 +77,11 @@ extern int of_getintprop_default(struct device_node *np,
 				 const char *name,
 				 int def);
 extern int of_find_in_proplist(const char *list, const char *match, int len);
+#ifdef CONFIG_NUMA
+extern int of_node_to_nid(struct device_node *dp);
+#else
+#define of_node_to_nid(dp)	(-1)
+#endif
 
 extern void prom_build_devicetree(void);
 

+ 2 - 1
include/asm-sparc64/iommu.h

@@ -56,6 +56,7 @@ struct strbuf {
 };
 
 extern int iommu_table_init(struct iommu *iommu, int tsbsize,
-			    u32 dma_offset, u32 dma_addr_mask);
+			    u32 dma_offset, u32 dma_addr_mask,
+			    int numa_node);
 
 #endif /* !(_SPARC64_IOMMU_H) */

+ 17 - 0
include/asm-sparc64/mmzone.h

@@ -0,0 +1,17 @@
+#ifndef _SPARC64_MMZONE_H
+#define _SPARC64_MMZONE_H
+
+#ifdef CONFIG_NEED_MULTIPLE_NODES
+
+extern struct pglist_data *node_data[];
+
+#define NODE_DATA(nid)		(node_data[nid])
+#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
+#define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
+
+extern int numa_cpu_lookup_table[];
+extern cpumask_t numa_cpumask_lookup_table[];
+
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
+
+#endif /* _SPARC64_MMZONE_H */

+ 0 - 6
include/asm-sparc64/numnodes.h

@@ -1,6 +0,0 @@
-#ifndef _SPARC64_NUMNODES_H
-#define _SPARC64_NUMNODES_H
-
-#define NODES_SHIFT	0
-
-#endif /* !(_SPARC64_NUMNODES_H) */

+ 16 - 2
include/asm-sparc64/ptrace.h

@@ -8,6 +8,8 @@
  * stack during a system call and basically all traps.
  */
 
+#define PT_REGS_MAGIC 0x57ac6c00
+
 #ifndef __ASSEMBLY__
 
 struct pt_regs {
@@ -16,7 +18,19 @@ struct pt_regs {
 	unsigned long tpc;
 	unsigned long tnpc;
 	unsigned int y;
-	unsigned int fprs;
+
+	/* We encode a magic number, PT_REGS_MAGIC, along
+	 * with the %tt (trap type) register value at trap
+	 * entry time.  The magic number allows us to identify
+	 * accurately a trap stack frame in the stack
+	 * unwinder, and the %tt value allows us to test
+	 * things like "in a system call" etc. for an arbitray
+	 * process.
+	 *
+	 * The PT_REGS_MAGIC is choosen such that it can be
+	 * loaded completely using just a sethi instruction.
+	 */
+	unsigned int magic;
 };
 
 struct pt_regs32 {
@@ -147,7 +161,7 @@ extern void __show_regs(struct pt_regs *);
 #define PT_V9_TPC    0x88
 #define PT_V9_TNPC   0x90
 #define PT_V9_Y      0x98
-#define PT_V9_FPRS   0x9c
+#define PT_V9_MAGIC  0x9c
 #define PT_TSTATE	PT_V9_TSTATE
 #define PT_TPC		PT_V9_TPC
 #define PT_TNPC		PT_V9_TNPC

+ 1 - 1
include/asm-sparc64/sparsemem.h

@@ -3,7 +3,7 @@
 
 #ifdef __KERNEL__
 
-#define SECTION_SIZE_BITS       31
+#define SECTION_SIZE_BITS       30
 #define MAX_PHYSADDR_BITS       42
 #define MAX_PHYSMEM_BITS        42
 

+ 71 - 2
include/asm-sparc64/topology.h

@@ -1,6 +1,77 @@
 #ifndef _ASM_SPARC64_TOPOLOGY_H
 #define _ASM_SPARC64_TOPOLOGY_H
 
+#ifdef CONFIG_NUMA
+
+#include <asm/mmzone.h>
+
+static inline int cpu_to_node(int cpu)
+{
+	return numa_cpu_lookup_table[cpu];
+}
+
+#define parent_node(node)	(node)
+
+static inline cpumask_t node_to_cpumask(int node)
+{
+	return numa_cpumask_lookup_table[node];
+}
+
+/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
+#define node_to_cpumask_ptr(v, node)		\
+		cpumask_t *v = &(numa_cpumask_lookup_table[node])
+
+#define node_to_cpumask_ptr_next(v, node)	\
+			   v = &(numa_cpumask_lookup_table[node])
+
+static inline int node_to_first_cpu(int node)
+{
+	cpumask_t tmp;
+	tmp = node_to_cpumask(node);
+	return first_cpu(tmp);
+}
+
+struct pci_bus;
+#ifdef CONFIG_PCI
+extern int pcibus_to_node(struct pci_bus *pbus);
+#else
+static inline int pcibus_to_node(struct pci_bus *pbus)
+{
+	return -1;
+}
+#endif
+
+#define pcibus_to_cpumask(bus)	\
+	(pcibus_to_node(bus) == -1 ? \
+	 CPU_MASK_ALL : \
+	 node_to_cpumask(pcibus_to_node(bus)))
+
+#define SD_NODE_INIT (struct sched_domain) {		\
+	.min_interval		= 8,			\
+	.max_interval		= 32,			\
+	.busy_factor		= 32,			\
+	.imbalance_pct		= 125,			\
+	.cache_nice_tries	= 2,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 2,			\
+	.newidle_idx		= 0, 			\
+	.wake_idx		= 1,			\
+	.forkexec_idx		= 1,			\
+	.flags			= SD_LOAD_BALANCE	\
+				| SD_BALANCE_FORK	\
+				| SD_BALANCE_EXEC	\
+				| SD_SERIALIZE		\
+				| SD_WAKE_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 1,			\
+}
+
+#else /* CONFIG_NUMA */
+
+#include <asm-generic/topology.h>
+
+#endif /* !(CONFIG_NUMA) */
+
 #ifdef CONFIG_SMP
 #define topology_physical_package_id(cpu)	(cpu_data(cpu).proc_id)
 #define topology_core_id(cpu)			(cpu_data(cpu).core_id)
@@ -10,8 +81,6 @@
 #define smt_capable()				(sparc64_multi_core)
 #endif /* CONFIG_SMP */
 
-#include <asm-generic/topology.h>
-
 #define cpu_coregroup_map(cpu)			(cpu_core_map[cpu])
 
 #endif /* _ASM_SPARC64_TOPOLOGY_H */

+ 7 - 7
include/asm-sparc64/ttable.h

@@ -28,7 +28,7 @@
 	call	routine;				\
 	 add	%sp, PTREGS_OFF, %o0;			\
 	ba,pt	%xcc, rtrap;				\
-	 clr	%l6;					\
+	 nop;						\
 	nop;
 
 #define TRAP_7INSNS(routine)				\
@@ -38,7 +38,7 @@
 	call	routine;				\
 	 add	%sp, PTREGS_OFF, %o0;			\
 	ba,pt	%xcc, rtrap;				\
-	 clr	%l6;
+	 nop;
 
 #define TRAP_SAVEFPU(routine)				\
 	sethi	%hi(109f), %g7;				\
@@ -47,7 +47,7 @@
 	call	routine;				\
 	 add	%sp, PTREGS_OFF, %o0;			\
 	ba,pt	%xcc, rtrap;				\
-	 clr	%l6;					\
+	 nop;						\
 	nop;
 
 #define TRAP_NOSAVE(routine)				\
@@ -67,7 +67,7 @@
 	call	routine;				\
 	 add	%sp, PTREGS_OFF, %o0;			\
 	ba,pt	%xcc, rtrap;				\
-	 clr	%l6;					\
+	 nop;						\
 	nop;
 	
 #define TRAP_ARG(routine, arg)				\
@@ -78,7 +78,7 @@
 	call	routine;				\
 	 mov	arg, %o1;				\
 	ba,pt	%xcc, rtrap;				\
-	 clr	%l6;
+	 nop;
 	
 #define TRAPTL1_ARG(routine, arg)			\
 	sethi	%hi(109f), %g7;				\
@@ -88,7 +88,7 @@
 	call	routine;				\
 	 mov	arg, %o1;				\
 	ba,pt	%xcc, rtrap;				\
-	 clr	%l6;
+	 nop;
 	
 #define SYSCALL_TRAP(routine, systbl)			\
 	sethi	%hi(109f), %g7;				\
@@ -166,7 +166,7 @@
 	ldx	[%sp + PTREGS_OFF + PT_V9_TNPC], %l1;			\
 	add	%l1, 4, %l2;						\
 	stx	%l1, [%sp + PTREGS_OFF + PT_V9_TPC];			\
-	ba,pt	%xcc, rtrap_clr_l6;					\
+	ba,pt	%xcc, rtrap;						\
 	 stx	%l2, [%sp + PTREGS_OFF + PT_V9_TNPC];
 	        
 #ifdef CONFIG_KPROBES

+ 1 - 1
lib/lmb.c

@@ -346,7 +346,7 @@ u64 __init __lmb_alloc_base(u64 size, u64 align, u64 max_addr)
 			if (j < 0) {
 				/* this area isn't reserved, take it */
 				if (lmb_add_region(&lmb.reserved, base,
-						   size) < 0)
+						   lmb_align_up(size, align)) < 0)
 					return 0;
 				return base;
 			}