浏览代码

Merge branches 'x86/mm', 'x86/build', 'x86/apic' and 'x86/platform' into x86/core, to apply dependent patch

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Ingo Molnar 10 年之前
父节点
当前提交
34e7724c07
共有 100 个文件被更改,包括 2722 次插入2345 次删除
  1. 6 0
      Documentation/kernel-parameters.txt
  2. 15 3
      Documentation/x86/mtrr.txt
  3. 34 1
      Documentation/x86/pat.txt
  4. 0 2
      arch/ia64/include/asm/irq_remapping.h
  5. 19 11
      arch/ia64/kernel/msi_ia64.c
  6. 8 7
      arch/x86/Kconfig
  7. 11 0
      arch/x86/Kconfig.debug
  8. 9 0
      arch/x86/Makefile
  9. 16 36
      arch/x86/ia32/ia32entry.S
  10. 18 0
      arch/x86/include/asm/alternative-asm.h
  11. 6 0
      arch/x86/include/asm/apic.h
  12. 25 0
      arch/x86/include/asm/asm.h
  13. 15 15
      arch/x86/include/asm/atomic.h
  14. 4 4
      arch/x86/include/asm/atomic64_64.h
  15. 2 0
      arch/x86/include/asm/entry_arch.h
  16. 1 0
      arch/x86/include/asm/hardirq.h
  17. 6 10
      arch/x86/include/asm/hpet.h
  18. 66 72
      arch/x86/include/asm/hw_irq.h
  19. 4 0
      arch/x86/include/asm/io.h
  20. 32 82
      arch/x86/include/asm/io_apic.h
  21. 4 0
      arch/x86/include/asm/irq.h
  22. 23 46
      arch/x86/include/asm/irq_remapping.h
  23. 13 27
      arch/x86/include/asm/irq_vectors.h
  24. 63 0
      arch/x86/include/asm/irqdomain.h
  25. 7 0
      arch/x86/include/asm/msi.h
  26. 7 8
      arch/x86/include/asm/mtrr.h
  27. 4 3
      arch/x86/include/asm/paravirt_types.h
  28. 1 6
      arch/x86/include/asm/pat.h
  29. 0 5
      arch/x86/include/asm/pci.h
  30. 38 0
      arch/x86/include/asm/special_insns.h
  31. 5 3
      arch/x86/include/asm/thread_info.h
  32. 4 0
      arch/x86/include/asm/uaccess_32.h
  33. 0 21
      arch/x86/include/asm/x86_init.h
  34. 7 1
      arch/x86/include/uapi/asm/mtrr.h
  35. 22 51
      arch/x86/kernel/acpi/boot.c
  36. 3 3
      arch/x86/kernel/acpi/wakeup_64.S
  37. 9 0
      arch/x86/kernel/alternative.c
  38. 0 4
      arch/x86/kernel/apb_timer.c
  39. 126 47
      arch/x86/kernel/apic/htirq.c
  40. 597 706
      arch/x86/kernel/apic/io_apic.c
  41. 244 173
      arch/x86/kernel/apic/msi.c
  42. 279 169
      arch/x86/kernel/apic/vector.c
  43. 2 0
      arch/x86/kernel/apic/x2apic_phys.c
  44. 21 0
      arch/x86/kernel/asm-offsets.c
  45. 0 18
      arch/x86/kernel/asm-offsets_32.c
  46. 0 21
      arch/x86/kernel/asm-offsets_64.c
  47. 0 4
      arch/x86/kernel/cpu/common.c
  48. 2 4
      arch/x86/kernel/cpu/mshyperv.c
  49. 2 1
      arch/x86/kernel/cpu/mtrr/cleanup.c
  50. 137 72
      arch/x86/kernel/cpu/mtrr/generic.c
  51. 37 11
      arch/x86/kernel/cpu/mtrr/main.c
  52. 1 1
      arch/x86/kernel/cpu/mtrr/mtrr.h
  53. 17 24
      arch/x86/kernel/devicetree.c
  54. 33 44
      arch/x86/kernel/entry_64.S
  55. 2 2
      arch/x86/kernel/head_32.S
  56. 2 2
      arch/x86/kernel/head_64.S
  57. 15 42
      arch/x86/kernel/hpet.c
  58. 4 4
      arch/x86/kernel/i8259.c
  59. 46 10
      arch/x86/kernel/irq.c
  60. 0 6
      arch/x86/kernel/irq_32.c
  61. 0 6
      arch/x86/kernel/irq_64.c
  62. 2 8
      arch/x86/kernel/irq_work.c
  63. 4 2
      arch/x86/kernel/irqinit.c
  64. 1 6
      arch/x86/kernel/mpparse.c
  65. 3 1
      arch/x86/kernel/paravirt.c
  66. 0 1
      arch/x86/kernel/paravirt_patch_64.c
  67. 1 4
      arch/x86/kernel/process_32.c
  68. 0 3
      arch/x86/kernel/process_64.c
  69. 1 2
      arch/x86/kernel/setup.c
  70. 6 13
      arch/x86/kernel/smp.c
  71. 39 4
      arch/x86/kernel/smpboot.c
  72. 2 2
      arch/x86/kernel/traps.c
  73. 0 9
      arch/x86/kernel/x86_init.c
  74. 2 2
      arch/x86/lguest/boot.c
  75. 1 1
      arch/x86/lib/Makefile
  76. 92 24
      arch/x86/lib/copy_user_64.S
  77. 0 136
      arch/x86/lib/copy_user_nocache_64.S
  78. 1 1
      arch/x86/mm/iomap_32.c
  79. 39 5
      arch/x86/mm/ioremap.c
  80. 10 10
      arch/x86/mm/pageattr.c
  81. 40 42
      arch/x86/mm/pat.c
  82. 1 1
      arch/x86/mm/pat_internal.h
  83. 3 3
      arch/x86/mm/pat_rbtree.c
  84. 46 14
      arch/x86/mm/pgtable.c
  85. 3 3
      arch/x86/pci/i386.c
  86. 3 3
      arch/x86/pci/intel_mid_pci.c
  87. 7 6
      arch/x86/pci/irq.c
  88. 1 0
      arch/x86/platform/Makefile
  89. 1 0
      arch/x86/platform/atom/Makefile
  90. 183 0
      arch/x86/platform/atom/punit_atom_debug.c
  91. 3 2
      arch/x86/platform/intel-mid/device_libs/platform_wdt.c
  92. 13 5
      arch/x86/platform/intel-mid/intel-mid.c
  93. 14 16
      arch/x86/platform/intel-mid/sfi.c
  94. 2 5
      arch/x86/platform/sfi/sfi.c
  95. 111 187
      arch/x86/platform/uv/uv_irq.c
  96. 4 4
      arch/x86/power/hibernate_asm_64.S
  97. 2 1
      arch/x86/xen/enlighten.c
  98. 4 18
      arch/x86/xen/xen-asm_64.S
  99. 2 0
      arch/x86/xen/xen-ops.h
  100. 1 13
      drivers/gpu/drm/drm_ioctl.c

+ 6 - 0
Documentation/kernel-parameters.txt

@@ -746,6 +746,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	cpuidle.off=1	[CPU_IDLE]
 	cpuidle.off=1	[CPU_IDLE]
 			disable the cpuidle sub-system
 			disable the cpuidle sub-system
 
 
+	cpu_init_udelay=N
+			[X86] Delay for N microsec between assert and de-assert
+			of APIC INIT to start processors.  This delay occurs
+			on every CPU online, such as boot, and resume from suspend.
+			Default: 10000
+
 	cpcihp_generic=	[HW,PCI] Generic port I/O CompactPCI driver
 	cpcihp_generic=	[HW,PCI] Generic port I/O CompactPCI driver
 			Format:
 			Format:
 			<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
 			<first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]

+ 15 - 3
Documentation/x86/mtrr.txt

@@ -1,7 +1,19 @@
 MTRR (Memory Type Range Register) control
 MTRR (Memory Type Range Register) control
-3 Jun 1999
-Richard Gooch
-<rgooch@atnf.csiro.au>
+
+Richard Gooch <rgooch@atnf.csiro.au> - 3 Jun 1999
+Luis R. Rodriguez <mcgrof@do-not-panic.com> - April 9, 2015
+
+===============================================================================
+Phasing out MTRR use
+
+MTRR use is replaced on modern x86 hardware with PAT. Over time the only type
+of effective MTRR that is expected to be supported will be for write-combining.
+As MTRR use is phased out device drivers should use arch_phys_wc_add() to make
+MTRR effective on non-PAT systems while a no-op on PAT enabled systems.
+
+For details refer to Documentation/x86/pat.txt.
+
+===============================================================================
 
 
   On Intel P6 family processors (Pentium Pro, Pentium II and later)
   On Intel P6 family processors (Pentium Pro, Pentium II and later)
   the Memory Type Range Registers (MTRRs) may be used to control
   the Memory Type Range Registers (MTRRs) may be used to control

+ 34 - 1
Documentation/x86/pat.txt

@@ -34,6 +34,8 @@ ioremap                |    --    |    UC-     |       UC-        |
                        |          |            |                  |
                        |          |            |                  |
 ioremap_cache          |    --    |    WB      |       WB         |
 ioremap_cache          |    --    |    WB      |       WB         |
                        |          |            |                  |
                        |          |            |                  |
+ioremap_uc             |    --    |    UC      |       UC         |
+                       |          |            |                  |
 ioremap_nocache        |    --    |    UC-     |       UC-        |
 ioremap_nocache        |    --    |    UC-     |       UC-        |
                        |          |            |                  |
                        |          |            |                  |
 ioremap_wc             |    --    |    --      |       WC         |
 ioremap_wc             |    --    |    --      |       WC         |
@@ -102,7 +104,38 @@ wants to export a RAM region, it has to do set_memory_uc() or set_memory_wc()
 as step 0 above and also track the usage of those pages and use set_memory_wb()
 as step 0 above and also track the usage of those pages and use set_memory_wb()
 before the page is freed to free pool.
 before the page is freed to free pool.
 
 
-
+MTRR effects on PAT / non-PAT systems
+-------------------------------------
+
+The following table provides the effects of using write-combining MTRRs when
+using ioremap*() calls on x86 for both non-PAT and PAT systems. Ideally
+mtrr_add() usage will be phased out in favor of arch_phys_wc_add() which will
+be a no-op on PAT enabled systems. The region over which a arch_phys_wc_add()
+is made, should already have been ioremapped with WC attributes or PAT entries,
+this can be done by using ioremap_wc() / set_memory_wc().  Devices which
+combine areas of IO memory desired to remain uncacheable with areas where
+write-combining is desirable should consider use of ioremap_uc() followed by
+set_memory_wc() to white-list effective write-combined areas.  Such use is
+nevertheless discouraged as the effective memory type is considered
+implementation defined, yet this strategy can be used as last resort on devices
+with size-constrained regions where otherwise MTRR write-combining would
+otherwise not be effective.
+
+----------------------------------------------------------------------
+MTRR Non-PAT   PAT    Linux ioremap value        Effective memory type
+----------------------------------------------------------------------
+                                                  Non-PAT |  PAT
+     PAT
+     |PCD
+     ||PWT
+     |||
+WC   000      WB      _PAGE_CACHE_MODE_WB            WC   |   WC
+WC   001      WC      _PAGE_CACHE_MODE_WC            WC*  |   WC
+WC   010      UC-     _PAGE_CACHE_MODE_UC_MINUS      WC*  |   UC
+WC   011      UC      _PAGE_CACHE_MODE_UC            UC   |   UC
+----------------------------------------------------------------------
+
+(*) denotes implementation defined and is discouraged
 
 
 Notes:
 Notes:
 
 

+ 0 - 2
arch/ia64/include/asm/irq_remapping.h

@@ -1,6 +1,4 @@
 #ifndef __IA64_INTR_REMAPPING_H
 #ifndef __IA64_INTR_REMAPPING_H
 #define __IA64_INTR_REMAPPING_H
 #define __IA64_INTR_REMAPPING_H
 #define irq_remapping_enabled 0
 #define irq_remapping_enabled 0
-#define dmar_alloc_hwirq	create_irq
-#define dmar_free_hwirq		destroy_irq
 #endif
 #endif

+ 19 - 11
arch/ia64/kernel/msi_ia64.c

@@ -165,7 +165,7 @@ static struct irq_chip dmar_msi_type = {
 	.irq_retrigger = ia64_msi_retrigger_irq,
 	.irq_retrigger = ia64_msi_retrigger_irq,
 };
 };
 
 
-static int
+static void
 msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 {
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
 	struct irq_cfg *cfg = irq_cfg + irq;
@@ -186,21 +186,29 @@ msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 		MSI_DATA_LEVEL_ASSERT |
 		MSI_DATA_LEVEL_ASSERT |
 		MSI_DATA_DELIVERY_FIXED |
 		MSI_DATA_DELIVERY_FIXED |
 		MSI_DATA_VECTOR(cfg->vector);
 		MSI_DATA_VECTOR(cfg->vector);
-	return 0;
 }
 }
 
 
-int arch_setup_dmar_msi(unsigned int irq)
+int dmar_alloc_hwirq(int id, int node, void *arg)
 {
 {
-	int ret;
+	int irq;
 	struct msi_msg msg;
 	struct msi_msg msg;
 
 
-	ret = msi_compose_msg(NULL, irq, &msg);
-	if (ret < 0)
-		return ret;
-	dmar_msi_write(irq, &msg);
-	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-				      "edge");
-	return 0;
+	irq = create_irq();
+	if (irq > 0) {
+		irq_set_handler_data(irq, arg);
+		irq_set_chip_and_handler_name(irq, &dmar_msi_type,
+					      handle_edge_irq, "edge");
+		msi_compose_msg(NULL, irq, &msg);
+		dmar_msi_write(irq, &msg);
+	}
+
+	return irq;
+}
+
+void dmar_free_hwirq(int irq)
+{
+	irq_set_handler_data(irq, NULL);
+	destroy_irq(irq);
 }
 }
 #endif /* CONFIG_INTEL_IOMMU */
 #endif /* CONFIG_INTEL_IOMMU */
 
 

+ 8 - 7
arch/x86/Kconfig

@@ -100,7 +100,7 @@ config X86
 	select IRQ_FORCED_THREADING
 	select IRQ_FORCED_THREADING
 	select HAVE_BPF_JIT if X86_64
 	select HAVE_BPF_JIT if X86_64
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
-	select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
+	select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_SG_CHAIN
 	select CLKEVT_I8253
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
@@ -341,7 +341,7 @@ config X86_FEATURE_NAMES
 
 
 config X86_X2APIC
 config X86_X2APIC
 	bool "Support x2apic"
 	bool "Support x2apic"
-	depends on X86_LOCAL_APIC && X86_64 && IRQ_REMAP
+	depends on X86_LOCAL_APIC && X86_64 && (IRQ_REMAP || HYPERVISOR_GUEST)
 	---help---
 	---help---
 	  This enables x2apic support on CPUs that have this feature.
 	  This enables x2apic support on CPUs that have this feature.
 
 
@@ -441,6 +441,7 @@ config X86_UV
 	depends on X86_EXTENDED_PLATFORM
 	depends on X86_EXTENDED_PLATFORM
 	depends on NUMA
 	depends on NUMA
 	depends on X86_X2APIC
 	depends on X86_X2APIC
+	depends on PCI
 	---help---
 	---help---
 	  This option is needed in order to support SGI Ultraviolet systems.
 	  This option is needed in order to support SGI Ultraviolet systems.
 	  If you don't have one of these, you should say N here.
 	  If you don't have one of these, you should say N here.
@@ -466,7 +467,6 @@ config X86_INTEL_CE
 	select X86_REBOOTFIXUPS
 	select X86_REBOOTFIXUPS
 	select OF
 	select OF
 	select OF_EARLY_FLATTREE
 	select OF_EARLY_FLATTREE
-	select IRQ_DOMAIN
 	---help---
 	---help---
 	  Select for the Intel CE media processor (CE4100) SOC.
 	  Select for the Intel CE media processor (CE4100) SOC.
 	  This option compiles in support for the CE4100 SOC for settop
 	  This option compiles in support for the CE4100 SOC for settop
@@ -851,11 +851,12 @@ config NR_CPUS
 	default "1" if !SMP
 	default "1" if !SMP
 	default "8192" if MAXSMP
 	default "8192" if MAXSMP
 	default "32" if SMP && X86_BIGSMP
 	default "32" if SMP && X86_BIGSMP
-	default "8" if SMP
+	default "8" if SMP && X86_32
+	default "64" if SMP
 	---help---
 	---help---
 	  This allows you to specify the maximum number of CPUs which this
 	  This allows you to specify the maximum number of CPUs which this
 	  kernel will support.  If CPUMASK_OFFSTACK is enabled, the maximum
 	  kernel will support.  If CPUMASK_OFFSTACK is enabled, the maximum
-	  supported value is 4096, otherwise the maximum value is 512.  The
+	  supported value is 8192, otherwise the maximum value is 512.  The
 	  minimum value which makes sense is 2.
 	  minimum value which makes sense is 2.
 
 
 	  This is purely to save memory - each supported CPU adds
 	  This is purely to save memory - each supported CPU adds
@@ -914,12 +915,12 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 config X86_LOCAL_APIC
 	def_bool y
 	def_bool y
 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
 	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
-	select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+	select IRQ_DOMAIN_HIERARCHY
+	select PCI_MSI_IRQ_DOMAIN if PCI_MSI
 
 
 config X86_IO_APIC
 config X86_IO_APIC
 	def_bool y
 	def_bool y
 	depends on X86_LOCAL_APIC || X86_UP_IOAPIC
 	depends on X86_LOCAL_APIC || X86_UP_IOAPIC
-	select IRQ_DOMAIN
 
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
 	bool "Reroute for broken boot IRQs"
 	bool "Reroute for broken boot IRQs"

+ 11 - 0
arch/x86/Kconfig.debug

@@ -332,4 +332,15 @@ config X86_DEBUG_STATIC_CPU_HAS
 
 
 	  If unsure, say N.
 	  If unsure, say N.
 
 
+config PUNIT_ATOM_DEBUG
+	tristate "ATOM Punit debug driver"
+	select DEBUG_FS
+	select IOSF_MBI
+	---help---
+	  This is a debug driver, which gets the power states
+	  of all Punit North Complex devices. The power states of
+	  each device is exposed as part of the debugfs interface.
+	  The current power state can be read from
+	  /sys/kernel/debug/punit_atom/dev_power_state
+
 endmenu
 endmenu

+ 9 - 0
arch/x86/Makefile

@@ -77,6 +77,12 @@ else
         KBUILD_AFLAGS += -m64
         KBUILD_AFLAGS += -m64
         KBUILD_CFLAGS += -m64
         KBUILD_CFLAGS += -m64
 
 
+        # Align jump targets to 1 byte, not the default 16 bytes:
+        KBUILD_CFLAGS += -falign-jumps=1
+
+        # Pack loops tightly as well:
+        KBUILD_CFLAGS += -falign-loops=1
+
         # Don't autogenerate traditional x87 instructions
         # Don't autogenerate traditional x87 instructions
         KBUILD_CFLAGS += $(call cc-option,-mno-80387)
         KBUILD_CFLAGS += $(call cc-option,-mno-80387)
         KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
         KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
@@ -84,6 +90,9 @@ else
 	# Use -mpreferred-stack-boundary=3 if supported.
 	# Use -mpreferred-stack-boundary=3 if supported.
 	KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
 	KBUILD_CFLAGS += $(call cc-option,-mpreferred-stack-boundary=3)
 
 
+	# Use -mskip-rax-setup if supported.
+	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
+
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
         cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
         cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
         cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
         cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)

+ 16 - 36
arch/x86/ia32/ia32entry.S

@@ -77,12 +77,6 @@ ENTRY(native_usergs_sysret32)
 	swapgs
 	swapgs
 	sysretl
 	sysretl
 ENDPROC(native_usergs_sysret32)
 ENDPROC(native_usergs_sysret32)
-
-ENTRY(native_irq_enable_sysexit)
-	swapgs
-	sti
-	sysexit
-ENDPROC(native_irq_enable_sysexit)
 #endif
 #endif
 
 
 /*
 /*
@@ -119,7 +113,7 @@ ENTRY(ia32_sysenter_target)
 	 * it is too small to ever cause noticeable irq latency.
 	 * it is too small to ever cause noticeable irq latency.
 	 */
 	 */
 	SWAPGS_UNSAFE_STACK
 	SWAPGS_UNSAFE_STACK
-	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
 
 	/* Zero-extending 32-bit regs, do not remove */
 	/* Zero-extending 32-bit regs, do not remove */
@@ -142,7 +136,7 @@ ENTRY(ia32_sysenter_target)
 	pushq_cfi_reg	rsi			/* pt_regs->si */
 	pushq_cfi_reg	rsi			/* pt_regs->si */
 	pushq_cfi_reg	rdx			/* pt_regs->dx */
 	pushq_cfi_reg	rdx			/* pt_regs->dx */
 	pushq_cfi_reg	rcx			/* pt_regs->cx */
 	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi_reg	rax			/* pt_regs->ax */
+	pushq_cfi	$-ENOSYS		/* pt_regs->ax */
 	cld
 	cld
 	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
 	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
 	CFI_ADJUST_CFA_OFFSET 10*8
 	CFI_ADJUST_CFA_OFFSET 10*8
@@ -169,8 +163,6 @@ sysenter_flags_fixed:
 	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	CFI_REMEMBER_STATE
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	jnz  sysenter_tracesys
-	cmpq	$(IA32_NR_syscalls-1),%rax
-	ja	ia32_badsys
 sysenter_do_call:
 sysenter_do_call:
 	/* 32bit syscall -> 64bit C ABI argument conversion */
 	/* 32bit syscall -> 64bit C ABI argument conversion */
 	movl	%edi,%r8d	/* arg5 */
 	movl	%edi,%r8d	/* arg5 */
@@ -179,8 +171,11 @@ sysenter_do_call:
 	movl	%ebx,%edi	/* arg1 */
 	movl	%ebx,%edi	/* arg1 */
 	movl	%edx,%edx	/* arg3 (zero extension) */
 	movl	%edx,%edx	/* arg3 (zero extension) */
 sysenter_dispatch:
 sysenter_dispatch:
+	cmpq	$(IA32_NR_syscalls-1),%rax
+	ja	1f
 	call	*ia32_sys_call_table(,%rax,8)
 	call	*ia32_sys_call_table(,%rax,8)
 	movq	%rax,RAX(%rsp)
 	movq	%rax,RAX(%rsp)
+1:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	TRACE_IRQS_OFF
 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
@@ -247,9 +242,7 @@ sysexit_from_sys_call:
 	movl %ebx,%esi			/* 2nd arg: 1st syscall arg */
 	movl %ebx,%esi			/* 2nd arg: 1st syscall arg */
 	movl %eax,%edi			/* 1st arg: syscall number */
 	movl %eax,%edi			/* 1st arg: syscall number */
 	call __audit_syscall_entry
 	call __audit_syscall_entry
-	movl RAX(%rsp),%eax	/* reload syscall number */
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja ia32_badsys
+	movl ORIG_RAX(%rsp),%eax	/* reload syscall number */
 	movl %ebx,%edi			/* reload 1st syscall arg */
 	movl %ebx,%edi			/* reload 1st syscall arg */
 	movl RCX(%rsp),%esi	/* reload 2nd syscall arg */
 	movl RCX(%rsp),%esi	/* reload 2nd syscall arg */
 	movl RDX(%rsp),%edx	/* reload 3rd syscall arg */
 	movl RDX(%rsp),%edx	/* reload 3rd syscall arg */
@@ -300,13 +293,10 @@ sysenter_tracesys:
 #endif
 #endif
 	SAVE_EXTRA_REGS
 	SAVE_EXTRA_REGS
 	CLEAR_RREGS
 	CLEAR_RREGS
-	movq	$-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
 	call	syscall_trace_enter
 	call	syscall_trace_enter
 	LOAD_ARGS32  /* reload args from stack in case ptrace changed it */
 	LOAD_ARGS32  /* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
 	RESTORE_EXTRA_REGS
-	cmpq	$(IA32_NR_syscalls-1),%rax
-	ja	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
 	jmp	sysenter_do_call
 	jmp	sysenter_do_call
 	CFI_ENDPROC
 	CFI_ENDPROC
 ENDPROC(ia32_sysenter_target)
 ENDPROC(ia32_sysenter_target)
@@ -356,7 +346,7 @@ ENTRY(ia32_cstar_target)
 	SWAPGS_UNSAFE_STACK
 	SWAPGS_UNSAFE_STACK
 	movl	%esp,%r8d
 	movl	%esp,%r8d
 	CFI_REGISTER	rsp,r8
 	CFI_REGISTER	rsp,r8
-	movq	PER_CPU_VAR(kernel_stack),%rsp
+	movq	PER_CPU_VAR(cpu_current_top_of_stack),%rsp
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
 
 	/* Zero-extending 32-bit regs, do not remove */
 	/* Zero-extending 32-bit regs, do not remove */
@@ -376,7 +366,7 @@ ENTRY(ia32_cstar_target)
 	pushq_cfi_reg	rdx			/* pt_regs->dx */
 	pushq_cfi_reg	rdx			/* pt_regs->dx */
 	pushq_cfi_reg	rbp			/* pt_regs->cx */
 	pushq_cfi_reg	rbp			/* pt_regs->cx */
 	movl	%ebp,%ecx
 	movl	%ebp,%ecx
-	pushq_cfi_reg	rax			/* pt_regs->ax */
+	pushq_cfi	$-ENOSYS		/* pt_regs->ax */
 	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
 	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
 	CFI_ADJUST_CFA_OFFSET 10*8
 	CFI_ADJUST_CFA_OFFSET 10*8
 
 
@@ -392,8 +382,6 @@ ENTRY(ia32_cstar_target)
 	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	CFI_REMEMBER_STATE
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	jnz   cstar_tracesys
-	cmpq $IA32_NR_syscalls-1,%rax
-	ja  ia32_badsys
 cstar_do_call:
 cstar_do_call:
 	/* 32bit syscall -> 64bit C ABI argument conversion */
 	/* 32bit syscall -> 64bit C ABI argument conversion */
 	movl	%edi,%r8d	/* arg5 */
 	movl	%edi,%r8d	/* arg5 */
@@ -402,8 +390,11 @@ cstar_do_call:
 	movl	%ebx,%edi	/* arg1 */
 	movl	%ebx,%edi	/* arg1 */
 	movl	%edx,%edx	/* arg3 (zero extension) */
 	movl	%edx,%edx	/* arg3 (zero extension) */
 cstar_dispatch:
 cstar_dispatch:
+	cmpq	$(IA32_NR_syscalls-1),%rax
+	ja	1f
 	call *ia32_sys_call_table(,%rax,8)
 	call *ia32_sys_call_table(,%rax,8)
 	movq %rax,RAX(%rsp)
 	movq %rax,RAX(%rsp)
+1:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	TRACE_IRQS_OFF
 	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
@@ -457,14 +448,11 @@ cstar_tracesys:
 	xchgl %r9d,%ebp
 	xchgl %r9d,%ebp
 	SAVE_EXTRA_REGS
 	SAVE_EXTRA_REGS
 	CLEAR_RREGS r9
 	CLEAR_RREGS r9
-	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
 	call syscall_trace_enter
 	LOAD_ARGS32 1	/* reload args from stack in case ptrace changed it */
 	LOAD_ARGS32 1	/* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
 	RESTORE_EXTRA_REGS
 	xchgl %ebp,%r9d
 	xchgl %ebp,%r9d
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
 	jmp cstar_do_call
 	jmp cstar_do_call
 END(ia32_cstar_target)
 END(ia32_cstar_target)
 				
 				
@@ -523,7 +511,7 @@ ENTRY(ia32_syscall)
 	pushq_cfi_reg	rsi			/* pt_regs->si */
 	pushq_cfi_reg	rsi			/* pt_regs->si */
 	pushq_cfi_reg	rdx			/* pt_regs->dx */
 	pushq_cfi_reg	rdx			/* pt_regs->dx */
 	pushq_cfi_reg	rcx			/* pt_regs->cx */
 	pushq_cfi_reg	rcx			/* pt_regs->cx */
-	pushq_cfi_reg	rax			/* pt_regs->ax */
+	pushq_cfi	$-ENOSYS		/* pt_regs->ax */
 	cld
 	cld
 	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
 	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
 	CFI_ADJUST_CFA_OFFSET 10*8
 	CFI_ADJUST_CFA_OFFSET 10*8
@@ -531,8 +519,6 @@ ENTRY(ia32_syscall)
 	orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz ia32_tracesys
 	jnz ia32_tracesys
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja ia32_badsys
 ia32_do_call:
 ia32_do_call:
 	/* 32bit syscall -> 64bit C ABI argument conversion */
 	/* 32bit syscall -> 64bit C ABI argument conversion */
 	movl %edi,%r8d	/* arg5 */
 	movl %edi,%r8d	/* arg5 */
@@ -540,9 +526,12 @@ ia32_do_call:
 	xchg %ecx,%esi	/* rsi:arg2, rcx:arg4 */
 	xchg %ecx,%esi	/* rsi:arg2, rcx:arg4 */
 	movl %ebx,%edi	/* arg1 */
 	movl %ebx,%edi	/* arg1 */
 	movl %edx,%edx	/* arg3 (zero extension) */
 	movl %edx,%edx	/* arg3 (zero extension) */
+	cmpq	$(IA32_NR_syscalls-1),%rax
+	ja	1f
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
 ia32_sysret:
 	movq %rax,RAX(%rsp)
 	movq %rax,RAX(%rsp)
+1:
 ia32_ret_from_sys_call:
 ia32_ret_from_sys_call:
 	CLEAR_RREGS
 	CLEAR_RREGS
 	jmp int_ret_from_sys_call
 	jmp int_ret_from_sys_call
@@ -550,23 +539,14 @@ ia32_ret_from_sys_call:
 ia32_tracesys:
 ia32_tracesys:
 	SAVE_EXTRA_REGS
 	SAVE_EXTRA_REGS
 	CLEAR_RREGS
 	CLEAR_RREGS
-	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
 	call syscall_trace_enter
 	LOAD_ARGS32	/* reload args from stack in case ptrace changed it */
 	LOAD_ARGS32	/* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
 	RESTORE_EXTRA_REGS
-	cmpq $(IA32_NR_syscalls-1),%rax
-	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
 	jmp ia32_do_call
 	jmp ia32_do_call
+	CFI_ENDPROC
 END(ia32_syscall)
 END(ia32_syscall)
 
 
-ia32_badsys:
-	movq $0,ORIG_RAX(%rsp)
-	movq $-ENOSYS,%rax
-	jmp ia32_sysret
-
-	CFI_ENDPROC
-	
 	.macro PTREGSCALL label, func
 	.macro PTREGSCALL label, func
 	ALIGN
 	ALIGN
 GLOBAL(\label)
 GLOBAL(\label)

+ 18 - 0
arch/x86/include/asm/alternative-asm.h

@@ -18,6 +18,12 @@
 	.endm
 	.endm
 #endif
 #endif
 
 
+/*
+ * Issue one struct alt_instr descriptor entry (need to put it into
+ * the section .altinstructions, see below). This entry contains
+ * enough information for the alternatives patching code to patch an
+ * instruction. See apply_alternatives().
+ */
 .macro altinstruction_entry orig alt feature orig_len alt_len pad_len
 .macro altinstruction_entry orig alt feature orig_len alt_len pad_len
 	.long \orig - .
 	.long \orig - .
 	.long \alt - .
 	.long \alt - .
@@ -27,6 +33,12 @@
 	.byte \pad_len
 	.byte \pad_len
 .endm
 .endm
 
 
+/*
+ * Define an alternative between two instructions. If @feature is
+ * present, early code in apply_alternatives() replaces @oldinstr with
+ * @newinstr. ".skip" directive takes care of proper instruction padding
+ * in case @newinstr is longer than @oldinstr.
+ */
 .macro ALTERNATIVE oldinstr, newinstr, feature
 .macro ALTERNATIVE oldinstr, newinstr, feature
 140:
 140:
 	\oldinstr
 	\oldinstr
@@ -55,6 +67,12 @@
  */
  */
 #define alt_max_short(a, b)	((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
 #define alt_max_short(a, b)	((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
 
 
+
+/*
+ * Same as ALTERNATIVE macro above but for two alternatives. If CPU
+ * has @feature1, it replaces @oldinstr with @newinstr1. If CPU has
+ * @feature2, it replaces @oldinstr with @feature2.
+ */
 .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
 .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
 140:
 140:
 	\oldinstr
 	\oldinstr

+ 6 - 0
arch/x86/include/asm/apic.h

@@ -644,6 +644,12 @@ static inline void entering_ack_irq(void)
 	entering_irq();
 	entering_irq();
 }
 }
 
 
+static inline void ipi_entering_ack_irq(void)
+{
+	ack_APIC_irq();
+	irq_enter();
+}
+
 static inline void exiting_irq(void)
 static inline void exiting_irq(void)
 {
 {
 	irq_exit();
 	irq_exit();

+ 25 - 0
arch/x86/include/asm/asm.h

@@ -63,6 +63,31 @@
 	_ASM_ALIGN ;						\
 	_ASM_ALIGN ;						\
 	_ASM_PTR (entry);					\
 	_ASM_PTR (entry);					\
 	.popsection
 	.popsection
+
+.macro ALIGN_DESTINATION
+	/* check for bad alignment of destination */
+	movl %edi,%ecx
+	andl $7,%ecx
+	jz 102f				/* already aligned */
+	subl $8,%ecx
+	negl %ecx
+	subl %ecx,%edx
+100:	movb (%rsi),%al
+101:	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 100b
+102:
+	.section .fixup,"ax"
+103:	addl %ecx,%edx			/* ecx is zerorest also */
+	jmp copy_user_handle_tail
+	.previous
+
+	_ASM_EXTABLE(100b,103b)
+	_ASM_EXTABLE(101b,103b)
+	.endm
+
 #else
 #else
 # define _ASM_EXTABLE(from,to)					\
 # define _ASM_EXTABLE(from,to)					\
 	" .pushsection \"__ex_table\",\"a\"\n"			\
 	" .pushsection \"__ex_table\",\"a\"\n"			\

+ 15 - 15
arch/x86/include/asm/atomic.h

@@ -22,7 +22,7 @@
  *
  *
  * Atomically reads the value of @v.
  * Atomically reads the value of @v.
  */
  */
-static inline int atomic_read(const atomic_t *v)
+static __always_inline int atomic_read(const atomic_t *v)
 {
 {
 	return ACCESS_ONCE((v)->counter);
 	return ACCESS_ONCE((v)->counter);
 }
 }
@@ -34,7 +34,7 @@ static inline int atomic_read(const atomic_t *v)
  *
  *
  * Atomically sets the value of @v to @i.
  * Atomically sets the value of @v to @i.
  */
  */
-static inline void atomic_set(atomic_t *v, int i)
+static __always_inline void atomic_set(atomic_t *v, int i)
 {
 {
 	v->counter = i;
 	v->counter = i;
 }
 }
@@ -46,7 +46,7 @@ static inline void atomic_set(atomic_t *v, int i)
  *
  *
  * Atomically adds @i to @v.
  * Atomically adds @i to @v.
  */
  */
-static inline void atomic_add(int i, atomic_t *v)
+static __always_inline void atomic_add(int i, atomic_t *v)
 {
 {
 	asm volatile(LOCK_PREFIX "addl %1,%0"
 	asm volatile(LOCK_PREFIX "addl %1,%0"
 		     : "+m" (v->counter)
 		     : "+m" (v->counter)
@@ -60,7 +60,7 @@ static inline void atomic_add(int i, atomic_t *v)
  *
  *
  * Atomically subtracts @i from @v.
  * Atomically subtracts @i from @v.
  */
  */
-static inline void atomic_sub(int i, atomic_t *v)
+static __always_inline void atomic_sub(int i, atomic_t *v)
 {
 {
 	asm volatile(LOCK_PREFIX "subl %1,%0"
 	asm volatile(LOCK_PREFIX "subl %1,%0"
 		     : "+m" (v->counter)
 		     : "+m" (v->counter)
@@ -76,7 +76,7 @@ static inline void atomic_sub(int i, atomic_t *v)
  * true if the result is zero, or false for all
  * true if the result is zero, or false for all
  * other cases.
  * other cases.
  */
  */
-static inline int atomic_sub_and_test(int i, atomic_t *v)
+static __always_inline int atomic_sub_and_test(int i, atomic_t *v)
 {
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e");
 	GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e");
 }
 }
@@ -87,7 +87,7 @@ static inline int atomic_sub_and_test(int i, atomic_t *v)
  *
  *
  * Atomically increments @v by 1.
  * Atomically increments @v by 1.
  */
  */
-static inline void atomic_inc(atomic_t *v)
+static __always_inline void atomic_inc(atomic_t *v)
 {
 {
 	asm volatile(LOCK_PREFIX "incl %0"
 	asm volatile(LOCK_PREFIX "incl %0"
 		     : "+m" (v->counter));
 		     : "+m" (v->counter));
@@ -99,7 +99,7 @@ static inline void atomic_inc(atomic_t *v)
  *
  *
  * Atomically decrements @v by 1.
  * Atomically decrements @v by 1.
  */
  */
-static inline void atomic_dec(atomic_t *v)
+static __always_inline void atomic_dec(atomic_t *v)
 {
 {
 	asm volatile(LOCK_PREFIX "decl %0"
 	asm volatile(LOCK_PREFIX "decl %0"
 		     : "+m" (v->counter));
 		     : "+m" (v->counter));
@@ -113,7 +113,7 @@ static inline void atomic_dec(atomic_t *v)
  * returns true if the result is 0, or false for all other
  * returns true if the result is 0, or false for all other
  * cases.
  * cases.
  */
  */
-static inline int atomic_dec_and_test(atomic_t *v)
+static __always_inline int atomic_dec_and_test(atomic_t *v)
 {
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
 	GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e");
 }
 }
@@ -126,7 +126,7 @@ static inline int atomic_dec_and_test(atomic_t *v)
  * and returns true if the result is zero, or false for all
  * and returns true if the result is zero, or false for all
  * other cases.
  * other cases.
  */
  */
-static inline int atomic_inc_and_test(atomic_t *v)
+static __always_inline int atomic_inc_and_test(atomic_t *v)
 {
 {
 	GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e");
 	GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e");
 }
 }
@@ -140,7 +140,7 @@ static inline int atomic_inc_and_test(atomic_t *v)
  * if the result is negative, or false when
  * if the result is negative, or false when
  * result is greater than or equal to zero.
  * result is greater than or equal to zero.
  */
  */
-static inline int atomic_add_negative(int i, atomic_t *v)
+static __always_inline int atomic_add_negative(int i, atomic_t *v)
 {
 {
 	GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s");
 	GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s");
 }
 }
@@ -152,7 +152,7 @@ static inline int atomic_add_negative(int i, atomic_t *v)
  *
  *
  * Atomically adds @i to @v and returns @i + @v
  * Atomically adds @i to @v and returns @i + @v
  */
  */
-static inline int atomic_add_return(int i, atomic_t *v)
+static __always_inline int atomic_add_return(int i, atomic_t *v)
 {
 {
 	return i + xadd(&v->counter, i);
 	return i + xadd(&v->counter, i);
 }
 }
@@ -164,7 +164,7 @@ static inline int atomic_add_return(int i, atomic_t *v)
  *
  *
  * Atomically subtracts @i from @v and returns @v - @i
  * Atomically subtracts @i from @v and returns @v - @i
  */
  */
-static inline int atomic_sub_return(int i, atomic_t *v)
+static __always_inline int atomic_sub_return(int i, atomic_t *v)
 {
 {
 	return atomic_add_return(-i, v);
 	return atomic_add_return(-i, v);
 }
 }
@@ -172,7 +172,7 @@ static inline int atomic_sub_return(int i, atomic_t *v)
 #define atomic_inc_return(v)  (atomic_add_return(1, v))
 #define atomic_inc_return(v)  (atomic_add_return(1, v))
 #define atomic_dec_return(v)  (atomic_sub_return(1, v))
 #define atomic_dec_return(v)  (atomic_sub_return(1, v))
 
 
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 {
 	return cmpxchg(&v->counter, old, new);
 	return cmpxchg(&v->counter, old, new);
 }
 }
@@ -191,7 +191,7 @@ static inline int atomic_xchg(atomic_t *v, int new)
  * Atomically adds @a to @v, so long as @v was not already @u.
  * Atomically adds @a to @v, so long as @v was not already @u.
  * Returns the old value of @v.
  * Returns the old value of @v.
  */
  */
-static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+static __always_inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 {
 	int c, old;
 	int c, old;
 	c = atomic_read(v);
 	c = atomic_read(v);
@@ -213,7 +213,7 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
  * Atomically adds 1 to @v
  * Atomically adds 1 to @v
  * Returns the new value of @u
  * Returns the new value of @u
  */
  */
-static inline short int atomic_inc_short(short int *v)
+static __always_inline short int atomic_inc_short(short int *v)
 {
 {
 	asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
 	asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
 	return *v;
 	return *v;

+ 4 - 4
arch/x86/include/asm/atomic64_64.h

@@ -40,7 +40,7 @@ static inline void atomic64_set(atomic64_t *v, long i)
  *
  *
  * Atomically adds @i to @v.
  * Atomically adds @i to @v.
  */
  */
-static inline void atomic64_add(long i, atomic64_t *v)
+static __always_inline void atomic64_add(long i, atomic64_t *v)
 {
 {
 	asm volatile(LOCK_PREFIX "addq %1,%0"
 	asm volatile(LOCK_PREFIX "addq %1,%0"
 		     : "=m" (v->counter)
 		     : "=m" (v->counter)
@@ -81,7 +81,7 @@ static inline int atomic64_sub_and_test(long i, atomic64_t *v)
  *
  *
  * Atomically increments @v by 1.
  * Atomically increments @v by 1.
  */
  */
-static inline void atomic64_inc(atomic64_t *v)
+static __always_inline void atomic64_inc(atomic64_t *v)
 {
 {
 	asm volatile(LOCK_PREFIX "incq %0"
 	asm volatile(LOCK_PREFIX "incq %0"
 		     : "=m" (v->counter)
 		     : "=m" (v->counter)
@@ -94,7 +94,7 @@ static inline void atomic64_inc(atomic64_t *v)
  *
  *
  * Atomically decrements @v by 1.
  * Atomically decrements @v by 1.
  */
  */
-static inline void atomic64_dec(atomic64_t *v)
+static __always_inline void atomic64_dec(atomic64_t *v)
 {
 {
 	asm volatile(LOCK_PREFIX "decq %0"
 	asm volatile(LOCK_PREFIX "decq %0"
 		     : "=m" (v->counter)
 		     : "=m" (v->counter)
@@ -148,7 +148,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v)
  *
  *
  * Atomically adds @i to @v and returns @i + @v
  * Atomically adds @i to @v and returns @i + @v
  */
  */
-static inline long atomic64_add_return(long i, atomic64_t *v)
+static __always_inline long atomic64_add_return(long i, atomic64_t *v)
 {
 {
 	return i + xadd(&v->counter, i);
 	return i + xadd(&v->counter, i);
 }
 }

+ 2 - 0
arch/x86/include/asm/entry_arch.h

@@ -23,6 +23,8 @@ BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
 #ifdef CONFIG_HAVE_KVM
 #ifdef CONFIG_HAVE_KVM
 BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
 BUILD_INTERRUPT3(kvm_posted_intr_ipi, POSTED_INTR_VECTOR,
 		 smp_kvm_posted_intr_ipi)
 		 smp_kvm_posted_intr_ipi)
+BUILD_INTERRUPT3(kvm_posted_intr_wakeup_ipi, POSTED_INTR_WAKEUP_VECTOR,
+		 smp_kvm_posted_intr_wakeup_ipi)
 #endif
 #endif
 
 
 /*
 /*

+ 1 - 0
arch/x86/include/asm/hardirq.h

@@ -14,6 +14,7 @@ typedef struct {
 #endif
 #endif
 #ifdef CONFIG_HAVE_KVM
 #ifdef CONFIG_HAVE_KVM
 	unsigned int kvm_posted_intr_ipis;
 	unsigned int kvm_posted_intr_ipis;
+	unsigned int kvm_posted_intr_wakeup_ipis;
 #endif
 #endif
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int x86_platform_ipis;	/* arch dependent */
 	unsigned int apic_perf_irqs;
 	unsigned int apic_perf_irqs;

+ 6 - 10
arch/x86/include/asm/hpet.h

@@ -74,20 +74,16 @@ extern unsigned int hpet_readl(unsigned int a);
 extern void force_hpet_resume(void);
 extern void force_hpet_resume(void);
 
 
 struct irq_data;
 struct irq_data;
+struct hpet_dev;
+struct irq_domain;
+
 extern void hpet_msi_unmask(struct irq_data *data);
 extern void hpet_msi_unmask(struct irq_data *data);
 extern void hpet_msi_mask(struct irq_data *data);
 extern void hpet_msi_mask(struct irq_data *data);
-struct hpet_dev;
 extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
 extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
 extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
 extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
-
-#ifdef CONFIG_PCI_MSI
-extern int default_setup_hpet_msi(unsigned int irq, unsigned int id);
-#else
-static inline int default_setup_hpet_msi(unsigned int irq, unsigned int id)
-{
-	return -EINVAL;
-}
-#endif
+extern struct irq_domain *hpet_create_irq_domain(int hpet_id);
+extern int hpet_assign_irq(struct irq_domain *domain,
+			   struct hpet_dev *dev, int dev_num);
 
 
 #ifdef CONFIG_HPET_EMULATE_RTC
 #ifdef CONFIG_HPET_EMULATE_RTC
 
 

+ 66 - 72
arch/x86/include/asm/hw_irq.h

@@ -29,6 +29,7 @@
 extern asmlinkage void apic_timer_interrupt(void);
 extern asmlinkage void apic_timer_interrupt(void);
 extern asmlinkage void x86_platform_ipi(void);
 extern asmlinkage void x86_platform_ipi(void);
 extern asmlinkage void kvm_posted_intr_ipi(void);
 extern asmlinkage void kvm_posted_intr_ipi(void);
+extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
 extern asmlinkage void error_interrupt(void);
 extern asmlinkage void error_interrupt(void);
 extern asmlinkage void irq_work_interrupt(void);
 extern asmlinkage void irq_work_interrupt(void);
 
 
@@ -36,40 +37,6 @@ extern asmlinkage void spurious_interrupt(void);
 extern asmlinkage void thermal_interrupt(void);
 extern asmlinkage void thermal_interrupt(void);
 extern asmlinkage void reschedule_interrupt(void);
 extern asmlinkage void reschedule_interrupt(void);
 
 
-extern asmlinkage void invalidate_interrupt(void);
-extern asmlinkage void invalidate_interrupt0(void);
-extern asmlinkage void invalidate_interrupt1(void);
-extern asmlinkage void invalidate_interrupt2(void);
-extern asmlinkage void invalidate_interrupt3(void);
-extern asmlinkage void invalidate_interrupt4(void);
-extern asmlinkage void invalidate_interrupt5(void);
-extern asmlinkage void invalidate_interrupt6(void);
-extern asmlinkage void invalidate_interrupt7(void);
-extern asmlinkage void invalidate_interrupt8(void);
-extern asmlinkage void invalidate_interrupt9(void);
-extern asmlinkage void invalidate_interrupt10(void);
-extern asmlinkage void invalidate_interrupt11(void);
-extern asmlinkage void invalidate_interrupt12(void);
-extern asmlinkage void invalidate_interrupt13(void);
-extern asmlinkage void invalidate_interrupt14(void);
-extern asmlinkage void invalidate_interrupt15(void);
-extern asmlinkage void invalidate_interrupt16(void);
-extern asmlinkage void invalidate_interrupt17(void);
-extern asmlinkage void invalidate_interrupt18(void);
-extern asmlinkage void invalidate_interrupt19(void);
-extern asmlinkage void invalidate_interrupt20(void);
-extern asmlinkage void invalidate_interrupt21(void);
-extern asmlinkage void invalidate_interrupt22(void);
-extern asmlinkage void invalidate_interrupt23(void);
-extern asmlinkage void invalidate_interrupt24(void);
-extern asmlinkage void invalidate_interrupt25(void);
-extern asmlinkage void invalidate_interrupt26(void);
-extern asmlinkage void invalidate_interrupt27(void);
-extern asmlinkage void invalidate_interrupt28(void);
-extern asmlinkage void invalidate_interrupt29(void);
-extern asmlinkage void invalidate_interrupt30(void);
-extern asmlinkage void invalidate_interrupt31(void);
-
 extern asmlinkage void irq_move_cleanup_interrupt(void);
 extern asmlinkage void irq_move_cleanup_interrupt(void);
 extern asmlinkage void reboot_interrupt(void);
 extern asmlinkage void reboot_interrupt(void);
 extern asmlinkage void threshold_interrupt(void);
 extern asmlinkage void threshold_interrupt(void);
@@ -92,55 +59,87 @@ extern void trace_call_function_single_interrupt(void);
 #define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
 #define trace_irq_move_cleanup_interrupt  irq_move_cleanup_interrupt
 #define trace_reboot_interrupt  reboot_interrupt
 #define trace_reboot_interrupt  reboot_interrupt
 #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
 #define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
+#define trace_kvm_posted_intr_wakeup_ipi kvm_posted_intr_wakeup_ipi
 #endif /* CONFIG_TRACING */
 #endif /* CONFIG_TRACING */
 
 
-#ifdef CONFIG_IRQ_REMAP
-/* Intel specific interrupt remapping information */
-struct irq_2_iommu {
-	struct intel_iommu *iommu;
-	u16 irte_index;
-	u16 sub_handle;
-	u8  irte_mask;
-};
-
-/* AMD specific interrupt remapping information */
-struct irq_2_irte {
-	u16 devid; /* Device ID for IRTE table */
-	u16 index; /* Index into IRTE table*/
-};
-#endif	/* CONFIG_IRQ_REMAP */
-
 #ifdef	CONFIG_X86_LOCAL_APIC
 #ifdef	CONFIG_X86_LOCAL_APIC
 struct irq_data;
 struct irq_data;
+struct pci_dev;
+struct msi_desc;
+
+enum irq_alloc_type {
+	X86_IRQ_ALLOC_TYPE_IOAPIC = 1,
+	X86_IRQ_ALLOC_TYPE_HPET,
+	X86_IRQ_ALLOC_TYPE_MSI,
+	X86_IRQ_ALLOC_TYPE_MSIX,
+	X86_IRQ_ALLOC_TYPE_DMAR,
+	X86_IRQ_ALLOC_TYPE_UV,
+};
 
 
-struct irq_cfg {
-	cpumask_var_t		domain;
-	cpumask_var_t		old_domain;
-	u8			vector;
-	u8			move_in_progress : 1;
-#ifdef CONFIG_IRQ_REMAP
-	u8			remapped : 1;
+struct irq_alloc_info {
+	enum irq_alloc_type	type;
+	u32			flags;
+	const struct cpumask	*mask;	/* CPU mask for vector allocation */
 	union {
 	union {
-		struct irq_2_iommu irq_2_iommu;
-		struct irq_2_irte  irq_2_irte;
-	};
+		int		unused;
+#ifdef	CONFIG_HPET_TIMER
+		struct {
+			int		hpet_id;
+			int		hpet_index;
+			void		*hpet_data;
+		};
 #endif
 #endif
-	union {
-#ifdef CONFIG_X86_IO_APIC
+#ifdef	CONFIG_PCI_MSI
 		struct {
 		struct {
-			struct list_head	irq_2_pin;
+			struct pci_dev	*msi_dev;
+			irq_hw_number_t	msi_hwirq;
+		};
+#endif
+#ifdef	CONFIG_X86_IO_APIC
+		struct {
+			int		ioapic_id;
+			int		ioapic_pin;
+			int		ioapic_node;
+			u32		ioapic_trigger : 1;
+			u32		ioapic_polarity : 1;
+			u32		ioapic_valid : 1;
+			struct IO_APIC_route_entry *ioapic_entry;
+		};
+#endif
+#ifdef	CONFIG_DMAR_TABLE
+		struct {
+			int		dmar_id;
+			void		*dmar_data;
+		};
+#endif
+#ifdef	CONFIG_HT_IRQ
+		struct {
+			int		ht_pos;
+			int		ht_idx;
+			struct pci_dev	*ht_dev;
+			void		*ht_update;
+		};
+#endif
+#ifdef	CONFIG_X86_UV
+		struct {
+			int		uv_limit;
+			int		uv_blade;
+			unsigned long	uv_offset;
+			char		*uv_name;
 		};
 		};
 #endif
 #endif
 	};
 	};
 };
 };
 
 
+struct irq_cfg {
+	unsigned int		dest_apicid;
+	u8			vector;
+};
+
 extern struct irq_cfg *irq_cfg(unsigned int irq);
 extern struct irq_cfg *irq_cfg(unsigned int irq);
 extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
 extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
-extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
 extern void lock_vector_lock(void);
 extern void lock_vector_lock(void);
 extern void unlock_vector_lock(void);
 extern void unlock_vector_lock(void);
-extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
-extern void clear_irq_vector(int irq, struct irq_cfg *cfg);
 extern void setup_vector_irq(int cpu);
 extern void setup_vector_irq(int cpu);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 extern void send_cleanup_vector(struct irq_cfg *);
 extern void send_cleanup_vector(struct irq_cfg *);
@@ -150,10 +149,7 @@ static inline void send_cleanup_vector(struct irq_cfg *c) { }
 static inline void irq_complete_move(struct irq_cfg *c) { }
 static inline void irq_complete_move(struct irq_cfg *c) { }
 #endif
 #endif
 
 
-extern int apic_retrigger_irq(struct irq_data *data);
 extern void apic_ack_edge(struct irq_data *data);
 extern void apic_ack_edge(struct irq_data *data);
-extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-			     unsigned int *dest_id);
 #else	/*  CONFIG_X86_LOCAL_APIC */
 #else	/*  CONFIG_X86_LOCAL_APIC */
 static inline void lock_vector_lock(void) {}
 static inline void lock_vector_lock(void) {}
 static inline void unlock_vector_lock(void) {}
 static inline void unlock_vector_lock(void) {}
@@ -163,8 +159,7 @@ static inline void unlock_vector_lock(void) {}
 extern atomic_t irq_err_count;
 extern atomic_t irq_err_count;
 extern atomic_t irq_mis_count;
 extern atomic_t irq_mis_count;
 
 
-/* EISA */
-extern void eisa_set_level_irq(unsigned int irq);
+extern void elcr_set_level_irq(unsigned int irq);
 
 
 /* SMP */
 /* SMP */
 extern __visible void smp_apic_timer_interrupt(struct pt_regs *);
 extern __visible void smp_apic_timer_interrupt(struct pt_regs *);
@@ -178,7 +173,6 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
 extern __visible void smp_reschedule_interrupt(struct pt_regs *);
 extern __visible void smp_reschedule_interrupt(struct pt_regs *);
 extern __visible void smp_call_function_interrupt(struct pt_regs *);
 extern __visible void smp_call_function_interrupt(struct pt_regs *);
 extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
 extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
-extern __visible void smp_invalidate_interrupt(struct pt_regs *);
 #endif
 #endif
 
 
 extern char irq_entries_start[];
 extern char irq_entries_start[];

+ 4 - 0
arch/x86/include/asm/io.h

@@ -177,6 +177,7 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
  * look at pci_iomap().
  * look at pci_iomap().
  */
  */
 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
 				unsigned long prot_val);
 				unsigned long prot_val);
@@ -338,6 +339,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 #define IO_SPACE_LIMIT 0xffff
 #define IO_SPACE_LIMIT 0xffff
 
 
 #ifdef CONFIG_MTRR
 #ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_index(int handle);
+#define arch_phys_wc_index arch_phys_wc_index
+
 extern int __must_check arch_phys_wc_add(unsigned long base,
 extern int __must_check arch_phys_wc_add(unsigned long base,
 					 unsigned long size);
 					 unsigned long size);
 extern void arch_phys_wc_del(int handle);
 extern void arch_phys_wc_del(int handle);

+ 32 - 82
arch/x86/include/asm/io_apic.h

@@ -95,9 +95,22 @@ struct IR_IO_APIC_route_entry {
 		index		: 15;
 		index		: 15;
 } __attribute__ ((packed));
 } __attribute__ ((packed));
 
 
-#define IOAPIC_AUTO     -1
-#define IOAPIC_EDGE     0
-#define IOAPIC_LEVEL    1
+struct irq_alloc_info;
+struct ioapic_domain_cfg;
+
+#define IOAPIC_AUTO			-1
+#define IOAPIC_EDGE			0
+#define IOAPIC_LEVEL			1
+
+#define IOAPIC_MASKED			1
+#define IOAPIC_UNMASKED			0
+
+#define IOAPIC_POL_HIGH			0
+#define IOAPIC_POL_LOW			1
+
+#define IOAPIC_DEST_MODE_PHYSICAL	0
+#define IOAPIC_DEST_MODE_LOGICAL	1
+
 #define	IOAPIC_MAP_ALLOC		0x1
 #define	IOAPIC_MAP_ALLOC		0x1
 #define	IOAPIC_MAP_CHECK		0x2
 #define	IOAPIC_MAP_CHECK		0x2
 
 
@@ -110,9 +123,6 @@ extern int nr_ioapics;
 
 
 extern int mpc_ioapic_id(int ioapic);
 extern int mpc_ioapic_id(int ioapic);
 extern unsigned int mpc_ioapic_addr(int ioapic);
 extern unsigned int mpc_ioapic_addr(int ioapic);
-extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic);
-
-#define MP_MAX_IOAPIC_PIN 127
 
 
 /* # of MP IRQ source entries */
 /* # of MP IRQ source entries */
 extern int mp_irq_entries;
 extern int mp_irq_entries;
@@ -120,9 +130,6 @@ extern int mp_irq_entries;
 /* MP IRQ source entries */
 /* MP IRQ source entries */
 extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 extern struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
 
 
-/* Older SiS APIC requires we rewrite the index register */
-extern int sis_apic_bug;
-
 /* 1 if "noapic" boot option passed */
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
 extern int skip_ioapic_setup;
 
 
@@ -132,6 +139,8 @@ extern int noioapicquirk;
 /* -1 if "noapic" boot option passed */
 /* -1 if "noapic" boot option passed */
 extern int noioapicreroute;
 extern int noioapicreroute;
 
 
+extern u32 gsi_top;
+
 extern unsigned long io_apic_irqs;
 extern unsigned long io_apic_irqs;
 
 
 #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs))
 #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs))
@@ -147,13 +156,6 @@ struct irq_cfg;
 extern void ioapic_insert_resources(void);
 extern void ioapic_insert_resources(void);
 extern int arch_early_ioapic_init(void);
 extern int arch_early_ioapic_init(void);
 
 
-extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
-				     unsigned int, int,
-				     struct io_apic_irq_attr *);
-extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg);
-
-extern void native_eoi_ioapic_pin(int apic, int pin, int vector);
-
 extern int save_ioapic_entries(void);
 extern int save_ioapic_entries(void);
 extern void mask_ioapic_entries(void);
 extern void mask_ioapic_entries(void);
 extern int restore_ioapic_entries(void);
 extern int restore_ioapic_entries(void);
@@ -161,82 +163,32 @@ extern int restore_ioapic_entries(void);
 extern void setup_ioapic_ids_from_mpc(void);
 extern void setup_ioapic_ids_from_mpc(void);
 extern void setup_ioapic_ids_from_mpc_nocheck(void);
 extern void setup_ioapic_ids_from_mpc_nocheck(void);
 
 
-struct io_apic_irq_attr {
-	int ioapic;
-	int ioapic_pin;
-	int trigger;
-	int polarity;
-};
-
-enum ioapic_domain_type {
-	IOAPIC_DOMAIN_INVALID,
-	IOAPIC_DOMAIN_LEGACY,
-	IOAPIC_DOMAIN_STRICT,
-	IOAPIC_DOMAIN_DYNAMIC,
-};
-
-struct device_node;
-struct irq_domain;
-struct irq_domain_ops;
-
-struct ioapic_domain_cfg {
-	enum ioapic_domain_type		type;
-	const struct irq_domain_ops	*ops;
-	struct device_node		*dev;
-};
-
-struct mp_ioapic_gsi{
-	u32 gsi_base;
-	u32 gsi_end;
-};
-extern u32 gsi_top;
-
 extern int mp_find_ioapic(u32 gsi);
 extern int mp_find_ioapic(u32 gsi);
 extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
 extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
-extern u32 mp_pin_to_gsi(int ioapic, int pin);
-extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags);
+extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+			     struct irq_alloc_info *info);
 extern void mp_unmap_irq(int irq);
 extern void mp_unmap_irq(int irq);
 extern int mp_register_ioapic(int id, u32 address, u32 gsi_base,
 extern int mp_register_ioapic(int id, u32 address, u32 gsi_base,
 			      struct ioapic_domain_cfg *cfg);
 			      struct ioapic_domain_cfg *cfg);
 extern int mp_unregister_ioapic(u32 gsi_base);
 extern int mp_unregister_ioapic(u32 gsi_base);
 extern int mp_ioapic_registered(u32 gsi_base);
 extern int mp_ioapic_registered(u32 gsi_base);
-extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
-			    irq_hw_number_t hwirq);
-extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq);
-extern int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node);
-extern void __init pre_init_apic_IRQ0(void);
+
+extern void ioapic_set_alloc_attr(struct irq_alloc_info *info,
+				  int node, int trigger, int polarity);
 
 
 extern void mp_save_irq(struct mpc_intsrc *m);
 extern void mp_save_irq(struct mpc_intsrc *m);
 
 
 extern void disable_ioapic_support(void);
 extern void disable_ioapic_support(void);
 
 
-extern void __init native_io_apic_init_mappings(void);
+extern void __init io_apic_init_mappings(void);
 extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg);
 extern unsigned int native_io_apic_read(unsigned int apic, unsigned int reg);
-extern void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int val);
-extern void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val);
 extern void native_disable_io_apic(void);
 extern void native_disable_io_apic(void);
-extern void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries);
-extern void intel_ir_io_apic_print_entries(unsigned int apic, unsigned int nr_entries);
-extern int native_ioapic_set_affinity(struct irq_data *,
-				      const struct cpumask *,
-				      bool);
 
 
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
 {
 {
 	return x86_io_apic_ops.read(apic, reg);
 	return x86_io_apic_ops.read(apic, reg);
 }
 }
 
 
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	x86_io_apic_ops.write(apic, reg, value);
-}
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	x86_io_apic_ops.modify(apic, reg, value);
-}
-
-extern void io_apic_eoi(unsigned int apic, unsigned int vector);
-
 extern void setup_IO_APIC(void);
 extern void setup_IO_APIC(void);
 extern void enable_IO_APIC(void);
 extern void enable_IO_APIC(void);
 extern void disable_IO_APIC(void);
 extern void disable_IO_APIC(void);
@@ -253,8 +205,12 @@ static inline int arch_early_ioapic_init(void) { return 0; }
 static inline void print_IO_APICs(void) {}
 static inline void print_IO_APICs(void) {}
 #define gsi_top (NR_IRQS_LEGACY)
 #define gsi_top (NR_IRQS_LEGACY)
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
 static inline int mp_find_ioapic(u32 gsi) { return 0; }
-static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; }
-static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; }
+static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags,
+				    struct irq_alloc_info *info)
+{
+	return gsi;
+}
+
 static inline void mp_unmap_irq(int irq) { }
 static inline void mp_unmap_irq(int irq) { }
 
 
 static inline int save_ioapic_entries(void)
 static inline int save_ioapic_entries(void)
@@ -268,17 +224,11 @@ static inline int restore_ioapic_entries(void)
 	return -ENOMEM;
 	return -ENOMEM;
 }
 }
 
 
-static inline void mp_save_irq(struct mpc_intsrc *m) { };
+static inline void mp_save_irq(struct mpc_intsrc *m) { }
 static inline void disable_ioapic_support(void) { }
 static inline void disable_ioapic_support(void) { }
-#define native_io_apic_init_mappings	NULL
+static inline void io_apic_init_mappings(void) { }
 #define native_io_apic_read		NULL
 #define native_io_apic_read		NULL
-#define native_io_apic_write		NULL
-#define native_io_apic_modify		NULL
 #define native_disable_io_apic		NULL
 #define native_disable_io_apic		NULL
-#define native_io_apic_print_entries	NULL
-#define native_ioapic_set_affinity	NULL
-#define native_setup_ioapic_entry	NULL
-#define native_eoi_ioapic_pin		NULL
 
 
 static inline void setup_IO_APIC(void) { }
 static inline void setup_IO_APIC(void) { }
 static inline void enable_IO_APIC(void) { }
 static inline void enable_IO_APIC(void) { }

+ 4 - 0
arch/x86/include/asm/irq.h

@@ -30,6 +30,10 @@ extern void fixup_irqs(void);
 extern void irq_force_complete_move(int);
 extern void irq_force_complete_move(int);
 #endif
 #endif
 
 
+#ifdef CONFIG_HAVE_KVM
+extern void kvm_set_posted_intr_wakeup_handler(void (*handler)(void));
+#endif
+
 extern void (*x86_platform_ipi_callback)(void);
 extern void (*x86_platform_ipi_callback)(void);
 extern void native_init_IRQ(void);
 extern void native_init_IRQ(void);
 extern bool handle_irq(unsigned irq, struct pt_regs *regs);
 extern bool handle_irq(unsigned irq, struct pt_regs *regs);

+ 23 - 46
arch/x86/include/asm/irq_remapping.h

@@ -22,14 +22,12 @@
 #ifndef __X86_IRQ_REMAPPING_H
 #ifndef __X86_IRQ_REMAPPING_H
 #define __X86_IRQ_REMAPPING_H
 #define __X86_IRQ_REMAPPING_H
 
 
+#include <asm/irqdomain.h>
+#include <asm/hw_irq.h>
 #include <asm/io_apic.h>
 #include <asm/io_apic.h>
 
 
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct irq_chip;
 struct msi_msg;
 struct msi_msg;
-struct pci_dev;
-struct irq_cfg;
+struct irq_alloc_info;
 
 
 #ifdef CONFIG_IRQ_REMAP
 #ifdef CONFIG_IRQ_REMAP
 
 
@@ -39,22 +37,21 @@ extern int irq_remapping_enable(void);
 extern void irq_remapping_disable(void);
 extern void irq_remapping_disable(void);
 extern int irq_remapping_reenable(int);
 extern int irq_remapping_reenable(int);
 extern int irq_remap_enable_fault_handling(void);
 extern int irq_remap_enable_fault_handling(void);
-extern int setup_ioapic_remapped_entry(int irq,
-				       struct IO_APIC_route_entry *entry,
-				       unsigned int destination,
-				       int vector,
-				       struct io_apic_irq_attr *attr);
-extern void free_remapped_irq(int irq);
-extern void compose_remapped_msi_msg(struct pci_dev *pdev,
-				     unsigned int irq, unsigned int dest,
-				     struct msi_msg *msg, u8 hpet_id);
-extern int setup_hpet_msi_remapped(unsigned int irq, unsigned int id);
 extern void panic_if_irq_remap(const char *msg);
 extern void panic_if_irq_remap(const char *msg);
-extern bool setup_remapped_irq(int irq,
-			       struct irq_cfg *cfg,
-			       struct irq_chip *chip);
 
 
-void irq_remap_modify_chip_defaults(struct irq_chip *chip);
+extern struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info);
+extern struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info);
+
+/* Create PCI MSI/MSIx irqdomain, use @parent as the parent irqdomain. */
+extern struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent);
+
+/* Get parent irqdomain for interrupt remapping irqdomain */
+static inline struct irq_domain *arch_get_ir_parent_domain(void)
+{
+	return x86_vector_domain;
+}
 
 
 #else  /* CONFIG_IRQ_REMAP */
 #else  /* CONFIG_IRQ_REMAP */
 
 
@@ -64,42 +61,22 @@ static inline int irq_remapping_enable(void) { return -ENODEV; }
 static inline void irq_remapping_disable(void) { }
 static inline void irq_remapping_disable(void) { }
 static inline int irq_remapping_reenable(int eim) { return -ENODEV; }
 static inline int irq_remapping_reenable(int eim) { return -ENODEV; }
 static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; }
 static inline int irq_remap_enable_fault_handling(void) { return -ENODEV; }
-static inline int setup_ioapic_remapped_entry(int irq,
-					      struct IO_APIC_route_entry *entry,
-					      unsigned int destination,
-					      int vector,
-					      struct io_apic_irq_attr *attr)
-{
-	return -ENODEV;
-}
-static inline void free_remapped_irq(int irq) { }
-static inline void compose_remapped_msi_msg(struct pci_dev *pdev,
-					    unsigned int irq, unsigned int dest,
-					    struct msi_msg *msg, u8 hpet_id)
-{
-}
-static inline int setup_hpet_msi_remapped(unsigned int irq, unsigned int id)
-{
-	return -ENODEV;
-}
 
 
 static inline void panic_if_irq_remap(const char *msg)
 static inline void panic_if_irq_remap(const char *msg)
 {
 {
 }
 }
 
 
-static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
+static inline struct irq_domain *
+irq_remapping_get_ir_irq_domain(struct irq_alloc_info *info)
 {
 {
+	return NULL;
 }
 }
 
 
-static inline bool setup_remapped_irq(int irq,
-				      struct irq_cfg *cfg,
-				      struct irq_chip *chip)
+static inline struct irq_domain *
+irq_remapping_get_irq_domain(struct irq_alloc_info *info)
 {
 {
-	return false;
+	return NULL;
 }
 }
-#endif /* CONFIG_IRQ_REMAP */
-
-#define dmar_alloc_hwirq()	irq_alloc_hwirq(-1)
-#define dmar_free_hwirq		irq_free_hwirq
 
 
+#endif /* CONFIG_IRQ_REMAP */
 #endif /* __X86_IRQ_REMAPPING_H */
 #endif /* __X86_IRQ_REMAPPING_H */

+ 13 - 27
arch/x86/include/asm/irq_vectors.h

@@ -47,31 +47,12 @@
 #define IRQ_MOVE_CLEANUP_VECTOR		FIRST_EXTERNAL_VECTOR
 #define IRQ_MOVE_CLEANUP_VECTOR		FIRST_EXTERNAL_VECTOR
 
 
 #define IA32_SYSCALL_VECTOR		0x80
 #define IA32_SYSCALL_VECTOR		0x80
-#ifdef CONFIG_X86_32
-# define SYSCALL_VECTOR			0x80
-#endif
 
 
 /*
 /*
  * Vectors 0x30-0x3f are used for ISA interrupts.
  * Vectors 0x30-0x3f are used for ISA interrupts.
  *   round up to the next 16-vector boundary
  *   round up to the next 16-vector boundary
  */
  */
-#define IRQ0_VECTOR			((FIRST_EXTERNAL_VECTOR + 16) & ~15)
-
-#define IRQ1_VECTOR			(IRQ0_VECTOR +  1)
-#define IRQ2_VECTOR			(IRQ0_VECTOR +  2)
-#define IRQ3_VECTOR			(IRQ0_VECTOR +  3)
-#define IRQ4_VECTOR			(IRQ0_VECTOR +  4)
-#define IRQ5_VECTOR			(IRQ0_VECTOR +  5)
-#define IRQ6_VECTOR			(IRQ0_VECTOR +  6)
-#define IRQ7_VECTOR			(IRQ0_VECTOR +  7)
-#define IRQ8_VECTOR			(IRQ0_VECTOR +  8)
-#define IRQ9_VECTOR			(IRQ0_VECTOR +  9)
-#define IRQ10_VECTOR			(IRQ0_VECTOR + 10)
-#define IRQ11_VECTOR			(IRQ0_VECTOR + 11)
-#define IRQ12_VECTOR			(IRQ0_VECTOR + 12)
-#define IRQ13_VECTOR			(IRQ0_VECTOR + 13)
-#define IRQ14_VECTOR			(IRQ0_VECTOR + 14)
-#define IRQ15_VECTOR			(IRQ0_VECTOR + 15)
+#define ISA_IRQ_VECTOR(irq)		(((FIRST_EXTERNAL_VECTOR + 16) & ~15) + irq)
 
 
 /*
 /*
  * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
  * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
@@ -105,6 +86,7 @@
 /* Vector for KVM to deliver posted interrupt IPI */
 /* Vector for KVM to deliver posted interrupt IPI */
 #ifdef CONFIG_HAVE_KVM
 #ifdef CONFIG_HAVE_KVM
 #define POSTED_INTR_VECTOR		0xf2
 #define POSTED_INTR_VECTOR		0xf2
+#define POSTED_INTR_WAKEUP_VECTOR	0xf1
 #endif
 #endif
 
 
 /*
 /*
@@ -155,18 +137,22 @@ static inline int invalid_vm86_irq(int irq)
  * static arrays.
  * static arrays.
  */
  */
 
 
-#define NR_IRQS_LEGACY			  16
+#define NR_IRQS_LEGACY			16
 
 
-#define IO_APIC_VECTOR_LIMIT		( 32 * MAX_IO_APICS )
+#define CPU_VECTOR_LIMIT		(64 * NR_CPUS)
+#define IO_APIC_VECTOR_LIMIT		(32 * MAX_IO_APICS)
 
 
-#ifdef CONFIG_X86_IO_APIC
-# define CPU_VECTOR_LIMIT		(64 * NR_CPUS)
-# define NR_IRQS					\
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_PCI_MSI)
+#define NR_IRQS						\
 	(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?	\
 	(CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ?	\
 		(NR_VECTORS + CPU_VECTOR_LIMIT)  :	\
 		(NR_VECTORS + CPU_VECTOR_LIMIT)  :	\
 		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
 		(NR_VECTORS + IO_APIC_VECTOR_LIMIT))
-#else /* !CONFIG_X86_IO_APIC: */
-# define NR_IRQS			NR_IRQS_LEGACY
+#elif defined(CONFIG_X86_IO_APIC)
+#define	NR_IRQS				(NR_VECTORS + IO_APIC_VECTOR_LIMIT)
+#elif defined(CONFIG_PCI_MSI)
+#define NR_IRQS				(NR_VECTORS + CPU_VECTOR_LIMIT)
+#else
+#define NR_IRQS				NR_IRQS_LEGACY
 #endif
 #endif
 
 
 #endif /* _ASM_X86_IRQ_VECTORS_H */
 #endif /* _ASM_X86_IRQ_VECTORS_H */

+ 63 - 0
arch/x86/include/asm/irqdomain.h

@@ -0,0 +1,63 @@
+#ifndef _ASM_IRQDOMAIN_H
+#define _ASM_IRQDOMAIN_H
+
+#include <linux/irqdomain.h>
+#include <asm/hw_irq.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+enum {
+	/* Allocate contiguous CPU vectors */
+	X86_IRQ_ALLOC_CONTIGUOUS_VECTORS		= 0x1,
+};
+
+extern struct irq_domain *x86_vector_domain;
+
+extern void init_irq_alloc_info(struct irq_alloc_info *info,
+				const struct cpumask *mask);
+extern void copy_irq_alloc_info(struct irq_alloc_info *dst,
+				struct irq_alloc_info *src);
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#ifdef CONFIG_X86_IO_APIC
+struct device_node;
+struct irq_data;
+
+enum ioapic_domain_type {
+	IOAPIC_DOMAIN_INVALID,
+	IOAPIC_DOMAIN_LEGACY,
+	IOAPIC_DOMAIN_STRICT,
+	IOAPIC_DOMAIN_DYNAMIC,
+};
+
+struct ioapic_domain_cfg {
+	enum ioapic_domain_type		type;
+	const struct irq_domain_ops	*ops;
+	struct device_node		*dev;
+};
+
+extern const struct irq_domain_ops mp_ioapic_irqdomain_ops;
+
+extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg);
+extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs);
+extern void mp_irqdomain_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data);
+extern void mp_irqdomain_deactivate(struct irq_domain *domain,
+				    struct irq_data *irq_data);
+extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
+#endif /* CONFIG_X86_IO_APIC */
+
+#ifdef CONFIG_PCI_MSI
+extern void arch_init_msi_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_msi_domain(struct irq_domain *domain) { }
+#endif
+
+#ifdef CONFIG_HT_IRQ
+extern void arch_init_htirq_domain(struct irq_domain *domain);
+#else
+static inline void arch_init_htirq_domain(struct irq_domain *domain) { }
+#endif
+
+#endif

+ 7 - 0
arch/x86/include/asm/msi.h

@@ -0,0 +1,7 @@
+#ifndef _ASM_X86_MSI_H
+#define _ASM_X86_MSI_H
+#include <asm/hw_irq.h>
+
+typedef struct irq_alloc_info msi_alloc_info_t;
+
+#endif /* _ASM_X86_MSI_H */

+ 7 - 8
arch/x86/include/asm/mtrr.h

@@ -31,7 +31,7 @@
  * arch_phys_wc_add and arch_phys_wc_del.
  * arch_phys_wc_add and arch_phys_wc_del.
  */
  */
 # ifdef CONFIG_MTRR
 # ifdef CONFIG_MTRR
-extern u8 mtrr_type_lookup(u64 addr, u64 end);
+extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_state(void);
 extern void mtrr_save_state(void);
 extern int mtrr_add(unsigned long base, unsigned long size,
 extern int mtrr_add(unsigned long base, unsigned long size,
@@ -48,14 +48,13 @@ extern void mtrr_aps_init(void);
 extern void mtrr_bp_restore(void);
 extern void mtrr_bp_restore(void);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
 extern int amd_special_default_mtrr(void);
 extern int amd_special_default_mtrr(void);
-extern int phys_wc_to_mtrr_index(int handle);
 #  else
 #  else
-static inline u8 mtrr_type_lookup(u64 addr, u64 end)
+static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
 {
 {
 	/*
 	/*
 	 * Return no-MTRRs:
 	 * Return no-MTRRs:
 	 */
 	 */
-	return 0xff;
+	return MTRR_TYPE_INVALID;
 }
 }
 #define mtrr_save_fixed_ranges(arg) do {} while (0)
 #define mtrr_save_fixed_ranges(arg) do {} while (0)
 #define mtrr_save_state() do {} while (0)
 #define mtrr_save_state() do {} while (0)
@@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
 static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
 {
 {
 }
 }
-static inline int phys_wc_to_mtrr_index(int handle)
-{
-	return -1;
-}
 
 
 #define mtrr_ap_init() do {} while (0)
 #define mtrr_ap_init() do {} while (0)
 #define mtrr_bp_init() do {} while (0)
 #define mtrr_bp_init() do {} while (0)
@@ -127,4 +122,8 @@ struct mtrr_gentry32 {
 				 _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry32)
 				 _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry32)
 #endif /* CONFIG_COMPAT */
 #endif /* CONFIG_COMPAT */
 
 
+/* Bit fields for enabled in struct mtrr_state_type */
+#define MTRR_STATE_MTRR_FIXED_ENABLED	0x01
+#define MTRR_STATE_MTRR_ENABLED		0x02
+
 #endif /* _ASM_X86_MTRR_H */
 #endif /* _ASM_X86_MTRR_H */

+ 4 - 3
arch/x86/include/asm/paravirt_types.h

@@ -160,13 +160,14 @@ struct pv_cpu_ops {
 	u64 (*read_pmc)(int counter);
 	u64 (*read_pmc)(int counter);
 	unsigned long long (*read_tscp)(unsigned int *aux);
 	unsigned long long (*read_tscp)(unsigned int *aux);
 
 
+#ifdef CONFIG_X86_32
 	/*
 	/*
 	 * Atomically enable interrupts and return to userspace.  This
 	 * Atomically enable interrupts and return to userspace.  This
-	 * is only ever used to return to 32-bit processes; in a
-	 * 64-bit kernel, it's used for 32-on-64 compat processes, but
-	 * never native 64-bit processes.  (Jump, not call.)
+	 * is only used in 32-bit kernels.  64-bit kernels use
+	 * usergs_sysret32 instead.
 	 */
 	 */
 	void (*irq_enable_sysexit)(void);
 	void (*irq_enable_sysexit)(void);
+#endif
 
 
 	/*
 	/*
 	 * Switch to usermode gs and return to 64-bit usermode using
 	 * Switch to usermode gs and return to 64-bit usermode using

+ 1 - 6
arch/x86/include/asm/pat.h

@@ -4,12 +4,7 @@
 #include <linux/types.h>
 #include <linux/types.h>
 #include <asm/pgtable_types.h>
 #include <asm/pgtable_types.h>
 
 
-#ifdef CONFIG_X86_PAT
-extern int pat_enabled;
-#else
-static const int pat_enabled;
-#endif
-
+bool pat_enabled(void);
 extern void pat_init(void);
 extern void pat_init(void);
 void pat_init_cache_modes(void);
 void pat_init_cache_modes(void);
 
 

+ 0 - 5
arch/x86/include/asm/pci.h

@@ -96,15 +96,10 @@ extern void pci_iommu_alloc(void);
 #ifdef CONFIG_PCI_MSI
 #ifdef CONFIG_PCI_MSI
 /* implemented in arch/x86/kernel/apic/io_apic. */
 /* implemented in arch/x86/kernel/apic/io_apic. */
 struct msi_desc;
 struct msi_desc;
-void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq,
-			    unsigned int dest, struct msi_msg *msg, u8 hpet_id);
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
 void native_teardown_msi_irq(unsigned int irq);
 void native_teardown_msi_irq(unsigned int irq);
 void native_restore_msi_irqs(struct pci_dev *dev);
 void native_restore_msi_irqs(struct pci_dev *dev);
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-		  unsigned int irq_base, unsigned int irq_offset);
 #else
 #else
-#define native_compose_msi_msg		NULL
 #define native_setup_msi_irqs		NULL
 #define native_setup_msi_irqs		NULL
 #define native_teardown_msi_irq		NULL
 #define native_teardown_msi_irq		NULL
 #endif
 #endif

+ 38 - 0
arch/x86/include/asm/special_insns.h

@@ -215,6 +215,44 @@ static inline void clwb(volatile void *__p)
 		: [pax] "a" (p));
 		: [pax] "a" (p));
 }
 }
 
 
+/**
+ * pcommit_sfence() - persistent commit and fence
+ *
+ * The PCOMMIT instruction ensures that data that has been flushed from the
+ * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
+ * memory and is durable on the DIMM.  The primary use case for this is
+ * persistent memory.
+ *
+ * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT
+ * with appropriate fencing.
+ *
+ * Example:
+ * void flush_and_commit_buffer(void *vaddr, unsigned int size)
+ * {
+ *         unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ *         void *vend = vaddr + size;
+ *         void *p;
+ *
+ *         for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ *              p < vend; p += boot_cpu_data.x86_clflush_size)
+ *                 clwb(p);
+ *
+ *         // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes
+ *         // MFENCE via mb() also works
+ *         wmb();
+ *
+ *         // PCOMMIT and the required SFENCE for ordering
+ *         pcommit_sfence();
+ * }
+ *
+ * After this function completes the data pointed to by 'vaddr' has been
+ * accepted to memory and will be durable if the 'vaddr' points to persistent
+ * memory.
+ *
+ * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify
+ * things we include both the PCOMMIT and the required SFENCE in the
+ * alternatives generated by pcommit_sfence().
+ */
 static inline void pcommit_sfence(void)
 static inline void pcommit_sfence(void)
 {
 {
 	alternative(ASM_NOP7,
 	alternative(ASM_NOP7,

+ 5 - 3
arch/x86/include/asm/thread_info.h

@@ -177,8 +177,6 @@ struct thread_info {
  */
  */
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
 
 
-DECLARE_PER_CPU(unsigned long, kernel_stack);
-
 static inline struct thread_info *current_thread_info(void)
 static inline struct thread_info *current_thread_info(void)
 {
 {
 	return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
 	return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
@@ -197,9 +195,13 @@ static inline unsigned long current_stack_pointer(void)
 
 
 #else /* !__ASSEMBLY__ */
 #else /* !__ASSEMBLY__ */
 
 
+#ifdef CONFIG_X86_64
+# define cpu_current_top_of_stack (cpu_tss + TSS_sp0)
+#endif
+
 /* Load thread_info address into "reg" */
 /* Load thread_info address into "reg" */
 #define GET_THREAD_INFO(reg) \
 #define GET_THREAD_INFO(reg) \
-	_ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
+	_ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
 	_ASM_SUB $(THREAD_SIZE),reg ;
 	_ASM_SUB $(THREAD_SIZE),reg ;
 
 
 /*
 /*

+ 4 - 0
arch/x86/include/asm/uaccess_32.h

@@ -59,6 +59,10 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
 			__put_user_size(*(u32 *)from, (u32 __user *)to,
 			__put_user_size(*(u32 *)from, (u32 __user *)to,
 					4, ret, 4);
 					4, ret, 4);
 			return ret;
 			return ret;
+		case 8:
+			__put_user_size(*(u64 *)from, (u64 __user *)to,
+					8, ret, 8);
+			return ret;
 		}
 		}
 	}
 	}
 	return __copy_to_user_ll(to, from, n);
 	return __copy_to_user_ll(to, from, n);

+ 0 - 21
arch/x86/include/asm/x86_init.h

@@ -171,38 +171,17 @@ struct x86_platform_ops {
 };
 };
 
 
 struct pci_dev;
 struct pci_dev;
-struct msi_msg;
 
 
 struct x86_msi_ops {
 struct x86_msi_ops {
 	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
 	int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
-	void (*compose_msi_msg)(struct pci_dev *dev, unsigned int irq,
-				unsigned int dest, struct msi_msg *msg,
-			       u8 hpet_id);
 	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irq)(unsigned int irq);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
 	void (*teardown_msi_irqs)(struct pci_dev *dev);
 	void (*restore_msi_irqs)(struct pci_dev *dev);
 	void (*restore_msi_irqs)(struct pci_dev *dev);
-	int  (*setup_hpet_msi)(unsigned int irq, unsigned int id);
 };
 };
 
 
-struct IO_APIC_route_entry;
-struct io_apic_irq_attr;
-struct irq_data;
-struct cpumask;
-
 struct x86_io_apic_ops {
 struct x86_io_apic_ops {
-	void		(*init)   (void);
 	unsigned int	(*read)   (unsigned int apic, unsigned int reg);
 	unsigned int	(*read)   (unsigned int apic, unsigned int reg);
-	void		(*write)  (unsigned int apic, unsigned int reg, unsigned int value);
-	void		(*modify) (unsigned int apic, unsigned int reg, unsigned int value);
 	void		(*disable)(void);
 	void		(*disable)(void);
-	void		(*print_entries)(unsigned int apic, unsigned int nr_entries);
-	int		(*set_affinity)(struct irq_data *data,
-					const struct cpumask *mask,
-					bool force);
-	int		(*setup_entry)(int irq, struct IO_APIC_route_entry *entry,
-				       unsigned int destination, int vector,
-				       struct io_apic_irq_attr *attr);
-	void		(*eoi_ioapic_pin)(int apic, int pin, int vector);
 };
 };
 
 
 extern struct x86_init_ops x86_init;
 extern struct x86_init_ops x86_init;

+ 7 - 1
arch/x86/include/uapi/asm/mtrr.h

@@ -103,7 +103,7 @@ struct mtrr_state_type {
 #define MTRRIOC_GET_PAGE_ENTRY   _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
 #define MTRRIOC_GET_PAGE_ENTRY   _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
 #define MTRRIOC_KILL_PAGE_ENTRY  _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry)
 #define MTRRIOC_KILL_PAGE_ENTRY  _IOW(MTRR_IOCTL_BASE,  9, struct mtrr_sentry)
 
 
-/*  These are the region types  */
+/* MTRR memory types, which are defined in SDM */
 #define MTRR_TYPE_UNCACHABLE 0
 #define MTRR_TYPE_UNCACHABLE 0
 #define MTRR_TYPE_WRCOMB     1
 #define MTRR_TYPE_WRCOMB     1
 /*#define MTRR_TYPE_         2*/
 /*#define MTRR_TYPE_         2*/
@@ -113,5 +113,11 @@ struct mtrr_state_type {
 #define MTRR_TYPE_WRBACK     6
 #define MTRR_TYPE_WRBACK     6
 #define MTRR_NUM_TYPES       7
 #define MTRR_NUM_TYPES       7
 
 
+/*
+ * Invalid MTRR memory type.  mtrr_type_lookup() returns this value when
+ * MTRRs are disabled.  Note, this value is allocated from the reserved
+ * values (0x7-0xff) of the MTRR memory types.
+ */
+#define MTRR_TYPE_INVALID    0xff
 
 
 #endif /* _UAPI_ASM_X86_MTRR_H */
 #endif /* _UAPI_ASM_X86_MTRR_H */

+ 22 - 51
arch/x86/kernel/acpi/boot.c

@@ -31,12 +31,12 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/dmi.h>
 #include <linux/dmi.h>
 #include <linux/irq.h>
 #include <linux/irq.h>
-#include <linux/irqdomain.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
 #include <linux/ioport.h>
 #include <linux/pci.h>
 #include <linux/pci.h>
 
 
+#include <asm/irqdomain.h>
 #include <asm/pci_x86.h>
 #include <asm/pci_x86.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/io_apic.h>
 #include <asm/io_apic.h>
@@ -400,57 +400,13 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
 	return 0;
 	return 0;
 }
 }
 
 
-static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
-			   int polarity)
-{
-	int irq, node;
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return gsi;
-
-	trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
-	polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
-	node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
-	if (mp_set_gsi_attr(gsi, trigger, polarity, node)) {
-		pr_warn("Failed to set pin attr for GSI%d\n", gsi);
-		return -1;
-	}
-
-	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
-	if (irq < 0)
-		return irq;
-
-	/* Don't set up the ACPI SCI because it's already set up */
-	if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi)
-		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
-
-	return irq;
-}
-
-static void mp_unregister_gsi(u32 gsi)
-{
-	int irq;
-
-	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
-		return;
-
-	irq = mp_map_gsi_to_irq(gsi, 0);
-	if (irq > 0)
-		mp_unmap_irq(irq);
-}
-
-static struct irq_domain_ops acpi_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-};
-
 static int __init
 static int __init
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
 {
 	struct acpi_madt_io_apic *ioapic = NULL;
 	struct acpi_madt_io_apic *ioapic = NULL;
 	struct ioapic_domain_cfg cfg = {
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_DYNAMIC,
 		.type = IOAPIC_DOMAIN_DYNAMIC,
-		.ops = &acpi_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 	};
 
 
 	ioapic = (struct acpi_madt_io_apic *)header;
 	ioapic = (struct acpi_madt_io_apic *)header;
@@ -652,7 +608,7 @@ static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
 	 */
 	 */
 	if (trigger == ACPI_LEVEL_SENSITIVE)
 	if (trigger == ACPI_LEVEL_SENSITIVE)
-		eisa_set_level_irq(gsi);
+		elcr_set_level_irq(gsi);
 #endif
 #endif
 
 
 	return gsi;
 	return gsi;
@@ -663,10 +619,21 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 				    int trigger, int polarity)
 				    int trigger, int polarity)
 {
 {
 	int irq = gsi;
 	int irq = gsi;
-
 #ifdef CONFIG_X86_IO_APIC
 #ifdef CONFIG_X86_IO_APIC
+	int node;
+	struct irq_alloc_info info;
+
+	node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
+	trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
+	polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
+	ioapic_set_alloc_attr(&info, node, trigger, polarity);
+
 	mutex_lock(&acpi_ioapic_lock);
 	mutex_lock(&acpi_ioapic_lock);
-	irq = mp_register_gsi(dev, gsi, trigger, polarity);
+	irq = mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info);
+	/* Don't set up the ACPI SCI because it's already set up */
+	if (irq >= 0 && enable_update_mptable &&
+	    acpi_gbl_FADT.sci_interrupt != gsi)
+		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
 	mutex_unlock(&acpi_ioapic_lock);
 	mutex_unlock(&acpi_ioapic_lock);
 #endif
 #endif
 
 
@@ -676,8 +643,12 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
 static void acpi_unregister_gsi_ioapic(u32 gsi)
 static void acpi_unregister_gsi_ioapic(u32 gsi)
 {
 {
 #ifdef CONFIG_X86_IO_APIC
 #ifdef CONFIG_X86_IO_APIC
+	int irq;
+
 	mutex_lock(&acpi_ioapic_lock);
 	mutex_lock(&acpi_ioapic_lock);
-	mp_unregister_gsi(gsi);
+	irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+	if (irq > 0)
+		mp_unmap_irq(irq);
 	mutex_unlock(&acpi_ioapic_lock);
 	mutex_unlock(&acpi_ioapic_lock);
 #endif
 #endif
 }
 }
@@ -786,7 +757,7 @@ int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
 	u64 addr;
 	u64 addr;
 	struct ioapic_domain_cfg cfg = {
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_DYNAMIC,
 		.type = IOAPIC_DOMAIN_DYNAMIC,
-		.ops = &acpi_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 	};
 
 
 	ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);
 	ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);

+ 3 - 3
arch/x86/kernel/acpi/wakeup_64.S

@@ -62,7 +62,7 @@ ENTRY(do_suspend_lowlevel)
 	pushfq
 	pushfq
 	popq	pt_regs_flags(%rax)
 	popq	pt_regs_flags(%rax)
 
 
-	movq	$resume_point, saved_rip(%rip)
+	movq	$.Lresume_point, saved_rip(%rip)
 
 
 	movq	%rsp, saved_rsp
 	movq	%rsp, saved_rsp
 	movq	%rbp, saved_rbp
 	movq	%rbp, saved_rbp
@@ -75,10 +75,10 @@ ENTRY(do_suspend_lowlevel)
 	xorl	%eax, %eax
 	xorl	%eax, %eax
 	call	x86_acpi_enter_sleep_state
 	call	x86_acpi_enter_sleep_state
 	/* in case something went wrong, restore the machine status and go on */
 	/* in case something went wrong, restore the machine status and go on */
-	jmp	resume_point
+	jmp	.Lresume_point
 
 
 	.align 4
 	.align 4
-resume_point:
+.Lresume_point:
 	/* We don't restore %rax, it must be 0 anyway */
 	/* We don't restore %rax, it must be 0 anyway */
 	movq	$saved_context, %rax
 	movq	$saved_context, %rax
 	movq	saved_context_cr4(%rax), %rbx
 	movq	saved_context_cr4(%rax), %rbx

+ 9 - 0
arch/x86/kernel/alternative.c

@@ -227,6 +227,15 @@ void __init arch_init_ideal_nops(void)
 #endif
 #endif
 		}
 		}
 		break;
 		break;
+
+	case X86_VENDOR_AMD:
+		if (boot_cpu_data.x86 > 0xf) {
+			ideal_nops = p6_nops;
+			return;
+		}
+
+		/* fall through */
+
 	default:
 	default:
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 		ideal_nops = k8_nops;
 		ideal_nops = k8_nops;

+ 0 - 4
arch/x86/kernel/apb_timer.c

@@ -171,10 +171,6 @@ static int __init apbt_clockevent_register(void)
 
 
 static void apbt_setup_irq(struct apbt_dev *adev)
 static void apbt_setup_irq(struct apbt_dev *adev)
 {
 {
-	/* timer0 irq has been setup early */
-	if (adev->irq == 0)
-		return;
-
 	irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
 	irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
 	irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
 	irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
 }
 }

+ 126 - 47
arch/x86/kernel/apic/htirq.c

@@ -3,6 +3,8 @@
  *
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Add support of hierarchical irqdomain
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * it under the terms of the GNU General Public License version 2 as
@@ -14,78 +16,112 @@
 #include <linux/device.h>
 #include <linux/device.h>
 #include <linux/pci.h>
 #include <linux/pci.h>
 #include <linux/htirq.h>
 #include <linux/htirq.h>
+#include <asm/irqdomain.h>
 #include <asm/hw_irq.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/apic.h>
 #include <asm/hypertransport.h>
 #include <asm/hypertransport.h>
 
 
+static struct irq_domain *htirq_domain;
+
 /*
 /*
  * Hypertransport interrupt support
  * Hypertransport interrupt support
  */
  */
-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
-{
-	struct ht_irq_msg msg;
-
-	fetch_ht_irq_msg(irq, &msg);
-
-	msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
-	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
-	msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
-	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
-	write_ht_irq_msg(irq, &msg);
-}
-
 static int
 static int
 ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
 {
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest;
+	struct irq_data *parent = data->parent_data;
 	int ret;
 	int ret;
 
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
-
-	target_ht_irq(data->irq, dest, cfg->vector);
-	return IRQ_SET_MASK_OK_NOCOPY;
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret >= 0) {
+		struct ht_irq_msg msg;
+		struct irq_cfg *cfg = irqd_cfg(data);
+
+		fetch_ht_irq_msg(data->irq, &msg);
+		msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK |
+				    HT_IRQ_LOW_DEST_ID_MASK);
+		msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) |
+				  HT_IRQ_LOW_DEST_ID(cfg->dest_apicid);
+		msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+		msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
+		write_ht_irq_msg(data->irq, &msg);
+	}
+
+	return ret;
 }
 }
 
 
 static struct irq_chip ht_irq_chip = {
 static struct irq_chip ht_irq_chip = {
 	.name			= "PCI-HT",
 	.name			= "PCI-HT",
 	.irq_mask		= mask_ht_irq,
 	.irq_mask		= mask_ht_irq,
 	.irq_unmask		= unmask_ht_irq,
 	.irq_unmask		= unmask_ht_irq,
-	.irq_ack		= apic_ack_edge,
+	.irq_ack		= irq_chip_ack_parent,
 	.irq_set_affinity	= ht_set_affinity,
 	.irq_set_affinity	= ht_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 };
 
 
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg)
 {
 {
-	struct irq_cfg *cfg;
-	struct ht_irq_msg msg;
-	unsigned dest;
-	int err;
+	struct ht_irq_cfg *ht_cfg;
+	struct irq_alloc_info *info = arg;
+	struct pci_dev *dev;
+	irq_hw_number_t hwirq;
+	int ret;
 
 
-	if (disable_apic)
-		return -ENXIO;
+	if (nr_irqs > 1 || !info)
+		return -EINVAL;
 
 
-	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, apic->target_cpus());
-	if (err)
-		return err;
+	dev = info->ht_dev;
+	hwirq = (info->ht_idx & 0xFF) |
+		PCI_DEVID(dev->bus->number, dev->devfn) << 8 |
+		(pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24;
+	if (irq_find_mapping(domain, hwirq) > 0)
+		return -EEXIST;
 
 
-	err = apic->cpu_mask_to_apicid_and(cfg->domain,
-					   apic->target_cpus(), &dest);
-	if (err)
-		return err;
+	ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL);
+	if (!ht_cfg)
+		return -ENOMEM;
 
 
-	msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0) {
+		kfree(ht_cfg);
+		return ret;
+	}
+
+	/* Initialize msg to a value that will never match the first write. */
+	ht_cfg->msg.address_lo = 0xffffffff;
+	ht_cfg->msg.address_hi = 0xffffffff;
+	ht_cfg->dev = info->ht_dev;
+	ht_cfg->update = info->ht_update;
+	ht_cfg->pos = info->ht_pos;
+	ht_cfg->idx = 0x10 + (info->ht_idx * 2);
+	irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg,
+			    handle_edge_irq, ht_cfg, "edge");
+
+	return 0;
+}
+
+static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs)
+{
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+	BUG_ON(nr_irqs != 1);
+	kfree(irq_data->chip_data);
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
+}
 
 
+static void htirq_domain_activate(struct irq_domain *domain,
+				  struct irq_data *irq_data)
+{
+	struct ht_irq_msg msg;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
+
+	msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
 	msg.address_lo =
 	msg.address_lo =
 		HT_IRQ_LOW_BASE |
 		HT_IRQ_LOW_BASE |
-		HT_IRQ_LOW_DEST_ID(dest) |
+		HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) |
 		HT_IRQ_LOW_VECTOR(cfg->vector) |
 		HT_IRQ_LOW_VECTOR(cfg->vector) |
 		((apic->irq_dest_mode == 0) ?
 		((apic->irq_dest_mode == 0) ?
 			HT_IRQ_LOW_DM_PHYSICAL :
 			HT_IRQ_LOW_DM_PHYSICAL :
@@ -95,13 +131,56 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 			HT_IRQ_LOW_MT_FIXED :
 			HT_IRQ_LOW_MT_FIXED :
 			HT_IRQ_LOW_MT_ARBITRATED) |
 			HT_IRQ_LOW_MT_ARBITRATED) |
 		HT_IRQ_LOW_IRQ_MASKED;
 		HT_IRQ_LOW_IRQ_MASKED;
+	write_ht_irq_msg(irq_data->irq, &msg);
+}
 
 
-	write_ht_irq_msg(irq, &msg);
+static void htirq_domain_deactivate(struct irq_domain *domain,
+				    struct irq_data *irq_data)
+{
+	struct ht_irq_msg msg;
 
 
-	irq_set_chip_and_handler_name(irq, &ht_irq_chip,
-				      handle_edge_irq, "edge");
+	memset(&msg, 0, sizeof(msg));
+	write_ht_irq_msg(irq_data->irq, &msg);
+}
 
 
-	dev_dbg(&dev->dev, "irq %d for HT\n", irq);
+static const struct irq_domain_ops htirq_domain_ops = {
+	.alloc		= htirq_domain_alloc,
+	.free		= htirq_domain_free,
+	.activate	= htirq_domain_activate,
+	.deactivate	= htirq_domain_deactivate,
+};
 
 
-	return 0;
+void arch_init_htirq_domain(struct irq_domain *parent)
+{
+	if (disable_apic)
+		return;
+
+	htirq_domain = irq_domain_add_tree(NULL, &htirq_domain_ops, NULL);
+	if (!htirq_domain)
+		pr_warn("failed to initialize irqdomain for HTIRQ.\n");
+	else
+		htirq_domain->parent = parent;
+}
+
+int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
+		      ht_irq_update_t *update)
+{
+	struct irq_alloc_info info;
+
+	if (!htirq_domain)
+		return -ENOSYS;
+
+	init_irq_alloc_info(&info, NULL);
+	info.ht_idx = idx;
+	info.ht_pos = pos;
+	info.ht_dev = dev;
+	info.ht_update = update;
+
+	return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev),
+				     &info);
+}
+
+void arch_teardown_ht_irq(unsigned int irq)
+{
+	irq_domain_free_irqs(irq, 1);
 }
 }

+ 597 - 706
arch/x86/kernel/apic/io_apic.c

@@ -18,6 +18,16 @@
  *					and Rolf G. Tews
  *					and Rolf G. Tews
  *					for testing these extensively
  *					for testing these extensively
  *	Paul Diefenbaugh	:	Added full ACPI support
  *	Paul Diefenbaugh	:	Added full ACPI support
+ *
+ * Historical information which is worth to be preserved:
+ *
+ * - SiS APIC rmw bug:
+ *
+ *	We used to have a workaround for a bug in SiS chips which
+ *	required to rewrite the index register for a read-modify-write
+ *	operation as the chip lost the index information which was
+ *	setup for the read already. We cache the data now, so that
+ *	workaround has been removed.
  */
  */
 
 
 #include <linux/mm.h>
 #include <linux/mm.h>
@@ -31,13 +41,13 @@
 #include <linux/acpi.h>
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/syscore_ops.h>
 #include <linux/syscore_ops.h>
-#include <linux/irqdomain.h>
 #include <linux/freezer.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/kthread.h>
 #include <linux/jiffies.h>	/* time_after() */
 #include <linux/jiffies.h>	/* time_after() */
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/bootmem.h>
 
 
+#include <asm/irqdomain.h>
 #include <asm/idle.h>
 #include <asm/idle.h>
 #include <asm/io.h>
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/smp.h>
@@ -63,27 +73,31 @@
 #define	for_each_ioapic_pin(idx, pin)	\
 #define	for_each_ioapic_pin(idx, pin)	\
 	for_each_ioapic((idx))		\
 	for_each_ioapic((idx))		\
 		for_each_pin((idx), (pin))
 		for_each_pin((idx), (pin))
-
 #define for_each_irq_pin(entry, head) \
 #define for_each_irq_pin(entry, head) \
 	list_for_each_entry(entry, &head, list)
 	list_for_each_entry(entry, &head, list)
 
 
-/*
- *      Is the SiS APIC rmw bug present ?
- *      -1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
 static DEFINE_RAW_SPINLOCK(ioapic_lock);
 static DEFINE_RAW_SPINLOCK(ioapic_lock);
 static DEFINE_MUTEX(ioapic_mutex);
 static DEFINE_MUTEX(ioapic_mutex);
 static unsigned int ioapic_dynirq_base;
 static unsigned int ioapic_dynirq_base;
 static int ioapic_initialized;
 static int ioapic_initialized;
 
 
-struct mp_pin_info {
+struct irq_pin_list {
+	struct list_head list;
+	int apic, pin;
+};
+
+struct mp_chip_data {
+	struct list_head irq_2_pin;
+	struct IO_APIC_route_entry entry;
 	int trigger;
 	int trigger;
 	int polarity;
 	int polarity;
-	int node;
-	int set;
 	u32 count;
 	u32 count;
+	bool isa_irq;
+};
+
+struct mp_ioapic_gsi {
+	u32 gsi_base;
+	u32 gsi_end;
 };
 };
 
 
 static struct ioapic {
 static struct ioapic {
@@ -101,7 +115,6 @@ static struct ioapic {
 	struct mp_ioapic_gsi  gsi_config;
 	struct mp_ioapic_gsi  gsi_config;
 	struct ioapic_domain_cfg irqdomain_cfg;
 	struct ioapic_domain_cfg irqdomain_cfg;
 	struct irq_domain *irqdomain;
 	struct irq_domain *irqdomain;
-	struct mp_pin_info *pin_info;
 	struct resource *iomem_res;
 	struct resource *iomem_res;
 } ioapics[MAX_IO_APICS];
 } ioapics[MAX_IO_APICS];
 
 
@@ -117,7 +130,7 @@ unsigned int mpc_ioapic_addr(int ioapic_idx)
 	return ioapics[ioapic_idx].mp_config.apicaddr;
 	return ioapics[ioapic_idx].mp_config.apicaddr;
 }
 }
 
 
-struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
+static inline struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx)
 {
 {
 	return &ioapics[ioapic_idx].gsi_config;
 	return &ioapics[ioapic_idx].gsi_config;
 }
 }
@@ -129,11 +142,16 @@ static inline int mp_ioapic_pin_count(int ioapic)
 	return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
 	return gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
 }
 }
 
 
-u32 mp_pin_to_gsi(int ioapic, int pin)
+static inline u32 mp_pin_to_gsi(int ioapic, int pin)
 {
 {
 	return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
 	return mp_ioapic_gsi_routing(ioapic)->gsi_base + pin;
 }
 }
 
 
+static inline bool mp_is_legacy_irq(int irq)
+{
+	return irq >= 0 && irq < nr_legacy_irqs();
+}
+
 /*
 /*
  * Initialize all legacy IRQs and all pins on the first IOAPIC
  * Initialize all legacy IRQs and all pins on the first IOAPIC
  * if we have legacy interrupt controller. Kernel boot option "pirq="
  * if we have legacy interrupt controller. Kernel boot option "pirq="
@@ -144,12 +162,7 @@ static inline int mp_init_irq_at_boot(int ioapic, int irq)
 	if (!nr_legacy_irqs())
 	if (!nr_legacy_irqs())
 		return 0;
 		return 0;
 
 
-	return ioapic == 0 || (irq >= 0 && irq < nr_legacy_irqs());
-}
-
-static inline struct mp_pin_info *mp_pin_info(int ioapic_idx, int pin)
-{
-	return ioapics[ioapic_idx].pin_info + pin;
+	return ioapic == 0 || mp_is_legacy_irq(irq);
 }
 }
 
 
 static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
 static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
@@ -216,16 +229,6 @@ void mp_save_irq(struct mpc_intsrc *m)
 		panic("Max # of irq sources exceeded!!\n");
 		panic("Max # of irq sources exceeded!!\n");
 }
 }
 
 
-struct irq_pin_list {
-	struct list_head list;
-	int apic, pin;
-};
-
-static struct irq_pin_list *alloc_irq_pin_list(int node)
-{
-	return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
-}
-
 static void alloc_ioapic_saved_registers(int idx)
 static void alloc_ioapic_saved_registers(int idx)
 {
 {
 	size_t size;
 	size_t size;
@@ -247,8 +250,7 @@ static void free_ioapic_saved_registers(int idx)
 
 
 int __init arch_early_ioapic_init(void)
 int __init arch_early_ioapic_init(void)
 {
 {
-	struct irq_cfg *cfg;
-	int i, node = cpu_to_node(0);
+	int i;
 
 
 	if (!nr_legacy_irqs())
 	if (!nr_legacy_irqs())
 		io_apic_irqs = ~0UL;
 		io_apic_irqs = ~0UL;
@@ -256,16 +258,6 @@ int __init arch_early_ioapic_init(void)
 	for_each_ioapic(i)
 	for_each_ioapic(i)
 		alloc_ioapic_saved_registers(i);
 		alloc_ioapic_saved_registers(i);
 
 
-	/*
-	 * For legacy IRQ's, start with assigning irq0 to irq15 to
-	 * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's.
-	 */
-	for (i = 0; i < nr_legacy_irqs(); i++) {
-		cfg = alloc_irq_and_cfg_at(i, node);
-		cfg->vector = IRQ0_VECTOR + i;
-		cpumask_setall(cfg->domain);
-	}
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -283,7 +275,7 @@ static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
 		+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
 		+ (mpc_ioapic_addr(idx) & ~PAGE_MASK);
 }
 }
 
 
-void io_apic_eoi(unsigned int apic, unsigned int vector)
+static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
 {
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 	writel(vector, &io_apic->eoi);
 	writel(vector, &io_apic->eoi);
@@ -296,7 +288,8 @@ unsigned int native_io_apic_read(unsigned int apic, unsigned int reg)
 	return readl(&io_apic->data);
 	return readl(&io_apic->data);
 }
 }
 
 
-void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
+static void io_apic_write(unsigned int apic, unsigned int reg,
+			  unsigned int value)
 {
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
 
 
@@ -304,21 +297,6 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu
 	writel(value, &io_apic->data);
 	writel(value, &io_apic->data);
 }
 }
 
 
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
-
-	if (sis_apic_bug)
-		writel(reg, &io_apic->index);
-	writel(value, &io_apic->data);
-}
-
 union entry_union {
 union entry_union {
 	struct { u32 w1, w2; };
 	struct { u32 w1, w2; };
 	struct IO_APIC_route_entry entry;
 	struct IO_APIC_route_entry entry;
@@ -378,7 +356,7 @@ static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
 static void ioapic_mask_entry(int apic, int pin)
 static void ioapic_mask_entry(int apic, int pin)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
-	union entry_union eu = { .entry.mask = 1 };
+	union entry_union eu = { .entry.mask = IOAPIC_MASKED };
 
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
 	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
@@ -391,16 +369,17 @@ static void ioapic_mask_entry(int apic, int pin)
  * shared ISA-space IRQs, so we have to support them. We are super
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
  */
-static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static int __add_pin_to_irq_node(struct mp_chip_data *data,
+				 int node, int apic, int pin)
 {
 {
 	struct irq_pin_list *entry;
 	struct irq_pin_list *entry;
 
 
 	/* don't allow duplicates */
 	/* don't allow duplicates */
-	for_each_irq_pin(entry, cfg->irq_2_pin)
+	for_each_irq_pin(entry, data->irq_2_pin)
 		if (entry->apic == apic && entry->pin == pin)
 		if (entry->apic == apic && entry->pin == pin)
 			return 0;
 			return 0;
 
 
-	entry = alloc_irq_pin_list(node);
+	entry = kzalloc_node(sizeof(struct irq_pin_list), GFP_ATOMIC, node);
 	if (!entry) {
 	if (!entry) {
 		pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
 		pr_err("can not alloc irq_pin_list (%d,%d,%d)\n",
 		       node, apic, pin);
 		       node, apic, pin);
@@ -408,16 +387,16 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
 	}
 	}
 	entry->apic = apic;
 	entry->apic = apic;
 	entry->pin = pin;
 	entry->pin = pin;
+	list_add_tail(&entry->list, &data->irq_2_pin);
 
 
-	list_add_tail(&entry->list, &cfg->irq_2_pin);
 	return 0;
 	return 0;
 }
 }
 
 
-static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
+static void __remove_pin_from_irq(struct mp_chip_data *data, int apic, int pin)
 {
 {
 	struct irq_pin_list *tmp, *entry;
 	struct irq_pin_list *tmp, *entry;
 
 
-	list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list)
+	list_for_each_entry_safe(entry, tmp, &data->irq_2_pin, list)
 		if (entry->apic == apic && entry->pin == pin) {
 		if (entry->apic == apic && entry->pin == pin) {
 			list_del(&entry->list);
 			list_del(&entry->list);
 			kfree(entry);
 			kfree(entry);
@@ -425,22 +404,23 @@ static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
 		}
 		}
 }
 }
 
 
-static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
+static void add_pin_to_irq_node(struct mp_chip_data *data,
+				int node, int apic, int pin)
 {
 {
-	if (__add_pin_to_irq_node(cfg, node, apic, pin))
+	if (__add_pin_to_irq_node(data, node, apic, pin))
 		panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
 		panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
 }
 }
 
 
 /*
 /*
  * Reroute an IRQ to a different pin.
  * Reroute an IRQ to a different pin.
  */
  */
-static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
+static void __init replace_pin_at_irq_node(struct mp_chip_data *data, int node,
 					   int oldapic, int oldpin,
 					   int oldapic, int oldpin,
 					   int newapic, int newpin)
 					   int newapic, int newpin)
 {
 {
 	struct irq_pin_list *entry;
 	struct irq_pin_list *entry;
 
 
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
+	for_each_irq_pin(entry, data->irq_2_pin) {
 		if (entry->apic == oldapic && entry->pin == oldpin) {
 		if (entry->apic == oldapic && entry->pin == oldpin) {
 			entry->apic = newapic;
 			entry->apic = newapic;
 			entry->pin = newpin;
 			entry->pin = newpin;
@@ -450,32 +430,26 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
 	}
 	}
 
 
 	/* old apic/pin didn't exist, so just add new ones */
 	/* old apic/pin didn't exist, so just add new ones */
-	add_pin_to_irq_node(cfg, node, newapic, newpin);
-}
-
-static void __io_apic_modify_irq(struct irq_pin_list *entry,
-				 int mask_and, int mask_or,
-				 void (*final)(struct irq_pin_list *entry))
-{
-	unsigned int reg, pin;
-
-	pin = entry->pin;
-	reg = io_apic_read(entry->apic, 0x10 + pin * 2);
-	reg &= mask_and;
-	reg |= mask_or;
-	io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
-	if (final)
-		final(entry);
+	add_pin_to_irq_node(data, node, newapic, newpin);
 }
 }
 
 
-static void io_apic_modify_irq(struct irq_cfg *cfg,
+static void io_apic_modify_irq(struct mp_chip_data *data,
 			       int mask_and, int mask_or,
 			       int mask_and, int mask_or,
 			       void (*final)(struct irq_pin_list *entry))
 			       void (*final)(struct irq_pin_list *entry))
 {
 {
+	union entry_union eu;
 	struct irq_pin_list *entry;
 	struct irq_pin_list *entry;
 
 
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		__io_apic_modify_irq(entry, mask_and, mask_or, final);
+	eu.entry = data->entry;
+	eu.w1 &= mask_and;
+	eu.w1 |= mask_or;
+	data->entry = eu.entry;
+
+	for_each_irq_pin(entry, data->irq_2_pin) {
+		io_apic_write(entry->apic, 0x10 + 2 * entry->pin, eu.w1);
+		if (final)
+			final(entry);
+	}
 }
 }
 
 
 static void io_apic_sync(struct irq_pin_list *entry)
 static void io_apic_sync(struct irq_pin_list *entry)
@@ -490,39 +464,31 @@ static void io_apic_sync(struct irq_pin_list *entry)
 	readl(&io_apic->data);
 	readl(&io_apic->data);
 }
 }
 
 
-static void mask_ioapic(struct irq_cfg *cfg)
+static void mask_ioapic_irq(struct irq_data *irq_data)
 {
 {
+	struct mp_chip_data *data = irq_data->chip_data;
 	unsigned long flags;
 	unsigned long flags;
 
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+	io_apic_modify_irq(data, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 }
 
 
-static void mask_ioapic_irq(struct irq_data *data)
+static void __unmask_ioapic(struct mp_chip_data *data)
 {
 {
-	mask_ioapic(irqd_cfg(data));
+	io_apic_modify_irq(data, ~IO_APIC_REDIR_MASKED, 0, NULL);
 }
 }
 
 
-static void __unmask_ioapic(struct irq_cfg *cfg)
-{
-	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
-}
-
-static void unmask_ioapic(struct irq_cfg *cfg)
+static void unmask_ioapic_irq(struct irq_data *irq_data)
 {
 {
+	struct mp_chip_data *data = irq_data->chip_data;
 	unsigned long flags;
 	unsigned long flags;
 
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_ioapic(cfg);
+	__unmask_ioapic(data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 }
 
 
-static void unmask_ioapic_irq(struct irq_data *data)
-{
-	unmask_ioapic(irqd_cfg(data));
-}
-
 /*
 /*
  * IO-APIC versions below 0x20 don't support EOI register.
  * IO-APIC versions below 0x20 don't support EOI register.
  * For the record, here is the information about various versions:
  * For the record, here is the information about various versions:
@@ -539,7 +505,7 @@ static void unmask_ioapic_irq(struct irq_data *data)
  * Otherwise, we simulate the EOI message manually by changing the trigger
  * Otherwise, we simulate the EOI message manually by changing the trigger
  * mode to edge and then back to level, with RTE being masked during this.
  * mode to edge and then back to level, with RTE being masked during this.
  */
  */
-void native_eoi_ioapic_pin(int apic, int pin, int vector)
+static void __eoi_ioapic_pin(int apic, int pin, int vector)
 {
 {
 	if (mpc_ioapic_ver(apic) >= 0x20) {
 	if (mpc_ioapic_ver(apic) >= 0x20) {
 		io_apic_eoi(apic, vector);
 		io_apic_eoi(apic, vector);
@@ -551,7 +517,7 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector)
 		/*
 		/*
 		 * Mask the entry and change the trigger mode to edge.
 		 * Mask the entry and change the trigger mode to edge.
 		 */
 		 */
-		entry1.mask = 1;
+		entry1.mask = IOAPIC_MASKED;
 		entry1.trigger = IOAPIC_EDGE;
 		entry1.trigger = IOAPIC_EDGE;
 
 
 		__ioapic_write_entry(apic, pin, entry1);
 		__ioapic_write_entry(apic, pin, entry1);
@@ -563,15 +529,14 @@ void native_eoi_ioapic_pin(int apic, int pin, int vector)
 	}
 	}
 }
 }
 
 
-void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+void eoi_ioapic_pin(int vector, struct mp_chip_data *data)
 {
 {
-	struct irq_pin_list *entry;
 	unsigned long flags;
 	unsigned long flags;
+	struct irq_pin_list *entry;
 
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		x86_io_apic_ops.eoi_ioapic_pin(entry->apic, entry->pin,
-					       cfg->vector);
+	for_each_irq_pin(entry, data->irq_2_pin)
+		__eoi_ioapic_pin(entry->apic, entry->pin, vector);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 }
 
 
@@ -588,8 +553,8 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 	 * Make sure the entry is masked and re-read the contents to check
 	 * Make sure the entry is masked and re-read the contents to check
 	 * if it is a level triggered pin and if the remote-IRR is set.
 	 * if it is a level triggered pin and if the remote-IRR is set.
 	 */
 	 */
-	if (!entry.mask) {
-		entry.mask = 1;
+	if (entry.mask == IOAPIC_UNMASKED) {
+		entry.mask = IOAPIC_MASKED;
 		ioapic_write_entry(apic, pin, entry);
 		ioapic_write_entry(apic, pin, entry);
 		entry = ioapic_read_entry(apic, pin);
 		entry = ioapic_read_entry(apic, pin);
 	}
 	}
@@ -602,13 +567,12 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
 		 * doesn't clear the remote-IRR if the trigger mode is not
 		 * doesn't clear the remote-IRR if the trigger mode is not
 		 * set to level.
 		 * set to level.
 		 */
 		 */
-		if (!entry.trigger) {
+		if (entry.trigger == IOAPIC_EDGE) {
 			entry.trigger = IOAPIC_LEVEL;
 			entry.trigger = IOAPIC_LEVEL;
 			ioapic_write_entry(apic, pin, entry);
 			ioapic_write_entry(apic, pin, entry);
 		}
 		}
-
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
 		raw_spin_lock_irqsave(&ioapic_lock, flags);
-		x86_io_apic_ops.eoi_ioapic_pin(apic, pin, entry.vector);
+		__eoi_ioapic_pin(apic, pin, entry.vector);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 		raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
 
 
@@ -706,8 +670,8 @@ void mask_ioapic_entries(void)
 			struct IO_APIC_route_entry entry;
 			struct IO_APIC_route_entry entry;
 
 
 			entry = ioapics[apic].saved_registers[pin];
 			entry = ioapics[apic].saved_registers[pin];
-			if (!entry.mask) {
-				entry.mask = 1;
+			if (entry.mask == IOAPIC_UNMASKED) {
+				entry.mask = IOAPIC_MASKED;
 				ioapic_write_entry(apic, pin, entry);
 				ioapic_write_entry(apic, pin, entry);
 			}
 			}
 		}
 		}
@@ -809,11 +773,11 @@ static int EISA_ELCR(unsigned int irq)
 
 
 #endif
 #endif
 
 
-/* ISA interrupts are always polarity zero edge triggered,
+/* ISA interrupts are always active high edge triggered,
  * when listed as conforming in the MP table. */
  * when listed as conforming in the MP table. */
 
 
-#define default_ISA_trigger(idx)	(0)
-#define default_ISA_polarity(idx)	(0)
+#define default_ISA_trigger(idx)	(IOAPIC_EDGE)
+#define default_ISA_polarity(idx)	(IOAPIC_POL_HIGH)
 
 
 /* EISA interrupts are always polarity zero and can be edge or level
 /* EISA interrupts are always polarity zero and can be edge or level
  * trigger depending on the ELCR value.  If an interrupt is listed as
  * trigger depending on the ELCR value.  If an interrupt is listed as
@@ -823,53 +787,55 @@ static int EISA_ELCR(unsigned int irq)
 #define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))
 #define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].srcbusirq))
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)
 #define default_EISA_polarity(idx)	default_ISA_polarity(idx)
 
 
-/* PCI interrupts are always polarity one level triggered,
+/* PCI interrupts are always active low level triggered,
  * when listed as conforming in the MP table. */
  * when listed as conforming in the MP table. */
 
 
-#define default_PCI_trigger(idx)	(1)
-#define default_PCI_polarity(idx)	(1)
+#define default_PCI_trigger(idx)	(IOAPIC_LEVEL)
+#define default_PCI_polarity(idx)	(IOAPIC_POL_LOW)
 
 
 static int irq_polarity(int idx)
 static int irq_polarity(int idx)
 {
 {
 	int bus = mp_irqs[idx].srcbus;
 	int bus = mp_irqs[idx].srcbus;
-	int polarity;
 
 
 	/*
 	/*
 	 * Determine IRQ line polarity (high active or low active):
 	 * Determine IRQ line polarity (high active or low active):
 	 */
 	 */
-	switch (mp_irqs[idx].irqflag & 3)
-	{
-		case 0: /* conforms, ie. bus-type dependent polarity */
-			if (test_bit(bus, mp_bus_not_pci))
-				polarity = default_ISA_polarity(idx);
-			else
-				polarity = default_PCI_polarity(idx);
-			break;
-		case 1: /* high active */
-		{
-			polarity = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			pr_warn("broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
-		case 3: /* low active */
-		{
-			polarity = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			pr_warn("broken BIOS!!\n");
-			polarity = 1;
-			break;
-		}
+	switch (mp_irqs[idx].irqflag & 0x03) {
+	case 0:
+		/* conforms to spec, ie. bus-type dependent polarity */
+		if (test_bit(bus, mp_bus_not_pci))
+			return default_ISA_polarity(idx);
+		else
+			return default_PCI_polarity(idx);
+	case 1:
+		return IOAPIC_POL_HIGH;
+	case 2:
+		pr_warn("IOAPIC: Invalid polarity: 2, defaulting to low\n");
+	case 3:
+	default: /* Pointless default required due to do gcc stupidity */
+		return IOAPIC_POL_LOW;
+	}
+}
+
+#ifdef CONFIG_EISA
+static int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+	switch (mp_bus_id_to_type[bus]) {
+	case MP_BUS_PCI:
+	case MP_BUS_ISA:
+		return trigger;
+	case MP_BUS_EISA:
+		return default_EISA_trigger(idx);
 	}
 	}
-	return polarity;
+	pr_warn("IOAPIC: Invalid srcbus: %d defaulting to level\n", bus);
+	return IOAPIC_LEVEL;
 }
 }
+#else
+static inline int eisa_irq_trigger(int idx, int bus, int trigger)
+{
+	return trigger;
+}
+#endif
 
 
 static int irq_trigger(int idx)
 static int irq_trigger(int idx)
 {
 {
@@ -879,153 +845,227 @@ static int irq_trigger(int idx)
 	/*
 	/*
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 * Determine IRQ trigger mode (edge or level sensitive):
 	 */
 	 */
-	switch ((mp_irqs[idx].irqflag>>2) & 3)
-	{
-		case 0: /* conforms, ie. bus-type dependent */
-			if (test_bit(bus, mp_bus_not_pci))
-				trigger = default_ISA_trigger(idx);
-			else
-				trigger = default_PCI_trigger(idx);
-#ifdef CONFIG_EISA
-			switch (mp_bus_id_to_type[bus]) {
-				case MP_BUS_ISA: /* ISA pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				case MP_BUS_EISA: /* EISA pin */
-				{
-					trigger = default_EISA_trigger(idx);
-					break;
-				}
-				case MP_BUS_PCI: /* PCI pin */
-				{
-					/* set before the switch */
-					break;
-				}
-				default:
-				{
-					pr_warn("broken BIOS!!\n");
-					trigger = 1;
-					break;
-				}
-			}
+	switch ((mp_irqs[idx].irqflag >> 2) & 0x03) {
+	case 0:
+		/* conforms to spec, ie. bus-type dependent trigger mode */
+		if (test_bit(bus, mp_bus_not_pci))
+			trigger = default_ISA_trigger(idx);
+		else
+			trigger = default_PCI_trigger(idx);
+		/* Take EISA into account */
+		return eisa_irq_trigger(idx, bus, trigger);
+	case 1:
+		return IOAPIC_EDGE;
+	case 2:
+		pr_warn("IOAPIC: Invalid trigger mode 2 defaulting to level\n");
+	case 3:
+	default: /* Pointless default required due to do gcc stupidity */
+		return IOAPIC_LEVEL;
+	}
+}
+
+void ioapic_set_alloc_attr(struct irq_alloc_info *info, int node,
+			   int trigger, int polarity)
+{
+	init_irq_alloc_info(info, NULL);
+	info->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info->ioapic_node = node;
+	info->ioapic_trigger = trigger;
+	info->ioapic_polarity = polarity;
+	info->ioapic_valid = 1;
+}
+
+#ifndef CONFIG_ACPI
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity);
 #endif
 #endif
-			break;
-		case 1: /* edge */
-		{
-			trigger = 0;
-			break;
-		}
-		case 2: /* reserved */
-		{
-			pr_warn("broken BIOS!!\n");
-			trigger = 1;
-			break;
-		}
-		case 3: /* level */
-		{
-			trigger = 1;
-			break;
-		}
-		default: /* invalid */
-		{
-			pr_warn("broken BIOS!!\n");
-			trigger = 0;
-			break;
+
+static void ioapic_copy_alloc_attr(struct irq_alloc_info *dst,
+				   struct irq_alloc_info *src,
+				   u32 gsi, int ioapic_idx, int pin)
+{
+	int trigger, polarity;
+
+	copy_irq_alloc_info(dst, src);
+	dst->type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	dst->ioapic_id = mpc_ioapic_id(ioapic_idx);
+	dst->ioapic_pin = pin;
+	dst->ioapic_valid = 1;
+	if (src && src->ioapic_valid) {
+		dst->ioapic_node = src->ioapic_node;
+		dst->ioapic_trigger = src->ioapic_trigger;
+		dst->ioapic_polarity = src->ioapic_polarity;
+	} else {
+		dst->ioapic_node = NUMA_NO_NODE;
+		if (acpi_get_override_irq(gsi, &trigger, &polarity) >= 0) {
+			dst->ioapic_trigger = trigger;
+			dst->ioapic_polarity = polarity;
+		} else {
+			/*
+			 * PCI interrupts are always active low level
+			 * triggered.
+			 */
+			dst->ioapic_trigger = IOAPIC_LEVEL;
+			dst->ioapic_polarity = IOAPIC_POL_LOW;
 		}
 		}
 	}
 	}
-	return trigger;
 }
 }
 
 
-static int alloc_irq_from_domain(struct irq_domain *domain, u32 gsi, int pin)
+static int ioapic_alloc_attr_node(struct irq_alloc_info *info)
+{
+	return (info && info->ioapic_valid) ? info->ioapic_node : NUMA_NO_NODE;
+}
+
+static void mp_register_handler(unsigned int irq, unsigned long trigger)
+{
+	irq_flow_handler_t hdl;
+	bool fasteoi;
+
+	if (trigger) {
+		irq_set_status_flags(irq, IRQ_LEVEL);
+		fasteoi = true;
+	} else {
+		irq_clear_status_flags(irq, IRQ_LEVEL);
+		fasteoi = false;
+	}
+
+	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
+	__irq_set_handler(irq, hdl, 0, fasteoi ? "fasteoi" : "edge");
+}
+
+static bool mp_check_pin_attr(int irq, struct irq_alloc_info *info)
 {
 {
+	struct mp_chip_data *data = irq_get_chip_data(irq);
+
+	/*
+	 * setup_IO_APIC_irqs() programs all legacy IRQs with default trigger
+	 * and polarity attirbutes. So allow the first user to reprogram the
+	 * pin with real trigger and polarity attributes.
+	 */
+	if (irq < nr_legacy_irqs() && data->count == 1) {
+		if (info->ioapic_trigger != data->trigger)
+			mp_register_handler(irq, data->trigger);
+		data->entry.trigger = data->trigger = info->ioapic_trigger;
+		data->entry.polarity = data->polarity = info->ioapic_polarity;
+	}
+
+	return data->trigger == info->ioapic_trigger &&
+	       data->polarity == info->ioapic_polarity;
+}
+
+static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi,
+				 struct irq_alloc_info *info)
+{
+	bool legacy = false;
 	int irq = -1;
 	int irq = -1;
-	int ioapic = (int)(long)domain->host_data;
 	int type = ioapics[ioapic].irqdomain_cfg.type;
 	int type = ioapics[ioapic].irqdomain_cfg.type;
 
 
 	switch (type) {
 	switch (type) {
 	case IOAPIC_DOMAIN_LEGACY:
 	case IOAPIC_DOMAIN_LEGACY:
 		/*
 		/*
-		 * Dynamically allocate IRQ number for non-ISA IRQs in the first 16
-		 * GSIs on some weird platforms.
+		 * Dynamically allocate IRQ number for non-ISA IRQs in the first
+		 * 16 GSIs on some weird platforms.
 		 */
 		 */
-		if (gsi < nr_legacy_irqs())
-			irq = irq_create_mapping(domain, pin);
-		else if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
+		if (!ioapic_initialized || gsi >= nr_legacy_irqs())
 			irq = gsi;
 			irq = gsi;
+		legacy = mp_is_legacy_irq(irq);
 		break;
 		break;
 	case IOAPIC_DOMAIN_STRICT:
 	case IOAPIC_DOMAIN_STRICT:
-		if (irq_create_strict_mappings(domain, gsi, pin, 1) == 0)
-			irq = gsi;
+		irq = gsi;
 		break;
 		break;
 	case IOAPIC_DOMAIN_DYNAMIC:
 	case IOAPIC_DOMAIN_DYNAMIC:
-		irq = irq_create_mapping(domain, pin);
 		break;
 		break;
 	default:
 	default:
 		WARN(1, "ioapic: unknown irqdomain type %d\n", type);
 		WARN(1, "ioapic: unknown irqdomain type %d\n", type);
-		break;
+		return -1;
+	}
+
+	return __irq_domain_alloc_irqs(domain, irq, 1,
+				       ioapic_alloc_attr_node(info),
+				       info, legacy);
+}
+
+/*
+ * Need special handling for ISA IRQs because there may be multiple IOAPIC pins
+ * sharing the same ISA IRQ number and irqdomain only supports 1:1 mapping
+ * between IOAPIC pin and IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are
+ * used for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
+ * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are available, and
+ * some BIOSes may use MP Interrupt Source records to override IRQ numbers for
+ * PIRQs instead of reprogramming the interrupt routing logic. Thus there may be
+ * multiple pins sharing the same legacy IRQ number when ACPI is disabled.
+ */
+static int alloc_isa_irq_from_domain(struct irq_domain *domain,
+				     int irq, int ioapic, int pin,
+				     struct irq_alloc_info *info)
+{
+	struct mp_chip_data *data;
+	struct irq_data *irq_data = irq_get_irq_data(irq);
+	int node = ioapic_alloc_attr_node(info);
+
+	/*
+	 * Legacy ISA IRQ has already been allocated, just add pin to
+	 * the pin list assoicated with this IRQ and program the IOAPIC
+	 * entry. The IOAPIC entry
+	 */
+	if (irq_data && irq_data->parent_data) {
+		if (!mp_check_pin_attr(irq, info))
+			return -EBUSY;
+		if (__add_pin_to_irq_node(irq_data->chip_data, node, ioapic,
+					  info->ioapic_pin))
+			return -ENOMEM;
+	} else {
+		irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true);
+		if (irq >= 0) {
+			irq_data = irq_domain_get_irq_data(domain, irq);
+			data = irq_data->chip_data;
+			data->isa_irq = true;
+		}
 	}
 	}
 
 
-	return irq > 0 ? irq : -1;
+	return irq;
 }
 }
 
 
 static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
 static int mp_map_pin_to_irq(u32 gsi, int idx, int ioapic, int pin,
-			     unsigned int flags)
+			     unsigned int flags, struct irq_alloc_info *info)
 {
 {
 	int irq;
 	int irq;
+	bool legacy = false;
+	struct irq_alloc_info tmp;
+	struct mp_chip_data *data;
 	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
 	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
-	struct mp_pin_info *info = mp_pin_info(ioapic, pin);
 
 
 	if (!domain)
 	if (!domain)
-		return -1;
+		return -ENOSYS;
 
 
-	mutex_lock(&ioapic_mutex);
-
-	/*
-	 * Don't use irqdomain to manage ISA IRQs because there may be
-	 * multiple IOAPIC pins sharing the same ISA IRQ number and
-	 * irqdomain only supports 1:1 mapping between IOAPIC pin and
-	 * IRQ number. A typical IOAPIC has 24 pins, pin 0-15 are used
-	 * for legacy IRQs and pin 16-23 are used for PCI IRQs (PIRQ A-H).
-	 * When ACPI is disabled, only legacy IRQ numbers (IRQ0-15) are
-	 * available, and some BIOSes may use MP Interrupt Source records
-	 * to override IRQ numbers for PIRQs instead of reprogramming
-	 * the interrupt routing logic. Thus there may be multiple pins
-	 * sharing the same legacy IRQ number when ACPI is disabled.
-	 */
 	if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
 	if (idx >= 0 && test_bit(mp_irqs[idx].srcbus, mp_bus_not_pci)) {
 		irq = mp_irqs[idx].srcbusirq;
 		irq = mp_irqs[idx].srcbusirq;
-		if (flags & IOAPIC_MAP_ALLOC) {
-			if (info->count == 0 &&
-			    mp_irqdomain_map(domain, irq, pin) != 0)
-				irq = -1;
+		legacy = mp_is_legacy_irq(irq);
+	}
 
 
-			/* special handling for timer IRQ0 */
+	mutex_lock(&ioapic_mutex);
+	if (!(flags & IOAPIC_MAP_ALLOC)) {
+		if (!legacy) {
+			irq = irq_find_mapping(domain, pin);
 			if (irq == 0)
 			if (irq == 0)
-				info->count++;
+				irq = -ENOENT;
 		}
 		}
 	} else {
 	} else {
-		irq = irq_find_mapping(domain, pin);
-		if (irq <= 0 && (flags & IOAPIC_MAP_ALLOC))
-			irq = alloc_irq_from_domain(domain, gsi, pin);
-	}
-
-	if (flags & IOAPIC_MAP_ALLOC) {
-		/* special handling for legacy IRQs */
-		if (irq < nr_legacy_irqs() && info->count == 1 &&
-		    mp_irqdomain_map(domain, irq, pin) != 0)
-			irq = -1;
-
-		if (irq > 0)
-			info->count++;
-		else if (info->count == 0)
-			info->set = 0;
+		ioapic_copy_alloc_attr(&tmp, info, gsi, ioapic, pin);
+		if (legacy)
+			irq = alloc_isa_irq_from_domain(domain, irq,
+							ioapic, pin, &tmp);
+		else if ((irq = irq_find_mapping(domain, pin)) == 0)
+			irq = alloc_irq_from_domain(domain, ioapic, gsi, &tmp);
+		else if (!mp_check_pin_attr(irq, &tmp))
+			irq = -EBUSY;
+		if (irq >= 0) {
+			data = irq_get_chip_data(irq);
+			data->count++;
+		}
 	}
 	}
-
 	mutex_unlock(&ioapic_mutex);
 	mutex_unlock(&ioapic_mutex);
 
 
-	return irq > 0 ? irq : -1;
+	return irq;
 }
 }
 
 
 static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
 static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
@@ -1058,10 +1098,10 @@ static int pin_2_irq(int idx, int ioapic, int pin, unsigned int flags)
 	}
 	}
 #endif
 #endif
 
 
-	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+	return  mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, NULL);
 }
 }
 
 
-int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
+int mp_map_gsi_to_irq(u32 gsi, unsigned int flags, struct irq_alloc_info *info)
 {
 {
 	int ioapic, pin, idx;
 	int ioapic, pin, idx;
 
 
@@ -1074,31 +1114,24 @@ int mp_map_gsi_to_irq(u32 gsi, unsigned int flags)
 	if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
 	if ((flags & IOAPIC_MAP_CHECK) && idx < 0)
 		return -1;
 		return -1;
 
 
-	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags);
+	return mp_map_pin_to_irq(gsi, idx, ioapic, pin, flags, info);
 }
 }
 
 
 void mp_unmap_irq(int irq)
 void mp_unmap_irq(int irq)
 {
 {
-	struct irq_data *data = irq_get_irq_data(irq);
-	struct mp_pin_info *info;
-	int ioapic, pin;
+	struct irq_data *irq_data = irq_get_irq_data(irq);
+	struct mp_chip_data *data;
 
 
-	if (!data || !data->domain)
+	if (!irq_data || !irq_data->domain)
 		return;
 		return;
 
 
-	ioapic = (int)(long)data->domain->host_data;
-	pin = (int)data->hwirq;
-	info = mp_pin_info(ioapic, pin);
+	data = irq_data->chip_data;
+	if (!data || data->isa_irq)
+		return;
 
 
 	mutex_lock(&ioapic_mutex);
 	mutex_lock(&ioapic_mutex);
-	if (--info->count == 0) {
-		info->set = 0;
-		if (irq < nr_legacy_irqs() &&
-		    ioapics[ioapic].irqdomain_cfg.type == IOAPIC_DOMAIN_LEGACY)
-			mp_irqdomain_unmap(data->domain, irq);
-		else
-			irq_dispose_mapping(irq);
-	}
+	if (--data->count == 0)
+		irq_domain_free_irqs(irq, 1);
 	mutex_unlock(&ioapic_mutex);
 	mutex_unlock(&ioapic_mutex);
 }
 }
 
 
@@ -1165,7 +1198,7 @@ out:
 }
 }
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
 
 
-static struct irq_chip ioapic_chip;
+static struct irq_chip ioapic_chip, ioapic_ir_chip;
 
 
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 static inline int IO_APIC_irq_trigger(int irq)
 static inline int IO_APIC_irq_trigger(int irq)
@@ -1189,96 +1222,6 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 }
 #endif
 #endif
 
 
-static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
-				 unsigned long trigger)
-{
-	struct irq_chip *chip = &ioapic_chip;
-	irq_flow_handler_t hdl;
-	bool fasteoi;
-
-	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-	    trigger == IOAPIC_LEVEL) {
-		irq_set_status_flags(irq, IRQ_LEVEL);
-		fasteoi = true;
-	} else {
-		irq_clear_status_flags(irq, IRQ_LEVEL);
-		fasteoi = false;
-	}
-
-	if (setup_remapped_irq(irq, cfg, chip))
-		fasteoi = trigger != 0;
-
-	hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
-	irq_set_chip_and_handler_name(irq, chip, hdl,
-				      fasteoi ? "fasteoi" : "edge");
-}
-
-int native_setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry,
-			      unsigned int destination, int vector,
-			      struct io_apic_irq_attr *attr)
-{
-	memset(entry, 0, sizeof(*entry));
-
-	entry->delivery_mode = apic->irq_delivery_mode;
-	entry->dest_mode     = apic->irq_dest_mode;
-	entry->dest	     = destination;
-	entry->vector	     = vector;
-	entry->mask	     = 0;			/* enable IRQ */
-	entry->trigger	     = attr->trigger;
-	entry->polarity	     = attr->polarity;
-
-	/*
-	 * Mask level triggered irqs.
-	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-	 */
-	if (attr->trigger)
-		entry->mask = 1;
-
-	return 0;
-}
-
-static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
-				struct io_apic_irq_attr *attr)
-{
-	struct IO_APIC_route_entry entry;
-	unsigned int dest;
-
-	if (!IO_APIC_IRQ(irq))
-		return;
-
-	if (assign_irq_vector(irq, cfg, apic->target_cpus()))
-		return;
-
-	if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(),
-					 &dest)) {
-		pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
-			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
-		clear_irq_vector(irq, cfg);
-
-		return;
-	}
-
-	apic_printk(APIC_VERBOSE,KERN_DEBUG
-		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
-		    "IRQ %d Mode:%i Active:%i Dest:%d)\n",
-		    attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin,
-		    cfg->vector, irq, attr->trigger, attr->polarity, dest);
-
-	if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
-		pr_warn("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
-			mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
-		clear_irq_vector(irq, cfg);
-
-		return;
-	}
-
-	ioapic_register_intr(irq, cfg, attr->trigger);
-	if (irq < nr_legacy_irqs())
-		legacy_pic->mask(irq);
-
-	ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry);
-}
-
 static void __init setup_IO_APIC_irqs(void)
 static void __init setup_IO_APIC_irqs(void)
 {
 {
 	unsigned int ioapic, pin;
 	unsigned int ioapic, pin;
@@ -1298,106 +1241,41 @@ static void __init setup_IO_APIC_irqs(void)
 	}
 	}
 }
 }
 
 
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
-					unsigned int pin, int vector)
-{
-	struct IO_APIC_route_entry entry;
-	unsigned int dest;
-
-	memset(&entry, 0, sizeof(entry));
-
-	/*
-	 * We use logical delivery to get the timer IRQ
-	 * to the first CPU.
-	 */
-	if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
-						  apic->target_cpus(), &dest)))
-		dest = BAD_APICID;
-
-	entry.dest_mode = apic->irq_dest_mode;
-	entry.mask = 0;			/* don't mask IRQ for edge */
-	entry.dest = dest;
-	entry.delivery_mode = apic->irq_delivery_mode;
-	entry.polarity = 0;
-	entry.trigger = 0;
-	entry.vector = vector;
-
-	/*
-	 * The timer IRQ doesn't have to know that behind the
-	 * scene we may have a 8259A-master in AEOI mode ...
-	 */
-	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-				      "edge");
-
-	/*
-	 * Add it to the IO-APIC irq-routing table:
-	 */
-	ioapic_write_entry(ioapic_idx, pin, entry);
-}
-
-void native_io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
+void ioapic_zap_locks(void)
 {
 {
-	int i;
-
-	pr_debug(" NR Dst Mask Trig IRR Pol Stat Dmod Deli Vect:\n");
-
-	for (i = 0; i <= nr_entries; i++) {
-		struct IO_APIC_route_entry entry;
-
-		entry = ioapic_read_entry(apic, i);
-
-		pr_debug(" %02x %02X  ", i, entry.dest);
-		pr_cont("%1d    %1d    %1d   %1d   %1d    "
-			"%1d    %1d    %02X\n",
-			entry.mask,
-			entry.trigger,
-			entry.irr,
-			entry.polarity,
-			entry.delivery_status,
-			entry.dest_mode,
-			entry.delivery_mode,
-			entry.vector);
-	}
+	raw_spin_lock_init(&ioapic_lock);
 }
 }
 
 
-void intel_ir_io_apic_print_entries(unsigned int apic,
-				    unsigned int nr_entries)
+static void io_apic_print_entries(unsigned int apic, unsigned int nr_entries)
 {
 {
 	int i;
 	int i;
+	char buf[256];
+	struct IO_APIC_route_entry entry;
+	struct IR_IO_APIC_route_entry *ir_entry = (void *)&entry;
 
 
-	pr_debug(" NR Indx Fmt Mask Trig IRR Pol Stat Indx2 Zero Vect:\n");
-
+	printk(KERN_DEBUG "IOAPIC %d:\n", apic);
 	for (i = 0; i <= nr_entries; i++) {
 	for (i = 0; i <= nr_entries; i++) {
-		struct IR_IO_APIC_route_entry *ir_entry;
-		struct IO_APIC_route_entry entry;
-
 		entry = ioapic_read_entry(apic, i);
 		entry = ioapic_read_entry(apic, i);
-
-		ir_entry = (struct IR_IO_APIC_route_entry *)&entry;
-
-		pr_debug(" %02x %04X ", i, ir_entry->index);
-		pr_cont("%1d   %1d    %1d    %1d   %1d   "
-			"%1d    %1d     %X    %02X\n",
-			ir_entry->format,
-			ir_entry->mask,
-			ir_entry->trigger,
-			ir_entry->irr,
-			ir_entry->polarity,
-			ir_entry->delivery_status,
-			ir_entry->index2,
-			ir_entry->zero,
-			ir_entry->vector);
+		snprintf(buf, sizeof(buf),
+			 " pin%02x, %s, %s, %s, V(%02X), IRR(%1d), S(%1d)",
+			 i,
+			 entry.mask == IOAPIC_MASKED ? "disabled" : "enabled ",
+			 entry.trigger == IOAPIC_LEVEL ? "level" : "edge ",
+			 entry.polarity == IOAPIC_POL_LOW ? "low " : "high",
+			 entry.vector, entry.irr, entry.delivery_status);
+		if (ir_entry->format)
+			printk(KERN_DEBUG "%s, remapped, I(%04X),  Z(%X)\n",
+			       buf, (ir_entry->index << 15) | ir_entry->index,
+			       ir_entry->zero);
+		else
+			printk(KERN_DEBUG "%s, %s, D(%02X), M(%1d)\n",
+			       buf,
+			       entry.dest_mode == IOAPIC_DEST_MODE_LOGICAL ?
+			       "logical " : "physical",
+			       entry.dest, entry.delivery_mode);
 	}
 	}
 }
 }
 
 
-void ioapic_zap_locks(void)
-{
-	raw_spin_lock_init(&ioapic_lock);
-}
-
 static void __init print_IO_APIC(int ioapic_idx)
 static void __init print_IO_APIC(int ioapic_idx)
 {
 {
 	union IO_APIC_reg_00 reg_00;
 	union IO_APIC_reg_00 reg_00;
@@ -1451,16 +1329,13 @@ static void __init print_IO_APIC(int ioapic_idx)
 	}
 	}
 
 
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
-	x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
+	io_apic_print_entries(ioapic_idx, reg_01.bits.entries);
 }
 }
 
 
 void __init print_IO_APICs(void)
 void __init print_IO_APICs(void)
 {
 {
 	int ioapic_idx;
 	int ioapic_idx;
-	struct irq_cfg *cfg;
 	unsigned int irq;
 	unsigned int irq;
-	struct irq_chip *chip;
 
 
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for_each_ioapic(ioapic_idx)
 	for_each_ioapic(ioapic_idx)
@@ -1480,18 +1355,20 @@ void __init print_IO_APICs(void)
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
 	for_each_active_irq(irq) {
 	for_each_active_irq(irq) {
 		struct irq_pin_list *entry;
 		struct irq_pin_list *entry;
+		struct irq_chip *chip;
+		struct mp_chip_data *data;
 
 
 		chip = irq_get_chip(irq);
 		chip = irq_get_chip(irq);
-		if (chip != &ioapic_chip)
+		if (chip != &ioapic_chip && chip != &ioapic_ir_chip)
 			continue;
 			continue;
-
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = irq_get_chip_data(irq);
+		if (!data)
 			continue;
 			continue;
-		if (list_empty(&cfg->irq_2_pin))
+		if (list_empty(&data->irq_2_pin))
 			continue;
 			continue;
+
 		printk(KERN_DEBUG "IRQ%d ", irq);
 		printk(KERN_DEBUG "IRQ%d ", irq);
-		for_each_irq_pin(entry, cfg->irq_2_pin)
+		for_each_irq_pin(entry, data->irq_2_pin)
 			pr_cont("-> %d:%d", entry->apic, entry->pin);
 			pr_cont("-> %d:%d", entry->apic, entry->pin);
 		pr_cont("\n");
 		pr_cont("\n");
 	}
 	}
@@ -1564,15 +1441,12 @@ void native_disable_io_apic(void)
 		struct IO_APIC_route_entry entry;
 		struct IO_APIC_route_entry entry;
 
 
 		memset(&entry, 0, sizeof(entry));
 		memset(&entry, 0, sizeof(entry));
-		entry.mask            = 0; /* Enabled */
-		entry.trigger         = 0; /* Edge */
-		entry.irr             = 0;
-		entry.polarity        = 0; /* High */
-		entry.delivery_status = 0;
-		entry.dest_mode       = 0; /* Physical */
-		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
-		entry.vector          = 0;
-		entry.dest            = read_apic_id();
+		entry.mask		= IOAPIC_UNMASKED;
+		entry.trigger		= IOAPIC_EDGE;
+		entry.polarity		= IOAPIC_POL_HIGH;
+		entry.dest_mode		= IOAPIC_DEST_MODE_PHYSICAL;
+		entry.delivery_mode	= dest_ExtINT;
+		entry.dest		= read_apic_id();
 
 
 		/*
 		/*
 		 * Add it to the IO-APIC irq-routing table:
 		 * Add it to the IO-APIC irq-routing table:
@@ -1582,7 +1456,6 @@ void native_disable_io_apic(void)
 
 
 	if (cpu_has_apic || apic_from_smp_config())
 	if (cpu_has_apic || apic_from_smp_config())
 		disconnect_bsp_APIC(ioapic_i8259.pin != -1);
 		disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-
 }
 }
 
 
 /*
 /*
@@ -1792,7 +1665,6 @@ static int __init timer_irq_works(void)
  * This is not complete - we should be able to fake
  * This is not complete - we should be able to fake
  * an edge even if it isn't on the 8259A...
  * an edge even if it isn't on the 8259A...
  */
  */
-
 static unsigned int startup_ioapic_irq(struct irq_data *data)
 static unsigned int startup_ioapic_irq(struct irq_data *data)
 {
 {
 	int was_pending = 0, irq = data->irq;
 	int was_pending = 0, irq = data->irq;
@@ -1804,74 +1676,22 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
 		if (legacy_pic->irq_pending(irq))
 		if (legacy_pic->irq_pending(irq))
 			was_pending = 1;
 			was_pending = 1;
 	}
 	}
-	__unmask_ioapic(irqd_cfg(data));
+	__unmask_ioapic(data->chip_data);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 
 	return was_pending;
 	return was_pending;
 }
 }
 
 
-/*
- * Level and edge triggered IO-APIC interrupts need different handling,
- * so we use two separate IRQ descriptors. Edge triggered IRQs can be
- * handled with the level-triggered descriptor, but that one has slightly
- * more overhead. Level-triggered interrupts cannot be handled with the
- * edge-triggered handler, without risking IRQ storms and other ugly
- * races.
- */
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
-	int apic, pin;
-	struct irq_pin_list *entry;
-	u8 vector = cfg->vector;
-
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
-		unsigned int reg;
-
-		apic = entry->apic;
-		pin = entry->pin;
-
-		io_apic_write(apic, 0x11 + pin*2, dest);
-		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
-		reg |= vector;
-		io_apic_modify(apic, 0x10 + pin*2, reg);
-	}
-}
-
-int native_ioapic_set_affinity(struct irq_data *data,
-			       const struct cpumask *mask,
-			       bool force)
-{
-	unsigned int dest, irq = data->irq;
-	unsigned long flags;
-	int ret;
-
-	if (!config_enabled(CONFIG_SMP))
-		return -EPERM;
-
-	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	ret = apic_set_affinity(data, mask, &dest);
-	if (!ret) {
-		/* Only the high 8 bits are valid. */
-		dest = SET_APIC_LOGICAL_ID(dest);
-		__target_IO_APIC_irq(irq, dest, irqd_cfg(data));
-		ret = IRQ_SET_MASK_OK_NOCOPY;
-	}
-	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
-	return ret;
-}
-
 atomic_t irq_mis_count;
 atomic_t irq_mis_count;
 
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
+static bool io_apic_level_ack_pending(struct mp_chip_data *data)
 {
 {
 	struct irq_pin_list *entry;
 	struct irq_pin_list *entry;
 	unsigned long flags;
 	unsigned long flags;
 
 
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
 	raw_spin_lock_irqsave(&ioapic_lock, flags);
-	for_each_irq_pin(entry, cfg->irq_2_pin) {
+	for_each_irq_pin(entry, data->irq_2_pin) {
 		unsigned int reg;
 		unsigned int reg;
 		int pin;
 		int pin;
 
 
@@ -1888,18 +1708,17 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
 	return false;
 	return false;
 }
 }
 
 
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
 {
 {
 	/* If we are moving the irq we need to mask it */
 	/* If we are moving the irq we need to mask it */
 	if (unlikely(irqd_is_setaffinity_pending(data))) {
 	if (unlikely(irqd_is_setaffinity_pending(data))) {
-		mask_ioapic(cfg);
+		mask_ioapic_irq(data);
 		return true;
 		return true;
 	}
 	}
 	return false;
 	return false;
 }
 }
 
 
-static inline void ioapic_irqd_unmask(struct irq_data *data,
-				      struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
 {
 {
 	if (unlikely(masked)) {
 	if (unlikely(masked)) {
 		/* Only migrate the irq if the ack has been received.
 		/* Only migrate the irq if the ack has been received.
@@ -1928,31 +1747,30 @@ static inline void ioapic_irqd_unmask(struct irq_data *data,
 		 * accurate and is causing problems then it is a hardware bug
 		 * accurate and is causing problems then it is a hardware bug
 		 * and you can go talk to the chipset vendor about it.
 		 * and you can go talk to the chipset vendor about it.
 		 */
 		 */
-		if (!io_apic_level_ack_pending(cfg))
+		if (!io_apic_level_ack_pending(data->chip_data))
 			irq_move_masked_irq(data);
 			irq_move_masked_irq(data);
-		unmask_ioapic(cfg);
+		unmask_ioapic_irq(data);
 	}
 	}
 }
 }
 #else
 #else
-static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
+static inline bool ioapic_irqd_mask(struct irq_data *data)
 {
 {
 	return false;
 	return false;
 }
 }
-static inline void ioapic_irqd_unmask(struct irq_data *data,
-				      struct irq_cfg *cfg, bool masked)
+static inline void ioapic_irqd_unmask(struct irq_data *data, bool masked)
 {
 {
 }
 }
 #endif
 #endif
 
 
-static void ack_ioapic_level(struct irq_data *data)
+static void ioapic_ack_level(struct irq_data *irq_data)
 {
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	int i, irq = data->irq;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	unsigned long v;
 	unsigned long v;
 	bool masked;
 	bool masked;
+	int i;
 
 
 	irq_complete_move(cfg);
 	irq_complete_move(cfg);
-	masked = ioapic_irqd_mask(data, cfg);
+	masked = ioapic_irqd_mask(irq_data);
 
 
 	/*
 	/*
 	 * It appears there is an erratum which affects at least version 0x11
 	 * It appears there is an erratum which affects at least version 0x11
@@ -2004,11 +1822,49 @@ static void ack_ioapic_level(struct irq_data *data)
 	 */
 	 */
 	if (!(v & (1 << (i & 0x1f)))) {
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		atomic_inc(&irq_mis_count);
+		eoi_ioapic_pin(cfg->vector, irq_data->chip_data);
+	}
+
+	ioapic_irqd_unmask(irq_data, masked);
+}
+
+static void ioapic_ir_ack_level(struct irq_data *irq_data)
+{
+	struct mp_chip_data *data = irq_data->chip_data;
+
+	/*
+	 * Intr-remapping uses pin number as the virtual vector
+	 * in the RTE. Actual vector is programmed in
+	 * intr-remapping table entry. Hence for the io-apic
+	 * EOI we use the pin number.
+	 */
+	ack_APIC_irq();
+	eoi_ioapic_pin(data->entry.vector, data);
+}
 
 
-		eoi_ioapic_irq(irq, cfg);
+static int ioapic_set_affinity(struct irq_data *irq_data,
+			       const struct cpumask *mask, bool force)
+{
+	struct irq_data *parent = irq_data->parent_data;
+	struct mp_chip_data *data = irq_data->chip_data;
+	struct irq_pin_list *entry;
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	int ret;
+
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) {
+		cfg = irqd_cfg(irq_data);
+		data->entry.dest = cfg->dest_apicid;
+		data->entry.vector = cfg->vector;
+		for_each_irq_pin(entry, data->irq_2_pin)
+			__ioapic_write_entry(entry->apic, entry->pin,
+					     data->entry);
 	}
 	}
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
 
 
-	ioapic_irqd_unmask(data, cfg, masked);
+	return ret;
 }
 }
 
 
 static struct irq_chip ioapic_chip __read_mostly = {
 static struct irq_chip ioapic_chip __read_mostly = {
@@ -2016,10 +1872,20 @@ static struct irq_chip ioapic_chip __read_mostly = {
 	.irq_startup		= startup_ioapic_irq,
 	.irq_startup		= startup_ioapic_irq,
 	.irq_mask		= mask_ioapic_irq,
 	.irq_mask		= mask_ioapic_irq,
 	.irq_unmask		= unmask_ioapic_irq,
 	.irq_unmask		= unmask_ioapic_irq,
-	.irq_ack		= apic_ack_edge,
-	.irq_eoi		= ack_ioapic_level,
-	.irq_set_affinity	= native_ioapic_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_eoi		= ioapic_ack_level,
+	.irq_set_affinity	= ioapic_set_affinity,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+static struct irq_chip ioapic_ir_chip __read_mostly = {
+	.name			= "IR-IO-APIC",
+	.irq_startup		= startup_ioapic_irq,
+	.irq_mask		= mask_ioapic_irq,
+	.irq_unmask		= unmask_ioapic_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_eoi		= ioapic_ir_ack_level,
+	.irq_set_affinity	= ioapic_set_affinity,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 };
 
 
@@ -2113,12 +1979,12 @@ static inline void __init unlock_ExtINT_logic(void)
 
 
 	memset(&entry1, 0, sizeof(entry1));
 	memset(&entry1, 0, sizeof(entry1));
 
 
-	entry1.dest_mode = 0;			/* physical delivery */
-	entry1.mask = 0;			/* unmask IRQ now */
+	entry1.dest_mode = IOAPIC_DEST_MODE_PHYSICAL;
+	entry1.mask = IOAPIC_UNMASKED;
 	entry1.dest = hard_smp_processor_id();
 	entry1.dest = hard_smp_processor_id();
 	entry1.delivery_mode = dest_ExtINT;
 	entry1.delivery_mode = dest_ExtINT;
 	entry1.polarity = entry0.polarity;
 	entry1.polarity = entry0.polarity;
-	entry1.trigger = 0;
+	entry1.trigger = IOAPIC_EDGE;
 	entry1.vector = 0;
 	entry1.vector = 0;
 
 
 	ioapic_write_entry(apic, pin, entry1);
 	ioapic_write_entry(apic, pin, entry1);
@@ -2152,6 +2018,25 @@ static int __init disable_timer_pin_setup(char *arg)
 }
 }
 early_param("disable_timer_pin_1", disable_timer_pin_setup);
 early_param("disable_timer_pin_1", disable_timer_pin_setup);
 
 
+static int mp_alloc_timer_irq(int ioapic, int pin)
+{
+	int irq = -1;
+	struct irq_domain *domain = mp_ioapic_irqdomain(ioapic);
+
+	if (domain) {
+		struct irq_alloc_info info;
+
+		ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 0, 0);
+		info.ioapic_id = mpc_ioapic_id(ioapic);
+		info.ioapic_pin = pin;
+		mutex_lock(&ioapic_mutex);
+		irq = alloc_isa_irq_from_domain(domain, 0, ioapic, pin, &info);
+		mutex_unlock(&ioapic_mutex);
+	}
+
+	return irq;
+}
+
 /*
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
@@ -2162,7 +2047,9 @@ early_param("disable_timer_pin_1", disable_timer_pin_setup);
  */
  */
 static inline void __init check_timer(void)
 static inline void __init check_timer(void)
 {
 {
-	struct irq_cfg *cfg = irq_cfg(0);
+	struct irq_data *irq_data = irq_get_irq_data(0);
+	struct mp_chip_data *data = irq_data->chip_data;
+	struct irq_cfg *cfg = irqd_cfg(irq_data);
 	int node = cpu_to_node(0);
 	int node = cpu_to_node(0);
 	int apic1, pin1, apic2, pin2;
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	unsigned long flags;
@@ -2174,7 +2061,6 @@ static inline void __init check_timer(void)
 	 * get/set the timer IRQ vector:
 	 * get/set the timer IRQ vector:
 	 */
 	 */
 	legacy_pic->mask(0);
 	legacy_pic->mask(0);
-	assign_irq_vector(0, cfg, apic->target_cpus());
 
 
 	/*
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
@@ -2215,23 +2101,21 @@ static inline void __init check_timer(void)
 	}
 	}
 
 
 	if (pin1 != -1) {
 	if (pin1 != -1) {
-		/*
-		 * Ok, does IRQ0 through the IOAPIC work?
-		 */
+		/* Ok, does IRQ0 through the IOAPIC work? */
 		if (no_pin1) {
 		if (no_pin1) {
-			add_pin_to_irq_node(cfg, node, apic1, pin1);
-			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
+			mp_alloc_timer_irq(apic1, pin1);
 		} else {
 		} else {
-			/* for edge trigger, setup_ioapic_irq already
-			 * leave it unmasked.
+			/*
+			 * for edge trigger, it's already unmasked,
 			 * so only need to unmask if it is level-trigger
 			 * so only need to unmask if it is level-trigger
 			 * do we really have level trigger timer?
 			 * do we really have level trigger timer?
 			 */
 			 */
 			int idx;
 			int idx;
 			idx = find_irq_entry(apic1, pin1, mp_INT);
 			idx = find_irq_entry(apic1, pin1, mp_INT);
 			if (idx != -1 && irq_trigger(idx))
 			if (idx != -1 && irq_trigger(idx))
-				unmask_ioapic(cfg);
+				unmask_ioapic_irq(irq_get_chip_data(0));
 		}
 		}
+		irq_domain_activate_irq(irq_data);
 		if (timer_irq_works()) {
 		if (timer_irq_works()) {
 			if (disable_timer_pin_1 > 0)
 			if (disable_timer_pin_1 > 0)
 				clear_IO_APIC_pin(0, pin1);
 				clear_IO_APIC_pin(0, pin1);
@@ -2251,8 +2135,8 @@ static inline void __init check_timer(void)
 		/*
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 * legacy devices should be connected to IO APIC #0
 		 */
 		 */
-		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
-		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
+		replace_pin_at_irq_node(data, node, apic1, pin1, apic2, pin2);
+		irq_domain_activate_irq(irq_data);
 		legacy_pic->unmask(0);
 		legacy_pic->unmask(0);
 		if (timer_irq_works()) {
 		if (timer_irq_works()) {
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
 			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
@@ -2329,36 +2213,35 @@ out:
 
 
 static int mp_irqdomain_create(int ioapic)
 static int mp_irqdomain_create(int ioapic)
 {
 {
-	size_t size;
+	struct irq_alloc_info info;
+	struct irq_domain *parent;
 	int hwirqs = mp_ioapic_pin_count(ioapic);
 	int hwirqs = mp_ioapic_pin_count(ioapic);
 	struct ioapic *ip = &ioapics[ioapic];
 	struct ioapic *ip = &ioapics[ioapic];
 	struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
 	struct ioapic_domain_cfg *cfg = &ip->irqdomain_cfg;
 	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
 	struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(ioapic);
 
 
-	size = sizeof(struct mp_pin_info) * mp_ioapic_pin_count(ioapic);
-	ip->pin_info = kzalloc(size, GFP_KERNEL);
-	if (!ip->pin_info)
-		return -ENOMEM;
-
 	if (cfg->type == IOAPIC_DOMAIN_INVALID)
 	if (cfg->type == IOAPIC_DOMAIN_INVALID)
 		return 0;
 		return 0;
 
 
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_IOAPIC;
+	info.ioapic_id = mpc_ioapic_id(ioapic);
+	parent = irq_remapping_get_ir_irq_domain(&info);
+	if (!parent)
+		parent = x86_vector_domain;
+
 	ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
 	ip->irqdomain = irq_domain_add_linear(cfg->dev, hwirqs, cfg->ops,
 					      (void *)(long)ioapic);
 					      (void *)(long)ioapic);
-	if(!ip->irqdomain) {
-		kfree(ip->pin_info);
-		ip->pin_info = NULL;
+	if (!ip->irqdomain)
 		return -ENOMEM;
 		return -ENOMEM;
-	}
+
+	ip->irqdomain->parent = parent;
 
 
 	if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
 	if (cfg->type == IOAPIC_DOMAIN_LEGACY ||
 	    cfg->type == IOAPIC_DOMAIN_STRICT)
 	    cfg->type == IOAPIC_DOMAIN_STRICT)
 		ioapic_dynirq_base = max(ioapic_dynirq_base,
 		ioapic_dynirq_base = max(ioapic_dynirq_base,
 					 gsi_cfg->gsi_end + 1);
 					 gsi_cfg->gsi_end + 1);
 
 
-	if (gsi_cfg->gsi_base == 0)
-		irq_set_default_host(ip->irqdomain);
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -2368,8 +2251,6 @@ static void ioapic_destroy_irqdomain(int idx)
 		irq_domain_remove(ioapics[idx].irqdomain);
 		irq_domain_remove(ioapics[idx].irqdomain);
 		ioapics[idx].irqdomain = NULL;
 		ioapics[idx].irqdomain = NULL;
 	}
 	}
-	kfree(ioapics[idx].pin_info);
-	ioapics[idx].pin_info = NULL;
 }
 }
 
 
 void __init setup_IO_APIC(void)
 void __init setup_IO_APIC(void)
@@ -2399,20 +2280,6 @@ void __init setup_IO_APIC(void)
 	ioapic_initialized = 1;
 	ioapic_initialized = 1;
 }
 }
 
 
-/*
- *      Called after all the initialization is done. If we didn't find any
- *      APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
-	if (sis_apic_bug == -1)
-		sis_apic_bug = 0;
-	return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
 static void resume_ioapic_id(int ioapic_idx)
 static void resume_ioapic_id(int ioapic_idx)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
@@ -2451,20 +2318,6 @@ static int __init ioapic_init_ops(void)
 
 
 device_initcall(ioapic_init_ops);
 device_initcall(ioapic_init_ops);
 
 
-static int
-io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
-{
-	struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
-	int ret;
-
-	if (!cfg)
-		return -EINVAL;
-	ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
-	if (!ret)
-		setup_ioapic_irq(irq, cfg, attr);
-	return ret;
-}
-
 static int io_apic_get_redir_entries(int ioapic)
 static int io_apic_get_redir_entries(int ioapic)
 {
 {
 	union IO_APIC_reg_01	reg_01;
 	union IO_APIC_reg_01	reg_01;
@@ -2692,7 +2545,7 @@ void __init setup_ioapic_dest(void)
 		else
 		else
 			mask = apic->target_cpus();
 			mask = apic->target_cpus();
 
 
-		x86_io_apic_ops.set_affinity(idata, mask, false);
+		irq_set_affinity(irq, mask);
 	}
 	}
 
 
 }
 }
@@ -2737,7 +2590,7 @@ static struct resource * __init ioapic_setup_resources(void)
 	return res;
 	return res;
 }
 }
 
 
-void __init native_io_apic_init_mappings(void)
+void __init io_apic_init_mappings(void)
 {
 {
 	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
 	struct resource *ioapic_res;
 	struct resource *ioapic_res;
@@ -2962,7 +2815,6 @@ int mp_unregister_ioapic(u32 gsi_base)
 {
 {
 	int ioapic, pin;
 	int ioapic, pin;
 	int found = 0;
 	int found = 0;
-	struct mp_pin_info *pin_info;
 
 
 	for_each_ioapic(ioapic)
 	for_each_ioapic(ioapic)
 		if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
 		if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
@@ -2975,11 +2827,17 @@ int mp_unregister_ioapic(u32 gsi_base)
 	}
 	}
 
 
 	for_each_pin(ioapic, pin) {
 	for_each_pin(ioapic, pin) {
-		pin_info = mp_pin_info(ioapic, pin);
-		if (pin_info->count) {
-			pr_warn("pin%d on IOAPIC%d is still in use.\n",
-				pin, ioapic);
-			return -EBUSY;
+		u32 gsi = mp_pin_to_gsi(ioapic, pin);
+		int irq = mp_map_gsi_to_irq(gsi, 0, NULL);
+		struct mp_chip_data *data;
+
+		if (irq >= 0) {
+			data = irq_get_chip_data(irq);
+			if (data && data->count) {
+				pr_warn("pin%d on IOAPIC%d is still in use.\n",
+					pin, ioapic);
+				return -EBUSY;
+			}
 		}
 		}
 	}
 	}
 
 
@@ -3006,108 +2864,141 @@ int mp_ioapic_registered(u32 gsi_base)
 	return 0;
 	return 0;
 }
 }
 
 
-static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
-					int ioapic, int ioapic_pin,
-					int trigger, int polarity)
+static void mp_irqdomain_get_attr(u32 gsi, struct mp_chip_data *data,
+				  struct irq_alloc_info *info)
 {
 {
-	irq_attr->ioapic	= ioapic;
-	irq_attr->ioapic_pin	= ioapic_pin;
-	irq_attr->trigger	= trigger;
-	irq_attr->polarity	= polarity;
+	if (info && info->ioapic_valid) {
+		data->trigger = info->ioapic_trigger;
+		data->polarity = info->ioapic_polarity;
+	} else if (acpi_get_override_irq(gsi, &data->trigger,
+					 &data->polarity) < 0) {
+		/* PCI interrupts are always active low level triggered. */
+		data->trigger = IOAPIC_LEVEL;
+		data->polarity = IOAPIC_POL_LOW;
+	}
 }
 }
 
 
-int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
-		     irq_hw_number_t hwirq)
+static void mp_setup_entry(struct irq_cfg *cfg, struct mp_chip_data *data,
+			   struct IO_APIC_route_entry *entry)
 {
 {
-	int ioapic = (int)(long)domain->host_data;
-	struct mp_pin_info *info = mp_pin_info(ioapic, hwirq);
-	struct io_apic_irq_attr attr;
+	memset(entry, 0, sizeof(*entry));
+	entry->delivery_mode = apic->irq_delivery_mode;
+	entry->dest_mode     = apic->irq_dest_mode;
+	entry->dest	     = cfg->dest_apicid;
+	entry->vector	     = cfg->vector;
+	entry->trigger	     = data->trigger;
+	entry->polarity	     = data->polarity;
+	/*
+	 * Mask level triggered irqs. Edge triggered irqs are masked
+	 * by the irq core code in case they fire.
+	 */
+	if (data->trigger == IOAPIC_LEVEL)
+		entry->mask = IOAPIC_MASKED;
+	else
+		entry->mask = IOAPIC_UNMASKED;
+}
 
 
-	/* Get default attribute if not set by caller yet */
-	if (!info->set) {
-		u32 gsi = mp_pin_to_gsi(ioapic, hwirq);
+int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+		       unsigned int nr_irqs, void *arg)
+{
+	int ret, ioapic, pin;
+	struct irq_cfg *cfg;
+	struct irq_data *irq_data;
+	struct mp_chip_data *data;
+	struct irq_alloc_info *info = arg;
 
 
-		if (acpi_get_override_irq(gsi, &info->trigger,
-					  &info->polarity) < 0) {
-			/*
-			 * PCI interrupts are always polarity one level
-			 * triggered.
-			 */
-			info->trigger = 1;
-			info->polarity = 1;
-		}
-		info->node = NUMA_NO_NODE;
+	if (!info || nr_irqs > 1)
+		return -EINVAL;
+	irq_data = irq_domain_get_irq_data(domain, virq);
+	if (!irq_data)
+		return -EINVAL;
 
 
-		/*
-		 * setup_IO_APIC_irqs() programs all legacy IRQs with default
-		 * trigger and polarity attributes. Don't set the flag for that
-		 * case so the first legacy IRQ user could reprogram the pin
-		 * with real trigger and polarity attributes.
-		 */
-		if (virq >= nr_legacy_irqs() || info->count)
-			info->set = 1;
-	}
-	set_io_apic_irq_attr(&attr, ioapic, hwirq, info->trigger,
-			     info->polarity);
+	ioapic = mp_irqdomain_ioapic_idx(domain);
+	pin = info->ioapic_pin;
+	if (irq_find_mapping(domain, (irq_hw_number_t)pin) > 0)
+		return -EEXIST;
 
 
-	return io_apic_setup_irq_pin(virq, info->node, &attr);
-}
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
 
 
-void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
-{
-	struct irq_data *data = irq_get_irq_data(virq);
-	struct irq_cfg *cfg = irq_cfg(virq);
-	int ioapic = (int)(long)domain->host_data;
-	int pin = (int)data->hwirq;
+	info->ioapic_entry = &data->entry;
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
+	if (ret < 0) {
+		kfree(data);
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&data->irq_2_pin);
+	irq_data->hwirq = info->ioapic_pin;
+	irq_data->chip = (domain->parent == x86_vector_domain) ?
+			  &ioapic_chip : &ioapic_ir_chip;
+	irq_data->chip_data = data;
+	mp_irqdomain_get_attr(mp_pin_to_gsi(ioapic, pin), data, info);
+
+	cfg = irqd_cfg(irq_data);
+	add_pin_to_irq_node(data, ioapic_alloc_attr_node(info), ioapic, pin);
+	if (info->ioapic_entry)
+		mp_setup_entry(cfg, data, info->ioapic_entry);
+	mp_register_handler(virq, data->trigger);
+	if (virq < nr_legacy_irqs())
+		legacy_pic->mask(virq);
+
+	apic_printk(APIC_VERBOSE, KERN_DEBUG
+		    "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i Dest:%d)\n",
+		    ioapic, mpc_ioapic_id(ioapic), pin, cfg->vector,
+		    virq, data->trigger, data->polarity, cfg->dest_apicid);
 
 
-	ioapic_mask_entry(ioapic, pin);
-	__remove_pin_from_irq(cfg, ioapic, pin);
-	WARN_ON(!list_empty(&cfg->irq_2_pin));
-	arch_teardown_hwirq(virq);
+	return 0;
 }
 }
 
 
-int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
+void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
+		       unsigned int nr_irqs)
 {
 {
-	int ret = 0;
-	int ioapic, pin;
-	struct mp_pin_info *info;
+	struct irq_data *irq_data;
+	struct mp_chip_data *data;
 
 
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return -ENODEV;
-
-	pin = mp_find_ioapic_pin(ioapic, gsi);
-	info = mp_pin_info(ioapic, pin);
-	trigger = trigger ? 1 : 0;
-	polarity = polarity ? 1 : 0;
-
-	mutex_lock(&ioapic_mutex);
-	if (!info->set) {
-		info->trigger = trigger;
-		info->polarity = polarity;
-		info->node = node;
-		info->set = 1;
-	} else if (info->trigger != trigger || info->polarity != polarity) {
-		ret = -EBUSY;
+	BUG_ON(nr_irqs != 1);
+	irq_data = irq_domain_get_irq_data(domain, virq);
+	if (irq_data && irq_data->chip_data) {
+		data = irq_data->chip_data;
+		__remove_pin_from_irq(data, mp_irqdomain_ioapic_idx(domain),
+				      (int)irq_data->hwirq);
+		WARN_ON(!list_empty(&data->irq_2_pin));
+		kfree(irq_data->chip_data);
 	}
 	}
-	mutex_unlock(&ioapic_mutex);
-
-	return ret;
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 }
 
 
-/* Enable IOAPIC early just for system timer */
-void __init pre_init_apic_IRQ0(void)
+void mp_irqdomain_activate(struct irq_domain *domain,
+			   struct irq_data *irq_data)
 {
 {
-	struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
+	unsigned long flags;
+	struct irq_pin_list *entry;
+	struct mp_chip_data *data = irq_data->chip_data;
 
 
-	printk(KERN_INFO "Early APIC setup for system timer0\n");
-#ifndef CONFIG_SMP
-	physid_set_mask_of_physid(boot_cpu_physical_apicid,
-					 &phys_cpu_present_map);
-#endif
-	setup_local_APIC();
+	raw_spin_lock_irqsave(&ioapic_lock, flags);
+	for_each_irq_pin(entry, data->irq_2_pin)
+		__ioapic_write_entry(entry->apic, entry->pin, data->entry);
+	raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+}
 
 
-	io_apic_setup_irq_pin(0, 0, &attr);
-	irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
-				      "edge");
+void mp_irqdomain_deactivate(struct irq_domain *domain,
+			     struct irq_data *irq_data)
+{
+	/* It won't be called for IRQ with multiple IOAPIC pins associated */
+	ioapic_mask_entry(mp_irqdomain_ioapic_idx(domain),
+			  (int)irq_data->hwirq);
+}
+
+int mp_irqdomain_ioapic_idx(struct irq_domain *domain)
+{
+	return (int)(long)domain->host_data;
 }
 }
+
+const struct irq_domain_ops mp_ioapic_irqdomain_ops = {
+	.alloc		= mp_irqdomain_alloc,
+	.free		= mp_irqdomain_free,
+	.activate	= mp_irqdomain_activate,
+	.deactivate	= mp_irqdomain_deactivate,
+};

+ 244 - 173
arch/x86/kernel/apic/msi.c

@@ -3,6 +3,8 @@
  *
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Convert to hierarchical irqdomain
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * it under the terms of the GNU General Public License version 2 as
@@ -14,22 +16,23 @@
 #include <linux/dmar.h>
 #include <linux/dmar.h>
 #include <linux/hpet.h>
 #include <linux/hpet.h>
 #include <linux/msi.h>
 #include <linux/msi.h>
+#include <asm/irqdomain.h>
 #include <asm/msidef.h>
 #include <asm/msidef.h>
 #include <asm/hpet.h>
 #include <asm/hpet.h>
 #include <asm/hw_irq.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 #include <asm/irq_remapping.h>
 
 
-void native_compose_msi_msg(struct pci_dev *pdev,
-			    unsigned int irq, unsigned int dest,
-			    struct msi_msg *msg, u8 hpet_id)
+static struct irq_domain *msi_default_domain;
+
+static void irq_msi_compose_msg(struct irq_data *data, struct msi_msg *msg)
 {
 {
-	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_cfg *cfg = irqd_cfg(data);
 
 
 	msg->address_hi = MSI_ADDR_BASE_HI;
 	msg->address_hi = MSI_ADDR_BASE_HI;
 
 
 	if (x2apic_enabled())
 	if (x2apic_enabled())
-		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest);
+		msg->address_hi |= MSI_ADDR_EXT_DEST_ID(cfg->dest_apicid);
 
 
 	msg->address_lo =
 	msg->address_lo =
 		MSI_ADDR_BASE_LO |
 		MSI_ADDR_BASE_LO |
@@ -39,7 +42,7 @@ void native_compose_msi_msg(struct pci_dev *pdev,
 		((apic->irq_delivery_mode != dest_LowestPrio) ?
 		((apic->irq_delivery_mode != dest_LowestPrio) ?
 			MSI_ADDR_REDIRECTION_CPU :
 			MSI_ADDR_REDIRECTION_CPU :
 			MSI_ADDR_REDIRECTION_LOWPRI) |
 			MSI_ADDR_REDIRECTION_LOWPRI) |
-		MSI_ADDR_DEST_ID(dest);
+		MSI_ADDR_DEST_ID(cfg->dest_apicid);
 
 
 	msg->data =
 	msg->data =
 		MSI_DATA_TRIGGER_EDGE |
 		MSI_DATA_TRIGGER_EDGE |
@@ -50,180 +53,201 @@ void native_compose_msi_msg(struct pci_dev *pdev,
 		MSI_DATA_VECTOR(cfg->vector);
 		MSI_DATA_VECTOR(cfg->vector);
 }
 }
 
 
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
-			   struct msi_msg *msg, u8 hpet_id)
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip pci_msi_controller = {
+	.name			= "PCI-MSI",
+	.irq_unmask		= pci_msi_unmask_irq,
+	.irq_mask		= pci_msi_mask_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg	= irq_msi_compose_msg,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 {
 {
-	struct irq_cfg *cfg;
-	int err;
-	unsigned dest;
+	struct irq_domain *domain;
+	struct irq_alloc_info info;
 
 
-	if (disable_apic)
-		return -ENXIO;
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_MSI;
+	info.msi_dev = dev;
 
 
-	cfg = irq_cfg(irq);
-	err = assign_irq_vector(irq, cfg, apic->target_cpus());
-	if (err)
-		return err;
+	domain = irq_remapping_get_irq_domain(&info);
+	if (domain == NULL)
+		domain = msi_default_domain;
+	if (domain == NULL)
+		return -ENOSYS;
 
 
-	err = apic->cpu_mask_to_apicid_and(cfg->domain,
-					   apic->target_cpus(), &dest);
-	if (err)
-		return err;
+	return pci_msi_domain_alloc_irqs(domain, dev, nvec, type);
+}
 
 
-	x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+void native_teardown_msi_irq(unsigned int irq)
+{
+	irq_domain_free_irqs(irq, 1);
+}
 
 
-	return 0;
+static irq_hw_number_t pci_msi_get_hwirq(struct msi_domain_info *info,
+					 msi_alloc_info_t *arg)
+{
+	return arg->msi_hwirq;
 }
 }
 
 
-static int
-msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+static int pci_msi_prepare(struct irq_domain *domain, struct device *dev,
+			   int nvec, msi_alloc_info_t *arg)
 {
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	struct msi_msg msg;
-	unsigned int dest;
-	int ret;
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct msi_desc *desc = first_pci_msi_entry(pdev);
+
+	init_irq_alloc_info(arg, NULL);
+	arg->msi_dev = pdev;
+	if (desc->msi_attrib.is_msix) {
+		arg->type = X86_IRQ_ALLOC_TYPE_MSIX;
+	} else {
+		arg->type = X86_IRQ_ALLOC_TYPE_MSI;
+		arg->flags |= X86_IRQ_ALLOC_CONTIGUOUS_VECTORS;
+	}
 
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+	return 0;
+}
 
 
-	__get_cached_msi_msg(data->msi_desc, &msg);
+static void pci_msi_set_desc(msi_alloc_info_t *arg, struct msi_desc *desc)
+{
+	arg->msi_hwirq = pci_msi_domain_calc_hwirq(arg->msi_dev, desc);
+}
+
+static struct msi_domain_ops pci_msi_domain_ops = {
+	.get_hwirq	= pci_msi_get_hwirq,
+	.msi_prepare	= pci_msi_prepare,
+	.set_desc	= pci_msi_set_desc,
+};
 
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static struct msi_domain_info pci_msi_domain_info = {
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+			  MSI_FLAG_PCI_MSIX,
+	.ops		= &pci_msi_domain_ops,
+	.chip		= &pci_msi_controller,
+	.handler	= handle_edge_irq,
+	.handler_name	= "edge",
+};
 
 
-	__pci_write_msi_msg(data->msi_desc, &msg);
+void arch_init_msi_domain(struct irq_domain *parent)
+{
+	if (disable_apic)
+		return;
 
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+	msi_default_domain = pci_msi_create_irq_domain(NULL,
+					&pci_msi_domain_info, parent);
+	if (!msi_default_domain)
+		pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
 }
 }
 
 
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
-	.name			= "PCI-MSI",
+#ifdef CONFIG_IRQ_REMAP
+static struct irq_chip pci_msi_ir_controller = {
+	.name			= "IR-PCI-MSI",
 	.irq_unmask		= pci_msi_unmask_irq,
 	.irq_unmask		= pci_msi_unmask_irq,
 	.irq_mask		= pci_msi_mask_irq,
 	.irq_mask		= pci_msi_mask_irq,
-	.irq_ack		= apic_ack_edge,
-	.irq_set_affinity	= msi_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_set_vcpu_affinity	= irq_chip_set_vcpu_affinity_parent,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 	.flags			= IRQCHIP_SKIP_SET_WAKE,
 };
 };
 
 
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
-		  unsigned int irq_base, unsigned int irq_offset)
-{
-	struct irq_chip *chip = &msi_chip;
-	struct msi_msg msg;
-	unsigned int irq = irq_base + irq_offset;
-	int ret;
-
-	ret = msi_compose_msg(dev, irq, &msg, -1);
-	if (ret < 0)
-		return ret;
-
-	irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
-
-	/*
-	 * MSI-X message is written per-IRQ, the offset is always 0.
-	 * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
-	 */
-	if (!irq_offset)
-		pci_write_msi_msg(irq, &msg);
+static struct msi_domain_info pci_msi_ir_domain_info = {
+	.flags		= MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS |
+			  MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX,
+	.ops		= &pci_msi_domain_ops,
+	.chip		= &pci_msi_ir_controller,
+	.handler	= handle_edge_irq,
+	.handler_name	= "edge",
+};
 
 
-	setup_remapped_irq(irq, irq_cfg(irq), chip);
+struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
+{
+	return pci_msi_create_irq_domain(NULL, &pci_msi_ir_domain_info, parent);
+}
+#endif
 
 
-	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+#ifdef CONFIG_DMAR_TABLE
+static void dmar_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
+{
+	dmar_msi_write(data->irq, msg);
+}
 
 
-	dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq);
+static struct irq_chip dmar_msi_controller = {
+	.name			= "DMAR-MSI",
+	.irq_unmask		= dmar_msi_unmask,
+	.irq_mask		= dmar_msi_mask,
+	.irq_ack		= irq_chip_ack_parent,
+	.irq_set_affinity	= msi_domain_set_affinity,
+	.irq_retrigger		= irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg	= irq_msi_compose_msg,
+	.irq_write_msi_msg	= dmar_msi_write_msg,
+	.flags			= IRQCHIP_SKIP_SET_WAKE,
+};
 
 
-	return 0;
+static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info,
+					  msi_alloc_info_t *arg)
+{
+	return arg->dmar_id;
 }
 }
 
 
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+static int dmar_msi_init(struct irq_domain *domain,
+			 struct msi_domain_info *info, unsigned int virq,
+			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
 {
 {
-	struct msi_desc *msidesc;
-	unsigned int irq;
-	int node, ret;
+	irq_domain_set_info(domain, virq, arg->dmar_id, info->chip, NULL,
+			    handle_edge_irq, arg->dmar_data, "edge");
 
 
-	/* Multiple MSI vectors only supported with interrupt remapping */
-	if (type == PCI_CAP_ID_MSI && nvec > 1)
-		return 1;
+	return 0;
+}
 
 
-	node = dev_to_node(&dev->dev);
+static struct msi_domain_ops dmar_msi_domain_ops = {
+	.get_hwirq	= dmar_msi_get_hwirq,
+	.msi_init	= dmar_msi_init,
+};
 
 
-	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = irq_alloc_hwirq(node);
-		if (!irq)
-			return -ENOSPC;
+static struct msi_domain_info dmar_msi_domain_info = {
+	.ops		= &dmar_msi_domain_ops,
+	.chip		= &dmar_msi_controller,
+};
 
 
-		ret = setup_msi_irq(dev, msidesc, irq, 0);
-		if (ret < 0) {
-			irq_free_hwirq(irq);
-			return ret;
-		}
+static struct irq_domain *dmar_get_irq_domain(void)
+{
+	static struct irq_domain *dmar_domain;
+	static DEFINE_MUTEX(dmar_lock);
 
 
-	}
-	return 0;
-}
+	mutex_lock(&dmar_lock);
+	if (dmar_domain == NULL)
+		dmar_domain = msi_create_irq_domain(NULL, &dmar_msi_domain_info,
+						    x86_vector_domain);
+	mutex_unlock(&dmar_lock);
 
 
-void native_teardown_msi_irq(unsigned int irq)
-{
-	irq_free_hwirq(irq);
+	return dmar_domain;
 }
 }
 
 
-#ifdef CONFIG_DMAR_TABLE
-static int
-dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		      bool force)
+int dmar_alloc_hwirq(int id, int node, void *arg)
 {
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest, irq = data->irq;
-	struct msi_msg msg;
-	int ret;
-
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+	struct irq_domain *domain = dmar_get_irq_domain();
+	struct irq_alloc_info info;
 
 
-	dmar_msi_read(irq, &msg);
+	if (!domain)
+		return -1;
 
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-	msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_DMAR;
+	info.dmar_id = id;
+	info.dmar_data = arg;
 
 
-	dmar_msi_write(irq, &msg);
-
-	return IRQ_SET_MASK_OK_NOCOPY;
+	return irq_domain_alloc_irqs(domain, 1, node, &info);
 }
 }
 
 
-static struct irq_chip dmar_msi_type = {
-	.name			= "DMAR_MSI",
-	.irq_unmask		= dmar_msi_unmask,
-	.irq_mask		= dmar_msi_mask,
-	.irq_ack		= apic_ack_edge,
-	.irq_set_affinity	= dmar_msi_set_affinity,
-	.irq_retrigger		= apic_retrigger_irq,
-	.flags			= IRQCHIP_SKIP_SET_WAKE,
-};
-
-int arch_setup_dmar_msi(unsigned int irq)
+void dmar_free_hwirq(int irq)
 {
 {
-	int ret;
-	struct msi_msg msg;
-
-	ret = msi_compose_msg(NULL, irq, &msg, -1);
-	if (ret < 0)
-		return ret;
-	dmar_msi_write(irq, &msg);
-	irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
-				      "edge");
-	return 0;
+	irq_domain_free_irqs(irq, 1);
 }
 }
 #endif
 #endif
 
 
@@ -231,56 +255,103 @@ int arch_setup_dmar_msi(unsigned int irq)
  * MSI message composition
  * MSI message composition
  */
  */
 #ifdef CONFIG_HPET_TIMER
 #ifdef CONFIG_HPET_TIMER
+static inline int hpet_dev_id(struct irq_domain *domain)
+{
+	struct msi_domain_info *info = msi_get_domain_info(domain);
+
+	return (int)(long)info->data;
+}
 
 
-static int hpet_msi_set_affinity(struct irq_data *data,
-				 const struct cpumask *mask, bool force)
+static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	struct msi_msg msg;
-	unsigned int dest;
-	int ret;
+	hpet_msi_write(data->handler_data, msg);
+}
 
 
-	ret = apic_set_affinity(data, mask, &dest);
-	if (ret)
-		return ret;
+static struct irq_chip hpet_msi_controller = {
+	.name = "HPET-MSI",
+	.irq_unmask = hpet_msi_unmask,
+	.irq_mask = hpet_msi_mask,
+	.irq_ack = irq_chip_ack_parent,
+	.irq_set_affinity = msi_domain_set_affinity,
+	.irq_retrigger = irq_chip_retrigger_hierarchy,
+	.irq_compose_msi_msg = irq_msi_compose_msg,
+	.irq_write_msi_msg = hpet_msi_write_msg,
+	.flags = IRQCHIP_SKIP_SET_WAKE,
+};
 
 
-	hpet_msi_read(data->handler_data, &msg);
+static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
+					  msi_alloc_info_t *arg)
+{
+	return arg->hpet_index;
+}
 
 
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(cfg->vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+static int hpet_msi_init(struct irq_domain *domain,
+			 struct msi_domain_info *info, unsigned int virq,
+			 irq_hw_number_t hwirq, msi_alloc_info_t *arg)
+{
+	irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+	irq_domain_set_info(domain, virq, arg->hpet_index, info->chip, NULL,
+			    handle_edge_irq, arg->hpet_data, "edge");
 
 
-	hpet_msi_write(data->handler_data, &msg);
+	return 0;
+}
 
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+static void hpet_msi_free(struct irq_domain *domain,
+			  struct msi_domain_info *info, unsigned int virq)
+{
+	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
 }
 }
 
 
-static struct irq_chip hpet_msi_type = {
-	.name = "HPET_MSI",
-	.irq_unmask = hpet_msi_unmask,
-	.irq_mask = hpet_msi_mask,
-	.irq_ack = apic_ack_edge,
-	.irq_set_affinity = hpet_msi_set_affinity,
-	.irq_retrigger = apic_retrigger_irq,
-	.flags = IRQCHIP_SKIP_SET_WAKE,
+static struct msi_domain_ops hpet_msi_domain_ops = {
+	.get_hwirq	= hpet_msi_get_hwirq,
+	.msi_init	= hpet_msi_init,
+	.msi_free	= hpet_msi_free,
+};
+
+static struct msi_domain_info hpet_msi_domain_info = {
+	.ops		= &hpet_msi_domain_ops,
+	.chip		= &hpet_msi_controller,
 };
 };
 
 
-int default_setup_hpet_msi(unsigned int irq, unsigned int id)
+struct irq_domain *hpet_create_irq_domain(int hpet_id)
 {
 {
-	struct irq_chip *chip = &hpet_msi_type;
-	struct msi_msg msg;
-	int ret;
+	struct irq_domain *parent;
+	struct irq_alloc_info info;
+	struct msi_domain_info *domain_info;
+
+	if (x86_vector_domain == NULL)
+		return NULL;
+
+	domain_info = kzalloc(sizeof(*domain_info), GFP_KERNEL);
+	if (!domain_info)
+		return NULL;
+
+	*domain_info = hpet_msi_domain_info;
+	domain_info->data = (void *)(long)hpet_id;
+
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.hpet_id = hpet_id;
+	parent = irq_remapping_get_ir_irq_domain(&info);
+	if (parent == NULL)
+		parent = x86_vector_domain;
+	else
+		hpet_msi_controller.name = "IR-HPET-MSI";
+
+	return msi_create_irq_domain(NULL, domain_info, parent);
+}
 
 
-	ret = msi_compose_msg(NULL, irq, &msg, id);
-	if (ret < 0)
-		return ret;
+int hpet_assign_irq(struct irq_domain *domain, struct hpet_dev *dev,
+		    int dev_num)
+{
+	struct irq_alloc_info info;
 
 
-	hpet_msi_write(irq_get_handler_data(irq), &msg);
-	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-	setup_remapped_irq(irq, irq_cfg(irq), chip);
+	init_irq_alloc_info(&info, NULL);
+	info.type = X86_IRQ_ALLOC_TYPE_HPET;
+	info.hpet_data = dev;
+	info.hpet_id = hpet_dev_id(domain);
+	info.hpet_index = dev_num;
 
 
-	irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
-	return 0;
+	return irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, &info);
 }
 }
 #endif
 #endif

+ 279 - 169
arch/x86/kernel/apic/vector.c

@@ -3,6 +3,8 @@
  *
  *
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
  *	Moved from arch/x86/kernel/apic/io_apic.c.
  *	Moved from arch/x86/kernel/apic/io_apic.c.
+ * Jiang Liu <jiang.liu@linux.intel.com>
+ *	Enable support of hierarchical irqdomains
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * it under the terms of the GNU General Public License version 2 as
@@ -11,15 +13,28 @@
 #include <linux/interrupt.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/compiler.h>
-#include <linux/irqdomain.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
+#include <asm/irqdomain.h>
 #include <asm/hw_irq.h>
 #include <asm/hw_irq.h>
 #include <asm/apic.h>
 #include <asm/apic.h>
 #include <asm/i8259.h>
 #include <asm/i8259.h>
 #include <asm/desc.h>
 #include <asm/desc.h>
 #include <asm/irq_remapping.h>
 #include <asm/irq_remapping.h>
 
 
+struct apic_chip_data {
+	struct irq_cfg		cfg;
+	cpumask_var_t		domain;
+	cpumask_var_t		old_domain;
+	u8			move_in_progress : 1;
+};
+
+struct irq_domain *x86_vector_domain;
 static DEFINE_RAW_SPINLOCK(vector_lock);
 static DEFINE_RAW_SPINLOCK(vector_lock);
+static cpumask_var_t vector_cpumask;
+static struct irq_chip lapic_controller;
+#ifdef	CONFIG_X86_IO_APIC
+static struct apic_chip_data *legacy_irq_data[NR_IRQS_LEGACY];
+#endif
 
 
 void lock_vector_lock(void)
 void lock_vector_lock(void)
 {
 {
@@ -34,71 +49,59 @@ void unlock_vector_lock(void)
 	raw_spin_unlock(&vector_lock);
 	raw_spin_unlock(&vector_lock);
 }
 }
 
 
-struct irq_cfg *irq_cfg(unsigned int irq)
+static struct apic_chip_data *apic_chip_data(struct irq_data *irq_data)
 {
 {
-	return irq_get_chip_data(irq);
+	if (!irq_data)
+		return NULL;
+
+	while (irq_data->parent_data)
+		irq_data = irq_data->parent_data;
+
+	return irq_data->chip_data;
 }
 }
 
 
 struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
 struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
 {
 {
-	return irq_data->chip_data;
+	struct apic_chip_data *data = apic_chip_data(irq_data);
+
+	return data ? &data->cfg : NULL;
 }
 }
 
 
-static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+struct irq_cfg *irq_cfg(unsigned int irq)
 {
 {
-	struct irq_cfg *cfg;
+	return irqd_cfg(irq_get_irq_data(irq));
+}
 
 
-	cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
-	if (!cfg)
+static struct apic_chip_data *alloc_apic_chip_data(int node)
+{
+	struct apic_chip_data *data;
+
+	data = kzalloc_node(sizeof(*data), GFP_KERNEL, node);
+	if (!data)
 		return NULL;
 		return NULL;
-	if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
-		goto out_cfg;
-	if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+	if (!zalloc_cpumask_var_node(&data->domain, GFP_KERNEL, node))
+		goto out_data;
+	if (!zalloc_cpumask_var_node(&data->old_domain, GFP_KERNEL, node))
 		goto out_domain;
 		goto out_domain;
-#ifdef	CONFIG_X86_IO_APIC
-	INIT_LIST_HEAD(&cfg->irq_2_pin);
-#endif
-	return cfg;
+	return data;
 out_domain:
 out_domain:
-	free_cpumask_var(cfg->domain);
-out_cfg:
-	kfree(cfg);
+	free_cpumask_var(data->domain);
+out_data:
+	kfree(data);
 	return NULL;
 	return NULL;
 }
 }
 
 
-struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+static void free_apic_chip_data(struct apic_chip_data *data)
 {
 {
-	int res = irq_alloc_desc_at(at, node);
-	struct irq_cfg *cfg;
-
-	if (res < 0) {
-		if (res != -EEXIST)
-			return NULL;
-		cfg = irq_cfg(at);
-		if (cfg)
-			return cfg;
+	if (data) {
+		free_cpumask_var(data->domain);
+		free_cpumask_var(data->old_domain);
+		kfree(data);
 	}
 	}
-
-	cfg = alloc_irq_cfg(at, node);
-	if (cfg)
-		irq_set_chip_data(at, cfg);
-	else
-		irq_free_desc(at);
-	return cfg;
-}
-
-static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
-{
-	if (!cfg)
-		return;
-	irq_set_chip_data(at, NULL);
-	free_cpumask_var(cfg->domain);
-	free_cpumask_var(cfg->old_domain);
-	kfree(cfg);
 }
 }
 
 
-static int
-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int __assign_irq_vector(int irq, struct apic_chip_data *d,
+			       const struct cpumask *mask)
 {
 {
 	/*
 	/*
 	 * NOTE! The local APIC isn't very good at handling
 	 * NOTE! The local APIC isn't very good at handling
@@ -114,36 +117,33 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 	static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
 	static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
 	static int current_offset = VECTOR_OFFSET_START % 16;
 	static int current_offset = VECTOR_OFFSET_START % 16;
 	int cpu, err;
 	int cpu, err;
-	cpumask_var_t tmp_mask;
 
 
-	if (cfg->move_in_progress)
+	if (d->move_in_progress)
 		return -EBUSY;
 		return -EBUSY;
 
 
-	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
-		return -ENOMEM;
-
 	/* Only try and allocate irqs on cpus that are present */
 	/* Only try and allocate irqs on cpus that are present */
 	err = -ENOSPC;
 	err = -ENOSPC;
-	cpumask_clear(cfg->old_domain);
+	cpumask_clear(d->old_domain);
 	cpu = cpumask_first_and(mask, cpu_online_mask);
 	cpu = cpumask_first_and(mask, cpu_online_mask);
 	while (cpu < nr_cpu_ids) {
 	while (cpu < nr_cpu_ids) {
 		int new_cpu, vector, offset;
 		int new_cpu, vector, offset;
 
 
-		apic->vector_allocation_domain(cpu, tmp_mask, mask);
+		apic->vector_allocation_domain(cpu, vector_cpumask, mask);
 
 
-		if (cpumask_subset(tmp_mask, cfg->domain)) {
+		if (cpumask_subset(vector_cpumask, d->domain)) {
 			err = 0;
 			err = 0;
-			if (cpumask_equal(tmp_mask, cfg->domain))
+			if (cpumask_equal(vector_cpumask, d->domain))
 				break;
 				break;
 			/*
 			/*
 			 * New cpumask using the vector is a proper subset of
 			 * New cpumask using the vector is a proper subset of
 			 * the current in use mask. So cleanup the vector
 			 * the current in use mask. So cleanup the vector
 			 * allocation for the members that are not used anymore.
 			 * allocation for the members that are not used anymore.
 			 */
 			 */
-			cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
-			cfg->move_in_progress =
-			   cpumask_intersects(cfg->old_domain, cpu_online_mask);
-			cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+			cpumask_andnot(d->old_domain, d->domain,
+				       vector_cpumask);
+			d->move_in_progress =
+			   cpumask_intersects(d->old_domain, cpu_online_mask);
+			cpumask_and(d->domain, d->domain, vector_cpumask);
 			break;
 			break;
 		}
 		}
 
 
@@ -157,16 +157,18 @@ next:
 		}
 		}
 
 
 		if (unlikely(current_vector == vector)) {
 		if (unlikely(current_vector == vector)) {
-			cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
-			cpumask_andnot(tmp_mask, mask, cfg->old_domain);
-			cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+			cpumask_or(d->old_domain, d->old_domain,
+				   vector_cpumask);
+			cpumask_andnot(vector_cpumask, mask, d->old_domain);
+			cpu = cpumask_first_and(vector_cpumask,
+						cpu_online_mask);
 			continue;
 			continue;
 		}
 		}
 
 
 		if (test_bit(vector, used_vectors))
 		if (test_bit(vector, used_vectors))
 			goto next;
 			goto next;
 
 
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
 			if (per_cpu(vector_irq, new_cpu)[vector] >
 			if (per_cpu(vector_irq, new_cpu)[vector] >
 			    VECTOR_UNDEFINED)
 			    VECTOR_UNDEFINED)
 				goto next;
 				goto next;
@@ -174,55 +176,73 @@ next:
 		/* Found one! */
 		/* Found one! */
 		current_vector = vector;
 		current_vector = vector;
 		current_offset = offset;
 		current_offset = offset;
-		if (cfg->vector) {
-			cpumask_copy(cfg->old_domain, cfg->domain);
-			cfg->move_in_progress =
-			   cpumask_intersects(cfg->old_domain, cpu_online_mask);
+		if (d->cfg.vector) {
+			cpumask_copy(d->old_domain, d->domain);
+			d->move_in_progress =
+			   cpumask_intersects(d->old_domain, cpu_online_mask);
 		}
 		}
-		for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
 			per_cpu(vector_irq, new_cpu)[vector] = irq;
 			per_cpu(vector_irq, new_cpu)[vector] = irq;
-		cfg->vector = vector;
-		cpumask_copy(cfg->domain, tmp_mask);
+		d->cfg.vector = vector;
+		cpumask_copy(d->domain, vector_cpumask);
 		err = 0;
 		err = 0;
 		break;
 		break;
 	}
 	}
-	free_cpumask_var(tmp_mask);
+
+	if (!err) {
+		/* cache destination APIC IDs into cfg->dest_apicid */
+		err = apic->cpu_mask_to_apicid_and(mask, d->domain,
+						   &d->cfg.dest_apicid);
+	}
 
 
 	return err;
 	return err;
 }
 }
 
 
-int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+static int assign_irq_vector(int irq, struct apic_chip_data *data,
+			     const struct cpumask *mask)
 {
 {
 	int err;
 	int err;
 	unsigned long flags;
 	unsigned long flags;
 
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	err = __assign_irq_vector(irq, cfg, mask);
+	err = __assign_irq_vector(irq, data, mask);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 	return err;
 	return err;
 }
 }
 
 
-void clear_irq_vector(int irq, struct irq_cfg *cfg)
+static int assign_irq_vector_policy(int irq, int node,
+				    struct apic_chip_data *data,
+				    struct irq_alloc_info *info)
+{
+	if (info && info->mask)
+		return assign_irq_vector(irq, data, info->mask);
+	if (node != NUMA_NO_NODE &&
+	    assign_irq_vector(irq, data, cpumask_of_node(node)) == 0)
+		return 0;
+	return assign_irq_vector(irq, data, apic->target_cpus());
+}
+
+static void clear_irq_vector(int irq, struct apic_chip_data *data)
 {
 {
 	int cpu, vector;
 	int cpu, vector;
 	unsigned long flags;
 	unsigned long flags;
 
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	BUG_ON(!cfg->vector);
+	BUG_ON(!data->cfg.vector);
 
 
-	vector = cfg->vector;
-	for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+	vector = data->cfg.vector;
+	for_each_cpu_and(cpu, data->domain, cpu_online_mask)
 		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 
 
-	cfg->vector = 0;
-	cpumask_clear(cfg->domain);
+	data->cfg.vector = 0;
+	cpumask_clear(data->domain);
 
 
-	if (likely(!cfg->move_in_progress)) {
+	if (likely(!data->move_in_progress)) {
 		raw_spin_unlock_irqrestore(&vector_lock, flags);
 		raw_spin_unlock_irqrestore(&vector_lock, flags);
 		return;
 		return;
 	}
 	}
 
 
-	for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+	for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 		     vector++) {
 		     vector++) {
 			if (per_cpu(vector_irq, cpu)[vector] != irq)
 			if (per_cpu(vector_irq, cpu)[vector] != irq)
@@ -231,10 +251,95 @@ void clear_irq_vector(int irq, struct irq_cfg *cfg)
 			break;
 			break;
 		}
 		}
 	}
 	}
-	cfg->move_in_progress = 0;
+	data->move_in_progress = 0;
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 }
 }
 
 
+void init_irq_alloc_info(struct irq_alloc_info *info,
+			 const struct cpumask *mask)
+{
+	memset(info, 0, sizeof(*info));
+	info->mask = mask;
+}
+
+void copy_irq_alloc_info(struct irq_alloc_info *dst, struct irq_alloc_info *src)
+{
+	if (src)
+		*dst = *src;
+	else
+		memset(dst, 0, sizeof(*dst));
+}
+
+static void x86_vector_free_irqs(struct irq_domain *domain,
+				 unsigned int virq, unsigned int nr_irqs)
+{
+	struct irq_data *irq_data;
+	int i;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(x86_vector_domain, virq + i);
+		if (irq_data && irq_data->chip_data) {
+			clear_irq_vector(virq + i, irq_data->chip_data);
+			free_apic_chip_data(irq_data->chip_data);
+#ifdef	CONFIG_X86_IO_APIC
+			if (virq + i < nr_legacy_irqs())
+				legacy_irq_data[virq + i] = NULL;
+#endif
+			irq_domain_reset_irq_data(irq_data);
+		}
+	}
+}
+
+static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
+				 unsigned int nr_irqs, void *arg)
+{
+	struct irq_alloc_info *info = arg;
+	struct apic_chip_data *data;
+	struct irq_data *irq_data;
+	int i, err;
+
+	if (disable_apic)
+		return -ENXIO;
+
+	/* Currently vector allocator can't guarantee contiguous allocations */
+	if ((info->flags & X86_IRQ_ALLOC_CONTIGUOUS_VECTORS) && nr_irqs > 1)
+		return -ENOSYS;
+
+	for (i = 0; i < nr_irqs; i++) {
+		irq_data = irq_domain_get_irq_data(domain, virq + i);
+		BUG_ON(!irq_data);
+#ifdef	CONFIG_X86_IO_APIC
+		if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
+			data = legacy_irq_data[virq + i];
+		else
+#endif
+			data = alloc_apic_chip_data(irq_data->node);
+		if (!data) {
+			err = -ENOMEM;
+			goto error;
+		}
+
+		irq_data->chip = &lapic_controller;
+		irq_data->chip_data = data;
+		irq_data->hwirq = virq + i;
+		err = assign_irq_vector_policy(virq, irq_data->node, data,
+					       info);
+		if (err)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	x86_vector_free_irqs(domain, virq, i + 1);
+	return err;
+}
+
+static const struct irq_domain_ops x86_vector_domain_ops = {
+	.alloc	= x86_vector_alloc_irqs,
+	.free	= x86_vector_free_irqs,
+};
+
 int __init arch_probe_nr_irqs(void)
 int __init arch_probe_nr_irqs(void)
 {
 {
 	int nr;
 	int nr;
@@ -258,8 +363,43 @@ int __init arch_probe_nr_irqs(void)
 	return nr_legacy_irqs();
 	return nr_legacy_irqs();
 }
 }
 
 
+#ifdef	CONFIG_X86_IO_APIC
+static void init_legacy_irqs(void)
+{
+	int i, node = cpu_to_node(0);
+	struct apic_chip_data *data;
+
+	/*
+	 * For legacy IRQ's, start with assigning irq0 to irq15 to
+	 * ISA_IRQ_VECTOR(i) for all cpu's.
+	 */
+	for (i = 0; i < nr_legacy_irqs(); i++) {
+		data = legacy_irq_data[i] = alloc_apic_chip_data(node);
+		BUG_ON(!data);
+
+		data->cfg.vector = ISA_IRQ_VECTOR(i);
+		cpumask_setall(data->domain);
+		irq_set_chip_data(i, data);
+	}
+}
+#else
+static void init_legacy_irqs(void) { }
+#endif
+
 int __init arch_early_irq_init(void)
 int __init arch_early_irq_init(void)
 {
 {
+	init_legacy_irqs();
+
+	x86_vector_domain = irq_domain_add_tree(NULL, &x86_vector_domain_ops,
+						NULL);
+	BUG_ON(x86_vector_domain == NULL);
+	irq_set_default_host(x86_vector_domain);
+
+	arch_init_msi_domain(x86_vector_domain);
+	arch_init_htirq_domain(x86_vector_domain);
+
+	BUG_ON(!alloc_cpumask_var(&vector_cpumask, GFP_KERNEL));
+
 	return arch_early_ioapic_init();
 	return arch_early_ioapic_init();
 }
 }
 
 
@@ -267,7 +407,7 @@ static void __setup_vector_irq(int cpu)
 {
 {
 	/* Initialize vector_irq on a new cpu */
 	/* Initialize vector_irq on a new cpu */
 	int irq, vector;
 	int irq, vector;
-	struct irq_cfg *cfg;
+	struct apic_chip_data *data;
 
 
 	/*
 	/*
 	 * vector_lock will make sure that we don't run into irq vector
 	 * vector_lock will make sure that we don't run into irq vector
@@ -277,13 +417,13 @@ static void __setup_vector_irq(int cpu)
 	raw_spin_lock(&vector_lock);
 	raw_spin_lock(&vector_lock);
 	/* Mark the inuse vectors */
 	/* Mark the inuse vectors */
 	for_each_active_irq(irq) {
 	for_each_active_irq(irq) {
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = apic_chip_data(irq_get_irq_data(irq));
+		if (!data)
 			continue;
 			continue;
 
 
-		if (!cpumask_test_cpu(cpu, cfg->domain))
+		if (!cpumask_test_cpu(cpu, data->domain))
 			continue;
 			continue;
-		vector = cfg->vector;
+		vector = data->cfg.vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
 		per_cpu(vector_irq, cpu)[vector] = irq;
 	}
 	}
 	/* Mark the free vectors */
 	/* Mark the free vectors */
@@ -292,8 +432,8 @@ static void __setup_vector_irq(int cpu)
 		if (irq <= VECTOR_UNDEFINED)
 		if (irq <= VECTOR_UNDEFINED)
 			continue;
 			continue;
 
 
-		cfg = irq_cfg(irq);
-		if (!cpumask_test_cpu(cpu, cfg->domain))
+		data = apic_chip_data(irq_get_irq_data(irq));
+		if (!cpumask_test_cpu(cpu, data->domain))
 			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
 	}
 	}
 	raw_spin_unlock(&vector_lock);
 	raw_spin_unlock(&vector_lock);
@@ -314,20 +454,20 @@ void setup_vector_irq(int cpu)
 	 * legacy vector to irq mapping:
 	 * legacy vector to irq mapping:
 	 */
 	 */
 	for (irq = 0; irq < nr_legacy_irqs(); irq++)
 	for (irq = 0; irq < nr_legacy_irqs(); irq++)
-		per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
+		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
 
 
 	__setup_vector_irq(cpu);
 	__setup_vector_irq(cpu);
 }
 }
 
 
-int apic_retrigger_irq(struct irq_data *data)
+static int apic_retrigger_irq(struct irq_data *irq_data)
 {
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
+	struct apic_chip_data *data = apic_chip_data(irq_data);
 	unsigned long flags;
 	unsigned long flags;
 	int cpu;
 	int cpu;
 
 
 	raw_spin_lock_irqsave(&vector_lock, flags);
 	raw_spin_lock_irqsave(&vector_lock, flags);
-	cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
-	apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
+	cpu = cpumask_first_and(data->domain, cpu_online_mask);
+	apic->send_IPI_mask(cpumask_of(cpu), data->cfg.vector);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 	raw_spin_unlock_irqrestore(&vector_lock, flags);
 
 
 	return 1;
 	return 1;
@@ -340,73 +480,76 @@ void apic_ack_edge(struct irq_data *data)
 	ack_APIC_irq();
 	ack_APIC_irq();
 }
 }
 
 
-/*
- * Either sets data->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves data->affinity untouched.
- */
-int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
-		      unsigned int *dest_id)
+static int apic_set_affinity(struct irq_data *irq_data,
+			     const struct cpumask *dest, bool force)
 {
 {
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int irq = data->irq;
-	int err;
+	struct apic_chip_data *data = irq_data->chip_data;
+	int err, irq = irq_data->irq;
 
 
 	if (!config_enabled(CONFIG_SMP))
 	if (!config_enabled(CONFIG_SMP))
 		return -EPERM;
 		return -EPERM;
 
 
-	if (!cpumask_intersects(mask, cpu_online_mask))
+	if (!cpumask_intersects(dest, cpu_online_mask))
 		return -EINVAL;
 		return -EINVAL;
 
 
-	err = assign_irq_vector(irq, cfg, mask);
-	if (err)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
+	err = assign_irq_vector(irq, data, dest);
 	if (err) {
 	if (err) {
-		if (assign_irq_vector(irq, cfg, data->affinity))
+		struct irq_data *top = irq_get_irq_data(irq);
+
+		if (assign_irq_vector(irq, data, top->affinity))
 			pr_err("Failed to recover vector for irq %d\n", irq);
 			pr_err("Failed to recover vector for irq %d\n", irq);
 		return err;
 		return err;
 	}
 	}
 
 
-	cpumask_copy(data->affinity, mask);
-
-	return 0;
+	return IRQ_SET_MASK_OK;
 }
 }
 
 
+static struct irq_chip lapic_controller = {
+	.irq_ack		= apic_ack_edge,
+	.irq_set_affinity	= apic_set_affinity,
+	.irq_retrigger		= apic_retrigger_irq,
+};
+
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
-void send_cleanup_vector(struct irq_cfg *cfg)
+static void __send_cleanup_vector(struct apic_chip_data *data)
 {
 {
 	cpumask_var_t cleanup_mask;
 	cpumask_var_t cleanup_mask;
 
 
 	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
 	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
 		unsigned int i;
 		unsigned int i;
 
 
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+		for_each_cpu_and(i, data->old_domain, cpu_online_mask)
 			apic->send_IPI_mask(cpumask_of(i),
 			apic->send_IPI_mask(cpumask_of(i),
 					    IRQ_MOVE_CLEANUP_VECTOR);
 					    IRQ_MOVE_CLEANUP_VECTOR);
 	} else {
 	} else {
-		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+		cpumask_and(cleanup_mask, data->old_domain, cpu_online_mask);
 		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 		free_cpumask_var(cleanup_mask);
 		free_cpumask_var(cleanup_mask);
 	}
 	}
-	cfg->move_in_progress = 0;
+	data->move_in_progress = 0;
+}
+
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+	struct apic_chip_data *data;
+
+	data = container_of(cfg, struct apic_chip_data, cfg);
+	if (data->move_in_progress)
+		__send_cleanup_vector(data);
 }
 }
 
 
 asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 {
 {
 	unsigned vector, me;
 	unsigned vector, me;
 
 
-	ack_APIC_irq();
-	irq_enter();
-	exit_idle();
+	entering_ack_irq();
 
 
 	me = smp_processor_id();
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		int irq;
 		int irq;
 		unsigned int irr;
 		unsigned int irr;
 		struct irq_desc *desc;
 		struct irq_desc *desc;
-		struct irq_cfg *cfg;
+		struct apic_chip_data *data;
 
 
 		irq = __this_cpu_read(vector_irq[vector]);
 		irq = __this_cpu_read(vector_irq[vector]);
 
 
@@ -417,8 +560,8 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 		if (!desc)
 		if (!desc)
 			continue;
 			continue;
 
 
-		cfg = irq_cfg(irq);
-		if (!cfg)
+		data = apic_chip_data(&desc->irq_data);
+		if (!data)
 			continue;
 			continue;
 
 
 		raw_spin_lock(&desc->lock);
 		raw_spin_lock(&desc->lock);
@@ -427,10 +570,11 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 		 * Check if the irq migration is in progress. If so, we
 		 * Check if the irq migration is in progress. If so, we
 		 * haven't received the cleanup request yet for this irq.
 		 * haven't received the cleanup request yet for this irq.
 		 */
 		 */
-		if (cfg->move_in_progress)
+		if (data->move_in_progress)
 			goto unlock;
 			goto unlock;
 
 
-		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+		if (vector == data->cfg.vector &&
+		    cpumask_test_cpu(me, data->domain))
 			goto unlock;
 			goto unlock;
 
 
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -450,20 +594,21 @@ unlock:
 		raw_spin_unlock(&desc->lock);
 		raw_spin_unlock(&desc->lock);
 	}
 	}
 
 
-	irq_exit();
+	exiting_irq();
 }
 }
 
 
 static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
 static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
 {
 {
 	unsigned me;
 	unsigned me;
+	struct apic_chip_data *data;
 
 
-	if (likely(!cfg->move_in_progress))
+	data = container_of(cfg, struct apic_chip_data, cfg);
+	if (likely(!data->move_in_progress))
 		return;
 		return;
 
 
 	me = smp_processor_id();
 	me = smp_processor_id();
-
-	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
-		send_cleanup_vector(cfg);
+	if (vector == data->cfg.vector && cpumask_test_cpu(me, data->domain))
+		__send_cleanup_vector(data);
 }
 }
 
 
 void irq_complete_move(struct irq_cfg *cfg)
 void irq_complete_move(struct irq_cfg *cfg)
@@ -475,46 +620,11 @@ void irq_force_complete_move(int irq)
 {
 {
 	struct irq_cfg *cfg = irq_cfg(irq);
 	struct irq_cfg *cfg = irq_cfg(irq);
 
 
-	if (!cfg)
-		return;
-
-	__irq_complete_move(cfg, cfg->vector);
+	if (cfg)
+		__irq_complete_move(cfg, cfg->vector);
 }
 }
 #endif
 #endif
 
 
-/*
- * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
- */
-int arch_setup_hwirq(unsigned int irq, int node)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	int ret;
-
-	cfg = alloc_irq_cfg(irq, node);
-	if (!cfg)
-		return -ENOMEM;
-
-	raw_spin_lock_irqsave(&vector_lock, flags);
-	ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
-	raw_spin_unlock_irqrestore(&vector_lock, flags);
-
-	if (!ret)
-		irq_set_chip_data(irq, cfg);
-	else
-		free_irq_cfg(irq, cfg);
-	return ret;
-}
-
-void arch_teardown_hwirq(unsigned int irq)
-{
-	struct irq_cfg *cfg = irq_cfg(irq);
-
-	free_remapped_irq(irq);
-	clear_irq_vector(irq, cfg);
-	free_irq_cfg(irq, cfg);
-}
-
 static void __init print_APIC_field(int base)
 static void __init print_APIC_field(int base)
 {
 {
 	int i;
 	int i;

+ 2 - 0
arch/x86/kernel/apic/x2apic_phys.c

@@ -21,11 +21,13 @@ early_param("x2apic_phys", set_x2apic_phys_mode);
 
 
 static bool x2apic_fadt_phys(void)
 static bool x2apic_fadt_phys(void)
 {
 {
+#ifdef CONFIG_ACPI
 	if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
 	if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
 		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
 		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
 		printk(KERN_DEBUG "System requires x2apic physical mode\n");
 		printk(KERN_DEBUG "System requires x2apic physical mode\n");
 		return true;
 		return true;
 	}
 	}
+#endif
 	return false;
 	return false;
 }
 }
 
 

+ 21 - 0
arch/x86/kernel/asm-offsets.c

@@ -41,6 +41,25 @@ void common(void) {
 	OFFSET(pbe_orig_address, pbe, orig_address);
 	OFFSET(pbe_orig_address, pbe, orig_address);
 	OFFSET(pbe_next, pbe, next);
 	OFFSET(pbe_next, pbe, next);
 
 
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	BLANK();
+	OFFSET(IA32_SIGCONTEXT_ax, sigcontext_ia32, ax);
+	OFFSET(IA32_SIGCONTEXT_bx, sigcontext_ia32, bx);
+	OFFSET(IA32_SIGCONTEXT_cx, sigcontext_ia32, cx);
+	OFFSET(IA32_SIGCONTEXT_dx, sigcontext_ia32, dx);
+	OFFSET(IA32_SIGCONTEXT_si, sigcontext_ia32, si);
+	OFFSET(IA32_SIGCONTEXT_di, sigcontext_ia32, di);
+	OFFSET(IA32_SIGCONTEXT_bp, sigcontext_ia32, bp);
+	OFFSET(IA32_SIGCONTEXT_sp, sigcontext_ia32, sp);
+	OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip);
+
+	BLANK();
+	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
+
+	BLANK();
+	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
+#endif
+
 #ifdef CONFIG_PARAVIRT
 #ifdef CONFIG_PARAVIRT
 	BLANK();
 	BLANK();
 	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
 	OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
@@ -49,7 +68,9 @@ void common(void) {
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+#ifdef CONFIG_X86_32
 	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
 	OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
+#endif
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
 #endif
 #endif

+ 0 - 18
arch/x86/kernel/asm-offsets_32.c

@@ -17,17 +17,6 @@ void foo(void);
 
 
 void foo(void)
 void foo(void)
 {
 {
-	OFFSET(IA32_SIGCONTEXT_ax, sigcontext, ax);
-	OFFSET(IA32_SIGCONTEXT_bx, sigcontext, bx);
-	OFFSET(IA32_SIGCONTEXT_cx, sigcontext, cx);
-	OFFSET(IA32_SIGCONTEXT_dx, sigcontext, dx);
-	OFFSET(IA32_SIGCONTEXT_si, sigcontext, si);
-	OFFSET(IA32_SIGCONTEXT_di, sigcontext, di);
-	OFFSET(IA32_SIGCONTEXT_bp, sigcontext, bp);
-	OFFSET(IA32_SIGCONTEXT_sp, sigcontext, sp);
-	OFFSET(IA32_SIGCONTEXT_ip, sigcontext, ip);
-	BLANK();
-
 	OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
 	OFFSET(CPUINFO_x86, cpuinfo_x86, x86);
 	OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
 	OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor);
 	OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
 	OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model);
@@ -37,10 +26,6 @@ void foo(void)
 	OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
 	OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
 	BLANK();
 	BLANK();
 
 
-	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
-	OFFSET(TI_cpu, thread_info, cpu);
-	BLANK();
-
 	OFFSET(PT_EBX, pt_regs, bx);
 	OFFSET(PT_EBX, pt_regs, bx);
 	OFFSET(PT_ECX, pt_regs, cx);
 	OFFSET(PT_ECX, pt_regs, cx);
 	OFFSET(PT_EDX, pt_regs, dx);
 	OFFSET(PT_EDX, pt_regs, dx);
@@ -60,9 +45,6 @@ void foo(void)
 	OFFSET(PT_OLDSS,  pt_regs, ss);
 	OFFSET(PT_OLDSS,  pt_regs, ss);
 	BLANK();
 	BLANK();
 
 
-	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
-	BLANK();
-
 	OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
 	OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
 	BLANK();
 	BLANK();
 
 

+ 0 - 21
arch/x86/kernel/asm-offsets_64.c

@@ -29,27 +29,6 @@ int main(void)
 	BLANK();
 	BLANK();
 #endif
 #endif
 
 
-#ifdef CONFIG_IA32_EMULATION
-	OFFSET(TI_sysenter_return, thread_info, sysenter_return);
-	BLANK();
-
-#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
-	ENTRY(ax);
-	ENTRY(bx);
-	ENTRY(cx);
-	ENTRY(dx);
-	ENTRY(si);
-	ENTRY(di);
-	ENTRY(bp);
-	ENTRY(sp);
-	ENTRY(ip);
-	BLANK();
-#undef ENTRY
-
-	OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
-	BLANK();
-#endif
-
 #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
 #define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
 	ENTRY(bx);
 	ENTRY(bx);
 	ENTRY(cx);
 	ENTRY(cx);

+ 0 - 4
arch/x86/kernel/cpu/common.c

@@ -1155,10 +1155,6 @@ static __init int setup_disablecpuid(char *arg)
 }
 }
 __setup("clearcpuid=", setup_disablecpuid);
 __setup("clearcpuid=", setup_disablecpuid);
 
 
-DEFINE_PER_CPU(unsigned long, kernel_stack) =
-	(unsigned long)&init_thread_union + THREAD_SIZE;
-EXPORT_PER_CPU_SYMBOL(kernel_stack);
-
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
 struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,
 struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1,

+ 2 - 4
arch/x86/kernel/cpu/mshyperv.c

@@ -39,14 +39,12 @@ void hyperv_vector_handler(struct pt_regs *regs)
 {
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
 
-	irq_enter();
-	exit_idle();
-
+	entering_irq();
 	inc_irq_stat(irq_hv_callback_count);
 	inc_irq_stat(irq_hv_callback_count);
 	if (vmbus_handler)
 	if (vmbus_handler)
 		vmbus_handler();
 		vmbus_handler();
 
 
-	irq_exit();
+	exiting_irq();
 	set_irq_regs(old_regs);
 	set_irq_regs(old_regs);
 }
 }
 
 

+ 2 - 1
arch/x86/kernel/cpu/mtrr/cleanup.c

@@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
 			continue;
 			continue;
 		base = range_state[i].base_pfn;
 		base = range_state[i].base_pfn;
 		if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
 		if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
-		    (mtrr_state.enabled & 1)) {
+		    (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+		    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
 			/* Var MTRR contains UC entry below 1M? Skip it: */
 			/* Var MTRR contains UC entry below 1M? Skip it: */
 			printk(BIOS_BUG_MSG, i);
 			printk(BIOS_BUG_MSG, i);
 			if (base + size <= (1<<(20-PAGE_SHIFT)))
 			if (base + size <= (1<<(20-PAGE_SHIFT)))

+ 137 - 72
arch/x86/kernel/cpu/mtrr/generic.c

@@ -102,59 +102,76 @@ static int check_type_overlap(u8 *prev, u8 *curr)
 	return 0;
 	return 0;
 }
 }
 
 
-/*
- * Error/Semi-error returns:
- * 0xFF - when MTRR is not enabled
- * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
- *		corresponds only to [start:*partial_end].
- *		Caller has to lookup again for [*partial_end:end].
+/**
+ * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries
+ *
+ * Return the MTRR fixed memory type of 'start'.
+ *
+ * MTRR fixed entries are divided into the following ways:
+ *  0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges
+ *  0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges
+ *  0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges
+ *
+ * Return Values:
+ * MTRR_TYPE_(type)  - Matched memory type
+ * MTRR_TYPE_INVALID - Unmatched
+ */
+static u8 mtrr_type_lookup_fixed(u64 start, u64 end)
+{
+	int idx;
+
+	if (start >= 0x100000)
+		return MTRR_TYPE_INVALID;
+
+	/* 0x0 - 0x7FFFF */
+	if (start < 0x80000) {
+		idx = 0;
+		idx += (start >> 16);
+		return mtrr_state.fixed_ranges[idx];
+	/* 0x80000 - 0xBFFFF */
+	} else if (start < 0xC0000) {
+		idx = 1 * 8;
+		idx += ((start - 0x80000) >> 14);
+		return mtrr_state.fixed_ranges[idx];
+	}
+
+	/* 0xC0000 - 0xFFFFF */
+	idx = 3 * 8;
+	idx += ((start - 0xC0000) >> 12);
+	return mtrr_state.fixed_ranges[idx];
+}
+
+/**
+ * mtrr_type_lookup_variable - look up memory type in MTRR variable entries
+ *
+ * Return Value:
+ * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched)
+ *
+ * Output Arguments:
+ * repeat - Set to 1 when [start:end] spanned across MTRR range and type
+ *	    returned corresponds only to [start:*partial_end].  Caller has
+ *	    to lookup again for [*partial_end:end].
+ *
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ *	     region is fully covered by a single MTRR entry or the default
+ *	     type.
  */
  */
-static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
+static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
+				    int *repeat, u8 *uniform)
 {
 {
 	int i;
 	int i;
 	u64 base, mask;
 	u64 base, mask;
 	u8 prev_match, curr_match;
 	u8 prev_match, curr_match;
 
 
 	*repeat = 0;
 	*repeat = 0;
-	if (!mtrr_state_set)
-		return 0xFF;
-
-	if (!mtrr_state.enabled)
-		return 0xFF;
+	*uniform = 1;
 
 
-	/* Make end inclusive end, instead of exclusive */
+	/* Make end inclusive instead of exclusive */
 	end--;
 	end--;
 
 
-	/* Look in fixed ranges. Just return the type as per start */
-	if (mtrr_state.have_fixed && (start < 0x100000)) {
-		int idx;
-
-		if (start < 0x80000) {
-			idx = 0;
-			idx += (start >> 16);
-			return mtrr_state.fixed_ranges[idx];
-		} else if (start < 0xC0000) {
-			idx = 1 * 8;
-			idx += ((start - 0x80000) >> 14);
-			return mtrr_state.fixed_ranges[idx];
-		} else if (start < 0x1000000) {
-			idx = 3 * 8;
-			idx += ((start - 0xC0000) >> 12);
-			return mtrr_state.fixed_ranges[idx];
-		}
-	}
-
-	/*
-	 * Look in variable ranges
-	 * Look of multiple ranges matching this address and pick type
-	 * as per MTRR precedence
-	 */
-	if (!(mtrr_state.enabled & 2))
-		return mtrr_state.def_type;
-
-	prev_match = 0xFF;
+	prev_match = MTRR_TYPE_INVALID;
 	for (i = 0; i < num_var_ranges; ++i) {
 	for (i = 0; i < num_var_ranges; ++i) {
-		unsigned short start_state, end_state;
+		unsigned short start_state, end_state, inclusive;
 
 
 		if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
 		if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
 			continue;
 			continue;
@@ -166,20 +183,29 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 
 
 		start_state = ((start & mask) == (base & mask));
 		start_state = ((start & mask) == (base & mask));
 		end_state = ((end & mask) == (base & mask));
 		end_state = ((end & mask) == (base & mask));
+		inclusive = ((start < base) && (end > base));
 
 
-		if (start_state != end_state) {
+		if ((start_state != end_state) || inclusive) {
 			/*
 			/*
 			 * We have start:end spanning across an MTRR.
 			 * We have start:end spanning across an MTRR.
-			 * We split the region into
-			 * either
-			 * (start:mtrr_end) (mtrr_end:end)
-			 * or
-			 * (start:mtrr_start) (mtrr_start:end)
+			 * We split the region into either
+			 *
+			 * - start_state:1
+			 * (start:mtrr_end)(mtrr_end:end)
+			 * - end_state:1
+			 * (start:mtrr_start)(mtrr_start:end)
+			 * - inclusive:1
+			 * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end)
+			 *
 			 * depending on kind of overlap.
 			 * depending on kind of overlap.
-			 * Return the type for first region and a pointer to
-			 * the start of second region so that caller will
-			 * lookup again on the second region.
-			 * Note: This way we handle multiple overlaps as well.
+			 *
+			 * Return the type of the first region and a pointer
+			 * to the start of next region so that caller will be
+			 * advised to lookup again after having adjusted start
+			 * and end.
+			 *
+			 * Note: This way we handle overlaps with multiple
+			 * entries and the default type properly.
 			 */
 			 */
 			if (start_state)
 			if (start_state)
 				*partial_end = base + get_mtrr_size(mask);
 				*partial_end = base + get_mtrr_size(mask);
@@ -193,59 +219,94 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
 
 
 			end = *partial_end - 1; /* end is inclusive */
 			end = *partial_end - 1; /* end is inclusive */
 			*repeat = 1;
 			*repeat = 1;
+			*uniform = 0;
 		}
 		}
 
 
 		if ((start & mask) != (base & mask))
 		if ((start & mask) != (base & mask))
 			continue;
 			continue;
 
 
 		curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
 		curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
-		if (prev_match == 0xFF) {
+		if (prev_match == MTRR_TYPE_INVALID) {
 			prev_match = curr_match;
 			prev_match = curr_match;
 			continue;
 			continue;
 		}
 		}
 
 
+		*uniform = 0;
 		if (check_type_overlap(&prev_match, &curr_match))
 		if (check_type_overlap(&prev_match, &curr_match))
 			return curr_match;
 			return curr_match;
 	}
 	}
 
 
-	if (mtrr_tom2) {
-		if (start >= (1ULL<<32) && (end < mtrr_tom2))
-			return MTRR_TYPE_WRBACK;
-	}
-
-	if (prev_match != 0xFF)
+	if (prev_match != MTRR_TYPE_INVALID)
 		return prev_match;
 		return prev_match;
 
 
 	return mtrr_state.def_type;
 	return mtrr_state.def_type;
 }
 }
 
 
-/*
- * Returns the effective MTRR type for the region
- * Error return:
- * 0xFF - when MTRR is not enabled
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type)  - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ *	     region is fully covered by a single MTRR entry or the default
+ *	     type.
  */
  */
-u8 mtrr_type_lookup(u64 start, u64 end)
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
 {
 {
-	u8 type, prev_type;
+	u8 type, prev_type, is_uniform = 1, dummy;
 	int repeat;
 	int repeat;
 	u64 partial_end;
 	u64 partial_end;
 
 
-	type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+	if (!mtrr_state_set)
+		return MTRR_TYPE_INVALID;
+
+	if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+		return MTRR_TYPE_INVALID;
+
+	/*
+	 * Look up the fixed ranges first, which take priority over
+	 * the variable ranges.
+	 */
+	if ((start < 0x100000) &&
+	    (mtrr_state.have_fixed) &&
+	    (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+		is_uniform = 0;
+		type = mtrr_type_lookup_fixed(start, end);
+		goto out;
+	}
+
+	/*
+	 * Look up the variable ranges.  Look of multiple ranges matching
+	 * this address and pick type as per MTRR precedence.
+	 */
+	type = mtrr_type_lookup_variable(start, end, &partial_end,
+					 &repeat, &is_uniform);
 
 
 	/*
 	/*
 	 * Common path is with repeat = 0.
 	 * Common path is with repeat = 0.
 	 * However, we can have cases where [start:end] spans across some
 	 * However, we can have cases where [start:end] spans across some
-	 * MTRR range. Do repeated lookups for that case here.
+	 * MTRR ranges and/or the default type.  Do repeated lookups for
+	 * that case here.
 	 */
 	 */
 	while (repeat) {
 	while (repeat) {
 		prev_type = type;
 		prev_type = type;
 		start = partial_end;
 		start = partial_end;
-		type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+		is_uniform = 0;
+		type = mtrr_type_lookup_variable(start, end, &partial_end,
+						 &repeat, &dummy);
 
 
 		if (check_type_overlap(&prev_type, &type))
 		if (check_type_overlap(&prev_type, &type))
-			return type;
+			goto out;
 	}
 	}
 
 
+	if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2))
+		type = MTRR_TYPE_WRBACK;
+
+out:
+	*uniform = is_uniform;
 	return type;
 	return type;
 }
 }
 
 
@@ -347,7 +408,9 @@ static void __init print_mtrr_state(void)
 		 mtrr_attrib_to_str(mtrr_state.def_type));
 		 mtrr_attrib_to_str(mtrr_state.def_type));
 	if (mtrr_state.have_fixed) {
 	if (mtrr_state.have_fixed) {
 		pr_debug("MTRR fixed ranges %sabled:\n",
 		pr_debug("MTRR fixed ranges %sabled:\n",
-			 mtrr_state.enabled & 1 ? "en" : "dis");
+			((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+			 (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
+			 "en" : "dis");
 		print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
 		print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
 		for (i = 0; i < 2; ++i)
 		for (i = 0; i < 2; ++i)
 			print_fixed(0x80000 + i * 0x20000, 0x04000,
 			print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -360,7 +423,7 @@ static void __init print_mtrr_state(void)
 		print_fixed_last();
 		print_fixed_last();
 	}
 	}
 	pr_debug("MTRR variable ranges %sabled:\n",
 	pr_debug("MTRR variable ranges %sabled:\n",
-		 mtrr_state.enabled & 2 ? "en" : "dis");
+		 mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
 	high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
 	high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
 
 
 	for (i = 0; i < num_var_ranges; ++i) {
 	for (i = 0; i < num_var_ranges; ++i) {
@@ -382,7 +445,7 @@ static void __init print_mtrr_state(void)
 }
 }
 
 
 /* Grab all of the MTRR state for this CPU into *state */
 /* Grab all of the MTRR state for this CPU into *state */
-void __init get_mtrr_state(void)
+bool __init get_mtrr_state(void)
 {
 {
 	struct mtrr_var_range *vrs;
 	struct mtrr_var_range *vrs;
 	unsigned long flags;
 	unsigned long flags;
@@ -426,6 +489,8 @@ void __init get_mtrr_state(void)
 
 
 	post_set();
 	post_set();
 	local_irq_restore(flags);
 	local_irq_restore(flags);
+
+	return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
 }
 }
 
 
 /* Some BIOS's are messed up and don't set all MTRRs the same! */
 /* Some BIOS's are messed up and don't set all MTRRs the same! */

+ 37 - 11
arch/x86/kernel/cpu/mtrr/main.c

@@ -59,6 +59,12 @@
 #define MTRR_TO_PHYS_WC_OFFSET 1000
 #define MTRR_TO_PHYS_WC_OFFSET 1000
 
 
 u32 num_var_ranges;
 u32 num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+	return __mtrr_enabled;
+}
 
 
 unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
 static DEFINE_MUTEX(mtrr_mutex);
@@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	int i, replace, error;
 	int i, replace, error;
 	mtrr_type ltype;
 	mtrr_type ltype;
 
 
-	if (!mtrr_if)
+	if (!mtrr_enabled())
 		return -ENXIO;
 		return -ENXIO;
 
 
 	error = mtrr_if->validate_add_page(base, size, type);
 	error = mtrr_if->validate_add_page(base, size, type);
@@ -435,6 +441,8 @@ static int mtrr_check(unsigned long base, unsigned long size)
 int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
 int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
 	     bool increment)
 	     bool increment)
 {
 {
+	if (!mtrr_enabled())
+		return -ENODEV;
 	if (mtrr_check(base, size))
 	if (mtrr_check(base, size))
 		return -EINVAL;
 		return -EINVAL;
 	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
 	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
@@ -463,8 +471,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 	unsigned long lbase, lsize;
 	unsigned long lbase, lsize;
 	int error = -EINVAL;
 	int error = -EINVAL;
 
 
-	if (!mtrr_if)
-		return -ENXIO;
+	if (!mtrr_enabled())
+		return -ENODEV;
 
 
 	max = num_var_ranges;
 	max = num_var_ranges;
 	/* No CPU hotplug when we change MTRR entries */
 	/* No CPU hotplug when we change MTRR entries */
@@ -523,6 +531,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
  */
  */
 int mtrr_del(int reg, unsigned long base, unsigned long size)
 int mtrr_del(int reg, unsigned long base, unsigned long size)
 {
 {
+	if (!mtrr_enabled())
+		return -ENODEV;
 	if (mtrr_check(base, size))
 	if (mtrr_check(base, size))
 		return -EINVAL;
 		return -EINVAL;
 	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
 	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
@@ -538,6 +548,9 @@ EXPORT_SYMBOL(mtrr_del);
  * attempts to add a WC MTRR covering size bytes starting at base and
  * attempts to add a WC MTRR covering size bytes starting at base and
  * logs an error if this fails.
  * logs an error if this fails.
  *
  *
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
  * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
  * Drivers must store the return value to pass to mtrr_del_wc_if_needed,
  * but drivers should not try to interpret that return value.
  * but drivers should not try to interpret that return value.
  */
  */
@@ -545,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size)
 {
 {
 	int ret;
 	int ret;
 
 
-	if (pat_enabled)
+	if (pat_enabled() || !mtrr_enabled())
 		return 0;  /* Success!  (We don't need to do anything.) */
 		return 0;  /* Success!  (We don't need to do anything.) */
 
 
 	ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
 	ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
@@ -577,7 +590,7 @@ void arch_phys_wc_del(int handle)
 EXPORT_SYMBOL(arch_phys_wc_del);
 EXPORT_SYMBOL(arch_phys_wc_del);
 
 
 /*
 /*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
  * @handle: Return value from arch_phys_wc_add
  * @handle: Return value from arch_phys_wc_add
  *
  *
  * This will turn the return value from arch_phys_wc_add into an mtrr
  * This will turn the return value from arch_phys_wc_add into an mtrr
@@ -587,14 +600,14 @@ EXPORT_SYMBOL(arch_phys_wc_del);
  * in printk line.  Alas there is an illegitimate use in some ancient
  * in printk line.  Alas there is an illegitimate use in some ancient
  * drm ioctls.
  * drm ioctls.
  */
  */
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
 {
 {
 	if (handle < MTRR_TO_PHYS_WC_OFFSET)
 	if (handle < MTRR_TO_PHYS_WC_OFFSET)
 		return -1;
 		return -1;
 	else
 	else
 		return handle - MTRR_TO_PHYS_WC_OFFSET;
 		return handle - MTRR_TO_PHYS_WC_OFFSET;
 }
 }
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
 
 
 /*
 /*
  * HACK ALERT!
  * HACK ALERT!
@@ -734,10 +747,12 @@ void __init mtrr_bp_init(void)
 	}
 	}
 
 
 	if (mtrr_if) {
 	if (mtrr_if) {
+		__mtrr_enabled = true;
 		set_num_var_ranges();
 		set_num_var_ranges();
 		init_table();
 		init_table();
 		if (use_intel()) {
 		if (use_intel()) {
-			get_mtrr_state();
+			/* BIOS may override */
+			__mtrr_enabled = get_mtrr_state();
 
 
 			if (mtrr_cleanup(phys_addr)) {
 			if (mtrr_cleanup(phys_addr)) {
 				changed_by_mtrr_cleanup = 1;
 				changed_by_mtrr_cleanup = 1;
@@ -745,10 +760,16 @@ void __init mtrr_bp_init(void)
 			}
 			}
 		}
 		}
 	}
 	}
+
+	if (!mtrr_enabled())
+		pr_info("MTRR: Disabled\n");
 }
 }
 
 
 void mtrr_ap_init(void)
 void mtrr_ap_init(void)
 {
 {
+	if (!mtrr_enabled())
+		return;
+
 	if (!use_intel() || mtrr_aps_delayed_init)
 	if (!use_intel() || mtrr_aps_delayed_init)
 		return;
 		return;
 	/*
 	/*
@@ -774,6 +795,9 @@ void mtrr_save_state(void)
 {
 {
 	int first_cpu;
 	int first_cpu;
 
 
+	if (!mtrr_enabled())
+		return;
+
 	get_online_cpus();
 	get_online_cpus();
 	first_cpu = cpumask_first(cpu_online_mask);
 	first_cpu = cpumask_first(cpu_online_mask);
 	smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
 	smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
@@ -782,6 +806,8 @@ void mtrr_save_state(void)
 
 
 void set_mtrr_aps_delayed_init(void)
 void set_mtrr_aps_delayed_init(void)
 {
 {
+	if (!mtrr_enabled())
+		return;
 	if (!use_intel())
 	if (!use_intel())
 		return;
 		return;
 
 
@@ -793,7 +819,7 @@ void set_mtrr_aps_delayed_init(void)
  */
  */
 void mtrr_aps_init(void)
 void mtrr_aps_init(void)
 {
 {
-	if (!use_intel())
+	if (!use_intel() || !mtrr_enabled())
 		return;
 		return;
 
 
 	/*
 	/*
@@ -810,7 +836,7 @@ void mtrr_aps_init(void)
 
 
 void mtrr_bp_restore(void)
 void mtrr_bp_restore(void)
 {
 {
-	if (!use_intel())
+	if (!use_intel() || !mtrr_enabled())
 		return;
 		return;
 
 
 	mtrr_if->set_all();
 	mtrr_if->set_all();
@@ -818,7 +844,7 @@ void mtrr_bp_restore(void)
 
 
 static int __init mtrr_init_finialize(void)
 static int __init mtrr_init_finialize(void)
 {
 {
-	if (!mtrr_if)
+	if (!mtrr_enabled())
 		return 0;
 		return 0;
 
 
 	if (use_intel()) {
 	if (use_intel()) {

+ 1 - 1
arch/x86/kernel/cpu/mtrr/mtrr.h

@@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
 
 
 void fill_mtrr_var_range(unsigned int index,
 void fill_mtrr_var_range(unsigned int index,
 		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
 		u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
-void get_mtrr_state(void);
+bool get_mtrr_state(void);
 
 
 extern void set_mtrr_ops(const struct mtrr_ops *ops);
 extern void set_mtrr_ops(const struct mtrr_ops *ops);
 
 

+ 17 - 24
arch/x86/kernel/devicetree.c

@@ -4,7 +4,6 @@
 #include <linux/bootmem.h>
 #include <linux/bootmem.h>
 #include <linux/export.h>
 #include <linux/export.h>
 #include <linux/io.h>
 #include <linux/io.h>
-#include <linux/irqdomain.h>
 #include <linux/interrupt.h>
 #include <linux/interrupt.h>
 #include <linux/list.h>
 #include <linux/list.h>
 #include <linux/of.h>
 #include <linux/of.h>
@@ -17,6 +16,7 @@
 #include <linux/of_pci.h>
 #include <linux/of_pci.h>
 #include <linux/initrd.h>
 #include <linux/initrd.h>
 
 
+#include <asm/irqdomain.h>
 #include <asm/hpet.h>
 #include <asm/hpet.h>
 #include <asm/apic.h>
 #include <asm/apic.h>
 #include <asm/pci_x86.h>
 #include <asm/pci_x86.h>
@@ -196,38 +196,31 @@ static struct of_ioapic_type of_ioapic_type[] =
 	},
 	},
 };
 };
 
 
-static int ioapic_xlate(struct irq_domain *domain,
-			struct device_node *controller,
-			const u32 *intspec, u32 intsize,
-			irq_hw_number_t *out_hwirq, u32 *out_type)
+static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
+			      unsigned int nr_irqs, void *arg)
 {
 {
+	struct of_phandle_args *irq_data = (void *)arg;
 	struct of_ioapic_type *it;
 	struct of_ioapic_type *it;
-	u32 line, idx, gsi;
+	struct irq_alloc_info tmp;
 
 
-	if (WARN_ON(intsize < 2))
+	if (WARN_ON(irq_data->args_count < 2))
 		return -EINVAL;
 		return -EINVAL;
-
-	line = intspec[0];
-
-	if (intspec[1] >= ARRAY_SIZE(of_ioapic_type))
+	if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type))
 		return -EINVAL;
 		return -EINVAL;
 
 
-	it = &of_ioapic_type[intspec[1]];
+	it = &of_ioapic_type[irq_data->args[1]];
+	ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity);
+	tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain));
+	tmp.ioapic_pin = irq_data->args[0];
 
 
-	idx = (u32)(long)domain->host_data;
-	gsi = mp_pin_to_gsi(idx, line);
-	if (mp_set_gsi_attr(gsi, it->trigger, it->polarity, cpu_to_node(0)))
-		return -EBUSY;
-
-	*out_hwirq = line;
-	*out_type = it->out_type;
-	return 0;
+	return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp);
 }
 }
 
 
-const struct irq_domain_ops ioapic_irq_domain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-	.xlate = ioapic_xlate,
+static const struct irq_domain_ops ioapic_irq_domain_ops = {
+	.alloc		= dt_irqdomain_alloc,
+	.free		= mp_irqdomain_free,
+	.activate	= mp_irqdomain_activate,
+	.deactivate	= mp_irqdomain_deactivate,
 };
 };
 
 
 static void __init dtb_add_ioapic(struct device_node *dn)
 static void __init dtb_add_ioapic(struct device_node *dn)

+ 33 - 44
arch/x86/kernel/entry_64.S

@@ -216,7 +216,7 @@ ENTRY(system_call)
 GLOBAL(system_call_after_swapgs)
 GLOBAL(system_call_after_swapgs)
 
 
 	movq	%rsp,PER_CPU_VAR(rsp_scratch)
 	movq	%rsp,PER_CPU_VAR(rsp_scratch)
-	movq	PER_CPU_VAR(kernel_stack),%rsp
+	movq	PER_CPU_VAR(cpu_current_top_of_stack),%rsp
 
 
 	/* Construct struct pt_regs on stack */
 	/* Construct struct pt_regs on stack */
 	pushq_cfi $__USER_DS			/* pt_regs->ss */
 	pushq_cfi $__USER_DS			/* pt_regs->ss */
@@ -419,26 +419,27 @@ syscall_return:
 	 * a completely clean 64-bit userspace context.
 	 * a completely clean 64-bit userspace context.
 	 */
 	 */
 	movq RCX(%rsp),%rcx
 	movq RCX(%rsp),%rcx
-	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
+	movq RIP(%rsp),%r11
+	cmpq %rcx,%r11			/* RCX == RIP */
 	jne opportunistic_sysret_failed
 	jne opportunistic_sysret_failed
 
 
 	/*
 	/*
 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
 	 * in kernel space.  This essentially lets the user take over
 	 * in kernel space.  This essentially lets the user take over
-	 * the kernel, since userspace controls RSP.  It's not worth
-	 * testing for canonicalness exactly -- this check detects any
-	 * of the 17 high bits set, which is true for non-canonical
-	 * or kernel addresses.  (This will pessimize vsyscall=native.
-	 * Big deal.)
+	 * the kernel, since userspace controls RSP.
 	 *
 	 *
-	 * If virtual addresses ever become wider, this will need
+	 * If width of "canonical tail" ever becomes variable, this will need
 	 * to be updated to remain correct on both old and new CPUs.
 	 * to be updated to remain correct on both old and new CPUs.
 	 */
 	 */
 	.ifne __VIRTUAL_MASK_SHIFT - 47
 	.ifne __VIRTUAL_MASK_SHIFT - 47
 	.error "virtual address width changed -- SYSRET checks need update"
 	.error "virtual address width changed -- SYSRET checks need update"
 	.endif
 	.endif
-	shr $__VIRTUAL_MASK_SHIFT, %rcx
-	jnz opportunistic_sysret_failed
+	/* Change top 16 bits to be the sign-extension of 47th bit */
+	shl	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+	sar	$(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+	/* If this changed %rcx, it was not canonical */
+	cmpq	%rcx, %r11
+	jne	opportunistic_sysret_failed
 
 
 	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
 	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
 	jne opportunistic_sysret_failed
 	jne opportunistic_sysret_failed
@@ -475,8 +476,8 @@ syscall_return:
 	 */
 	 */
 syscall_return_via_sysret:
 syscall_return_via_sysret:
 	CFI_REMEMBER_STATE
 	CFI_REMEMBER_STATE
-	/* r11 is already restored (see code above) */
-	RESTORE_C_REGS_EXCEPT_R11
+	/* rcx and r11 are already restored (see code above) */
+	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq RSP(%rsp),%rsp
 	movq RSP(%rsp),%rsp
 	USERGS_SYSRET64
 	USERGS_SYSRET64
 	CFI_RESTORE_STATE
 	CFI_RESTORE_STATE
@@ -533,40 +534,27 @@ GLOBAL(stub_execveat)
 	CFI_ENDPROC
 	CFI_ENDPROC
 END(stub_execveat)
 END(stub_execveat)
 
 
-#ifdef CONFIG_X86_X32_ABI
+#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
 	.align	8
 	.align	8
 GLOBAL(stub_x32_execve)
 GLOBAL(stub_x32_execve)
+GLOBAL(stub32_execve)
 	CFI_STARTPROC
 	CFI_STARTPROC
 	DEFAULT_FRAME 0, 8
 	DEFAULT_FRAME 0, 8
 	call	compat_sys_execve
 	call	compat_sys_execve
 	jmp	return_from_execve
 	jmp	return_from_execve
 	CFI_ENDPROC
 	CFI_ENDPROC
+END(stub32_execve)
 END(stub_x32_execve)
 END(stub_x32_execve)
 	.align	8
 	.align	8
 GLOBAL(stub_x32_execveat)
 GLOBAL(stub_x32_execveat)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
-	call	compat_sys_execveat
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub_x32_execveat)
-#endif
-
-#ifdef CONFIG_IA32_EMULATION
-	.align	8
-GLOBAL(stub32_execve)
-	CFI_STARTPROC
-	call	compat_sys_execve
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub32_execve)
-	.align	8
 GLOBAL(stub32_execveat)
 GLOBAL(stub32_execveat)
 	CFI_STARTPROC
 	CFI_STARTPROC
+	DEFAULT_FRAME 0, 8
 	call	compat_sys_execveat
 	call	compat_sys_execveat
 	jmp	return_from_execve
 	jmp	return_from_execve
 	CFI_ENDPROC
 	CFI_ENDPROC
 END(stub32_execveat)
 END(stub32_execveat)
+END(stub_x32_execveat)
 #endif
 #endif
 
 
 /*
 /*
@@ -622,7 +610,7 @@ ENTRY(ret_from_fork)
 
 
 	RESTORE_EXTRA_REGS
 	RESTORE_EXTRA_REGS
 
 
-	testl $3,CS(%rsp)			# from kernel_thread?
+	testb	$3, CS(%rsp)			# from kernel_thread?
 
 
 	/*
 	/*
 	 * By the time we get here, we have no idea whether our pt_regs,
 	 * By the time we get here, we have no idea whether our pt_regs,
@@ -686,8 +674,8 @@ END(irq_entries_start)
 
 
 	leaq -RBP(%rsp),%rdi	/* arg1 for \func (pointer to pt_regs) */
 	leaq -RBP(%rsp),%rdi	/* arg1 for \func (pointer to pt_regs) */
 
 
-	testl $3, CS-RBP(%rsp)
-	je 1f
+	testb	$3, CS-RBP(%rsp)
+	jz	1f
 	SWAPGS
 	SWAPGS
 1:
 1:
 	/*
 	/*
@@ -741,8 +729,8 @@ ret_from_intr:
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_ADJUST_CFA_OFFSET	RBP
 	CFI_ADJUST_CFA_OFFSET	RBP
 
 
-	testl $3,CS(%rsp)
-	je retint_kernel
+	testb	$3, CS(%rsp)
+	jz	retint_kernel
 	/* Interrupt came from user space */
 	/* Interrupt came from user space */
 
 
 	GET_THREAD_INFO(%rcx)
 	GET_THREAD_INFO(%rcx)
@@ -928,6 +916,8 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
 #ifdef CONFIG_HAVE_KVM
 #ifdef CONFIG_HAVE_KVM
 apicinterrupt3 POSTED_INTR_VECTOR \
 apicinterrupt3 POSTED_INTR_VECTOR \
 	kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
 	kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \
+	kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
 #endif
 #endif
 
 
 #ifdef CONFIG_X86_MCE_THRESHOLD
 #ifdef CONFIG_X86_MCE_THRESHOLD
@@ -989,7 +979,7 @@ ENTRY(\sym)
 	.if \paranoid
 	.if \paranoid
 	.if \paranoid == 1
 	.if \paranoid == 1
 	CFI_REMEMBER_STATE
 	CFI_REMEMBER_STATE
-	testl $3, CS(%rsp)		/* If coming from userspace, switch */
+	testb	$3, CS(%rsp)		/* If coming from userspace, switch */
 	jnz 1f				/* stacks. */
 	jnz 1f				/* stacks. */
 	.endif
 	.endif
 	call paranoid_entry
 	call paranoid_entry
@@ -1202,17 +1192,17 @@ ENTRY(xen_failsafe_callback)
 	/*CFI_REL_OFFSET ds,DS*/
 	/*CFI_REL_OFFSET ds,DS*/
 	CFI_REL_OFFSET r11,8
 	CFI_REL_OFFSET r11,8
 	CFI_REL_OFFSET rcx,0
 	CFI_REL_OFFSET rcx,0
-	movw %ds,%cx
+	movl %ds,%ecx
 	cmpw %cx,0x10(%rsp)
 	cmpw %cx,0x10(%rsp)
 	CFI_REMEMBER_STATE
 	CFI_REMEMBER_STATE
 	jne 1f
 	jne 1f
-	movw %es,%cx
+	movl %es,%ecx
 	cmpw %cx,0x18(%rsp)
 	cmpw %cx,0x18(%rsp)
 	jne 1f
 	jne 1f
-	movw %fs,%cx
+	movl %fs,%ecx
 	cmpw %cx,0x20(%rsp)
 	cmpw %cx,0x20(%rsp)
 	jne 1f
 	jne 1f
-	movw %gs,%cx
+	movl %gs,%ecx
 	cmpw %cx,0x28(%rsp)
 	cmpw %cx,0x28(%rsp)
 	jne 1f
 	jne 1f
 	/* All segments match their saved values => Category 2 (Bad IRET). */
 	/* All segments match their saved values => Category 2 (Bad IRET). */
@@ -1330,8 +1320,8 @@ ENTRY(error_entry)
 	SAVE_C_REGS 8
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
 	SAVE_EXTRA_REGS 8
 	xorl %ebx,%ebx
 	xorl %ebx,%ebx
-	testl $3,CS+8(%rsp)
-	je error_kernelspace
+	testb	$3, CS+8(%rsp)
+	jz	error_kernelspace
 error_swapgs:
 error_swapgs:
 	SWAPGS
 	SWAPGS
 error_sti:
 error_sti:
@@ -1382,7 +1372,7 @@ ENTRY(error_exit)
 	TRACE_IRQS_OFF
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
 	GET_THREAD_INFO(%rcx)
 	testl %eax,%eax
 	testl %eax,%eax
-	jne retint_kernel
+	jnz retint_kernel
 	LOCKDEP_SYS_EXIT_IRQ
 	LOCKDEP_SYS_EXIT_IRQ
 	movl TI_flags(%rcx),%edx
 	movl TI_flags(%rcx),%edx
 	movl $_TIF_WORK_MASK,%edi
 	movl $_TIF_WORK_MASK,%edi
@@ -1627,7 +1617,6 @@ end_repeat_nmi:
 	je 1f
 	je 1f
 	movq %r12, %cr2
 	movq %r12, %cr2
 1:
 1:
-	
 	testl %ebx,%ebx				/* swapgs needed? */
 	testl %ebx,%ebx				/* swapgs needed? */
 	jnz nmi_restore
 	jnz nmi_restore
 nmi_swapgs:
 nmi_swapgs:

+ 2 - 2
arch/x86/kernel/head_32.S

@@ -547,7 +547,7 @@ ENTRY(early_idt_handler)
 	cld
 	cld
 
 
 	cmpl $2,(%esp)		# X86_TRAP_NMI
 	cmpl $2,(%esp)		# X86_TRAP_NMI
-	je is_nmi		# Ignore NMI
+	je .Lis_nmi		# Ignore NMI
 
 
 	cmpl $2,%ss:early_recursion_flag
 	cmpl $2,%ss:early_recursion_flag
 	je hlt_loop
 	je hlt_loop
@@ -600,7 +600,7 @@ ex_entry:
 	pop %ecx
 	pop %ecx
 	pop %eax
 	pop %eax
 	decl %ss:early_recursion_flag
 	decl %ss:early_recursion_flag
-is_nmi:
+.Lis_nmi:
 	addl $8,%esp		/* drop vector number and error code */
 	addl $8,%esp		/* drop vector number and error code */
 	iret
 	iret
 ENDPROC(early_idt_handler)
 ENDPROC(early_idt_handler)

+ 2 - 2
arch/x86/kernel/head_64.S

@@ -344,7 +344,7 @@ ENTRY(early_idt_handler)
 	cld
 	cld
 
 
 	cmpl $2,(%rsp)		# X86_TRAP_NMI
 	cmpl $2,(%rsp)		# X86_TRAP_NMI
-	je is_nmi		# Ignore NMI
+	je .Lis_nmi		# Ignore NMI
 
 
 	cmpl $2,early_recursion_flag(%rip)
 	cmpl $2,early_recursion_flag(%rip)
 	jz  1f
 	jz  1f
@@ -409,7 +409,7 @@ ENTRY(early_idt_handler)
 	popq %rcx
 	popq %rcx
 	popq %rax
 	popq %rax
 	decl early_recursion_flag(%rip)
 	decl early_recursion_flag(%rip)
-is_nmi:
+.Lis_nmi:
 	addq $16,%rsp		# drop vector number and error code
 	addq $16,%rsp		# drop vector number and error code
 	INTERRUPT_RETURN
 	INTERRUPT_RETURN
 ENDPROC(early_idt_handler)
 ENDPROC(early_idt_handler)

+ 15 - 42
arch/x86/kernel/hpet.c

@@ -12,6 +12,7 @@
 #include <linux/pm.h>
 #include <linux/pm.h>
 #include <linux/io.h>
 #include <linux/io.h>
 
 
+#include <asm/irqdomain.h>
 #include <asm/fixmap.h>
 #include <asm/fixmap.h>
 #include <asm/hpet.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
 #include <asm/time.h>
@@ -305,8 +306,6 @@ static void hpet_legacy_clockevent_register(void)
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 }
 }
 
 
-static int hpet_setup_msi_irq(unsigned int irq);
-
 static void hpet_set_mode(enum clock_event_mode mode,
 static void hpet_set_mode(enum clock_event_mode mode,
 			  struct clock_event_device *evt, int timer)
 			  struct clock_event_device *evt, int timer)
 {
 {
@@ -357,7 +356,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
 			hpet_enable_legacy_int();
 			hpet_enable_legacy_int();
 		} else {
 		} else {
 			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
 			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-			hpet_setup_msi_irq(hdev->irq);
+			irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
 			disable_irq(hdev->irq);
 			disable_irq(hdev->irq);
 			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
 			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
 			enable_irq(hdev->irq);
 			enable_irq(hdev->irq);
@@ -423,6 +422,7 @@ static int hpet_legacy_next_event(unsigned long delta,
 
 
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
 static struct hpet_dev	*hpet_devs;
 static struct hpet_dev	*hpet_devs;
+static struct irq_domain *hpet_domain;
 
 
 void hpet_msi_unmask(struct irq_data *data)
 void hpet_msi_unmask(struct irq_data *data)
 {
 {
@@ -473,31 +473,6 @@ static int hpet_msi_next_event(unsigned long delta,
 	return hpet_next_event(delta, evt, hdev->num);
 	return hpet_next_event(delta, evt, hdev->num);
 }
 }
 
 
-static int hpet_setup_msi_irq(unsigned int irq)
-{
-	if (x86_msi.setup_hpet_msi(irq, hpet_blockid)) {
-		irq_free_hwirq(irq);
-		return -EINVAL;
-	}
-	return 0;
-}
-
-static int hpet_assign_irq(struct hpet_dev *dev)
-{
-	unsigned int irq = irq_alloc_hwirq(-1);
-
-	if (!irq)
-		return -EINVAL;
-
-	irq_set_handler_data(irq, dev);
-
-	if (hpet_setup_msi_irq(irq))
-		return -EINVAL;
-
-	dev->irq = irq;
-	return 0;
-}
-
 static irqreturn_t hpet_interrupt_handler(int irq, void *data)
 static irqreturn_t hpet_interrupt_handler(int irq, void *data)
 {
 {
 	struct hpet_dev *dev = (struct hpet_dev *)data;
 	struct hpet_dev *dev = (struct hpet_dev *)data;
@@ -540,9 +515,6 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
 	if (!(hdev->flags & HPET_DEV_VALID))
 	if (!(hdev->flags & HPET_DEV_VALID))
 		return;
 		return;
 
 
-	if (hpet_setup_msi_irq(hdev->irq))
-		return;
-
 	hdev->cpu = cpu;
 	hdev->cpu = cpu;
 	per_cpu(cpu_hpet_dev, cpu) = hdev;
 	per_cpu(cpu_hpet_dev, cpu) = hdev;
 	evt->name = hdev->name;
 	evt->name = hdev->name;
@@ -574,7 +546,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	unsigned int id;
 	unsigned int id;
 	unsigned int num_timers;
 	unsigned int num_timers;
 	unsigned int num_timers_used = 0;
 	unsigned int num_timers_used = 0;
-	int i;
+	int i, irq;
 
 
 	if (hpet_msi_disable)
 	if (hpet_msi_disable)
 		return;
 		return;
@@ -587,6 +559,10 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 	num_timers++; /* Value read out starts from 0 */
 	num_timers++; /* Value read out starts from 0 */
 	hpet_print_config();
 	hpet_print_config();
 
 
+	hpet_domain = hpet_create_irq_domain(hpet_blockid);
+	if (!hpet_domain)
+		return;
+
 	hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
 	hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
 	if (!hpet_devs)
 	if (!hpet_devs)
 		return;
 		return;
@@ -601,15 +577,16 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
 		if (!(cfg & HPET_TN_FSB_CAP))
 		if (!(cfg & HPET_TN_FSB_CAP))
 			continue;
 			continue;
 
 
+		irq = hpet_assign_irq(hpet_domain, hdev, hdev->num);
+		if (irq < 0)
+			continue;
+
+		sprintf(hdev->name, "hpet%d", i);
+		hdev->num = i;
+		hdev->irq = irq;
 		hdev->flags = 0;
 		hdev->flags = 0;
 		if (cfg & HPET_TN_PERIODIC_CAP)
 		if (cfg & HPET_TN_PERIODIC_CAP)
 			hdev->flags |= HPET_DEV_PERI_CAP;
 			hdev->flags |= HPET_DEV_PERI_CAP;
-		hdev->num = i;
-
-		sprintf(hdev->name, "hpet%d", i);
-		if (hpet_assign_irq(hdev))
-			continue;
-
 		hdev->flags |= HPET_DEV_FSB_CAP;
 		hdev->flags |= HPET_DEV_FSB_CAP;
 		hdev->flags |= HPET_DEV_VALID;
 		hdev->flags |= HPET_DEV_VALID;
 		num_timers_used++;
 		num_timers_used++;
@@ -709,10 +686,6 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
 }
 }
 #else
 #else
 
 
-static int hpet_setup_msi_irq(unsigned int irq)
-{
-	return 0;
-}
 static void hpet_msi_capability_lookup(unsigned int start_timer)
 static void hpet_msi_capability_lookup(unsigned int start_timer)
 {
 {
 	return;
 	return;

+ 4 - 4
arch/x86/kernel/i8259.c

@@ -329,8 +329,8 @@ static void init_8259A(int auto_eoi)
 	 */
 	 */
 	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
 	outb_pic(0x11, PIC_MASTER_CMD);	/* ICW1: select 8259A-1 init */
 
 
-	/* ICW2: 8259A-1 IR0-7 mapped to 0x30-0x37 */
-	outb_pic(IRQ0_VECTOR, PIC_MASTER_IMR);
+	/* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */
+	outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR);
 
 
 	/* 8259A-1 (the master) has a slave on IR2 */
 	/* 8259A-1 (the master) has a slave on IR2 */
 	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
 	outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR);
@@ -342,8 +342,8 @@ static void init_8259A(int auto_eoi)
 
 
 	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
 	outb_pic(0x11, PIC_SLAVE_CMD);	/* ICW1: select 8259A-2 init */
 
 
-	/* ICW2: 8259A-2 IR0-7 mapped to IRQ8_VECTOR */
-	outb_pic(IRQ8_VECTOR, PIC_SLAVE_IMR);
+	/* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */
+	outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR);
 	/* 8259A-2 is a slave on master's IR2 */
 	/* 8259A-2 is a slave on master's IR2 */
 	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
 	outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR);
 	/* (slave's support for AEOI in flat mode is to be investigated) */
 	/* (slave's support for AEOI in flat mode is to be investigated) */

+ 46 - 10
arch/x86/kernel/irq.c

@@ -22,6 +22,12 @@
 #define CREATE_TRACE_POINTS
 #define CREATE_TRACE_POINTS
 #include <asm/trace/irq_vectors.h>
 #include <asm/trace/irq_vectors.h>
 
 
+DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+DEFINE_PER_CPU(struct pt_regs *, irq_regs);
+EXPORT_PER_CPU_SYMBOL(irq_regs);
+
 atomic_t irq_err_count;
 atomic_t irq_err_count;
 
 
 /* Function pointer for generic interrupt vector handling */
 /* Function pointer for generic interrupt vector handling */
@@ -135,6 +141,18 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
 #if defined(CONFIG_X86_IO_APIC)
 	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
 	seq_printf(p, "%*s: %10u\n", prec, "MIS", atomic_read(&irq_mis_count));
+#endif
+#ifdef CONFIG_HAVE_KVM
+	seq_printf(p, "%*s: ", prec, "PIN");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->kvm_posted_intr_ipis);
+	seq_puts(p, "  Posted-interrupt notification event\n");
+
+	seq_printf(p, "%*s: ", prec, "PIW");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ",
+			   irq_stats(j)->kvm_posted_intr_wakeup_ipis);
+	seq_puts(p, "  Posted-interrupt wakeup event\n");
 #endif
 #endif
 	return 0;
 	return 0;
 }
 }
@@ -192,8 +210,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	unsigned vector = ~regs->orig_ax;
 	unsigned vector = ~regs->orig_ax;
 	unsigned irq;
 	unsigned irq;
 
 
-	irq_enter();
-	exit_idle();
+	entering_irq();
 
 
 	irq = __this_cpu_read(vector_irq[vector]);
 	irq = __this_cpu_read(vector_irq[vector]);
 
 
@@ -209,7 +226,7 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 		}
 		}
 	}
 	}
 
 
-	irq_exit();
+	exiting_irq();
 
 
 	set_irq_regs(old_regs);
 	set_irq_regs(old_regs);
 	return 1;
 	return 1;
@@ -237,6 +254,18 @@ __visible void smp_x86_platform_ipi(struct pt_regs *regs)
 }
 }
 
 
 #ifdef CONFIG_HAVE_KVM
 #ifdef CONFIG_HAVE_KVM
+static void dummy_handler(void) {}
+static void (*kvm_posted_intr_wakeup_handler)(void) = dummy_handler;
+
+void kvm_set_posted_intr_wakeup_handler(void (*handler)(void))
+{
+	if (handler)
+		kvm_posted_intr_wakeup_handler = handler;
+	else
+		kvm_posted_intr_wakeup_handler = dummy_handler;
+}
+EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
+
 /*
 /*
  * Handler for POSTED_INTERRUPT_VECTOR.
  * Handler for POSTED_INTERRUPT_VECTOR.
  */
  */
@@ -244,16 +273,23 @@ __visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
 {
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
 	struct pt_regs *old_regs = set_irq_regs(regs);
 
 
-	ack_APIC_irq();
-
-	irq_enter();
-
-	exit_idle();
-
+	entering_ack_irq();
 	inc_irq_stat(kvm_posted_intr_ipis);
 	inc_irq_stat(kvm_posted_intr_ipis);
+	exiting_irq();
+	set_irq_regs(old_regs);
+}
 
 
-	irq_exit();
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
 
 
+	entering_ack_irq();
+	inc_irq_stat(kvm_posted_intr_wakeup_ipis);
+	kvm_posted_intr_wakeup_handler();
+	exiting_irq();
 	set_irq_regs(old_regs);
 	set_irq_regs(old_regs);
 }
 }
 #endif
 #endif

+ 0 - 6
arch/x86/kernel/irq_32.c

@@ -21,12 +21,6 @@
 
 
 #include <asm/apic.h>
 #include <asm/apic.h>
 
 
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 
 
 int sysctl_panic_on_stackoverflow __read_mostly;
 int sysctl_panic_on_stackoverflow __read_mostly;

+ 0 - 6
arch/x86/kernel/irq_64.c

@@ -20,12 +20,6 @@
 #include <asm/idle.h>
 #include <asm/idle.h>
 #include <asm/apic.h>
 #include <asm/apic.h>
 
 
-DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
-EXPORT_PER_CPU_SYMBOL(irq_stat);
-
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
 int sysctl_panic_on_stackoverflow;
 int sysctl_panic_on_stackoverflow;
 
 
 /*
 /*

+ 2 - 8
arch/x86/kernel/irq_work.c

@@ -10,12 +10,6 @@
 #include <asm/apic.h>
 #include <asm/apic.h>
 #include <asm/trace/irq_vectors.h>
 #include <asm/trace/irq_vectors.h>
 
 
-static inline void irq_work_entering_irq(void)
-{
-	irq_enter();
-	ack_APIC_irq();
-}
-
 static inline void __smp_irq_work_interrupt(void)
 static inline void __smp_irq_work_interrupt(void)
 {
 {
 	inc_irq_stat(apic_irq_work_irqs);
 	inc_irq_stat(apic_irq_work_irqs);
@@ -24,14 +18,14 @@ static inline void __smp_irq_work_interrupt(void)
 
 
 __visible void smp_irq_work_interrupt(struct pt_regs *regs)
 __visible void smp_irq_work_interrupt(struct pt_regs *regs)
 {
 {
-	irq_work_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_irq_work_interrupt();
 	__smp_irq_work_interrupt();
 	exiting_irq();
 	exiting_irq();
 }
 }
 
 
 __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
 __visible void smp_trace_irq_work_interrupt(struct pt_regs *regs)
 {
 {
-	irq_work_entering_irq();
+	ipi_entering_ack_irq();
 	trace_irq_work_entry(IRQ_WORK_VECTOR);
 	trace_irq_work_entry(IRQ_WORK_VECTOR);
 	__smp_irq_work_interrupt();
 	__smp_irq_work_interrupt();
 	trace_irq_work_exit(IRQ_WORK_VECTOR);
 	trace_irq_work_exit(IRQ_WORK_VECTOR);

+ 4 - 2
arch/x86/kernel/irqinit.c

@@ -86,7 +86,7 @@ void __init init_IRQ(void)
 	int i;
 	int i;
 
 
 	/*
 	/*
-	 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
+	 * On cpu 0, Assign ISA_IRQ_VECTOR(irq) to IRQ 0..15.
 	 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
 	 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
 	 * then this configuration will likely be static after the boot. If
 	 * then this configuration will likely be static after the boot. If
 	 * these IRQ's are handled by more mordern controllers like IO-APIC,
 	 * these IRQ's are handled by more mordern controllers like IO-APIC,
@@ -94,7 +94,7 @@ void __init init_IRQ(void)
 	 * irq's migrate etc.
 	 * irq's migrate etc.
 	 */
 	 */
 	for (i = 0; i < nr_legacy_irqs(); i++)
 	for (i = 0; i < nr_legacy_irqs(); i++)
-		per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
+		per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i;
 
 
 	x86_init.irqs.intr_init();
 	x86_init.irqs.intr_init();
 }
 }
@@ -144,6 +144,8 @@ static void __init apic_intr_init(void)
 #ifdef CONFIG_HAVE_KVM
 #ifdef CONFIG_HAVE_KVM
 	/* IPI for KVM to deliver posted interrupt */
 	/* IPI for KVM to deliver posted interrupt */
 	alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
 	alloc_intr_gate(POSTED_INTR_VECTOR, kvm_posted_intr_ipi);
+	/* IPI for KVM to deliver interrupt to wake up tasks */
+	alloc_intr_gate(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi);
 #endif
 #endif
 
 
 	/* IPI vectors for APIC spurious and error interrupts */
 	/* IPI vectors for APIC spurious and error interrupts */

+ 1 - 6
arch/x86/kernel/mpparse.c

@@ -19,8 +19,8 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/smp.h>
 #include <linux/smp.h>
 #include <linux/pci.h>
 #include <linux/pci.h>
-#include <linux/irqdomain.h>
 
 
+#include <asm/irqdomain.h>
 #include <asm/mtrr.h>
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
 #include <asm/mpspec.h>
 #include <asm/pgalloc.h>
 #include <asm/pgalloc.h>
@@ -113,11 +113,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
 		pr_warn("Unknown bustype %s - ignoring\n", str);
 		pr_warn("Unknown bustype %s - ignoring\n", str);
 }
 }
 
 
-static struct irq_domain_ops mp_ioapic_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-	.unmap = mp_irqdomain_unmap,
-};
-
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 static void __init MP_ioapic_info(struct mpc_ioapic *m)
 {
 {
 	struct ioapic_domain_cfg cfg = {
 	struct ioapic_domain_cfg cfg = {

+ 3 - 1
arch/x86/kernel/paravirt.c

@@ -154,7 +154,9 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		ret = paravirt_patch_ident_64(insnbuf, len);
 		ret = paravirt_patch_ident_64(insnbuf, len);
 
 
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+#ifdef CONFIG_X86_32
 		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
+#endif
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
 		 type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64))
 		/* If operation requires a jmp, then jmp */
 		/* If operation requires a jmp, then jmp */
@@ -371,7 +373,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 
 
 	.load_sp0 = native_load_sp0,
 	.load_sp0 = native_load_sp0,
 
 
-#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+#if defined(CONFIG_X86_32)
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 	.irq_enable_sysexit = native_irq_enable_sysexit,
 #endif
 #endif
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64

+ 0 - 1
arch/x86/kernel/paravirt_patch_64.c

@@ -49,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, save_fl);
 		PATCH_SITE(pv_irq_ops, save_fl);
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
-		PATCH_SITE(pv_cpu_ops, irq_enable_sysexit);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret32);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret32);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret64);
 		PATCH_SITE(pv_cpu_ops, usergs_sysret64);
 		PATCH_SITE(pv_cpu_ops, swapgs);
 		PATCH_SITE(pv_cpu_ops, swapgs);

+ 1 - 4
arch/x86/kernel/process_32.c

@@ -302,13 +302,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	arch_end_context_switch(next_p);
 	arch_end_context_switch(next_p);
 
 
 	/*
 	/*
-	 * Reload esp0, kernel_stack, and current_top_of_stack.  This changes
+	 * Reload esp0 and cpu_current_top_of_stack.  This changes
 	 * current_thread_info().
 	 * current_thread_info().
 	 */
 	 */
 	load_sp0(tss, next);
 	load_sp0(tss, next);
-	this_cpu_write(kernel_stack,
-		       (unsigned long)task_stack_page(next_p) +
-		       THREAD_SIZE);
 	this_cpu_write(cpu_current_top_of_stack,
 	this_cpu_write(cpu_current_top_of_stack,
 		       (unsigned long)task_stack_page(next_p) +
 		       (unsigned long)task_stack_page(next_p) +
 		       THREAD_SIZE);
 		       THREAD_SIZE);

+ 0 - 3
arch/x86/kernel/process_64.c

@@ -409,9 +409,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/* Reload esp0 and ss1.  This changes current_thread_info(). */
 	/* Reload esp0 and ss1.  This changes current_thread_info(). */
 	load_sp0(tss, next);
 	load_sp0(tss, next);
 
 
-	this_cpu_write(kernel_stack,
-		(unsigned long)task_stack_page(next_p) + THREAD_SIZE);
-
 	/*
 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps
 	 * Now maybe reload the debug registers and handle I/O bitmaps
 	 */
 	 */

+ 1 - 2
arch/x86/kernel/setup.c

@@ -1222,8 +1222,7 @@ void __init setup_arch(char **cmdline_p)
 	init_cpu_to_node();
 	init_cpu_to_node();
 
 
 	init_apic_mappings();
 	init_apic_mappings();
-	if (x86_io_apic_ops.init)
-		x86_io_apic_ops.init();
+	io_apic_init_mappings();
 
 
 	kvm_guest_init();
 	kvm_guest_init();
 
 

+ 6 - 13
arch/x86/kernel/smp.c

@@ -170,8 +170,7 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
 
 
 asmlinkage __visible void smp_reboot_interrupt(void)
 asmlinkage __visible void smp_reboot_interrupt(void)
 {
 {
-	ack_APIC_irq();
-	irq_enter();
+	ipi_entering_ack_irq();
 	stop_this_cpu(NULL);
 	stop_this_cpu(NULL);
 	irq_exit();
 	irq_exit();
 }
 }
@@ -265,12 +264,6 @@ __visible void smp_reschedule_interrupt(struct pt_regs *regs)
 	 */
 	 */
 }
 }
 
 
-static inline void smp_entering_irq(void)
-{
-	ack_APIC_irq();
-	irq_enter();
-}
-
 __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 {
 {
 	/*
 	/*
@@ -279,7 +272,7 @@ __visible void smp_trace_reschedule_interrupt(struct pt_regs *regs)
 	 * scheduler_ipi(). This is OK, since those functions are allowed
 	 * scheduler_ipi(). This is OK, since those functions are allowed
 	 * to nest.
 	 * to nest.
 	 */
 	 */
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_reschedule_entry(RESCHEDULE_VECTOR);
 	trace_reschedule_entry(RESCHEDULE_VECTOR);
 	__smp_reschedule_interrupt();
 	__smp_reschedule_interrupt();
 	trace_reschedule_exit(RESCHEDULE_VECTOR);
 	trace_reschedule_exit(RESCHEDULE_VECTOR);
@@ -297,14 +290,14 @@ static inline void __smp_call_function_interrupt(void)
 
 
 __visible void smp_call_function_interrupt(struct pt_regs *regs)
 __visible void smp_call_function_interrupt(struct pt_regs *regs)
 {
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_call_function_interrupt();
 	__smp_call_function_interrupt();
 	exiting_irq();
 	exiting_irq();
 }
 }
 
 
 __visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
 __visible void smp_trace_call_function_interrupt(struct pt_regs *regs)
 {
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
 	trace_call_function_entry(CALL_FUNCTION_VECTOR);
 	__smp_call_function_interrupt();
 	__smp_call_function_interrupt();
 	trace_call_function_exit(CALL_FUNCTION_VECTOR);
 	trace_call_function_exit(CALL_FUNCTION_VECTOR);
@@ -319,14 +312,14 @@ static inline void __smp_call_function_single_interrupt(void)
 
 
 __visible void smp_call_function_single_interrupt(struct pt_regs *regs)
 __visible void smp_call_function_single_interrupt(struct pt_regs *regs)
 {
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	__smp_call_function_single_interrupt();
 	__smp_call_function_single_interrupt();
 	exiting_irq();
 	exiting_irq();
 }
 }
 
 
 __visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 __visible void smp_trace_call_function_single_interrupt(struct pt_regs *regs)
 {
 {
-	smp_entering_irq();
+	ipi_entering_ack_irq();
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
 	trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
 	__smp_call_function_single_interrupt();
 	__smp_call_function_single_interrupt();
 	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
 	trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);

+ 39 - 4
arch/x86/kernel/smpboot.c

@@ -513,6 +513,40 @@ void __inquire_remote_apic(int apicid)
 	}
 	}
 }
 }
 
 
+/*
+ * The Multiprocessor Specification 1.4 (1997) example code suggests
+ * that there should be a 10ms delay between the BSP asserting INIT
+ * and de-asserting INIT, when starting a remote processor.
+ * But that slows boot and resume on modern processors, which include
+ * many cores and don't require that delay.
+ *
+ * Cmdline "init_cpu_udelay=" is available to over-ride this delay.
+ * Modern processor families are quirked to remove the delay entirely.
+ */
+#define UDELAY_10MS_DEFAULT 10000
+
+static unsigned int init_udelay = UDELAY_10MS_DEFAULT;
+
+static int __init cpu_init_udelay(char *str)
+{
+	get_option(&str, &init_udelay);
+
+	return 0;
+}
+early_param("cpu_init_udelay", cpu_init_udelay);
+
+static void __init smp_quirk_init_udelay(void)
+{
+	/* if cmdline changed it from default, leave it alone */
+	if (init_udelay != UDELAY_10MS_DEFAULT)
+		return;
+
+	/* if modern processor, use no delay */
+	if (((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 6)) ||
+	    ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && (boot_cpu_data.x86 >= 0xF)))
+		init_udelay = 0;
+}
+
 /*
 /*
  * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
  * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
@@ -555,7 +589,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
 static int
 static int
 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
 {
-	unsigned long send_status, accept_status = 0;
+	unsigned long send_status = 0, accept_status = 0;
 	int maxlvt, num_starts, j;
 	int maxlvt, num_starts, j;
 
 
 	maxlvt = lapic_get_maxlvt();
 	maxlvt = lapic_get_maxlvt();
@@ -583,7 +617,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	pr_debug("Waiting for send to finish...\n");
 	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
 	send_status = safe_apic_wait_icr_idle();
 
 
-	mdelay(10);
+	udelay(init_udelay);
 
 
 	pr_debug("Deasserting INIT\n");
 	pr_debug("Deasserting INIT\n");
 
 
@@ -651,6 +685,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		 * Give the other CPU some time to accept the IPI.
 		 * Give the other CPU some time to accept the IPI.
 		 */
 		 */
 		udelay(200);
 		udelay(200);
+
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
 			apic_write(APIC_ESR, 0);
 		accept_status = (apic_read(APIC_ESR) & 0xEF);
 		accept_status = (apic_read(APIC_ESR) & 0xEF);
@@ -792,8 +827,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
 	clear_tsk_thread_flag(idle, TIF_FORK);
 	clear_tsk_thread_flag(idle, TIF_FORK);
 	initial_gs = per_cpu_offset(cpu);
 	initial_gs = per_cpu_offset(cpu);
 #endif
 #endif
-	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 }
 }
 
 
 /*
 /*
@@ -1176,6 +1209,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 		uv_system_init();
 		uv_system_init();
 
 
 	set_mtrr_aps_delayed_init();
 	set_mtrr_aps_delayed_init();
+
+	smp_quirk_init_udelay();
 }
 }
 
 
 void arch_enable_nonboot_cpus_begin(void)
 void arch_enable_nonboot_cpus_begin(void)

+ 2 - 2
arch/x86/kernel/traps.c

@@ -997,8 +997,8 @@ void __init trap_init(void)
 #endif
 #endif
 
 
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
-	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-	set_bit(SYSCALL_VECTOR, used_vectors);
+	set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call);
+	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 #endif
 
 
 	/*
 	/*

+ 0 - 9
arch/x86/kernel/x86_init.c

@@ -111,11 +111,9 @@ EXPORT_SYMBOL_GPL(x86_platform);
 #if defined(CONFIG_PCI_MSI)
 #if defined(CONFIG_PCI_MSI)
 struct x86_msi_ops x86_msi = {
 struct x86_msi_ops x86_msi = {
 	.setup_msi_irqs		= native_setup_msi_irqs,
 	.setup_msi_irqs		= native_setup_msi_irqs,
-	.compose_msi_msg	= native_compose_msi_msg,
 	.teardown_msi_irq	= native_teardown_msi_irq,
 	.teardown_msi_irq	= native_teardown_msi_irq,
 	.teardown_msi_irqs	= default_teardown_msi_irqs,
 	.teardown_msi_irqs	= default_teardown_msi_irqs,
 	.restore_msi_irqs	= default_restore_msi_irqs,
 	.restore_msi_irqs	= default_restore_msi_irqs,
-	.setup_hpet_msi		= default_setup_hpet_msi,
 };
 };
 
 
 /* MSI arch specific hooks */
 /* MSI arch specific hooks */
@@ -141,13 +139,6 @@ void arch_restore_msi_irqs(struct pci_dev *dev)
 #endif
 #endif
 
 
 struct x86_io_apic_ops x86_io_apic_ops = {
 struct x86_io_apic_ops x86_io_apic_ops = {
-	.init			= native_io_apic_init_mappings,
 	.read			= native_io_apic_read,
 	.read			= native_io_apic_read,
-	.write			= native_io_apic_write,
-	.modify			= native_io_apic_modify,
 	.disable		= native_disable_io_apic,
 	.disable		= native_disable_io_apic,
-	.print_entries		= native_io_apic_print_entries,
-	.set_affinity		= native_ioapic_set_affinity,
-	.setup_entry		= native_setup_ioapic_entry,
-	.eoi_ioapic_pin		= native_eoi_ioapic_pin,
 };
 };

+ 2 - 2
arch/x86/lguest/boot.c

@@ -90,7 +90,7 @@ struct lguest_data lguest_data = {
 	.noirq_iret = (u32)lguest_noirq_iret,
 	.noirq_iret = (u32)lguest_noirq_iret,
 	.kernel_address = PAGE_OFFSET,
 	.kernel_address = PAGE_OFFSET,
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
 	.blocked_interrupts = { 1 }, /* Block timer interrupts */
-	.syscall_vec = SYSCALL_VECTOR,
+	.syscall_vec = IA32_SYSCALL_VECTOR,
 };
 };
 
 
 /*G:037
 /*G:037
@@ -866,7 +866,7 @@ static void __init lguest_init_IRQ(void)
 	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
 	for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
 		/* Some systems map "vectors" to interrupts weirdly.  Not us! */
 		/* Some systems map "vectors" to interrupts weirdly.  Not us! */
 		__this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
 		__this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
-		if (i != SYSCALL_VECTOR)
+		if (i != IA32_SYSCALL_VECTOR)
 			set_intr_gate(i, irq_entries_start +
 			set_intr_gate(i, irq_entries_start +
 					8 * (i - FIRST_EXTERNAL_VECTOR));
 					8 * (i - FIRST_EXTERNAL_VECTOR));
 	}
 	}

+ 1 - 1
arch/x86/lib/Makefile

@@ -40,6 +40,6 @@ else
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
         lib-y += clear_page_64.o copy_page_64.o
         lib-y += clear_page_64.o copy_page_64.o
         lib-y += memmove_64.o memset_64.o
         lib-y += memmove_64.o memset_64.o
-        lib-y += copy_user_64.o copy_user_nocache_64.o
+        lib-y += copy_user_64.o
 	lib-y += cmpxchg16b_emu.o
 	lib-y += cmpxchg16b_emu.o
 endif
 endif

+ 92 - 24
arch/x86/lib/copy_user_64.S

@@ -16,30 +16,6 @@
 #include <asm/asm.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
 #include <asm/smap.h>
 
 
-	.macro ALIGN_DESTINATION
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jz 102f				/* already aligned */
-	subl $8,%ecx
-	negl %ecx
-	subl %ecx,%edx
-100:	movb (%rsi),%al
-101:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 100b
-102:
-	.section .fixup,"ax"
-103:	addl %ecx,%edx			/* ecx is zerorest also */
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(100b,103b)
-	_ASM_EXTABLE(101b,103b)
-	.endm
-
 /* Standard copy_to_user with segment limit checking */
 /* Standard copy_to_user with segment limit checking */
 ENTRY(_copy_to_user)
 ENTRY(_copy_to_user)
 	CFI_STARTPROC
 	CFI_STARTPROC
@@ -266,3 +242,95 @@ ENTRY(copy_user_enhanced_fast_string)
 	_ASM_EXTABLE(1b,12b)
 	_ASM_EXTABLE(1b,12b)
 	CFI_ENDPROC
 	CFI_ENDPROC
 ENDPROC(copy_user_enhanced_fast_string)
 ENDPROC(copy_user_enhanced_fast_string)
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ */
+ENTRY(__copy_user_nocache)
+	CFI_STARTPROC
+	ASM_STAC
+	cmpl $8,%edx
+	jb 20f		/* less then 8 bytes, go to byte copy loop */
+	ALIGN_DESTINATION
+	movl %edx,%ecx
+	andl $63,%edx
+	shrl $6,%ecx
+	jz 17f
+1:	movq (%rsi),%r8
+2:	movq 1*8(%rsi),%r9
+3:	movq 2*8(%rsi),%r10
+4:	movq 3*8(%rsi),%r11
+5:	movnti %r8,(%rdi)
+6:	movnti %r9,1*8(%rdi)
+7:	movnti %r10,2*8(%rdi)
+8:	movnti %r11,3*8(%rdi)
+9:	movq 4*8(%rsi),%r8
+10:	movq 5*8(%rsi),%r9
+11:	movq 6*8(%rsi),%r10
+12:	movq 7*8(%rsi),%r11
+13:	movnti %r8,4*8(%rdi)
+14:	movnti %r9,5*8(%rdi)
+15:	movnti %r10,6*8(%rdi)
+16:	movnti %r11,7*8(%rdi)
+	leaq 64(%rsi),%rsi
+	leaq 64(%rdi),%rdi
+	decl %ecx
+	jnz 1b
+17:	movl %edx,%ecx
+	andl $7,%edx
+	shrl $3,%ecx
+	jz 20f
+18:	movq (%rsi),%r8
+19:	movnti %r8,(%rdi)
+	leaq 8(%rsi),%rsi
+	leaq 8(%rdi),%rdi
+	decl %ecx
+	jnz 18b
+20:	andl %edx,%edx
+	jz 23f
+	movl %edx,%ecx
+21:	movb (%rsi),%al
+22:	movb %al,(%rdi)
+	incq %rsi
+	incq %rdi
+	decl %ecx
+	jnz 21b
+23:	xorl %eax,%eax
+	ASM_CLAC
+	sfence
+	ret
+
+	.section .fixup,"ax"
+30:	shll $6,%ecx
+	addl %ecx,%edx
+	jmp 60f
+40:	lea (%rdx,%rcx,8),%rdx
+	jmp 60f
+50:	movl %ecx,%edx
+60:	sfence
+	jmp copy_user_handle_tail
+	.previous
+
+	_ASM_EXTABLE(1b,30b)
+	_ASM_EXTABLE(2b,30b)
+	_ASM_EXTABLE(3b,30b)
+	_ASM_EXTABLE(4b,30b)
+	_ASM_EXTABLE(5b,30b)
+	_ASM_EXTABLE(6b,30b)
+	_ASM_EXTABLE(7b,30b)
+	_ASM_EXTABLE(8b,30b)
+	_ASM_EXTABLE(9b,30b)
+	_ASM_EXTABLE(10b,30b)
+	_ASM_EXTABLE(11b,30b)
+	_ASM_EXTABLE(12b,30b)
+	_ASM_EXTABLE(13b,30b)
+	_ASM_EXTABLE(14b,30b)
+	_ASM_EXTABLE(15b,30b)
+	_ASM_EXTABLE(16b,30b)
+	_ASM_EXTABLE(18b,40b)
+	_ASM_EXTABLE(19b,40b)
+	_ASM_EXTABLE(21b,50b)
+	_ASM_EXTABLE(22b,50b)
+	CFI_ENDPROC
+ENDPROC(__copy_user_nocache)

+ 0 - 136
arch/x86/lib/copy_user_nocache_64.S

@@ -1,136 +0,0 @@
-/*
- * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
- * Copyright 2002 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License v2.
- *
- * Functions to copy from and to user space.
- */
-
-#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-
-	.macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
-	/* check for bad alignment of destination */
-	movl %edi,%ecx
-	andl $7,%ecx
-	jz 102f				/* already aligned */
-	subl $8,%ecx
-	negl %ecx
-	subl %ecx,%edx
-100:	movb (%rsi),%al
-101:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 100b
-102:
-	.section .fixup,"ax"
-103:	addl %ecx,%edx			/* ecx is zerorest also */
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(100b,103b)
-	_ASM_EXTABLE(101b,103b)
-#endif
-	.endm
-
-/*
- * copy_user_nocache - Uncached memory copy with exception handling
- * This will force destination/source out of cache for more performance.
- */
-ENTRY(__copy_user_nocache)
-	CFI_STARTPROC
-	ASM_STAC
-	cmpl $8,%edx
-	jb 20f		/* less then 8 bytes, go to byte copy loop */
-	ALIGN_DESTINATION
-	movl %edx,%ecx
-	andl $63,%edx
-	shrl $6,%ecx
-	jz 17f
-1:	movq (%rsi),%r8
-2:	movq 1*8(%rsi),%r9
-3:	movq 2*8(%rsi),%r10
-4:	movq 3*8(%rsi),%r11
-5:	movnti %r8,(%rdi)
-6:	movnti %r9,1*8(%rdi)
-7:	movnti %r10,2*8(%rdi)
-8:	movnti %r11,3*8(%rdi)
-9:	movq 4*8(%rsi),%r8
-10:	movq 5*8(%rsi),%r9
-11:	movq 6*8(%rsi),%r10
-12:	movq 7*8(%rsi),%r11
-13:	movnti %r8,4*8(%rdi)
-14:	movnti %r9,5*8(%rdi)
-15:	movnti %r10,6*8(%rdi)
-16:	movnti %r11,7*8(%rdi)
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
-	decl %ecx
-	jnz 1b
-17:	movl %edx,%ecx
-	andl $7,%edx
-	shrl $3,%ecx
-	jz 20f
-18:	movq (%rsi),%r8
-19:	movnti %r8,(%rdi)
-	leaq 8(%rsi),%rsi
-	leaq 8(%rdi),%rdi
-	decl %ecx
-	jnz 18b
-20:	andl %edx,%edx
-	jz 23f
-	movl %edx,%ecx
-21:	movb (%rsi),%al
-22:	movb %al,(%rdi)
-	incq %rsi
-	incq %rdi
-	decl %ecx
-	jnz 21b
-23:	xorl %eax,%eax
-	ASM_CLAC
-	sfence
-	ret
-
-	.section .fixup,"ax"
-30:	shll $6,%ecx
-	addl %ecx,%edx
-	jmp 60f
-40:	lea (%rdx,%rcx,8),%rdx
-	jmp 60f
-50:	movl %ecx,%edx
-60:	sfence
-	jmp copy_user_handle_tail
-	.previous
-
-	_ASM_EXTABLE(1b,30b)
-	_ASM_EXTABLE(2b,30b)
-	_ASM_EXTABLE(3b,30b)
-	_ASM_EXTABLE(4b,30b)
-	_ASM_EXTABLE(5b,30b)
-	_ASM_EXTABLE(6b,30b)
-	_ASM_EXTABLE(7b,30b)
-	_ASM_EXTABLE(8b,30b)
-	_ASM_EXTABLE(9b,30b)
-	_ASM_EXTABLE(10b,30b)
-	_ASM_EXTABLE(11b,30b)
-	_ASM_EXTABLE(12b,30b)
-	_ASM_EXTABLE(13b,30b)
-	_ASM_EXTABLE(14b,30b)
-	_ASM_EXTABLE(15b,30b)
-	_ASM_EXTABLE(16b,30b)
-	_ASM_EXTABLE(18b,40b)
-	_ASM_EXTABLE(19b,40b)
-	_ASM_EXTABLE(21b,50b)
-	_ASM_EXTABLE(22b,50b)
-	CFI_ENDPROC
-ENDPROC(__copy_user_nocache)

+ 1 - 1
arch/x86/mm/iomap_32.c

@@ -82,7 +82,7 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
 	 * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
 	 * MTRR is UC or WC.  UC_MINUS gets the real intention, of the
 	 * user, which is "WC if the MTRR is WC, UC if you can't do that."
 	 * user, which is "WC if the MTRR is WC, UC if you can't do that."
 	 */
 	 */
-	if (!pat_enabled && pgprot_val(prot) ==
+	if (!pat_enabled() && pgprot_val(prot) ==
 	    (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC)))
 	    (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC)))
 		prot = __pgprot(__PAGE_KERNEL |
 		prot = __pgprot(__PAGE_KERNEL |
 				cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
 				cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));

+ 39 - 5
arch/x86/mm/ioremap.c

@@ -234,10 +234,11 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 {
 {
 	/*
 	/*
 	 * Ideally, this should be:
 	 * Ideally, this should be:
-	 *	pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
+	 *	pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
 	 *
 	 *
 	 * Till we fix all X drivers to use ioremap_wc(), we will use
 	 * Till we fix all X drivers to use ioremap_wc(), we will use
-	 * UC MINUS.
+	 * UC MINUS. Drivers that are certain they need or can already
+	 * be converted over to strong UC can use ioremap_uc().
 	 */
 	 */
 	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
 
 
@@ -246,6 +247,39 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
 }
 }
 EXPORT_SYMBOL(ioremap_nocache);
 EXPORT_SYMBOL(ioremap_nocache);
 
 
+/**
+ * ioremap_uc     -   map bus memory into CPU space as strongly uncachable
+ * @phys_addr:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_uc performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked with a strong
+ * preference as completely uncachable on the CPU when possible. For non-PAT
+ * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT
+ * systems this will set the PAT entry for the pages as strong UC.  This call
+ * will honor existing caching rules from things like the PCI bus. Note that
+ * there are other caches and buffers on many busses. In particular driver
+ * authors should read up on PCI writes.
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
+{
+	enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
+
+	return __ioremap_caller(phys_addr, size, pcm,
+				__builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(ioremap_uc);
+
 /**
 /**
  * ioremap_wc	-	map memory into CPU space write combined
  * ioremap_wc	-	map memory into CPU space write combined
  * @phys_addr:	bus address of the memory
  * @phys_addr:	bus address of the memory
@@ -258,7 +292,7 @@ EXPORT_SYMBOL(ioremap_nocache);
  */
  */
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
 {
 {
-	if (pat_enabled)
+	if (pat_enabled())
 		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
 		return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
 					__builtin_return_address(0));
 					__builtin_return_address(0));
 	else
 	else
@@ -331,7 +365,7 @@ void iounmap(volatile void __iomem *addr)
 }
 }
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(iounmap);
 
 
-int arch_ioremap_pud_supported(void)
+int __init arch_ioremap_pud_supported(void)
 {
 {
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 	return cpu_has_gbpages;
 	return cpu_has_gbpages;
@@ -340,7 +374,7 @@ int arch_ioremap_pud_supported(void)
 #endif
 #endif
 }
 }
 
 
-int arch_ioremap_pmd_supported(void)
+int __init arch_ioremap_pmd_supported(void)
 {
 {
 	return cpu_has_pse;
 	return cpu_has_pse;
 }
 }

+ 10 - 10
arch/x86/mm/pageattr.c

@@ -129,16 +129,15 @@ within(unsigned long addr, unsigned long start, unsigned long end)
  */
  */
 void clflush_cache_range(void *vaddr, unsigned int size)
 void clflush_cache_range(void *vaddr, unsigned int size)
 {
 {
-	void *vend = vaddr + size - 1;
+	unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+	void *vend = vaddr + size;
+	void *p;
 
 
 	mb();
 	mb();
 
 
-	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
-		clflushopt(vaddr);
-	/*
-	 * Flush any possible final partial cacheline:
-	 */
-	clflushopt(vend);
+	for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+	     p < vend; p += boot_cpu_data.x86_clflush_size)
+		clflushopt(p);
 
 
 	mb();
 	mb();
 }
 }
@@ -418,13 +417,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
 	phys_addr_t phys_addr;
 	phys_addr_t phys_addr;
 	unsigned long offset;
 	unsigned long offset;
 	enum pg_level level;
 	enum pg_level level;
-	unsigned long psize;
 	unsigned long pmask;
 	unsigned long pmask;
 	pte_t *pte;
 	pte_t *pte;
 
 
 	pte = lookup_address(virt_addr, &level);
 	pte = lookup_address(virt_addr, &level);
 	BUG_ON(!pte);
 	BUG_ON(!pte);
-	psize = page_level_size(level);
 	pmask = page_level_mask(level);
 	pmask = page_level_mask(level);
 	offset = virt_addr & ~pmask;
 	offset = virt_addr & ~pmask;
 	phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
 	phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
@@ -1468,6 +1465,9 @@ int _set_memory_uc(unsigned long addr, int numpages)
 {
 {
 	/*
 	/*
 	 * for now UC MINUS. see comments in ioremap_nocache()
 	 * for now UC MINUS. see comments in ioremap_nocache()
+	 * If you really need strong UC use ioremap_uc(), but note
+	 * that you cannot override IO areas with set_memory_*() as
+	 * these helpers cannot work with IO memory.
 	 */
 	 */
 	return change_page_attr_set(&addr, numpages,
 	return change_page_attr_set(&addr, numpages,
 				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
 				    cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
@@ -1571,7 +1571,7 @@ int set_memory_wc(unsigned long addr, int numpages)
 {
 {
 	int ret;
 	int ret;
 
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return set_memory_uc(addr, numpages);
 		return set_memory_uc(addr, numpages);
 
 
 	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
 	ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,

+ 40 - 42
arch/x86/mm/pat.c

@@ -33,13 +33,15 @@
 #include "pat_internal.h"
 #include "pat_internal.h"
 #include "mm_internal.h"
 #include "mm_internal.h"
 
 
-#ifdef CONFIG_X86_PAT
-int __read_mostly pat_enabled = 1;
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT);
 
 
 static inline void pat_disable(const char *reason)
 static inline void pat_disable(const char *reason)
 {
 {
-	pat_enabled = 0;
-	printk(KERN_INFO "%s\n", reason);
+	__pat_enabled = 0;
+	pr_info("x86/PAT: %s\n", reason);
 }
 }
 
 
 static int __init nopat(char *str)
 static int __init nopat(char *str)
@@ -48,13 +50,12 @@ static int __init nopat(char *str)
 	return 0;
 	return 0;
 }
 }
 early_param("nopat", nopat);
 early_param("nopat", nopat);
-#else
-static inline void pat_disable(const char *reason)
+
+bool pat_enabled(void)
 {
 {
-	(void)reason;
+	return !!__pat_enabled;
 }
 }
-#endif
-
+EXPORT_SYMBOL_GPL(pat_enabled);
 
 
 int pat_debug_enable;
 int pat_debug_enable;
 
 
@@ -188,7 +189,7 @@ void pat_init_cache_modes(void)
 					   pat_msg + 4 * i);
 					   pat_msg + 4 * i);
 		update_cache_mode_entry(i, cache);
 		update_cache_mode_entry(i, cache);
 	}
 	}
-	pr_info("PAT configuration [0-7]: %s\n", pat_msg);
+	pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
 }
 }
 
 
 #define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
 #define PAT(x, y)	((u64)PAT_ ## y << ((x)*8))
@@ -198,7 +199,7 @@ void pat_init(void)
 	u64 pat;
 	u64 pat;
 	bool boot_cpu = !boot_pat_state;
 	bool boot_cpu = !boot_pat_state;
 
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return;
 		return;
 
 
 	if (!cpu_has_pat) {
 	if (!cpu_has_pat) {
@@ -211,8 +212,7 @@ void pat_init(void)
 			 * switched to PAT on the boot CPU. We have no way to
 			 * switched to PAT on the boot CPU. We have no way to
 			 * undo PAT.
 			 * undo PAT.
 			 */
 			 */
-			printk(KERN_ERR "PAT enabled, "
-			       "but not supported by secondary CPU\n");
+			pr_err("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
 			BUG();
 			BUG();
 		}
 		}
 	}
 	}
@@ -267,9 +267,9 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end,
 	 * request is for WB.
 	 * request is for WB.
 	 */
 	 */
 	if (req_type == _PAGE_CACHE_MODE_WB) {
 	if (req_type == _PAGE_CACHE_MODE_WB) {
-		u8 mtrr_type;
+		u8 mtrr_type, uniform;
 
 
-		mtrr_type = mtrr_type_lookup(start, end);
+		mtrr_type = mtrr_type_lookup(start, end, &uniform);
 		if (mtrr_type != MTRR_TYPE_WRBACK)
 		if (mtrr_type != MTRR_TYPE_WRBACK)
 			return _PAGE_CACHE_MODE_UC_MINUS;
 			return _PAGE_CACHE_MODE_UC_MINUS;
 
 
@@ -347,7 +347,7 @@ static int reserve_ram_pages_type(u64 start, u64 end,
 		page = pfn_to_page(pfn);
 		page = pfn_to_page(pfn);
 		type = get_page_memtype(page);
 		type = get_page_memtype(page);
 		if (type != -1) {
 		if (type != -1) {
-			pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
+			pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
 				start, end - 1, type, req_type);
 				start, end - 1, type, req_type);
 			if (new_type)
 			if (new_type)
 				*new_type = type;
 				*new_type = type;
@@ -400,7 +400,7 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
 
 
 	BUG_ON(start >= end); /* end is exclusive */
 	BUG_ON(start >= end); /* end is exclusive */
 
 
-	if (!pat_enabled) {
+	if (!pat_enabled()) {
 		/* This is identical to page table setting without PAT */
 		/* This is identical to page table setting without PAT */
 		if (new_type) {
 		if (new_type) {
 			if (req_type == _PAGE_CACHE_MODE_WC)
 			if (req_type == _PAGE_CACHE_MODE_WC)
@@ -451,9 +451,9 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
 
 
 	err = rbt_memtype_check_insert(new, new_type);
 	err = rbt_memtype_check_insert(new, new_type);
 	if (err) {
 	if (err) {
-		printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
-		       start, end - 1,
-		       cattr_name(new->type), cattr_name(req_type));
+		pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+			start, end - 1,
+			cattr_name(new->type), cattr_name(req_type));
 		kfree(new);
 		kfree(new);
 		spin_unlock(&memtype_lock);
 		spin_unlock(&memtype_lock);
 
 
@@ -475,7 +475,7 @@ int free_memtype(u64 start, u64 end)
 	int is_range_ram;
 	int is_range_ram;
 	struct memtype *entry;
 	struct memtype *entry;
 
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 		return 0;
 
 
 	/* Low ISA region is always mapped WB. No need to track */
 	/* Low ISA region is always mapped WB. No need to track */
@@ -497,8 +497,8 @@ int free_memtype(u64 start, u64 end)
 	spin_unlock(&memtype_lock);
 	spin_unlock(&memtype_lock);
 
 
 	if (!entry) {
 	if (!entry) {
-		printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
-		       current->comm, current->pid, start, end - 1);
+		pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+			current->comm, current->pid, start, end - 1);
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
@@ -623,13 +623,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 	u64 to = from + size;
 	u64 to = from + size;
 	u64 cursor = from;
 	u64 cursor = from;
 
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 1;
 		return 1;
 
 
 	while (cursor < to) {
 	while (cursor < to) {
 		if (!devmem_is_allowed(pfn)) {
 		if (!devmem_is_allowed(pfn)) {
-			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
-			       current->comm, from, to - 1);
+			pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+				current->comm, from, to - 1);
 			return 0;
 			return 0;
 		}
 		}
 		cursor += PAGE_SIZE;
 		cursor += PAGE_SIZE;
@@ -659,7 +659,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	 * caching for the high addresses through the KEN pin, but
 	 * caching for the high addresses through the KEN pin, but
 	 * we maintain the tradition of paranoia in this code.
 	 * we maintain the tradition of paranoia in this code.
 	 */
 	 */
-	if (!pat_enabled &&
+	if (!pat_enabled() &&
 	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
 	    !(boot_cpu_has(X86_FEATURE_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_K6_MTRR) ||
 	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
 	      boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
@@ -698,8 +698,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size,
 				size;
 				size;
 
 
 	if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
 	if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
-		printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
-			"for [mem %#010Lx-%#010Lx]\n",
+		pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
 			current->comm, current->pid,
 			current->comm, current->pid,
 			cattr_name(pcm),
 			cattr_name(pcm),
 			base, (unsigned long long)(base + size-1));
 			base, (unsigned long long)(base + size-1));
@@ -729,12 +728,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 	 * the type requested matches the type of first page in the range.
 	 * the type requested matches the type of first page in the range.
 	 */
 	 */
 	if (is_ram) {
 	if (is_ram) {
-		if (!pat_enabled)
+		if (!pat_enabled())
 			return 0;
 			return 0;
 
 
 		pcm = lookup_memtype(paddr);
 		pcm = lookup_memtype(paddr);
 		if (want_pcm != pcm) {
 		if (want_pcm != pcm) {
-			printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
+			pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
 				current->comm, current->pid,
 				current->comm, current->pid,
 				cattr_name(want_pcm),
 				cattr_name(want_pcm),
 				(unsigned long long)paddr,
 				(unsigned long long)paddr,
@@ -755,13 +754,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
 		if (strict_prot ||
 		if (strict_prot ||
 		    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
 		    !is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
 			free_memtype(paddr, paddr + size);
 			free_memtype(paddr, paddr + size);
-			printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
-				" for [mem %#010Lx-%#010Lx], got %s\n",
-				current->comm, current->pid,
-				cattr_name(want_pcm),
-				(unsigned long long)paddr,
-				(unsigned long long)(paddr + size - 1),
-				cattr_name(pcm));
+			pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
+			       current->comm, current->pid,
+			       cattr_name(want_pcm),
+			       (unsigned long long)paddr,
+			       (unsigned long long)(paddr + size - 1),
+			       cattr_name(pcm));
 			return -EINVAL;
 			return -EINVAL;
 		}
 		}
 		/*
 		/*
@@ -844,7 +842,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 		return ret;
 		return ret;
 	}
 	}
 
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 		return 0;
 
 
 	/*
 	/*
@@ -872,7 +870,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
 {
 {
 	enum page_cache_mode pcm;
 	enum page_cache_mode pcm;
 
 
-	if (!pat_enabled)
+	if (!pat_enabled())
 		return 0;
 		return 0;
 
 
 	/* Set prot based on lookup */
 	/* Set prot based on lookup */
@@ -913,7 +911,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 
 
 pgprot_t pgprot_writecombine(pgprot_t prot)
 pgprot_t pgprot_writecombine(pgprot_t prot)
 {
 {
-	if (pat_enabled)
+	if (pat_enabled())
 		return __pgprot(pgprot_val(prot) |
 		return __pgprot(pgprot_val(prot) |
 				cachemode2protval(_PAGE_CACHE_MODE_WC));
 				cachemode2protval(_PAGE_CACHE_MODE_WC));
 	else
 	else
@@ -996,7 +994,7 @@ static const struct file_operations memtype_fops = {
 
 
 static int __init pat_memtype_list_init(void)
 static int __init pat_memtype_list_init(void)
 {
 {
-	if (pat_enabled) {
+	if (pat_enabled()) {
 		debugfs_create_file("pat_memtype_list", S_IRUSR,
 		debugfs_create_file("pat_memtype_list", S_IRUSR,
 				    arch_debugfs_dir, NULL, &memtype_fops);
 				    arch_debugfs_dir, NULL, &memtype_fops);
 	}
 	}

+ 1 - 1
arch/x86/mm/pat_internal.h

@@ -4,7 +4,7 @@
 extern int pat_debug_enable;
 extern int pat_debug_enable;
 
 
 #define dprintk(fmt, arg...) \
 #define dprintk(fmt, arg...) \
-	do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+	do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0)
 
 
 struct memtype {
 struct memtype {
 	u64			start;
 	u64			start;

+ 3 - 3
arch/x86/mm/pat_rbtree.c

@@ -160,9 +160,9 @@ success:
 	return 0;
 	return 0;
 
 
 failure:
 failure:
-	printk(KERN_INFO "%s:%d conflicting memory types "
-		"%Lx-%Lx %s<->%s\n", current->comm, current->pid, start,
-		end, cattr_name(found_type), cattr_name(match->type));
+	pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+		current->comm, current->pid, start, end,
+		cattr_name(found_type), cattr_name(match->type));
 	return -EBUSY;
 	return -EBUSY;
 }
 }
 
 

+ 46 - 14
arch/x86/mm/pgtable.c

@@ -563,16 +563,31 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 }
 }
 
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+/**
+ * pud_set_huge - setup kernel PUD mapping
+ *
+ * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
+ * function sets up a huge page only if any of the following conditions are met:
+ *
+ * - MTRRs are disabled, or
+ *
+ * - MTRRs are enabled and the range is completely covered by a single MTRR, or
+ *
+ * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
+ *   has no effect on the requested PAT memory type.
+ *
+ * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
+ * page mapping attempt fails.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 {
 {
-	u8 mtrr;
+	u8 mtrr, uniform;
 
 
-	/*
-	 * Do not use a huge page when the range is covered by non-WB type
-	 * of MTRRs.
-	 */
-	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
-	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
+	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+	    (mtrr != MTRR_TYPE_WRBACK))
 		return 0;
 		return 0;
 
 
 	prot = pgprot_4k_2_large(prot);
 	prot = pgprot_4k_2_large(prot);
@@ -584,17 +599,24 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
 	return 1;
 	return 1;
 }
 }
 
 
+/**
+ * pmd_set_huge - setup kernel PMD mapping
+ *
+ * See text over pud_set_huge() above.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 {
 {
-	u8 mtrr;
+	u8 mtrr, uniform;
 
 
-	/*
-	 * Do not use a huge page when the range is covered by non-WB type
-	 * of MTRRs.
-	 */
-	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
-	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
+	if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+	    (mtrr != MTRR_TYPE_WRBACK)) {
+		pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
+			     __func__, addr, addr + PMD_SIZE);
 		return 0;
 		return 0;
+	}
 
 
 	prot = pgprot_4k_2_large(prot);
 	prot = pgprot_4k_2_large(prot);
 
 
@@ -605,6 +627,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
 	return 1;
 	return 1;
 }
 }
 
 
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
 int pud_clear_huge(pud_t *pud)
 int pud_clear_huge(pud_t *pud)
 {
 {
 	if (pud_large(*pud)) {
 	if (pud_large(*pud)) {
@@ -615,6 +642,11 @@ int pud_clear_huge(pud_t *pud)
 	return 0;
 	return 0;
 }
 }
 
 
+/**
+ * pmd_clear_huge - clear kernel PMD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PMD map is found).
+ */
 int pmd_clear_huge(pmd_t *pmd)
 int pmd_clear_huge(pmd_t *pmd)
 {
 {
 	if (pmd_large(*pmd)) {
 	if (pmd_large(*pmd)) {

+ 3 - 3
arch/x86/pci/i386.c

@@ -429,12 +429,12 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
  	 * Caller can followup with UC MINUS request and add a WC mtrr if there
  	 * Caller can followup with UC MINUS request and add a WC mtrr if there
  	 * is a free mtrr slot.
  	 * is a free mtrr slot.
  	 */
  	 */
-	if (!pat_enabled && write_combine)
+	if (!pat_enabled() && write_combine)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	if (pat_enabled && write_combine)
+	if (pat_enabled() && write_combine)
 		prot |= cachemode2protval(_PAGE_CACHE_MODE_WC);
 		prot |= cachemode2protval(_PAGE_CACHE_MODE_WC);
-	else if (pat_enabled || boot_cpu_data.x86 > 3)
+	else if (pat_enabled() || boot_cpu_data.x86 > 3)
 		/*
 		/*
 		 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
 		 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
 		 * To avoid attribute conflicts, request UC MINUS here
 		 * To avoid attribute conflicts, request UC MINUS here

+ 3 - 3
arch/x86/pci/intel_mid_pci.c

@@ -208,6 +208,7 @@ static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
 
 
 static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 {
 {
+	struct irq_alloc_info info;
 	int polarity;
 	int polarity;
 
 
 	if (dev->irq_managed && dev->irq > 0)
 	if (dev->irq_managed && dev->irq > 0)
@@ -217,14 +218,13 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 		polarity = 0; /* active high */
 		polarity = 0; /* active high */
 	else
 	else
 		polarity = 1; /* active low */
 		polarity = 1; /* active low */
+	ioapic_set_alloc_attr(&info, dev_to_node(&dev->dev), 1, polarity);
 
 
 	/*
 	/*
 	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
 	 * MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
 	 * IOAPIC RTE entries, so we just enable RTE for the device.
 	 * IOAPIC RTE entries, so we just enable RTE for the device.
 	 */
 	 */
-	if (mp_set_gsi_attr(dev->irq, 1, polarity, dev_to_node(&dev->dev)))
-		return -EBUSY;
-	if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0)
+	if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC, &info) < 0)
 		return -EBUSY;
 		return -EBUSY;
 
 
 	dev->irq_managed = 1;
 	dev->irq_managed = 1;

+ 7 - 6
arch/x86/pci/irq.c

@@ -146,19 +146,20 @@ static void __init pirq_peer_trick(void)
 
 
 /*
 /*
  *  Code for querying and setting of IRQ routes on various interrupt routers.
  *  Code for querying and setting of IRQ routes on various interrupt routers.
+ *  PIC Edge/Level Control Registers (ELCR) 0x4d0 & 0x4d1.
  */
  */
 
 
-void eisa_set_level_irq(unsigned int irq)
+void elcr_set_level_irq(unsigned int irq)
 {
 {
 	unsigned char mask = 1 << (irq & 7);
 	unsigned char mask = 1 << (irq & 7);
 	unsigned int port = 0x4d0 + (irq >> 3);
 	unsigned int port = 0x4d0 + (irq >> 3);
 	unsigned char val;
 	unsigned char val;
-	static u16 eisa_irq_mask;
+	static u16 elcr_irq_mask;
 
 
-	if (irq >= 16 || (1 << irq) & eisa_irq_mask)
+	if (irq >= 16 || (1 << irq) & elcr_irq_mask)
 		return;
 		return;
 
 
-	eisa_irq_mask |= (1 << irq);
+	elcr_irq_mask |= (1 << irq);
 	printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
 	printk(KERN_DEBUG "PCI: setting IRQ %u as level-triggered\n", irq);
 	val = inb(port);
 	val = inb(port);
 	if (!(val & mask)) {
 	if (!(val & mask)) {
@@ -965,11 +966,11 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 	} else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
 	} else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
 	((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
 	((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
 		msg = "found";
 		msg = "found";
-		eisa_set_level_irq(irq);
+		elcr_set_level_irq(irq);
 	} else if (newirq && r->set &&
 	} else if (newirq && r->set &&
 		(dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
 		(dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
 		if (r->set(pirq_router_dev, dev, pirq, newirq)) {
 		if (r->set(pirq_router_dev, dev, pirq, newirq)) {
-			eisa_set_level_irq(newirq);
+			elcr_set_level_irq(newirq);
 			msg = "assigned";
 			msg = "assigned";
 			irq = newirq;
 			irq = newirq;
 		}
 		}

+ 1 - 0
arch/x86/platform/Makefile

@@ -1,4 +1,5 @@
 # Platform specific code goes here
 # Platform specific code goes here
+obj-y	+= atom/
 obj-y	+= ce4100/
 obj-y	+= ce4100/
 obj-y	+= efi/
 obj-y	+= efi/
 obj-y	+= geode/
 obj-y	+= geode/

+ 1 - 0
arch/x86/platform/atom/Makefile

@@ -0,0 +1 @@
+obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o

+ 183 - 0
arch/x86/platform/atom/punit_atom_debug.c

@@ -0,0 +1,183 @@
+/*
+ * Intel SOC Punit device state debug driver
+ * Punit controls power management for North Complex devices (Graphics
+ * blocks, Image Signal Processing, video processing, display, DSP etc.)
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+#include <asm/cpu_device_id.h>
+#include <asm/iosf_mbi.h>
+
+/* Side band Interface port */
+#define PUNIT_PORT		0x04
+/* Power gate status reg */
+#define PWRGT_STATUS		0x61
+/* Subsystem config/status Video processor */
+#define VED_SS_PM0		0x32
+/* Subsystem config/status ISP (Image Signal Processor) */
+#define ISP_SS_PM0		0x39
+/* Subsystem config/status Input/output controller */
+#define MIO_SS_PM		0x3B
+/* Shift bits for getting status for video, isp and i/o */
+#define SSS_SHIFT		24
+/* Shift bits for getting status for graphics rendering */
+#define RENDER_POS		0
+/* Shift bits for getting status for media control */
+#define MEDIA_POS		2
+/* Shift bits for getting status for Valley View/Baytrail display */
+#define VLV_DISPLAY_POS		6
+/* Subsystem config/status display for Cherry Trail SOC */
+#define CHT_DSP_SSS		0x36
+/* Shift bits for getting status for display */
+#define CHT_DSP_SSS_POS		16
+
+struct punit_device {
+	char *name;
+	int reg;
+	int sss_pos;
+};
+
+static const struct punit_device punit_device_byt[] = {
+	{ "GFX RENDER",	PWRGT_STATUS,	RENDER_POS },
+	{ "GFX MEDIA",	PWRGT_STATUS,	MEDIA_POS },
+	{ "DISPLAY",	PWRGT_STATUS,	VLV_DISPLAY_POS },
+	{ "VED",	VED_SS_PM0,	SSS_SHIFT },
+	{ "ISP",	ISP_SS_PM0,	SSS_SHIFT },
+	{ "MIO",	MIO_SS_PM,	SSS_SHIFT },
+	{ NULL }
+};
+
+static const struct punit_device punit_device_cht[] = {
+	{ "GFX RENDER",	PWRGT_STATUS,	RENDER_POS },
+	{ "GFX MEDIA",	PWRGT_STATUS,	MEDIA_POS },
+	{ "DISPLAY",	CHT_DSP_SSS,	CHT_DSP_SSS_POS },
+	{ "VED",	VED_SS_PM0,	SSS_SHIFT },
+	{ "ISP",	ISP_SS_PM0,	SSS_SHIFT },
+	{ "MIO",	MIO_SS_PM,	SSS_SHIFT },
+	{ NULL }
+};
+
+static const char * const dstates[] = {"D0", "D0i1", "D0i2", "D0i3"};
+
+static int punit_dev_state_show(struct seq_file *seq_file, void *unused)
+{
+	u32 punit_pwr_status;
+	struct punit_device *punit_devp = seq_file->private;
+	int index;
+	int status;
+
+	seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n");
+	while (punit_devp->name) {
+		status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ,
+				       punit_devp->reg,
+				       &punit_pwr_status);
+		if (status) {
+			seq_printf(seq_file, "%9s : Read Failed\n",
+				   punit_devp->name);
+		} else  {
+			index = (punit_pwr_status >> punit_devp->sss_pos) & 3;
+			seq_printf(seq_file, "%9s : %s\n", punit_devp->name,
+				   dstates[index]);
+		}
+		punit_devp++;
+	}
+
+	return 0;
+}
+
+static int punit_dev_state_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, punit_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations punit_dev_state_ops = {
+	.open		= punit_dev_state_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static struct dentry *punit_dbg_file;
+
+static int punit_dbgfs_register(struct punit_device *punit_device)
+{
+	static struct dentry *dev_state;
+
+	punit_dbg_file = debugfs_create_dir("punit_atom", NULL);
+	if (!punit_dbg_file)
+		return -ENXIO;
+
+	dev_state = debugfs_create_file("dev_power_state", S_IFREG | S_IRUGO,
+					punit_dbg_file, punit_device,
+					&punit_dev_state_ops);
+	if (!dev_state) {
+		pr_err("punit_dev_state register failed\n");
+		debugfs_remove(punit_dbg_file);
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static void punit_dbgfs_unregister(void)
+{
+	debugfs_remove_recursive(punit_dbg_file);
+}
+
+#define ICPU(model, drv_data) \
+	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\
+	  (kernel_ulong_t)&drv_data }
+
+static const struct x86_cpu_id intel_punit_cpu_ids[] = {
+	ICPU(55, punit_device_byt), /* Valleyview, Bay Trail */
+	ICPU(76, punit_device_cht), /* Braswell, Cherry Trail */
+	{}
+};
+
+MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids);
+
+static int __init punit_atom_debug_init(void)
+{
+	const struct x86_cpu_id *id;
+	int ret;
+
+	id = x86_match_cpu(intel_punit_cpu_ids);
+	if (!id)
+		return -ENODEV;
+
+	ret = punit_dbgfs_register((struct punit_device *)id->driver_data);
+	if (ret < 0)
+		return ret;
+
+	return 0;
+}
+
+static void __exit punit_atom_debug_exit(void)
+{
+	punit_dbgfs_unregister();
+}
+
+module_init(punit_atom_debug_init);
+module_exit(punit_atom_debug_exit);
+
+MODULE_AUTHOR("Kumar P, Mahesh <mahesh.kumar.p@intel.com>");
+MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
+MODULE_DESCRIPTION("Driver for Punit devices states debugging");
+MODULE_LICENSE("GPL v2");

+ 3 - 2
arch/x86/platform/intel-mid/device_libs/platform_wdt.c

@@ -27,6 +27,7 @@ static struct platform_device wdt_dev = {
 static int tangier_probe(struct platform_device *pdev)
 static int tangier_probe(struct platform_device *pdev)
 {
 {
 	int gsi;
 	int gsi;
+	struct irq_alloc_info info;
 	struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
 	struct intel_mid_wdt_pdata *pdata = pdev->dev.platform_data;
 
 
 	if (!pdata)
 	if (!pdata)
@@ -34,8 +35,8 @@ static int tangier_probe(struct platform_device *pdev)
 
 
 	/* IOAPIC builds identity mapping between GSI and IRQ on MID */
 	/* IOAPIC builds identity mapping between GSI and IRQ on MID */
 	gsi = pdata->irq;
 	gsi = pdata->irq;
-	if (mp_set_gsi_attr(gsi, 1, 0, cpu_to_node(0)) ||
-	    mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC) <= 0) {
+	ioapic_set_alloc_attr(&info, cpu_to_node(0), 1, 0);
+	if (mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC, &info) <= 0) {
 		dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
 		dev_warn(&pdev->dev, "cannot find interrupt %d in ioapic\n",
 			 gsi);
 			 gsi);
 		return -EINVAL;
 		return -EINVAL;

+ 13 - 5
arch/x86/platform/intel-mid/intel-mid.c

@@ -81,26 +81,34 @@ static unsigned long __init intel_mid_calibrate_tsc(void)
 	return 0;
 	return 0;
 }
 }
 
 
+static void __init intel_mid_setup_bp_timer(void)
+{
+	apbt_time_init();
+	setup_boot_APIC_clock();
+}
+
 static void __init intel_mid_time_init(void)
 static void __init intel_mid_time_init(void)
 {
 {
 	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
 	sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
+
 	switch (intel_mid_timer_options) {
 	switch (intel_mid_timer_options) {
 	case INTEL_MID_TIMER_APBT_ONLY:
 	case INTEL_MID_TIMER_APBT_ONLY:
 		break;
 		break;
 	case INTEL_MID_TIMER_LAPIC_APBT:
 	case INTEL_MID_TIMER_LAPIC_APBT:
-		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+		/* Use apbt and local apic */
+		x86_init.timers.setup_percpu_clockev = intel_mid_setup_bp_timer;
 		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
 		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
-		break;
+		return;
 	default:
 	default:
 		if (!boot_cpu_has(X86_FEATURE_ARAT))
 		if (!boot_cpu_has(X86_FEATURE_ARAT))
 			break;
 			break;
+		/* Lapic only, no apbt */
 		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
 		x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
 		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
 		x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
 		return;
 		return;
 	}
 	}
-	/* we need at least one APB timer */
-	pre_init_apic_IRQ0();
-	apbt_time_init();
+
+	x86_init.timers.setup_percpu_clockev = apbt_time_init;
 }
 }
 
 
 static void intel_mid_arch_setup(void)
 static void intel_mid_arch_setup(void)

+ 14 - 16
arch/x86/platform/intel-mid/sfi.c

@@ -95,18 +95,16 @@ int __init sfi_parse_mtmr(struct sfi_table_header *table)
 		pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n",
 		pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz, irq = %d\n",
 			totallen, (u32)pentry->phys_addr,
 			totallen, (u32)pentry->phys_addr,
 			pentry->freq_hz, pentry->irq);
 			pentry->freq_hz, pentry->irq);
-			if (!pentry->irq)
-				continue;
-			mp_irq.type = MP_INTSRC;
-			mp_irq.irqtype = mp_INT;
-/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
-			mp_irq.irqflag = 5;
-			mp_irq.srcbus = MP_BUS_ISA;
-			mp_irq.srcbusirq = pentry->irq;	/* IRQ */
-			mp_irq.dstapic = MP_APIC_ALL;
-			mp_irq.dstirq = pentry->irq;
-			mp_save_irq(&mp_irq);
-			mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC);
+		mp_irq.type = MP_INTSRC;
+		mp_irq.irqtype = mp_INT;
+		/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
+		mp_irq.irqflag = 5;
+		mp_irq.srcbus = MP_BUS_ISA;
+		mp_irq.srcbusirq = pentry->irq;	/* IRQ */
+		mp_irq.dstapic = MP_APIC_ALL;
+		mp_irq.dstirq = pentry->irq;
+		mp_save_irq(&mp_irq);
+		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -177,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
 		mp_irq.dstapic = MP_APIC_ALL;
 		mp_irq.dstapic = MP_APIC_ALL;
 		mp_irq.dstirq = pentry->irq;
 		mp_irq.dstirq = pentry->irq;
 		mp_save_irq(&mp_irq);
 		mp_save_irq(&mp_irq);
-		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC);
+		mp_map_gsi_to_irq(pentry->irq, IOAPIC_MAP_ALLOC, NULL);
 	}
 	}
 	return 0;
 	return 0;
 }
 }
@@ -436,6 +434,7 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 	struct devs_id *dev = NULL;
 	struct devs_id *dev = NULL;
 	int num, i, ret;
 	int num, i, ret;
 	int polarity;
 	int polarity;
+	struct irq_alloc_info info;
 
 
 	sb = (struct sfi_table_simple *)table;
 	sb = (struct sfi_table_simple *)table;
 	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
 	num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
@@ -469,9 +468,8 @@ static int __init sfi_parse_devs(struct sfi_table_header *table)
 				polarity = 1;
 				polarity = 1;
 			}
 			}
 
 
-			ret = mp_set_gsi_attr(irq, 1, polarity, NUMA_NO_NODE);
-			if (ret == 0)
-				ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC);
+			ioapic_set_alloc_attr(&info, NUMA_NO_NODE, 1, polarity);
+			ret = mp_map_gsi_to_irq(irq, IOAPIC_MAP_ALLOC, &info);
 			WARN_ON(ret < 0);
 			WARN_ON(ret < 0);
 		}
 		}
 
 

+ 2 - 5
arch/x86/platform/sfi/sfi.c

@@ -25,8 +25,8 @@
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/sfi.h>
 #include <linux/sfi.h>
 #include <linux/io.h>
 #include <linux/io.h>
-#include <linux/irqdomain.h>
 
 
+#include <asm/irqdomain.h>
 #include <asm/io_apic.h>
 #include <asm/io_apic.h>
 #include <asm/mpspec.h>
 #include <asm/mpspec.h>
 #include <asm/setup.h>
 #include <asm/setup.h>
@@ -71,9 +71,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
 #endif /* CONFIG_X86_LOCAL_APIC */
 #endif /* CONFIG_X86_LOCAL_APIC */
 
 
 #ifdef CONFIG_X86_IO_APIC
 #ifdef CONFIG_X86_IO_APIC
-static struct irq_domain_ops sfi_ioapic_irqdomain_ops = {
-	.map = mp_irqdomain_map,
-};
 
 
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 {
 {
@@ -82,7 +79,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
 	int i, num;
 	int i, num;
 	struct ioapic_domain_cfg cfg = {
 	struct ioapic_domain_cfg cfg = {
 		.type = IOAPIC_DOMAIN_STRICT,
 		.type = IOAPIC_DOMAIN_STRICT,
-		.ops = &sfi_ioapic_irqdomain_ops,
+		.ops = &mp_ioapic_irqdomain_ops,
 	};
 	};
 
 
 	sb = (struct sfi_table_simple *)table;
 	sb = (struct sfi_table_simple *)table;

+ 111 - 187
arch/x86/platform/uv/uv_irq.c

@@ -13,22 +13,37 @@
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/irq.h>
 #include <linux/irq.h>
 
 
+#include <asm/irqdomain.h>
 #include <asm/apic.h>
 #include <asm/apic.h>
 #include <asm/uv/uv_irq.h>
 #include <asm/uv/uv_irq.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_hub.h>
 
 
 /* MMR offset and pnode of hub sourcing interrupts for a given irq */
 /* MMR offset and pnode of hub sourcing interrupts for a given irq */
-struct uv_irq_2_mmr_pnode{
-	struct rb_node		list;
+struct uv_irq_2_mmr_pnode {
 	unsigned long		offset;
 	unsigned long		offset;
 	int			pnode;
 	int			pnode;
-	int			irq;
 };
 };
 
 
-static DEFINE_SPINLOCK(uv_irq_lock);
-static struct rb_root		uv_irq_root;
+static void uv_program_mmr(struct irq_cfg *cfg, struct uv_irq_2_mmr_pnode *info)
+{
+	unsigned long mmr_value;
+	struct uv_IO_APIC_route_entry *entry;
+
+	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
+		     sizeof(unsigned long));
+
+	mmr_value = 0;
+	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+	entry->vector		= cfg->vector;
+	entry->delivery_mode	= apic->irq_delivery_mode;
+	entry->dest_mode	= apic->irq_dest_mode;
+	entry->polarity		= 0;
+	entry->trigger		= 0;
+	entry->mask		= 0;
+	entry->dest		= cfg->dest_apicid;
 
 
-static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
+	uv_write_global_mmr64(info->pnode, info->offset, mmr_value);
+}
 
 
 static void uv_noop(struct irq_data *data) { }
 static void uv_noop(struct irq_data *data) { }
 
 
@@ -37,6 +52,23 @@ static void uv_ack_apic(struct irq_data *data)
 	ack_APIC_irq();
 	ack_APIC_irq();
 }
 }
 
 
+static int
+uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
+		    bool force)
+{
+	struct irq_data *parent = data->parent_data;
+	struct irq_cfg *cfg = irqd_cfg(data);
+	int ret;
+
+	ret = parent->chip->irq_set_affinity(parent, mask, force);
+	if (ret >= 0) {
+		uv_program_mmr(cfg, data->chip_data);
+		send_cleanup_vector(cfg);
+	}
+
+	return ret;
+}
+
 static struct irq_chip uv_irq_chip = {
 static struct irq_chip uv_irq_chip = {
 	.name			= "UV-CORE",
 	.name			= "UV-CORE",
 	.irq_mask		= uv_noop,
 	.irq_mask		= uv_noop,
@@ -45,189 +77,99 @@ static struct irq_chip uv_irq_chip = {
 	.irq_set_affinity	= uv_set_irq_affinity,
 	.irq_set_affinity	= uv_set_irq_affinity,
 };
 };
 
 
-/*
- * Add offset and pnode information of the hub sourcing interrupts to the
- * rb tree for a specific irq.
- */
-static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
+static int uv_domain_alloc(struct irq_domain *domain, unsigned int virq,
+			   unsigned int nr_irqs, void *arg)
 {
 {
-	struct rb_node **link = &uv_irq_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct uv_irq_2_mmr_pnode *n;
-	struct uv_irq_2_mmr_pnode *e;
-	unsigned long irqflags;
-
-	n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
-				uv_blade_to_memory_nid(blade));
-	if (!n)
+	struct uv_irq_2_mmr_pnode *chip_data;
+	struct irq_alloc_info *info = arg;
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+	int ret;
+
+	if (nr_irqs > 1 || !info || info->type != X86_IRQ_ALLOC_TYPE_UV)
+		return -EINVAL;
+
+	chip_data = kmalloc_node(sizeof(*chip_data), GFP_KERNEL,
+				 irq_data->node);
+	if (!chip_data)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	n->irq = irq;
-	n->offset = offset;
-	n->pnode = uv_blade_to_pnode(blade);
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	/* Find the right place in the rbtree: */
-	while (*link) {
-		parent = *link;
-		e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
-
-		if (unlikely(irq == e->irq)) {
-			/* irq entry exists */
-			e->pnode = uv_blade_to_pnode(blade);
-			e->offset = offset;
-			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-			kfree(n);
-			return 0;
-		}
-
-		if (irq < e->irq)
-			link = &(*link)->rb_left;
+	ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg);
+	if (ret >= 0) {
+		if (info->uv_limit == UV_AFFINITY_CPU)
+			irq_set_status_flags(virq, IRQ_NO_BALANCING);
 		else
 		else
-			link = &(*link)->rb_right;
+			irq_set_status_flags(virq, IRQ_MOVE_PCNTXT);
+
+		chip_data->pnode = uv_blade_to_pnode(info->uv_blade);
+		chip_data->offset = info->uv_offset;
+		irq_domain_set_info(domain, virq, virq, &uv_irq_chip, chip_data,
+				    handle_percpu_irq, NULL, info->uv_name);
+	} else {
+		kfree(chip_data);
 	}
 	}
 
 
-	/* Insert the node into the rbtree. */
-	rb_link_node(&n->list, parent, link);
-	rb_insert_color(&n->list, &uv_irq_root);
-
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	return 0;
+	return ret;
 }
 }
 
 
-/* Retrieve offset and pnode information from the rb tree for a specific irq */
-int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
+static void uv_domain_free(struct irq_domain *domain, unsigned int virq,
+			   unsigned int nr_irqs)
 {
 {
-	struct uv_irq_2_mmr_pnode *e;
-	struct rb_node *n;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	n = uv_irq_root.rb_node;
-	while (n) {
-		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-
-		if (e->irq == irq) {
-			*offset = e->offset;
-			*pnode = e->pnode;
-			spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-			return 0;
-		}
-
-		if (irq < e->irq)
-			n = n->rb_left;
-		else
-			n = n->rb_right;
-	}
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	return -1;
+	struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
+
+	BUG_ON(nr_irqs != 1);
+	kfree(irq_data->chip_data);
+	irq_clear_status_flags(virq, IRQ_MOVE_PCNTXT);
+	irq_clear_status_flags(virq, IRQ_NO_BALANCING);
+	irq_domain_free_irqs_top(domain, virq, nr_irqs);
 }
 }
 
 
 /*
 /*
  * Re-target the irq to the specified CPU and enable the specified MMR located
  * Re-target the irq to the specified CPU and enable the specified MMR located
  * on the specified blade to allow the sending of MSIs to the specified CPU.
  * on the specified blade to allow the sending of MSIs to the specified CPU.
  */
  */
-static int
-arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
-		       unsigned long mmr_offset, int limit)
+static void uv_domain_activate(struct irq_domain *domain,
+			       struct irq_data *irq_data)
 {
 {
-	const struct cpumask *eligible_cpu = cpumask_of(cpu);
-	struct irq_cfg *cfg = irq_cfg(irq);
-	unsigned long mmr_value;
-	struct uv_IO_APIC_route_entry *entry;
-	int mmr_pnode, err;
-	unsigned int dest;
-
-	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-			sizeof(unsigned long));
-
-	err = assign_irq_vector(irq, cfg, eligible_cpu);
-	if (err != 0)
-		return err;
-
-	err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest);
-	if (err != 0)
-		return err;
-
-	if (limit == UV_AFFINITY_CPU)
-		irq_set_status_flags(irq, IRQ_NO_BALANCING);
-	else
-		irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
-
-	irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
-				      irq_name);
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-	entry->vector		= cfg->vector;
-	entry->delivery_mode	= apic->irq_delivery_mode;
-	entry->dest_mode	= apic->irq_dest_mode;
-	entry->polarity		= 0;
-	entry->trigger		= 0;
-	entry->mask		= 0;
-	entry->dest		= dest;
-
-	mmr_pnode = uv_blade_to_pnode(mmr_blade);
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
-
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
-
-	return irq;
+	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 }
 }
 
 
 /*
 /*
  * Disable the specified MMR located on the specified blade so that MSIs are
  * Disable the specified MMR located on the specified blade so that MSIs are
  * longer allowed to be sent.
  * longer allowed to be sent.
  */
  */
-static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
+static void uv_domain_deactivate(struct irq_domain *domain,
+				 struct irq_data *irq_data)
 {
 {
 	unsigned long mmr_value;
 	unsigned long mmr_value;
 	struct uv_IO_APIC_route_entry *entry;
 	struct uv_IO_APIC_route_entry *entry;
 
 
-	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
-			sizeof(unsigned long));
-
 	mmr_value = 0;
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
 	entry->mask = 1;
 	entry->mask = 1;
-
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+	uv_program_mmr(irqd_cfg(irq_data), irq_data->chip_data);
 }
 }
 
 
-static int
-uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
-		    bool force)
-{
-	struct irq_cfg *cfg = irqd_cfg(data);
-	unsigned int dest;
-	unsigned long mmr_value, mmr_offset;
-	struct uv_IO_APIC_route_entry *entry;
-	int mmr_pnode;
-
-	if (apic_set_affinity(data, mask, &dest))
-		return -1;
-
-	mmr_value = 0;
-	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-
-	entry->vector		= cfg->vector;
-	entry->delivery_mode	= apic->irq_delivery_mode;
-	entry->dest_mode	= apic->irq_dest_mode;
-	entry->polarity		= 0;
-	entry->trigger		= 0;
-	entry->mask		= 0;
-	entry->dest		= dest;
-
-	/* Get previously stored MMR and pnode of hub sourcing interrupts */
-	if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
-		return -1;
-
-	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+static const struct irq_domain_ops uv_domain_ops = {
+	.alloc		= uv_domain_alloc,
+	.free		= uv_domain_free,
+	.activate	= uv_domain_activate,
+	.deactivate	= uv_domain_deactivate,
+};
 
 
-	if (cfg->move_in_progress)
-		send_cleanup_vector(cfg);
+static struct irq_domain *uv_get_irq_domain(void)
+{
+	static struct irq_domain *uv_domain;
+	static DEFINE_MUTEX(uv_lock);
+
+	mutex_lock(&uv_lock);
+	if (uv_domain == NULL) {
+		uv_domain = irq_domain_add_tree(NULL, &uv_domain_ops, NULL);
+		if (uv_domain)
+			uv_domain->parent = x86_vector_domain;
+	}
+	mutex_unlock(&uv_lock);
 
 
-	return IRQ_SET_MASK_OK_NOCOPY;
+	return uv_domain;
 }
 }
 
 
 /*
 /*
@@ -238,19 +180,21 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
 int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
 		 unsigned long mmr_offset, int limit)
 		 unsigned long mmr_offset, int limit)
 {
 {
-	int ret, irq = irq_alloc_hwirq(uv_blade_to_memory_nid(mmr_blade));
+	struct irq_alloc_info info;
+	struct irq_domain *domain = uv_get_irq_domain();
 
 
-	if (!irq)
-		return -EBUSY;
+	if (!domain)
+		return -ENOMEM;
 
 
-	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
-		limit);
-	if (ret == irq)
-		uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
-	else
-		irq_free_hwirq(irq);
+	init_irq_alloc_info(&info, cpumask_of(cpu));
+	info.type = X86_IRQ_ALLOC_TYPE_UV;
+	info.uv_limit = limit;
+	info.uv_blade = mmr_blade;
+	info.uv_offset = mmr_offset;
+	info.uv_name = irq_name;
 
 
-	return ret;
+	return irq_domain_alloc_irqs(domain, 1,
+				     uv_blade_to_memory_nid(mmr_blade), &info);
 }
 }
 EXPORT_SYMBOL_GPL(uv_setup_irq);
 EXPORT_SYMBOL_GPL(uv_setup_irq);
 
 
@@ -263,26 +207,6 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
  */
  */
 void uv_teardown_irq(unsigned int irq)
 void uv_teardown_irq(unsigned int irq)
 {
 {
-	struct uv_irq_2_mmr_pnode *e;
-	struct rb_node *n;
-	unsigned long irqflags;
-
-	spin_lock_irqsave(&uv_irq_lock, irqflags);
-	n = uv_irq_root.rb_node;
-	while (n) {
-		e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
-		if (e->irq == irq) {
-			arch_disable_uv_irq(e->pnode, e->offset);
-			rb_erase(n, &uv_irq_root);
-			kfree(e);
-			break;
-		}
-		if (irq < e->irq)
-			n = n->rb_left;
-		else
-			n = n->rb_right;
-	}
-	spin_unlock_irqrestore(&uv_irq_lock, irqflags);
-	irq_free_hwirq(irq);
+	irq_domain_free_irqs(irq, 1);
 }
 }
 EXPORT_SYMBOL_GPL(uv_teardown_irq);
 EXPORT_SYMBOL_GPL(uv_teardown_irq);

+ 4 - 4
arch/x86/power/hibernate_asm_64.S

@@ -78,9 +78,9 @@ ENTRY(restore_image)
 
 
 	/* code below has been relocated to a safe page */
 	/* code below has been relocated to a safe page */
 ENTRY(core_restore_code)
 ENTRY(core_restore_code)
-loop:
+.Lloop:
 	testq	%rdx, %rdx
 	testq	%rdx, %rdx
-	jz	done
+	jz	.Ldone
 
 
 	/* get addresses from the pbe and copy the page */
 	/* get addresses from the pbe and copy the page */
 	movq	pbe_address(%rdx), %rsi
 	movq	pbe_address(%rdx), %rsi
@@ -91,8 +91,8 @@ loop:
 
 
 	/* progress to the next pbe */
 	/* progress to the next pbe */
 	movq	pbe_next(%rdx), %rdx
 	movq	pbe_next(%rdx), %rdx
-	jmp	loop
-done:
+	jmp	.Lloop
+.Ldone:
 	/* jump to the restore_registers address from the image header */
 	/* jump to the restore_registers address from the image header */
 	jmpq	*%rax
 	jmpq	*%rax
 	/*
 	/*

+ 2 - 1
arch/x86/xen/enlighten.c

@@ -1181,10 +1181,11 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.read_tscp = native_read_tscp,
 	.read_tscp = native_read_tscp,
 
 
 	.iret = xen_iret,
 	.iret = xen_iret,
-	.irq_enable_sysexit = xen_sysexit,
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 	.usergs_sysret32 = xen_sysret32,
 	.usergs_sysret32 = xen_sysret32,
 	.usergs_sysret64 = xen_sysret64,
 	.usergs_sysret64 = xen_sysret64,
+#else
+	.irq_enable_sysexit = xen_sysexit,
 #endif
 #endif
 
 
 	.load_tr_desc = paravirt_nop,
 	.load_tr_desc = paravirt_nop,

+ 4 - 18
arch/x86/xen/xen-asm_64.S

@@ -15,6 +15,8 @@
 #include <asm/percpu.h>
 #include <asm/percpu.h>
 #include <asm/processor-flags.h>
 #include <asm/processor-flags.h>
 #include <asm/segment.h>
 #include <asm/segment.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
 
 
 #include <xen/interface/xen.h>
 #include <xen/interface/xen.h>
 
 
@@ -47,29 +49,13 @@ ENTRY(xen_iret)
 ENDPATCH(xen_iret)
 ENDPATCH(xen_iret)
 RELOC(xen_iret, 1b+1)
 RELOC(xen_iret, 1b+1)
 
 
-/*
- * sysexit is not used for 64-bit processes, so it's only ever used to
- * return to 32-bit compat userspace.
- */
-ENTRY(xen_sysexit)
-	pushq $__USER32_DS
-	pushq %rcx
-	pushq $X86_EFLAGS_IF
-	pushq $__USER32_CS
-	pushq %rdx
-
-	pushq $0
-1:	jmp hypercall_iret
-ENDPATCH(xen_sysexit)
-RELOC(xen_sysexit, 1b+1)
-
 ENTRY(xen_sysret64)
 ENTRY(xen_sysret64)
 	/*
 	/*
 	 * We're already on the usermode stack at this point, but
 	 * We're already on the usermode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 * still with the kernel gs, so we can easily switch back
 	 */
 	 */
 	movq %rsp, PER_CPU_VAR(rsp_scratch)
 	movq %rsp, PER_CPU_VAR(rsp_scratch)
-	movq PER_CPU_VAR(kernel_stack), %rsp
+	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 
 	pushq $__USER_DS
 	pushq $__USER_DS
 	pushq PER_CPU_VAR(rsp_scratch)
 	pushq PER_CPU_VAR(rsp_scratch)
@@ -88,7 +74,7 @@ ENTRY(xen_sysret32)
 	 * still with the kernel gs, so we can easily switch back
 	 * still with the kernel gs, so we can easily switch back
 	 */
 	 */
 	movq %rsp, PER_CPU_VAR(rsp_scratch)
 	movq %rsp, PER_CPU_VAR(rsp_scratch)
-	movq PER_CPU_VAR(kernel_stack), %rsp
+	movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 
 	pushq $__USER32_DS
 	pushq $__USER32_DS
 	pushq PER_CPU_VAR(rsp_scratch)
 	pushq PER_CPU_VAR(rsp_scratch)

+ 2 - 0
arch/x86/xen/xen-ops.h

@@ -134,7 +134,9 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long);
 
 
 /* These are not functions, and cannot be called normally */
 /* These are not functions, and cannot be called normally */
 __visible void xen_iret(void);
 __visible void xen_iret(void);
+#ifdef CONFIG_X86_32
 __visible void xen_sysexit(void);
 __visible void xen_sysexit(void);
+#endif
 __visible void xen_sysret32(void);
 __visible void xen_sysret32(void);
 __visible void xen_sysret64(void);
 __visible void xen_sysret64(void);
 __visible void xen_adjust_exception_frame(void);
 __visible void xen_adjust_exception_frame(void);

+ 1 - 13
drivers/gpu/drm/drm_ioctl.c

@@ -36,9 +36,6 @@
 
 
 #include <linux/pci.h>
 #include <linux/pci.h>
 #include <linux/export.h>
 #include <linux/export.h>
-#ifdef CONFIG_X86
-#include <asm/mtrr.h>
-#endif
 
 
 static int drm_version(struct drm_device *dev, void *data,
 static int drm_version(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 		       struct drm_file *file_priv);
@@ -197,16 +194,7 @@ static int drm_getmap(struct drm_device *dev, void *data,
 	map->type = r_list->map->type;
 	map->type = r_list->map->type;
 	map->flags = r_list->map->flags;
 	map->flags = r_list->map->flags;
 	map->handle = (void *)(unsigned long) r_list->user_token;
 	map->handle = (void *)(unsigned long) r_list->user_token;
-
-#ifdef CONFIG_X86
-	/*
-	 * There appears to be exactly one user of the mtrr index: dritest.
-	 * It's easy enough to keep it working on non-PAT systems.
-	 */
-	map->mtrr = phys_wc_to_mtrr_index(r_list->map->mtrr);
-#else
-	map->mtrr = -1;
-#endif
+	map->mtrr = arch_phys_wc_index(r_list->map->mtrr);
 
 
 	mutex_unlock(&dev->struct_mutex);
 	mutex_unlock(&dev->struct_mutex);
 
 

部分文件因为文件数量过多而无法显示