Browse Source

Merge tag 'for-linus-4.4-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull xen updates from David Vrabel:

 - Improve balloon driver memory hotplug placement.

 - Use unpopulated hotplugged memory for foreign pages (if
   supported/enabled).

 - Support 64 KiB guest pages on arm64.

 - CPU hotplug support on arm/arm64.

* tag 'for-linus-4.4-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (44 commits)
  xen: fix the check of e_pfn in xen_find_pfn_range
  x86/xen: add reschedule point when mapping foreign GFNs
  xen/arm: don't try to re-register vcpu_info on cpu_hotplug.
  xen, cpu_hotplug: call device_offline instead of cpu_down
  xen/arm: Enable cpu_hotplug.c
  xenbus: Support multiple grants ring with 64KB
  xen/grant-table: Add an helper to iterate over a specific number of grants
  xen/xenbus: Rename *RING_PAGE* to *RING_GRANT*
  xen/arm: correct comment in enlighten.c
  xen/gntdev: use types from linux/types.h in userspace headers
  xen/gntalloc: use types from linux/types.h in userspace headers
  xen/balloon: Use the correct sizeof when declaring frame_list
  xen/swiotlb: Add support for 64KB page granularity
  xen/swiotlb: Pass addresses rather than frame numbers to xen_arch_need_swiotlb
  arm/xen: Add support for 64KB page granularity
  xen/privcmd: Add support for Linux 64KB page granularity
  net/xen-netback: Make it running on 64KB page granularity
  net/xen-netfront: Make it running on 64KB page granularity
  block/xen-blkback: Make it running on 64KB page granularity
  block/xen-blkfront: Make it running on 64KB page granularity
  ...
Linus Torvalds 10 năm trước cách đây
mục cha
commit
41ecf1404b
41 tập tin đã thay đổi với 1365 bổ sung647 xóa
  1. 10 0
      arch/arm/include/asm/xen/hypervisor.h
  2. 17 9
      arch/arm/include/asm/xen/page-coherent.h
  3. 15 7
      arch/arm/include/asm/xen/page.h
  4. 16 4
      arch/arm/xen/enlighten.c
  5. 29 10
      arch/arm/xen/mm.c
  6. 3 3
      arch/arm/xen/p2m.c
  7. 5 0
      arch/x86/include/asm/xen/hypervisor.h
  8. 5 3
      arch/x86/include/asm/xen/page.h
  9. 15 0
      arch/x86/xen/enlighten.c
  10. 1 1
      arch/x86/xen/grant-table.c
  11. 1 0
      arch/x86/xen/mmu.c
  12. 13 6
      arch/x86/xen/p2m.c
  13. 6 3
      arch/x86/xen/setup.c
  14. 7 6
      drivers/block/xen-blkback/blkback.c
  15. 13 4
      drivers/block/xen-blkback/common.h
  16. 7 4
      drivers/block/xen-blkback/xenbus.c
  17. 360 200
      drivers/block/xen-blkfront.c
  18. 11 5
      drivers/net/xen-netback/common.h
  19. 105 62
      drivers/net/xen-netback/netback.c
  20. 86 36
      drivers/net/xen-netfront.c
  21. 2 2
      drivers/tty/hvc/hvc_xen.c
  22. 0 2
      drivers/xen/Makefile
  23. 224 117
      drivers/xen/balloon.c
  24. 8 0
      drivers/xen/biomerge.c
  25. 11 3
      drivers/xen/cpu_hotplug.c
  26. 1 1
      drivers/xen/events/events_base.c
  27. 1 1
      drivers/xen/events/events_fifo.c
  28. 52 4
      drivers/xen/grant-table.c
  29. 6 4
      drivers/xen/privcmd.c
  30. 21 22
      drivers/xen/swiotlb-xen.c
  31. 87 41
      drivers/xen/xenbus/xenbus_client.c
  32. 2 1
      drivers/xen/xenbus/xenbus_probe.c
  33. 84 40
      drivers/xen/xlate_mmu.c
  34. 2 0
      include/linux/memory_hotplug.h
  35. 12 10
      include/uapi/xen/gntalloc.h
  36. 18 16
      include/uapi/xen/gntdev.h
  37. 3 9
      include/xen/balloon.h
  38. 57 0
      include/xen/grant_table.h
  39. 26 1
      include/xen/page.h
  40. 2 2
      include/xen/xenbus.h
  41. 21 8
      mm/memory_hotplug.c

+ 10 - 0
arch/arm/include/asm/xen/hypervisor.h

@@ -26,4 +26,14 @@ void __init xen_early_init(void);
 static inline void xen_early_init(void) { return; }
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+static inline void xen_arch_register_cpu(int num)
+{
+}
+
+static inline void xen_arch_unregister_cpu(int num)
+{
+}
+#endif
+
 #endif /* _ASM_ARM_XEN_HYPERVISOR_H */

+ 17 - 9
arch/arm/include/asm/xen/page-coherent.h

@@ -35,11 +35,15 @@ static inline void xen_dma_map_page(struct device *hwdev, struct page *page,
 	     dma_addr_t dev_addr, unsigned long offset, size_t size,
 	     enum dma_data_direction dir, struct dma_attrs *attrs)
 {
-	bool local = PFN_DOWN(dev_addr) == page_to_pfn(page);
-	/* Dom0 is mapped 1:1, so if pfn == mfn the page is local otherwise
-	 * is a foreign page grant-mapped in dom0. If the page is local we
-	 * can safely call the native dma_ops function, otherwise we call
-	 * the xen specific function. */
+	bool local = XEN_PFN_DOWN(dev_addr) == page_to_xen_pfn(page);
+	/*
+	 * Dom0 is mapped 1:1, while the Linux page can be spanned accross
+	 * multiple Xen page, it's not possible to have a mix of local and
+	 * foreign Xen page. So if the first xen_pfn == mfn the page is local
+	 * otherwise it's a foreign page grant-mapped in dom0. If the page is
+	 * local we can safely call the native dma_ops function, otherwise we
+	 * call the xen specific function.
+	 */
 	if (local)
 		__generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs);
 	else
@@ -51,10 +55,14 @@ static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle,
 		struct dma_attrs *attrs)
 {
 	unsigned long pfn = PFN_DOWN(handle);
-	/* Dom0 is mapped 1:1, so calling pfn_valid on a foreign mfn will
-	 * always return false. If the page is local we can safely call the
-	 * native dma_ops function, otherwise we call the xen specific
-	 * function. */
+	/*
+	 * Dom0 is mapped 1:1, while the Linux page can be spanned accross
+	 * multiple Xen page, it's not possible to have a mix of local and
+	 * foreign Xen page. Dom0 is mapped 1:1, so calling pfn_valid on a
+	 * foreign mfn will always return false. If the page is local we can
+	 * safely call the native dma_ops function, otherwise we call the xen
+	 * specific function.
+	 */
 	if (pfn_valid(pfn)) {
 		if (__generic_dma_ops(hwdev)->unmap_page)
 			__generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs);

+ 15 - 7
arch/arm/include/asm/xen/page.h

@@ -13,9 +13,6 @@
 
 #define phys_to_machine_mapping_valid(pfn) (1)
 
-#define pte_mfn	    pte_pfn
-#define mfn_pte	    pfn_pte
-
 /* Xen machine address */
 typedef struct xmaddr {
 	phys_addr_t maddr;
@@ -31,6 +28,17 @@ typedef struct xpaddr {
 
 #define INVALID_P2M_ENTRY      (~0UL)
 
+/*
+ * The pseudo-physical frame (pfn) used in all the helpers is always based
+ * on Xen page granularity (i.e 4KB).
+ *
+ * A Linux page may be split across multiple non-contiguous Xen page so we
+ * have to keep track with frame based on 4KB page granularity.
+ *
+ * PV drivers should never make a direct usage of those helpers (particularly
+ * pfn_to_gfn and gfn_to_pfn).
+ */
+
 unsigned long __pfn_to_mfn(unsigned long pfn);
 extern struct rb_root phys_to_mach;
 
@@ -67,8 +75,8 @@ static inline unsigned long bfn_to_pfn(unsigned long bfn)
 #define bfn_to_local_pfn(bfn)	bfn_to_pfn(bfn)
 
 /* VIRT <-> GUEST conversion */
-#define virt_to_gfn(v)		(pfn_to_gfn(virt_to_pfn(v)))
-#define gfn_to_virt(m)		(__va(gfn_to_pfn(m) << PAGE_SHIFT))
+#define virt_to_gfn(v)		(pfn_to_gfn(virt_to_phys(v) >> XEN_PAGE_SHIFT))
+#define gfn_to_virt(m)		(__va(gfn_to_pfn(m) << XEN_PAGE_SHIFT))
 
 /* Only used in PV code. But ARM guests are always HVM. */
 static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr)
@@ -107,8 +115,8 @@ static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 #define xen_unmap(cookie) iounmap((cookie))
 
 bool xen_arch_need_swiotlb(struct device *dev,
-			   unsigned long pfn,
-			   unsigned long bfn);
+			   phys_addr_t phys,
+			   dma_addr_t dev_addr);
 unsigned long xen_get_swiotlb_free_pages(unsigned int order);
 
 #endif /* _ASM_ARM_XEN_PAGE_H */

+ 16 - 4
arch/arm/xen/enlighten.c

@@ -86,16 +86,25 @@ static void xen_percpu_init(void)
 	int err;
 	int cpu = get_cpu();
 
+	/* 
+	 * VCPUOP_register_vcpu_info cannot be called twice for the same
+	 * vcpu, so if vcpu_info is already registered, just get out. This
+	 * can happen with cpu-hotplug.
+	 */
+	if (per_cpu(xen_vcpu, cpu) != NULL)
+		goto after_register_vcpu_info;
+
 	pr_info("Xen: initializing cpu%d\n", cpu);
 	vcpup = per_cpu_ptr(xen_vcpu_info, cpu);
 
-	info.mfn = __pa(vcpup) >> PAGE_SHIFT;
-	info.offset = offset_in_page(vcpup);
+	info.mfn = virt_to_gfn(vcpup);
+	info.offset = xen_offset_in_page(vcpup);
 
 	err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
 	BUG_ON(err);
 	per_cpu(xen_vcpu, cpu) = vcpup;
 
+after_register_vcpu_info:
 	enable_percpu_irq(xen_events_irq, 0);
 	put_cpu();
 }
@@ -124,6 +133,9 @@ static int xen_cpu_notification(struct notifier_block *self,
 	case CPU_STARTING:
 		xen_percpu_init();
 		break;
+	case CPU_DYING:
+		disable_percpu_irq(xen_events_irq);
+		break;
 	default:
 		break;
 	}
@@ -213,7 +225,7 @@ static int __init xen_guest_init(void)
 	xatp.domid = DOMID_SELF;
 	xatp.idx = 0;
 	xatp.space = XENMAPSPACE_shared_info;
-	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+	xatp.gpfn = virt_to_gfn(shared_info_page);
 	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
 		BUG();
 
@@ -284,7 +296,7 @@ void xen_arch_resume(void) { }
 void xen_arch_suspend(void) { }
 
 
-/* In the hypervisor.S file. */
+/* In the hypercall.S file. */
 EXPORT_SYMBOL_GPL(HYPERVISOR_event_channel_op);
 EXPORT_SYMBOL_GPL(HYPERVISOR_grant_table_op);
 EXPORT_SYMBOL_GPL(HYPERVISOR_xen_version);

+ 29 - 10
arch/arm/xen/mm.c

@@ -48,22 +48,22 @@ static void dma_cache_maint(dma_addr_t handle, unsigned long offset,
 	size_t size, enum dma_data_direction dir, enum dma_cache_op op)
 {
 	struct gnttab_cache_flush cflush;
-	unsigned long pfn;
+	unsigned long xen_pfn;
 	size_t left = size;
 
-	pfn = (handle >> PAGE_SHIFT) + offset / PAGE_SIZE;
-	offset %= PAGE_SIZE;
+	xen_pfn = (handle >> XEN_PAGE_SHIFT) + offset / XEN_PAGE_SIZE;
+	offset %= XEN_PAGE_SIZE;
 
 	do {
 		size_t len = left;
 	
 		/* buffers in highmem or foreign pages cannot cross page
 		 * boundaries */
-		if (len + offset > PAGE_SIZE)
-			len = PAGE_SIZE - offset;
+		if (len + offset > XEN_PAGE_SIZE)
+			len = XEN_PAGE_SIZE - offset;
 
 		cflush.op = 0;
-		cflush.a.dev_bus_addr = pfn << PAGE_SHIFT;
+		cflush.a.dev_bus_addr = xen_pfn << XEN_PAGE_SHIFT;
 		cflush.offset = offset;
 		cflush.length = len;
 
@@ -79,7 +79,7 @@ static void dma_cache_maint(dma_addr_t handle, unsigned long offset,
 			HYPERVISOR_grant_table_op(GNTTABOP_cache_flush, &cflush, 1);
 
 		offset = 0;
-		pfn++;
+		xen_pfn++;
 		left -= len;
 	} while (left);
 }
@@ -138,10 +138,29 @@ void __xen_dma_sync_single_for_device(struct device *hwdev,
 }
 
 bool xen_arch_need_swiotlb(struct device *dev,
-			   unsigned long pfn,
-			   unsigned long bfn)
+			   phys_addr_t phys,
+			   dma_addr_t dev_addr)
 {
-	return (!hypercall_cflush && (pfn != bfn) && !is_device_dma_coherent(dev));
+	unsigned int xen_pfn = XEN_PFN_DOWN(phys);
+	unsigned int bfn = XEN_PFN_DOWN(dev_addr);
+
+	/*
+	 * The swiotlb buffer should be used if
+	 *	- Xen doesn't have the cache flush hypercall
+	 *	- The Linux page refers to foreign memory
+	 *	- The device doesn't support coherent DMA request
+	 *
+	 * The Linux page may be spanned acrros multiple Xen page, although
+	 * it's not possible to have a mix of local and foreign Xen page.
+	 * Furthermore, range_straddles_page_boundary is already checking
+	 * if buffer is physically contiguous in the host RAM.
+	 *
+	 * Therefore we only need to check the first Xen page to know if we
+	 * require a bounce buffer because the device doesn't support coherent
+	 * memory and we are not able to flush the cache.
+	 */
+	return (!hypercall_cflush && (xen_pfn != bfn) &&
+		!is_device_dma_coherent(dev));
 }
 
 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,

+ 3 - 3
arch/arm/xen/p2m.c

@@ -93,8 +93,8 @@ int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
 	for (i = 0; i < count; i++) {
 		if (map_ops[i].status)
 			continue;
-		set_phys_to_machine(map_ops[i].host_addr >> PAGE_SHIFT,
-				    map_ops[i].dev_bus_addr >> PAGE_SHIFT);
+		set_phys_to_machine(map_ops[i].host_addr >> XEN_PAGE_SHIFT,
+				    map_ops[i].dev_bus_addr >> XEN_PAGE_SHIFT);
 	}
 
 	return 0;
@@ -108,7 +108,7 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
 	int i;
 
 	for (i = 0; i < count; i++) {
-		set_phys_to_machine(unmap_ops[i].host_addr >> PAGE_SHIFT,
+		set_phys_to_machine(unmap_ops[i].host_addr >> XEN_PAGE_SHIFT,
 				    INVALID_P2M_ENTRY);
 	}
 

+ 5 - 0
arch/x86/include/asm/xen/hypervisor.h

@@ -57,4 +57,9 @@ static inline bool xen_x2apic_para_available(void)
 }
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num);
+void xen_arch_unregister_cpu(int num);
+#endif
+
 #endif /* _ASM_X86_XEN_HYPERVISOR_H */

+ 5 - 3
arch/x86/include/asm/xen/page.h

@@ -12,7 +12,7 @@
 #include <asm/pgtable.h>
 
 #include <xen/interface/xen.h>
-#include <xen/grant_table.h>
+#include <xen/interface/grant_table.h>
 #include <xen/features.h>
 
 /* Xen machine address */
@@ -43,6 +43,8 @@ extern unsigned long *xen_p2m_addr;
 extern unsigned long  xen_p2m_size;
 extern unsigned long  xen_max_p2m_pfn;
 
+extern int xen_alloc_p2m_entry(unsigned long pfn);
+
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
 extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
@@ -296,8 +298,8 @@ void make_lowmem_page_readwrite(void *vaddr);
 #define xen_unmap(cookie) iounmap((cookie))
 
 static inline bool xen_arch_need_swiotlb(struct device *dev,
-					 unsigned long pfn,
-					 unsigned long bfn)
+					 phys_addr_t phys,
+					 dma_addr_t dev_addr)
 {
 	return false;
 }

+ 15 - 0
arch/x86/xen/enlighten.c

@@ -75,6 +75,7 @@
 #include <asm/mwait.h>
 #include <asm/pci_x86.h>
 #include <asm/pat.h>
+#include <asm/cpu.h>
 
 #ifdef CONFIG_ACPI
 #include <linux/acpi.h>
@@ -1899,3 +1900,17 @@ const struct hypervisor_x86 x86_hyper_xen = {
 	.set_cpu_features       = xen_set_cpu_features,
 };
 EXPORT_SYMBOL(x86_hyper_xen);
+
+#ifdef CONFIG_HOTPLUG_CPU
+void xen_arch_register_cpu(int num)
+{
+	arch_register_cpu(num);
+}
+EXPORT_SYMBOL(xen_arch_register_cpu);
+
+void xen_arch_unregister_cpu(int num)
+{
+	arch_unregister_cpu(num);
+}
+EXPORT_SYMBOL(xen_arch_unregister_cpu);
+#endif

+ 1 - 1
arch/x86/xen/grant-table.c

@@ -133,7 +133,7 @@ static int __init xlated_setup_gnttab_pages(void)
 		kfree(pages);
 		return -ENOMEM;
 	}
-	rc = alloc_xenballooned_pages(nr_grant_frames, pages, 0 /* lowmem */);
+	rc = alloc_xenballooned_pages(nr_grant_frames, pages);
 	if (rc) {
 		pr_warn("%s Couldn't balloon alloc %ld pfns rc:%d\n", __func__,
 			nr_grant_frames, rc);

+ 1 - 0
arch/x86/xen/mmu.c

@@ -2888,6 +2888,7 @@ static int do_remap_gfn(struct vm_area_struct *vma,
 		addr += range;
 		if (err_ptr)
 			err_ptr += batch;
+		cond_resched();
 	}
 out:
 

+ 13 - 6
arch/x86/xen/p2m.c

@@ -530,7 +530,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg)
  * the new pages are installed with cmpxchg; if we lose the race then
  * simply free the page we allocated and use the one that's there.
  */
-static bool alloc_p2m(unsigned long pfn)
+int xen_alloc_p2m_entry(unsigned long pfn)
 {
 	unsigned topidx;
 	unsigned long *top_mfn_p, *mid_mfn;
@@ -540,6 +540,9 @@ static bool alloc_p2m(unsigned long pfn)
 	unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
 	unsigned long p2m_pfn;
 
+	if (xen_feature(XENFEAT_auto_translated_physmap))
+		return 0;
+
 	ptep = lookup_address(addr, &level);
 	BUG_ON(!ptep || level != PG_LEVEL_4K);
 	pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
@@ -548,7 +551,7 @@ static bool alloc_p2m(unsigned long pfn)
 		/* PMD level is missing, allocate a new one */
 		ptep = alloc_p2m_pmd(addr, pte_pg);
 		if (!ptep)
-			return false;
+			return -ENOMEM;
 	}
 
 	if (p2m_top_mfn && pfn < MAX_P2M_PFN) {
@@ -566,7 +569,7 @@ static bool alloc_p2m(unsigned long pfn)
 
 			mid_mfn = alloc_p2m_page();
 			if (!mid_mfn)
-				return false;
+				return -ENOMEM;
 
 			p2m_mid_mfn_init(mid_mfn, p2m_missing);
 
@@ -592,7 +595,7 @@ static bool alloc_p2m(unsigned long pfn)
 
 		p2m = alloc_p2m_page();
 		if (!p2m)
-			return false;
+			return -ENOMEM;
 
 		if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
 			p2m_init(p2m);
@@ -625,8 +628,9 @@ static bool alloc_p2m(unsigned long pfn)
 		HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn;
 	}
 
-	return true;
+	return 0;
 }
+EXPORT_SYMBOL(xen_alloc_p2m_entry);
 
 unsigned long __init set_phys_range_identity(unsigned long pfn_s,
 				      unsigned long pfn_e)
@@ -688,7 +692,10 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 {
 	if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
-		if (!alloc_p2m(pfn))
+		int ret;
+
+		ret = xen_alloc_p2m_entry(pfn);
+		if (ret < 0)
 			return false;
 
 		return __set_phys_to_machine(pfn, mfn);

+ 6 - 3
arch/x86/xen/setup.c

@@ -212,7 +212,7 @@ static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
 		e_pfn = PFN_DOWN(entry->addr + entry->size);
 
 		/* We only care about E820 after this */
-		if (e_pfn < *min_pfn)
+		if (e_pfn <= *min_pfn)
 			continue;
 
 		s_pfn = PFN_UP(entry->addr);
@@ -829,6 +829,8 @@ char * __init xen_memory_setup(void)
 	addr = xen_e820_map[0].addr;
 	size = xen_e820_map[0].size;
 	while (i < xen_e820_map_entries) {
+		bool discard = false;
+
 		chunk_size = size;
 		type = xen_e820_map[i].type;
 
@@ -843,10 +845,11 @@ char * __init xen_memory_setup(void)
 				xen_add_extra_mem(pfn_s, n_pfns);
 				xen_max_p2m_pfn = pfn_s + n_pfns;
 			} else
-				type = E820_UNUSABLE;
+				discard = true;
 		}
 
-		xen_align_and_add_e820_region(addr, chunk_size, type);
+		if (!discard)
+			xen_align_and_add_e820_region(addr, chunk_size, type);
 
 		addr += chunk_size;
 		size -= chunk_size;

+ 7 - 6
drivers/block/xen-blkback/blkback.c

@@ -87,7 +87,7 @@ MODULE_PARM_DESC(max_persistent_grants,
  * Maximum order of pages to be used for the shared ring between front and
  * backend, 4KB page granularity is used.
  */
-unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
+unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
 module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
 MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
 /*
@@ -961,7 +961,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req,
 		seg[n].nsec = segments[i].last_sect -
 			segments[i].first_sect + 1;
 		seg[n].offset = (segments[i].first_sect << 9);
-		if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
+		if ((segments[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
 		    (segments[i].last_sect < segments[i].first_sect)) {
 			rc = -EINVAL;
 			goto unmap;
@@ -1210,6 +1210,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 
 	req_operation = req->operation == BLKIF_OP_INDIRECT ?
 			req->u.indirect.indirect_op : req->operation;
+
 	if ((req->operation == BLKIF_OP_INDIRECT) &&
 	    (req_operation != BLKIF_OP_READ) &&
 	    (req_operation != BLKIF_OP_WRITE)) {
@@ -1268,7 +1269,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 			seg[i].nsec = req->u.rw.seg[i].last_sect -
 				req->u.rw.seg[i].first_sect + 1;
 			seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
-			if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+			if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
 			    (req->u.rw.seg[i].last_sect <
 			     req->u.rw.seg[i].first_sect))
 				goto fail_response;
@@ -1445,10 +1446,10 @@ static int __init xen_blkif_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) {
+	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
 		pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
-			xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
-		xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
+			xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
+		xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
 	}
 
 	rc = xen_blkif_interface_init();

+ 13 - 4
drivers/block/xen-blkback/common.h

@@ -39,6 +39,7 @@
 #include <asm/pgalloc.h>
 #include <asm/hypervisor.h>
 #include <xen/grant_table.h>
+#include <xen/page.h>
 #include <xen/xenbus.h>
 #include <xen/interface/io/ring.h>
 #include <xen/interface/io/blkif.h>
@@ -51,12 +52,20 @@ extern unsigned int xen_blkif_max_ring_order;
  */
 #define MAX_INDIRECT_SEGMENTS 256
 
-#define SEGS_PER_INDIRECT_FRAME \
-	(PAGE_SIZE/sizeof(struct blkif_request_segment))
+/*
+ * Xen use 4K pages. The guest may use different page size (4K or 64K)
+ * Number of Xen pages per segment
+ */
+#define XEN_PAGES_PER_SEGMENT   (PAGE_SIZE / XEN_PAGE_SIZE)
+
+#define XEN_PAGES_PER_INDIRECT_FRAME \
+	(XEN_PAGE_SIZE/sizeof(struct blkif_request_segment))
+#define SEGS_PER_INDIRECT_FRAME	\
+	(XEN_PAGES_PER_INDIRECT_FRAME / XEN_PAGES_PER_SEGMENT)
+
 #define MAX_INDIRECT_PAGES \
 	((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
-#define INDIRECT_PAGES(_segs) \
-	((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+#define INDIRECT_PAGES(_segs) DIV_ROUND_UP(_segs, XEN_PAGES_PER_INDIRECT_FRAME)
 
 /* Not a real protocol.  Used to generate ring structs which contain
  * the elements common to all protocols only.  This way we get a

+ 7 - 4
drivers/block/xen-blkback/xenbus.c

@@ -176,21 +176,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
 	{
 		struct blkif_sring *sring;
 		sring = (struct blkif_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs);
+		BACK_RING_INIT(&blkif->blk_rings.native, sring,
+			       XEN_PAGE_SIZE * nr_grefs);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_32:
 	{
 		struct blkif_x86_32_sring *sring_x86_32;
 		sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs);
+		BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32,
+			       XEN_PAGE_SIZE * nr_grefs);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_64:
 	{
 		struct blkif_x86_64_sring *sring_x86_64;
 		sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
-		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs);
+		BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64,
+			       XEN_PAGE_SIZE * nr_grefs);
 		break;
 	}
 	default:
@@ -826,7 +829,7 @@ again:
 static int connect_ring(struct backend_info *be)
 {
 	struct xenbus_device *dev = be->dev;
-	unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
+	unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
 	unsigned int evtchn, nr_grefs, ring_page_order;
 	unsigned int pers_grants;
 	char protocol[64] = "";

+ 360 - 200
drivers/block/xen-blkfront.c

@@ -68,7 +68,7 @@ enum blkif_state {
 
 struct grant {
 	grant_ref_t gref;
-	unsigned long pfn;
+	struct page *page;
 	struct list_head node;
 };
 
@@ -78,6 +78,7 @@ struct blk_shadow {
 	struct grant **grants_used;
 	struct grant **indirect_grants;
 	struct scatterlist *sg;
+	unsigned int num_sg;
 };
 
 struct split_bio {
@@ -106,8 +107,12 @@ static unsigned int xen_blkif_max_ring_order;
 module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
 MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
 
-#define BLK_RING_SIZE(info) __CONST_RING_SIZE(blkif, PAGE_SIZE * (info)->nr_ring_pages)
-#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES)
+#define BLK_RING_SIZE(info)	\
+	__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages)
+
+#define BLK_MAX_RING_SIZE	\
+	__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
+
 /*
  * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
  * characters are enough. Define to 20 to keep consist with backend.
@@ -128,7 +133,7 @@ struct blkfront_info
 	int vdevice;
 	blkif_vdev_t handle;
 	enum blkif_state connected;
-	int ring_ref[XENBUS_MAX_RING_PAGES];
+	int ring_ref[XENBUS_MAX_RING_GRANTS];
 	unsigned int nr_ring_pages;
 	struct blkif_front_ring ring;
 	unsigned int evtchn, irq;
@@ -146,6 +151,7 @@ struct blkfront_info
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
 	unsigned int feature_persistent:1;
+	/* Number of 4KB segments handled */
 	unsigned int max_indirect_segments;
 	int is_ready;
 	struct blk_mq_tag_set tag_set;
@@ -174,10 +180,23 @@ static DEFINE_SPINLOCK(minor_lock);
 
 #define DEV_NAME	"xvd"	/* name in /dev */
 
-#define SEGS_PER_INDIRECT_FRAME \
-	(PAGE_SIZE/sizeof(struct blkif_request_segment))
-#define INDIRECT_GREFS(_segs) \
-	((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+/*
+ * Grants are always the same size as a Xen page (i.e 4KB).
+ * A physical segment is always the same size as a Linux page.
+ * Number of grants per physical segment
+ */
+#define GRANTS_PER_PSEG	(PAGE_SIZE / XEN_PAGE_SIZE)
+
+#define GRANTS_PER_INDIRECT_FRAME \
+	(XEN_PAGE_SIZE / sizeof(struct blkif_request_segment))
+
+#define PSEGS_PER_INDIRECT_FRAME	\
+	(GRANTS_INDIRECT_FRAME / GRANTS_PSEGS)
+
+#define INDIRECT_GREFS(_grants)		\
+	DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME)
+
+#define GREFS(_psegs)	((_psegs) * GRANTS_PER_PSEG)
 
 static int blkfront_setup_indirect(struct blkfront_info *info);
 static int blkfront_gather_backend_features(struct blkfront_info *info);
@@ -221,7 +240,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num)
 				kfree(gnt_list_entry);
 				goto out_of_memory;
 			}
-			gnt_list_entry->pfn = page_to_pfn(granted_page);
+			gnt_list_entry->page = granted_page;
 		}
 
 		gnt_list_entry->gref = GRANT_INVALID_REF;
@@ -236,7 +255,7 @@ out_of_memory:
 	                         &info->grants, node) {
 		list_del(&gnt_list_entry->node);
 		if (info->feature_persistent)
-			__free_page(pfn_to_page(gnt_list_entry->pfn));
+			__free_page(gnt_list_entry->page);
 		kfree(gnt_list_entry);
 		i--;
 	}
@@ -244,34 +263,77 @@ out_of_memory:
 	return -ENOMEM;
 }
 
-static struct grant *get_grant(grant_ref_t *gref_head,
-                               unsigned long pfn,
-                               struct blkfront_info *info)
+static struct grant *get_free_grant(struct blkfront_info *info)
 {
 	struct grant *gnt_list_entry;
-	unsigned long buffer_gfn;
 
 	BUG_ON(list_empty(&info->grants));
 	gnt_list_entry = list_first_entry(&info->grants, struct grant,
-	                                  node);
+					  node);
 	list_del(&gnt_list_entry->node);
 
-	if (gnt_list_entry->gref != GRANT_INVALID_REF) {
+	if (gnt_list_entry->gref != GRANT_INVALID_REF)
 		info->persistent_gnts_c--;
+
+	return gnt_list_entry;
+}
+
+static inline void grant_foreign_access(const struct grant *gnt_list_entry,
+					const struct blkfront_info *info)
+{
+	gnttab_page_grant_foreign_access_ref_one(gnt_list_entry->gref,
+						 info->xbdev->otherend_id,
+						 gnt_list_entry->page,
+						 0);
+}
+
+static struct grant *get_grant(grant_ref_t *gref_head,
+			       unsigned long gfn,
+			       struct blkfront_info *info)
+{
+	struct grant *gnt_list_entry = get_free_grant(info);
+
+	if (gnt_list_entry->gref != GRANT_INVALID_REF)
 		return gnt_list_entry;
+
+	/* Assign a gref to this page */
+	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
+	BUG_ON(gnt_list_entry->gref == -ENOSPC);
+	if (info->feature_persistent)
+		grant_foreign_access(gnt_list_entry, info);
+	else {
+		/* Grant access to the GFN passed by the caller */
+		gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
+						info->xbdev->otherend_id,
+						gfn, 0);
 	}
 
+	return gnt_list_entry;
+}
+
+static struct grant *get_indirect_grant(grant_ref_t *gref_head,
+					struct blkfront_info *info)
+{
+	struct grant *gnt_list_entry = get_free_grant(info);
+
+	if (gnt_list_entry->gref != GRANT_INVALID_REF)
+		return gnt_list_entry;
+
 	/* Assign a gref to this page */
 	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
 	BUG_ON(gnt_list_entry->gref == -ENOSPC);
 	if (!info->feature_persistent) {
-		BUG_ON(!pfn);
-		gnt_list_entry->pfn = pfn;
+		struct page *indirect_page;
+
+		/* Fetch a pre-allocated page to use for indirect grefs */
+		BUG_ON(list_empty(&info->indirect_pages));
+		indirect_page = list_first_entry(&info->indirect_pages,
+						 struct page, lru);
+		list_del(&indirect_page->lru);
+		gnt_list_entry->page = indirect_page;
 	}
-	buffer_gfn = pfn_to_gfn(gnt_list_entry->pfn);
-	gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
-	                                info->xbdev->otherend_id,
-	                                buffer_gfn, 0);
+	grant_foreign_access(gnt_list_entry, info);
+
 	return gnt_list_entry;
 }
 
@@ -394,20 +456,128 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 	return 0;
 }
 
-/*
- * Generate a Xen blkfront IO request from a blk layer request.  Reads
- * and writes are handled as expected.
- *
- * @req: a request struct
- */
-static int blkif_queue_request(struct request *req)
+static int blkif_queue_discard_req(struct request *req)
 {
 	struct blkfront_info *info = req->rq_disk->private_data;
 	struct blkif_request *ring_req;
 	unsigned long id;
+
+	/* Fill out a communications ring structure. */
+	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+	id = get_id_from_freelist(info);
+	info->shadow[id].request = req;
+
+	ring_req->operation = BLKIF_OP_DISCARD;
+	ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+	ring_req->u.discard.id = id;
+	ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
+	if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+		ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
+	else
+		ring_req->u.discard.flag = 0;
+
+	info->ring.req_prod_pvt++;
+
+	/* Keep a private copy so we can reissue requests when recovering. */
+	info->shadow[id].req = *ring_req;
+
+	return 0;
+}
+
+struct setup_rw_req {
+	unsigned int grant_idx;
+	struct blkif_request_segment *segments;
+	struct blkfront_info *info;
+	struct blkif_request *ring_req;
+	grant_ref_t gref_head;
+	unsigned int id;
+	/* Only used when persistent grant is used and it's a read request */
+	bool need_copy;
+	unsigned int bvec_off;
+	char *bvec_data;
+};
+
+static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
+				     unsigned int len, void *data)
+{
+	struct setup_rw_req *setup = data;
+	int n, ref;
+	struct grant *gnt_list_entry;
 	unsigned int fsect, lsect;
-	int i, ref, n;
-	struct blkif_request_segment *segments = NULL;
+	/* Convenient aliases */
+	unsigned int grant_idx = setup->grant_idx;
+	struct blkif_request *ring_req = setup->ring_req;
+	struct blkfront_info *info = setup->info;
+	struct blk_shadow *shadow = &info->shadow[setup->id];
+
+	if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
+	    (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
+		if (setup->segments)
+			kunmap_atomic(setup->segments);
+
+		n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
+		gnt_list_entry = get_indirect_grant(&setup->gref_head, info);
+		shadow->indirect_grants[n] = gnt_list_entry;
+		setup->segments = kmap_atomic(gnt_list_entry->page);
+		ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
+	}
+
+	gnt_list_entry = get_grant(&setup->gref_head, gfn, info);
+	ref = gnt_list_entry->gref;
+	shadow->grants_used[grant_idx] = gnt_list_entry;
+
+	if (setup->need_copy) {
+		void *shared_data;
+
+		shared_data = kmap_atomic(gnt_list_entry->page);
+		/*
+		 * this does not wipe data stored outside the
+		 * range sg->offset..sg->offset+sg->length.
+		 * Therefore, blkback *could* see data from
+		 * previous requests. This is OK as long as
+		 * persistent grants are shared with just one
+		 * domain. It may need refactoring if this
+		 * changes
+		 */
+		memcpy(shared_data + offset,
+		       setup->bvec_data + setup->bvec_off,
+		       len);
+
+		kunmap_atomic(shared_data);
+		setup->bvec_off += len;
+	}
+
+	fsect = offset >> 9;
+	lsect = fsect + (len >> 9) - 1;
+	if (ring_req->operation != BLKIF_OP_INDIRECT) {
+		ring_req->u.rw.seg[grant_idx] =
+			(struct blkif_request_segment) {
+				.gref       = ref,
+				.first_sect = fsect,
+				.last_sect  = lsect };
+	} else {
+		setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] =
+			(struct blkif_request_segment) {
+				.gref       = ref,
+				.first_sect = fsect,
+				.last_sect  = lsect };
+	}
+
+	(setup->grant_idx)++;
+}
+
+static int blkif_queue_rw_req(struct request *req)
+{
+	struct blkfront_info *info = req->rq_disk->private_data;
+	struct blkif_request *ring_req;
+	unsigned long id;
+	int i;
+	struct setup_rw_req setup = {
+		.grant_idx = 0,
+		.segments = NULL,
+		.info = info,
+		.need_copy = rq_data_dir(req) && info->feature_persistent,
+	};
 
 	/*
 	 * Used to store if we are able to queue the request by just using
@@ -415,28 +585,23 @@ static int blkif_queue_request(struct request *req)
 	 * as there are not sufficiently many free.
 	 */
 	bool new_persistent_gnts;
-	grant_ref_t gref_head;
-	struct grant *gnt_list_entry = NULL;
 	struct scatterlist *sg;
-	int nseg, max_grefs;
+	int num_sg, max_grefs, num_grant;
 
-	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
-		return 1;
-
-	max_grefs = req->nr_phys_segments;
+	max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG;
 	if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
 		/*
 		 * If we are using indirect segments we need to account
 		 * for the indirect grefs used in the request.
 		 */
-		max_grefs += INDIRECT_GREFS(req->nr_phys_segments);
+		max_grefs += INDIRECT_GREFS(max_grefs);
 
 	/* Check if we have enough grants to allocate a requests */
 	if (info->persistent_gnts_c < max_grefs) {
 		new_persistent_gnts = 1;
 		if (gnttab_alloc_grant_references(
 		    max_grefs - info->persistent_gnts_c,
-		    &gref_head) < 0) {
+		    &setup.gref_head) < 0) {
 			gnttab_request_free_callback(
 				&info->callback,
 				blkif_restart_queue_callback,
@@ -452,139 +617,82 @@ static int blkif_queue_request(struct request *req)
 	id = get_id_from_freelist(info);
 	info->shadow[id].request = req;
 
-	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
-		ring_req->operation = BLKIF_OP_DISCARD;
-		ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
-		ring_req->u.discard.id = id;
-		ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
-		if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
-			ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
-		else
-			ring_req->u.discard.flag = 0;
+	BUG_ON(info->max_indirect_segments == 0 &&
+	       GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	BUG_ON(info->max_indirect_segments &&
+	       GREFS(req->nr_phys_segments) > info->max_indirect_segments);
+
+	num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+	num_grant = 0;
+	/* Calculate the number of grant used */
+	for_each_sg(info->shadow[id].sg, sg, num_sg, i)
+	       num_grant += gnttab_count_grant(sg->offset, sg->length);
+
+	ring_req->u.rw.id = id;
+	info->shadow[id].num_sg = num_sg;
+	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+		/*
+		 * The indirect operation can only be a BLKIF_OP_READ or
+		 * BLKIF_OP_WRITE
+		 */
+		BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
+		ring_req->operation = BLKIF_OP_INDIRECT;
+		ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
+			BLKIF_OP_WRITE : BLKIF_OP_READ;
+		ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
+		ring_req->u.indirect.handle = info->handle;
+		ring_req->u.indirect.nr_segments = num_grant;
 	} else {
-		BUG_ON(info->max_indirect_segments == 0 &&
-		       req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
-		BUG_ON(info->max_indirect_segments &&
-		       req->nr_phys_segments > info->max_indirect_segments);
-		nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
-		ring_req->u.rw.id = id;
-		if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+		ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
+		ring_req->u.rw.handle = info->handle;
+		ring_req->operation = rq_data_dir(req) ?
+			BLKIF_OP_WRITE : BLKIF_OP_READ;
+		if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
 			/*
-			 * The indirect operation can only be a BLKIF_OP_READ or
-			 * BLKIF_OP_WRITE
+			 * Ideally we can do an unordered flush-to-disk.
+			 * In case the backend onlysupports barriers, use that.
+			 * A barrier request a superset of FUA, so we can
+			 * implement it the same way.  (It's also a FLUSH+FUA,
+			 * since it is guaranteed ordered WRT previous writes.)
 			 */
-			BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
-			ring_req->operation = BLKIF_OP_INDIRECT;
-			ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
-				BLKIF_OP_WRITE : BLKIF_OP_READ;
-			ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
-			ring_req->u.indirect.handle = info->handle;
-			ring_req->u.indirect.nr_segments = nseg;
-		} else {
-			ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
-			ring_req->u.rw.handle = info->handle;
-			ring_req->operation = rq_data_dir(req) ?
-				BLKIF_OP_WRITE : BLKIF_OP_READ;
-			if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
-				/*
-				 * Ideally we can do an unordered flush-to-disk. In case the
-				 * backend onlysupports barriers, use that. A barrier request
-				 * a superset of FUA, so we can implement it the same
-				 * way.  (It's also a FLUSH+FUA, since it is
-				 * guaranteed ordered WRT previous writes.)
-				 */
-				switch (info->feature_flush &
-					((REQ_FLUSH|REQ_FUA))) {
-				case REQ_FLUSH|REQ_FUA:
-					ring_req->operation =
-						BLKIF_OP_WRITE_BARRIER;
-					break;
-				case REQ_FLUSH:
-					ring_req->operation =
-						BLKIF_OP_FLUSH_DISKCACHE;
-					break;
-				default:
-					ring_req->operation = 0;
-				}
+			switch (info->feature_flush &
+				((REQ_FLUSH|REQ_FUA))) {
+			case REQ_FLUSH|REQ_FUA:
+				ring_req->operation =
+					BLKIF_OP_WRITE_BARRIER;
+				break;
+			case REQ_FLUSH:
+				ring_req->operation =
+					BLKIF_OP_FLUSH_DISKCACHE;
+				break;
+			default:
+				ring_req->operation = 0;
 			}
-			ring_req->u.rw.nr_segments = nseg;
 		}
-		for_each_sg(info->shadow[id].sg, sg, nseg, i) {
-			fsect = sg->offset >> 9;
-			lsect = fsect + (sg->length >> 9) - 1;
-
-			if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
-			    (i % SEGS_PER_INDIRECT_FRAME == 0)) {
-				unsigned long uninitialized_var(pfn);
-
-				if (segments)
-					kunmap_atomic(segments);
-
-				n = i / SEGS_PER_INDIRECT_FRAME;
-				if (!info->feature_persistent) {
-					struct page *indirect_page;
-
-					/* Fetch a pre-allocated page to use for indirect grefs */
-					BUG_ON(list_empty(&info->indirect_pages));
-					indirect_page = list_first_entry(&info->indirect_pages,
-					                                 struct page, lru);
-					list_del(&indirect_page->lru);
-					pfn = page_to_pfn(indirect_page);
-				}
-				gnt_list_entry = get_grant(&gref_head, pfn, info);
-				info->shadow[id].indirect_grants[n] = gnt_list_entry;
-				segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
-				ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
-			}
-
-			gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info);
-			ref = gnt_list_entry->gref;
-
-			info->shadow[id].grants_used[i] = gnt_list_entry;
-
-			if (rq_data_dir(req) && info->feature_persistent) {
-				char *bvec_data;
-				void *shared_data;
+		ring_req->u.rw.nr_segments = num_grant;
+	}
 
-				BUG_ON(sg->offset + sg->length > PAGE_SIZE);
+	setup.ring_req = ring_req;
+	setup.id = id;
+	for_each_sg(info->shadow[id].sg, sg, num_sg, i) {
+		BUG_ON(sg->offset + sg->length > PAGE_SIZE);
 
-				shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
-				bvec_data = kmap_atomic(sg_page(sg));
+		if (setup.need_copy) {
+			setup.bvec_off = sg->offset;
+			setup.bvec_data = kmap_atomic(sg_page(sg));
+		}
 
-				/*
-				 * this does not wipe data stored outside the
-				 * range sg->offset..sg->offset+sg->length.
-				 * Therefore, blkback *could* see data from
-				 * previous requests. This is OK as long as
-				 * persistent grants are shared with just one
-				 * domain. It may need refactoring if this
-				 * changes
-				 */
-				memcpy(shared_data + sg->offset,
-				       bvec_data   + sg->offset,
-				       sg->length);
+		gnttab_foreach_grant_in_range(sg_page(sg),
+					      sg->offset,
+					      sg->length,
+					      blkif_setup_rw_req_grant,
+					      &setup);
 
-				kunmap_atomic(bvec_data);
-				kunmap_atomic(shared_data);
-			}
-			if (ring_req->operation != BLKIF_OP_INDIRECT) {
-				ring_req->u.rw.seg[i] =
-						(struct blkif_request_segment) {
-							.gref       = ref,
-							.first_sect = fsect,
-							.last_sect  = lsect };
-			} else {
-				n = i % SEGS_PER_INDIRECT_FRAME;
-				segments[n] =
-					(struct blkif_request_segment) {
-							.gref       = ref,
-							.first_sect = fsect,
-							.last_sect  = lsect };
-			}
-		}
-		if (segments)
-			kunmap_atomic(segments);
+		if (setup.need_copy)
+			kunmap_atomic(setup.bvec_data);
 	}
+	if (setup.segments)
+		kunmap_atomic(setup.segments);
 
 	info->ring.req_prod_pvt++;
 
@@ -592,11 +700,29 @@ static int blkif_queue_request(struct request *req)
 	info->shadow[id].req = *ring_req;
 
 	if (new_persistent_gnts)
-		gnttab_free_grant_references(gref_head);
+		gnttab_free_grant_references(setup.gref_head);
 
 	return 0;
 }
 
+/*
+ * Generate a Xen blkfront IO request from a blk layer request.  Reads
+ * and writes are handled as expected.
+ *
+ * @req: a request struct
+ */
+static int blkif_queue_request(struct request *req)
+{
+	struct blkfront_info *info = req->rq_disk->private_data;
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+		return 1;
+
+	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
+		return blkif_queue_discard_req(req);
+	else
+		return blkif_queue_rw_req(req);
+}
 
 static inline void flush_requests(struct blkfront_info *info)
 {
@@ -691,14 +817,14 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
 	blk_queue_logical_block_size(rq, sector_size);
 	blk_queue_physical_block_size(rq, physical_sector_size);
-	blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
+	blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512);
 
 	/* Each segment in a request is up to an aligned page in size. */
 	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
 	blk_queue_max_segment_size(rq, PAGE_SIZE);
 
 	/* Ensure a merged request will fit in a single I/O ring slot. */
-	blk_queue_max_segments(rq, segments);
+	blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG);
 
 	/* Make sure buffer addresses are sector-aligned. */
 	blk_queue_dma_alignment(rq, 511);
@@ -972,7 +1098,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 				info->persistent_gnts_c--;
 			}
 			if (info->feature_persistent)
-				__free_page(pfn_to_page(persistent_gnt->pfn));
+				__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
 	}
@@ -1007,7 +1133,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 			persistent_gnt = info->shadow[i].grants_used[j];
 			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
 			if (info->feature_persistent)
-				__free_page(pfn_to_page(persistent_gnt->pfn));
+				__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
 
@@ -1021,7 +1147,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 		for (j = 0; j < INDIRECT_GREFS(segs); j++) {
 			persistent_gnt = info->shadow[i].indirect_grants[j];
 			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
-			__free_page(pfn_to_page(persistent_gnt->pfn));
+			__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
 
@@ -1057,33 +1183,65 @@ free_shadow:
 
 }
 
+struct copy_from_grant {
+	const struct blk_shadow *s;
+	unsigned int grant_idx;
+	unsigned int bvec_offset;
+	char *bvec_data;
+};
+
+static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
+				  unsigned int len, void *data)
+{
+	struct copy_from_grant *info = data;
+	char *shared_data;
+	/* Convenient aliases */
+	const struct blk_shadow *s = info->s;
+
+	shared_data = kmap_atomic(s->grants_used[info->grant_idx]->page);
+
+	memcpy(info->bvec_data + info->bvec_offset,
+	       shared_data + offset, len);
+
+	info->bvec_offset += len;
+	info->grant_idx++;
+
+	kunmap_atomic(shared_data);
+}
+
 static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 			     struct blkif_response *bret)
 {
 	int i = 0;
 	struct scatterlist *sg;
-	char *bvec_data;
-	void *shared_data;
-	int nseg;
+	int num_sg, num_grant;
+	struct copy_from_grant data = {
+		.s = s,
+		.grant_idx = 0,
+	};
 
-	nseg = s->req.operation == BLKIF_OP_INDIRECT ?
+	num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
 		s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+	num_sg = s->num_sg;
 
 	if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
-		for_each_sg(s->sg, sg, nseg, i) {
+		for_each_sg(s->sg, sg, num_sg, i) {
 			BUG_ON(sg->offset + sg->length > PAGE_SIZE);
-			shared_data = kmap_atomic(
-				pfn_to_page(s->grants_used[i]->pfn));
-			bvec_data = kmap_atomic(sg_page(sg));
-			memcpy(bvec_data   + sg->offset,
-			       shared_data + sg->offset,
-			       sg->length);
-			kunmap_atomic(bvec_data);
-			kunmap_atomic(shared_data);
+
+			data.bvec_offset = sg->offset;
+			data.bvec_data = kmap_atomic(sg_page(sg));
+
+			gnttab_foreach_grant_in_range(sg_page(sg),
+						      sg->offset,
+						      sg->length,
+						      blkif_copy_from_grant,
+						      &data);
+
+			kunmap_atomic(data.bvec_data);
 		}
 	}
 	/* Add the persistent grant into the list of free grants */
-	for (i = 0; i < nseg; i++) {
+	for (i = 0; i < num_grant; i++) {
 		if (gnttab_query_foreign_access(s->grants_used[i]->gref)) {
 			/*
 			 * If the grant is still mapped by the backend (the
@@ -1109,7 +1267,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 		}
 	}
 	if (s->req.operation == BLKIF_OP_INDIRECT) {
-		for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
+		for (i = 0; i < INDIRECT_GREFS(num_grant); i++) {
 			if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) {
 				if (!info->feature_persistent)
 					pr_alert_ratelimited("backed has not unmapped grant: %u\n",
@@ -1125,7 +1283,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 				 * available pages for indirect grefs.
 				 */
 				if (!info->feature_persistent) {
-					indirect_page = pfn_to_page(s->indirect_grants[i]->pfn);
+					indirect_page = s->indirect_grants[i]->page;
 					list_add(&indirect_page->lru, &info->indirect_pages);
 				}
 				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
@@ -1254,8 +1412,8 @@ static int setup_blkring(struct xenbus_device *dev,
 {
 	struct blkif_sring *sring;
 	int err, i;
-	unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
-	grant_ref_t gref[XENBUS_MAX_RING_PAGES];
+	unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
+	grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
 
 	for (i = 0; i < info->nr_ring_pages; i++)
 		info->ring_ref[i] = GRANT_INVALID_REF;
@@ -1583,8 +1741,8 @@ static int blkif_recover(struct blkfront_info *info)
 			atomic_set(&split_bio->pending, pending);
 			split_bio->bio = bio;
 			for (i = 0; i < pending; i++) {
-				offset = (i * segs * PAGE_SIZE) >> 9;
-				size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
+				offset = (i * segs * XEN_PAGE_SIZE) >> 9;
+				size = min((unsigned int)(segs * XEN_PAGE_SIZE) >> 9,
 					   (unsigned int)bio_sectors(bio) - offset);
 				cloned_bio = bio_clone(bio, GFP_NOIO);
 				BUG_ON(cloned_bio == NULL);
@@ -1695,15 +1853,17 @@ static void blkfront_setup_discard(struct blkfront_info *info)
 
 static int blkfront_setup_indirect(struct blkfront_info *info)
 {
-	unsigned int segs;
+	unsigned int psegs, grants;
 	int err, i;
 
 	if (info->max_indirect_segments == 0)
-		segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+		grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	else
-		segs = info->max_indirect_segments;
+		grants = info->max_indirect_segments;
+	psegs = grants / GRANTS_PER_PSEG;
 
-	err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE(info));
+	err = fill_grant_buffer(info,
+				(grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
 	if (err)
 		goto out_of_memory;
 
@@ -1713,7 +1873,7 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
 		 * grants, we need to allocate a set of pages that can be
 		 * used for mapping indirect grefs
 		 */
-		int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE(info);
+		int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
 
 		BUG_ON(!list_empty(&info->indirect_pages));
 		for (i = 0; i < num; i++) {
@@ -1726,20 +1886,20 @@ static int blkfront_setup_indirect(struct blkfront_info *info)
 
 	for (i = 0; i < BLK_RING_SIZE(info); i++) {
 		info->shadow[i].grants_used = kzalloc(
-			sizeof(info->shadow[i].grants_used[0]) * segs,
+			sizeof(info->shadow[i].grants_used[0]) * grants,
 			GFP_NOIO);
-		info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
+		info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO);
 		if (info->max_indirect_segments)
 			info->shadow[i].indirect_grants = kzalloc(
 				sizeof(info->shadow[i].indirect_grants[0]) *
-				INDIRECT_GREFS(segs),
+				INDIRECT_GREFS(grants),
 				GFP_NOIO);
 		if ((info->shadow[i].grants_used == NULL) ||
 			(info->shadow[i].sg == NULL) ||
 		     (info->max_indirect_segments &&
 		     (info->shadow[i].indirect_grants == NULL)))
 			goto out_of_memory;
-		sg_init_table(info->shadow[i].sg, segs);
+		sg_init_table(info->shadow[i].sg, psegs);
 	}
 
 
@@ -2125,9 +2285,9 @@ static int __init xlblk_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
-	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) {
+	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
 		pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
-			xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
+			xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
 		xen_blkif_max_ring_order = 0;
 	}
 

+ 11 - 5
drivers/net/xen-netback/common.h

@@ -44,6 +44,7 @@
 #include <xen/interface/grant_table.h>
 #include <xen/grant_table.h>
 #include <xen/xenbus.h>
+#include <xen/page.h>
 #include <linux/debugfs.h>
 
 typedef unsigned int pending_ring_idx_t;
@@ -64,8 +65,8 @@ struct pending_tx_info {
 	struct ubuf_info callback_struct;
 };
 
-#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
-#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
+#define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, XEN_PAGE_SIZE)
+#define XEN_NETIF_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, XEN_PAGE_SIZE)
 
 struct xenvif_rx_meta {
 	int id;
@@ -80,16 +81,21 @@ struct xenvif_rx_meta {
 /* Discriminate from any valid pending_idx value. */
 #define INVALID_PENDING_IDX 0xFFFF
 
-#define MAX_BUFFER_OFFSET PAGE_SIZE
+#define MAX_BUFFER_OFFSET XEN_PAGE_SIZE
 
 #define MAX_PENDING_REQS XEN_NETIF_TX_RING_SIZE
 
+/* The maximum number of frags is derived from the size of a grant (same
+ * as a Xen page size for now).
+ */
+#define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1)
+
 /* It's possible for an skb to have a maximal number of frags
  * but still be less than MAX_BUFFER_OFFSET in size. Thus the
- * worst-case number of copy operations is MAX_SKB_FRAGS per
+ * worst-case number of copy operations is MAX_XEN_SKB_FRAGS per
  * ring slot.
  */
-#define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE)
+#define MAX_GRANT_COPY_OPS (MAX_XEN_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE)
 
 #define NETBACK_INVALID_HANDLE -1
 

+ 105 - 62
drivers/net/xen-netback/netback.c

@@ -152,9 +152,9 @@ static inline pending_ring_idx_t pending_index(unsigned i)
 static int xenvif_rx_ring_slots_needed(struct xenvif *vif)
 {
 	if (vif->gso_mask)
-		return DIV_ROUND_UP(vif->dev->gso_max_size, PAGE_SIZE) + 1;
+		return DIV_ROUND_UP(vif->dev->gso_max_size, XEN_PAGE_SIZE) + 1;
 	else
-		return DIV_ROUND_UP(vif->dev->mtu, PAGE_SIZE);
+		return DIV_ROUND_UP(vif->dev->mtu, XEN_PAGE_SIZE);
 }
 
 static bool xenvif_rx_ring_slots_available(struct xenvif_queue *queue)
@@ -274,6 +274,80 @@ static struct xenvif_rx_meta *get_next_rx_buffer(struct xenvif_queue *queue,
 	return meta;
 }
 
+struct gop_frag_copy {
+	struct xenvif_queue *queue;
+	struct netrx_pending_operations *npo;
+	struct xenvif_rx_meta *meta;
+	int head;
+	int gso_type;
+
+	struct page *page;
+};
+
+static void xenvif_setup_copy_gop(unsigned long gfn,
+				  unsigned int offset,
+				  unsigned int *len,
+				  struct gop_frag_copy *info)
+{
+	struct gnttab_copy *copy_gop;
+	struct xen_page_foreign *foreign;
+	/* Convenient aliases */
+	struct xenvif_queue *queue = info->queue;
+	struct netrx_pending_operations *npo = info->npo;
+	struct page *page = info->page;
+
+	BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET);
+
+	if (npo->copy_off == MAX_BUFFER_OFFSET)
+		info->meta = get_next_rx_buffer(queue, npo);
+
+	if (npo->copy_off + *len > MAX_BUFFER_OFFSET)
+		*len = MAX_BUFFER_OFFSET - npo->copy_off;
+
+	copy_gop = npo->copy + npo->copy_prod++;
+	copy_gop->flags = GNTCOPY_dest_gref;
+	copy_gop->len = *len;
+
+	foreign = xen_page_foreign(page);
+	if (foreign) {
+		copy_gop->source.domid = foreign->domid;
+		copy_gop->source.u.ref = foreign->gref;
+		copy_gop->flags |= GNTCOPY_source_gref;
+	} else {
+		copy_gop->source.domid = DOMID_SELF;
+		copy_gop->source.u.gmfn = gfn;
+	}
+	copy_gop->source.offset = offset;
+
+	copy_gop->dest.domid = queue->vif->domid;
+	copy_gop->dest.offset = npo->copy_off;
+	copy_gop->dest.u.ref = npo->copy_gref;
+
+	npo->copy_off += *len;
+	info->meta->size += *len;
+
+	/* Leave a gap for the GSO descriptor. */
+	if (info->head && ((1 << info->gso_type) & queue->vif->gso_mask))
+		queue->rx.req_cons++;
+
+	info->head = 0; /* There must be something in this buffer now */
+}
+
+static void xenvif_gop_frag_copy_grant(unsigned long gfn,
+				       unsigned offset,
+				       unsigned int len,
+				       void *data)
+{
+	unsigned int bytes;
+
+	while (len) {
+		bytes = len;
+		xenvif_setup_copy_gop(gfn, offset, &bytes, data);
+		offset += bytes;
+		len -= bytes;
+	}
+}
+
 /*
  * Set up the grant operations for this fragment. If it's a flipping
  * interface, we also set up the unmap request from here.
@@ -283,83 +357,52 @@ static void xenvif_gop_frag_copy(struct xenvif_queue *queue, struct sk_buff *skb
 				 struct page *page, unsigned long size,
 				 unsigned long offset, int *head)
 {
-	struct gnttab_copy *copy_gop;
-	struct xenvif_rx_meta *meta;
+	struct gop_frag_copy info = {
+		.queue = queue,
+		.npo = npo,
+		.head = *head,
+		.gso_type = XEN_NETIF_GSO_TYPE_NONE,
+	};
 	unsigned long bytes;
-	int gso_type = XEN_NETIF_GSO_TYPE_NONE;
+
+	if (skb_is_gso(skb)) {
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
+			info.gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
+		else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
+			info.gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
+	}
 
 	/* Data must not cross a page boundary. */
 	BUG_ON(size + offset > PAGE_SIZE<<compound_order(page));
 
-	meta = npo->meta + npo->meta_prod - 1;
+	info.meta = npo->meta + npo->meta_prod - 1;
 
 	/* Skip unused frames from start of page */
 	page += offset >> PAGE_SHIFT;
 	offset &= ~PAGE_MASK;
 
 	while (size > 0) {
-		struct xen_page_foreign *foreign;
-
 		BUG_ON(offset >= PAGE_SIZE);
-		BUG_ON(npo->copy_off > MAX_BUFFER_OFFSET);
-
-		if (npo->copy_off == MAX_BUFFER_OFFSET)
-			meta = get_next_rx_buffer(queue, npo);
 
 		bytes = PAGE_SIZE - offset;
 		if (bytes > size)
 			bytes = size;
 
-		if (npo->copy_off + bytes > MAX_BUFFER_OFFSET)
-			bytes = MAX_BUFFER_OFFSET - npo->copy_off;
-
-		copy_gop = npo->copy + npo->copy_prod++;
-		copy_gop->flags = GNTCOPY_dest_gref;
-		copy_gop->len = bytes;
-
-		foreign = xen_page_foreign(page);
-		if (foreign) {
-			copy_gop->source.domid = foreign->domid;
-			copy_gop->source.u.ref = foreign->gref;
-			copy_gop->flags |= GNTCOPY_source_gref;
-		} else {
-			copy_gop->source.domid = DOMID_SELF;
-			copy_gop->source.u.gmfn =
-				virt_to_gfn(page_address(page));
-		}
-		copy_gop->source.offset = offset;
-
-		copy_gop->dest.domid = queue->vif->domid;
-		copy_gop->dest.offset = npo->copy_off;
-		copy_gop->dest.u.ref = npo->copy_gref;
-
-		npo->copy_off += bytes;
-		meta->size += bytes;
-
-		offset += bytes;
+		info.page = page;
+		gnttab_foreach_grant_in_range(page, offset, bytes,
+					      xenvif_gop_frag_copy_grant,
+					      &info);
 		size -= bytes;
+		offset = 0;
 
-		/* Next frame */
-		if (offset == PAGE_SIZE && size) {
+		/* Next page */
+		if (size) {
 			BUG_ON(!PageCompound(page));
 			page++;
-			offset = 0;
 		}
-
-		/* Leave a gap for the GSO descriptor. */
-		if (skb_is_gso(skb)) {
-			if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
-				gso_type = XEN_NETIF_GSO_TYPE_TCPV4;
-			else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
-				gso_type = XEN_NETIF_GSO_TYPE_TCPV6;
-		}
-
-		if (*head && ((1 << gso_type) & queue->vif->gso_mask))
-			queue->rx.req_cons++;
-
-		*head = 0; /* There must be something in this buffer now. */
-
 	}
+
+	*head = info.head;
 }
 
 /*
@@ -758,7 +801,7 @@ static int xenvif_count_requests(struct xenvif_queue *queue,
 		first->size -= txp->size;
 		slots++;
 
-		if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) {
+		if (unlikely((txp->offset + txp->size) > XEN_PAGE_SIZE)) {
 			netdev_err(queue->vif->dev, "Cross page boundary, txp->offset: %u, size: %u\n",
 				 txp->offset, txp->size);
 			xenvif_fatal_tx_err(queue->vif);
@@ -1339,11 +1382,11 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue,
 		}
 
 		/* No crossing a page as the payload mustn't fragment. */
-		if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) {
+		if (unlikely((txreq.offset + txreq.size) > XEN_PAGE_SIZE)) {
 			netdev_err(queue->vif->dev,
 				   "txreq.offset: %u, size: %u, end: %lu\n",
 				   txreq.offset, txreq.size,
-				   (unsigned long)(txreq.offset&~PAGE_MASK) + txreq.size);
+				   (unsigned long)(txreq.offset&~XEN_PAGE_MASK) + txreq.size);
 			xenvif_fatal_tx_err(queue->vif);
 			break;
 		}
@@ -1409,7 +1452,7 @@ static void xenvif_tx_build_gops(struct xenvif_queue *queue,
 			virt_to_gfn(skb->data);
 		queue->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF;
 		queue->tx_copy_ops[*copy_ops].dest.offset =
-			offset_in_page(skb->data);
+			offset_in_page(skb->data) & ~XEN_PAGE_MASK;
 
 		queue->tx_copy_ops[*copy_ops].len = data_len;
 		queue->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref;
@@ -1894,7 +1937,7 @@ int xenvif_map_frontend_rings(struct xenvif_queue *queue,
 		goto err;
 
 	txs = (struct xen_netif_tx_sring *)addr;
-	BACK_RING_INIT(&queue->tx, txs, PAGE_SIZE);
+	BACK_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE);
 
 	err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(queue->vif),
 				     &rx_ring_ref, 1, &addr);
@@ -1902,7 +1945,7 @@ int xenvif_map_frontend_rings(struct xenvif_queue *queue,
 		goto err;
 
 	rxs = (struct xen_netif_rx_sring *)addr;
-	BACK_RING_INIT(&queue->rx, rxs, PAGE_SIZE);
+	BACK_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE);
 
 	return 0;
 

+ 86 - 36
drivers/net/xen-netfront.c

@@ -74,8 +74,8 @@ struct netfront_cb {
 
 #define GRANT_INVALID_REF	0
 
-#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
-#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, PAGE_SIZE)
+#define NET_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, XEN_PAGE_SIZE)
+#define NET_RX_RING_SIZE __CONST_RING_SIZE(xen_netif_rx, XEN_PAGE_SIZE)
 
 /* Minimum number of Rx slots (includes slot for GSO metadata). */
 #define NET_RX_SLOTS_MIN (XEN_NETIF_NR_SLOTS_MIN + 1)
@@ -291,7 +291,7 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue)
 		struct sk_buff *skb;
 		unsigned short id;
 		grant_ref_t ref;
-		unsigned long gfn;
+		struct page *page;
 		struct xen_netif_rx_request *req;
 
 		skb = xennet_alloc_one_rx_buffer(queue);
@@ -307,14 +307,13 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue)
 		BUG_ON((signed short)ref < 0);
 		queue->grant_rx_ref[id] = ref;
 
-		gfn = xen_page_to_gfn(skb_frag_page(&skb_shinfo(skb)->frags[0]));
+		page = skb_frag_page(&skb_shinfo(skb)->frags[0]);
 
 		req = RING_GET_REQUEST(&queue->rx, req_prod);
-		gnttab_grant_foreign_access_ref(ref,
-						queue->info->xbdev->otherend_id,
-						gfn,
-						0);
-
+		gnttab_page_grant_foreign_access_ref_one(ref,
+							 queue->info->xbdev->otherend_id,
+							 page,
+							 0);
 		req->id = id;
 		req->gref = ref;
 	}
@@ -415,25 +414,33 @@ static void xennet_tx_buf_gc(struct netfront_queue *queue)
 	xennet_maybe_wake_tx(queue);
 }
 
-static struct xen_netif_tx_request *xennet_make_one_txreq(
-	struct netfront_queue *queue, struct sk_buff *skb,
-	struct page *page, unsigned int offset, unsigned int len)
+struct xennet_gnttab_make_txreq {
+	struct netfront_queue *queue;
+	struct sk_buff *skb;
+	struct page *page;
+	struct xen_netif_tx_request *tx; /* Last request */
+	unsigned int size;
+};
+
+static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset,
+				  unsigned int len, void *data)
 {
+	struct xennet_gnttab_make_txreq *info = data;
 	unsigned int id;
 	struct xen_netif_tx_request *tx;
 	grant_ref_t ref;
-
-	len = min_t(unsigned int, PAGE_SIZE - offset, len);
+	/* convenient aliases */
+	struct page *page = info->page;
+	struct netfront_queue *queue = info->queue;
+	struct sk_buff *skb = info->skb;
 
 	id = get_id_from_freelist(&queue->tx_skb_freelist, queue->tx_skbs);
 	tx = RING_GET_REQUEST(&queue->tx, queue->tx.req_prod_pvt++);
 	ref = gnttab_claim_grant_reference(&queue->gref_tx_head);
 	BUG_ON((signed short)ref < 0);
 
-	gnttab_grant_foreign_access_ref(ref,
-					queue->info->xbdev->otherend_id,
-					xen_page_to_gfn(page),
-					GNTMAP_readonly);
+	gnttab_grant_foreign_access_ref(ref, queue->info->xbdev->otherend_id,
+					gfn, GNTMAP_readonly);
 
 	queue->tx_skbs[id].skb = skb;
 	queue->grant_tx_page[id] = page;
@@ -445,7 +452,34 @@ static struct xen_netif_tx_request *xennet_make_one_txreq(
 	tx->size = len;
 	tx->flags = 0;
 
-	return tx;
+	info->tx = tx;
+	info->size += tx->size;
+}
+
+static struct xen_netif_tx_request *xennet_make_first_txreq(
+	struct netfront_queue *queue, struct sk_buff *skb,
+	struct page *page, unsigned int offset, unsigned int len)
+{
+	struct xennet_gnttab_make_txreq info = {
+		.queue = queue,
+		.skb = skb,
+		.page = page,
+		.size = 0,
+	};
+
+	gnttab_for_one_grant(page, offset, len, xennet_tx_setup_grant, &info);
+
+	return info.tx;
+}
+
+static void xennet_make_one_txreq(unsigned long gfn, unsigned int offset,
+				  unsigned int len, void *data)
+{
+	struct xennet_gnttab_make_txreq *info = data;
+
+	info->tx->flags |= XEN_NETTXF_more_data;
+	skb_get(info->skb);
+	xennet_tx_setup_grant(gfn, offset, len, data);
 }
 
 static struct xen_netif_tx_request *xennet_make_txreqs(
@@ -453,20 +487,30 @@ static struct xen_netif_tx_request *xennet_make_txreqs(
 	struct sk_buff *skb, struct page *page,
 	unsigned int offset, unsigned int len)
 {
+	struct xennet_gnttab_make_txreq info = {
+		.queue = queue,
+		.skb = skb,
+		.tx = tx,
+	};
+
 	/* Skip unused frames from start of page */
 	page += offset >> PAGE_SHIFT;
 	offset &= ~PAGE_MASK;
 
 	while (len) {
-		tx->flags |= XEN_NETTXF_more_data;
-		tx = xennet_make_one_txreq(queue, skb_get(skb),
-					   page, offset, len);
+		info.page = page;
+		info.size = 0;
+
+		gnttab_foreach_grant_in_range(page, offset, len,
+					      xennet_make_one_txreq,
+					      &info);
+
 		page++;
 		offset = 0;
-		len -= tx->size;
+		len -= info.size;
 	}
 
-	return tx;
+	return info.tx;
 }
 
 /*
@@ -476,9 +520,10 @@ static struct xen_netif_tx_request *xennet_make_txreqs(
 static int xennet_count_skb_slots(struct sk_buff *skb)
 {
 	int i, frags = skb_shinfo(skb)->nr_frags;
-	int pages;
+	int slots;
 
-	pages = PFN_UP(offset_in_page(skb->data) + skb_headlen(skb));
+	slots = gnttab_count_grant(offset_in_page(skb->data),
+				   skb_headlen(skb));
 
 	for (i = 0; i < frags; i++) {
 		skb_frag_t *frag = skb_shinfo(skb)->frags + i;
@@ -488,10 +533,10 @@ static int xennet_count_skb_slots(struct sk_buff *skb)
 		/* Skip unused frames from start of page */
 		offset &= ~PAGE_MASK;
 
-		pages += PFN_UP(offset + size);
+		slots += gnttab_count_grant(offset, size);
 	}
 
-	return pages;
+	return slots;
 }
 
 static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb,
@@ -512,6 +557,8 @@ static u16 xennet_select_queue(struct net_device *dev, struct sk_buff *skb,
 	return queue_idx;
 }
 
+#define MAX_XEN_SKB_FRAGS (65536 / XEN_PAGE_SIZE + 1)
+
 static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct netfront_info *np = netdev_priv(dev);
@@ -546,7 +593,7 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	slots = xennet_count_skb_slots(skb);
-	if (unlikely(slots > MAX_SKB_FRAGS + 1)) {
+	if (unlikely(slots > MAX_XEN_SKB_FRAGS + 1)) {
 		net_dbg_ratelimited("xennet: skb rides the rocket: %d slots, %d bytes\n",
 				    slots, skb->len);
 		if (skb_linearize(skb))
@@ -567,10 +614,13 @@ static int xennet_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	/* First request for the linear area. */
-	first_tx = tx = xennet_make_one_txreq(queue, skb,
-					      page, offset, len);
-	page++;
-	offset = 0;
+	first_tx = tx = xennet_make_first_txreq(queue, skb,
+						page, offset, len);
+	offset += tx->size;
+	if (offset == PAGE_SIZE) {
+		page++;
+		offset = 0;
+	}
 	len -= tx->size;
 
 	if (skb->ip_summed == CHECKSUM_PARTIAL)
@@ -732,7 +782,7 @@ static int xennet_get_responses(struct netfront_queue *queue,
 
 	for (;;) {
 		if (unlikely(rx->status < 0 ||
-			     rx->offset + rx->status > PAGE_SIZE)) {
+			     rx->offset + rx->status > XEN_PAGE_SIZE)) {
 			if (net_ratelimit())
 				dev_warn(dev, "rx->offset: %u, size: %d\n",
 					 rx->offset, rx->status);
@@ -1496,7 +1546,7 @@ static int setup_netfront(struct xenbus_device *dev,
 		goto fail;
 	}
 	SHARED_RING_INIT(txs);
-	FRONT_RING_INIT(&queue->tx, txs, PAGE_SIZE);
+	FRONT_RING_INIT(&queue->tx, txs, XEN_PAGE_SIZE);
 
 	err = xenbus_grant_ring(dev, txs, 1, &gref);
 	if (err < 0)
@@ -1510,7 +1560,7 @@ static int setup_netfront(struct xenbus_device *dev,
 		goto alloc_rx_ring_fail;
 	}
 	SHARED_RING_INIT(rxs);
-	FRONT_RING_INIT(&queue->rx, rxs, PAGE_SIZE);
+	FRONT_RING_INIT(&queue->rx, rxs, XEN_PAGE_SIZE);
 
 	err = xenbus_grant_ring(dev, rxs, 1, &gref);
 	if (err < 0)

+ 2 - 2
drivers/tty/hvc/hvc_xen.c

@@ -230,7 +230,7 @@ static int xen_hvm_console_init(void)
 	if (r < 0 || v == 0)
 		goto err;
 	gfn = v;
-	info->intf = xen_remap(gfn << PAGE_SHIFT, PAGE_SIZE);
+	info->intf = xen_remap(gfn << XEN_PAGE_SHIFT, XEN_PAGE_SIZE);
 	if (info->intf == NULL)
 		goto err;
 	info->vtermno = HVC_COOKIE;
@@ -472,7 +472,7 @@ static int xencons_resume(struct xenbus_device *dev)
 	struct xencons_info *info = dev_get_drvdata(&dev->dev);
 
 	xencons_disconnect_backend(info);
-	memset(info->intf, 0, PAGE_SIZE);
+	memset(info->intf, 0, XEN_PAGE_SIZE);
 	return xencons_connect_backend(dev, info);
 }
 

+ 0 - 2
drivers/xen/Makefile

@@ -1,6 +1,4 @@
-ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),)
 obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
-endif
 obj-$(CONFIG_X86)			+= fallback.o
 obj-y	+= grant-table.o features.o balloon.o manage.o preempt.o
 obj-y	+= events/

+ 224 - 117
drivers/xen/balloon.c

@@ -54,6 +54,8 @@
 #include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 #include <linux/percpu-defs.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -70,16 +72,64 @@
 #include <xen/features.h>
 #include <xen/page.h>
 
+static int xen_hotplug_unpopulated;
+
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
+
+static int zero;
+static int one = 1;
+
+static struct ctl_table balloon_table[] = {
+	{
+		.procname	= "hotplug_unpopulated",
+		.data		= &xen_hotplug_unpopulated,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1         = &zero,
+		.extra2         = &one,
+	},
+	{ }
+};
+
+static struct ctl_table balloon_root[] = {
+	{
+		.procname	= "balloon",
+		.mode		= 0555,
+		.child		= balloon_table,
+	},
+	{ }
+};
+
+static struct ctl_table xen_root[] = {
+	{
+		.procname	= "xen",
+		.mode		= 0555,
+		.child		= balloon_root,
+	},
+	{ }
+};
+
+#endif
+
+/*
+ * Use one extent per PAGE_SIZE to avoid to break down the page into
+ * multiple frame.
+ */
+#define EXTENT_ORDER (fls(XEN_PFN_PER_PAGE) - 1)
+
 /*
  * balloon_process() state:
  *
  * BP_DONE: done or nothing to do,
+ * BP_WAIT: wait to be rescheduled,
  * BP_EAGAIN: error, go to sleep,
  * BP_ECANCELED: error, balloon operation canceled.
  */
 
 enum bp_state {
 	BP_DONE,
+	BP_WAIT,
 	BP_EAGAIN,
 	BP_ECANCELED
 };
@@ -91,11 +141,12 @@ struct balloon_stats balloon_stats;
 EXPORT_SYMBOL_GPL(balloon_stats);
 
 /* We increase/decrease in batches which fit in a page */
-static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)];
+static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)];
 
 
 /* List of ballooned pages, threaded through the mem_map array. */
 static LIST_HEAD(ballooned_pages);
+static DECLARE_WAIT_QUEUE_HEAD(balloon_wq);
 
 /* Main work function, always executed in process context. */
 static void balloon_process(struct work_struct *work);
@@ -124,6 +175,7 @@ static void __balloon_append(struct page *page)
 		list_add(&page->lru, &ballooned_pages);
 		balloon_stats.balloon_low++;
 	}
+	wake_up(&balloon_wq);
 }
 
 static void balloon_append(struct page *page)
@@ -133,17 +185,16 @@ static void balloon_append(struct page *page)
 }
 
 /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */
-static struct page *balloon_retrieve(bool prefer_highmem)
+static struct page *balloon_retrieve(bool require_lowmem)
 {
 	struct page *page;
 
 	if (list_empty(&ballooned_pages))
 		return NULL;
 
-	if (prefer_highmem)
-		page = list_entry(ballooned_pages.prev, struct page, lru);
-	else
-		page = list_entry(ballooned_pages.next, struct page, lru);
+	page = list_entry(ballooned_pages.next, struct page, lru);
+	if (require_lowmem && PageHighMem(page))
+		return NULL;
 	list_del(&page->lru);
 
 	if (PageHighMem(page))
@@ -166,6 +217,9 @@ static struct page *balloon_next_page(struct page *page)
 
 static enum bp_state update_schedule(enum bp_state state)
 {
+	if (state == BP_WAIT)
+		return BP_WAIT;
+
 	if (state == BP_ECANCELED)
 		return BP_ECANCELED;
 
@@ -193,43 +247,75 @@ static enum bp_state update_schedule(enum bp_state state)
 }
 
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-static long current_credit(void)
+static struct resource *additional_memory_resource(phys_addr_t size)
 {
-	return balloon_stats.target_pages - balloon_stats.current_pages -
-		balloon_stats.hotplug_pages;
+	struct resource *res;
+	int ret;
+
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		return NULL;
+
+	res->name = "System RAM";
+	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+
+	ret = allocate_resource(&iomem_resource, res,
+				size, 0, -1,
+				PAGES_PER_SECTION * PAGE_SIZE, NULL, NULL);
+	if (ret < 0) {
+		pr_err("Cannot allocate new System RAM resource\n");
+		kfree(res);
+		return NULL;
+	}
+
+	return res;
 }
 
-static bool balloon_is_inflated(void)
+static void release_memory_resource(struct resource *resource)
 {
-	if (balloon_stats.balloon_low || balloon_stats.balloon_high ||
-			balloon_stats.balloon_hotplug)
-		return true;
-	else
-		return false;
-}
+	if (!resource)
+		return;
 
-/*
- * reserve_additional_memory() adds memory region of size >= credit above
- * max_pfn. New region is section aligned and size is modified to be multiple
- * of section size. Those features allow optimal use of address space and
- * establish proper alignment when this function is called first time after
- * boot (last section not fully populated at boot time contains unused memory
- * pages with PG_reserved bit not set; online_pages_range() does not allow page
- * onlining in whole range if first onlined page does not have PG_reserved
- * bit set). Real size of added memory is established at page onlining stage.
- */
+	/*
+	 * No need to reset region to identity mapped since we now
+	 * know that no I/O can be in this region
+	 */
+	release_resource(resource);
+	kfree(resource);
+}
 
-static enum bp_state reserve_additional_memory(long credit)
+static enum bp_state reserve_additional_memory(void)
 {
+	long credit;
+	struct resource *resource;
 	int nid, rc;
-	u64 hotplug_start_paddr;
-	unsigned long balloon_hotplug = credit;
+	unsigned long balloon_hotplug;
+
+	credit = balloon_stats.target_pages + balloon_stats.target_unpopulated
+		- balloon_stats.total_pages;
+
+	/*
+	 * Already hotplugged enough pages?  Wait for them to be
+	 * onlined.
+	 */
+	if (credit <= 0)
+		return BP_WAIT;
 
-	hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn));
-	balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION);
-	nid = memory_add_physaddr_to_nid(hotplug_start_paddr);
+	balloon_hotplug = round_up(credit, PAGES_PER_SECTION);
+
+	resource = additional_memory_resource(balloon_hotplug * PAGE_SIZE);
+	if (!resource)
+		goto err;
+
+	nid = memory_add_physaddr_to_nid(resource->start);
 
 #ifdef CONFIG_XEN_HAVE_PVMMU
+	/*
+	 * We don't support PV MMU when Linux and Xen is using
+	 * different page granularity.
+	 */
+	BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE);
+
         /*
          * add_memory() will build page tables for the new memory so
          * the p2m must contain invalid entries so the correct
@@ -242,29 +328,28 @@ static enum bp_state reserve_additional_memory(long credit)
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
 		unsigned long pfn, i;
 
-		pfn = PFN_DOWN(hotplug_start_paddr);
+		pfn = PFN_DOWN(resource->start);
 		for (i = 0; i < balloon_hotplug; i++) {
 			if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) {
 				pr_warn("set_phys_to_machine() failed, no memory added\n");
-				return BP_ECANCELED;
+				goto err;
 			}
                 }
 	}
 #endif
 
-	rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT);
-
+	rc = add_memory_resource(nid, resource);
 	if (rc) {
 		pr_warn("Cannot add additional memory (%i)\n", rc);
-		return BP_ECANCELED;
+		goto err;
 	}
 
-	balloon_hotplug -= credit;
+	balloon_stats.total_pages += balloon_hotplug;
 
-	balloon_stats.hotplug_pages += credit;
-	balloon_stats.balloon_hotplug = balloon_hotplug;
-
-	return BP_DONE;
+	return BP_WAIT;
+  err:
+	release_memory_resource(resource);
+	return BP_ECANCELED;
 }
 
 static void xen_online_page(struct page *page)
@@ -275,11 +360,6 @@ static void xen_online_page(struct page *page)
 
 	__balloon_append(page);
 
-	if (balloon_stats.hotplug_pages)
-		--balloon_stats.hotplug_pages;
-	else
-		--balloon_stats.balloon_hotplug;
-
 	mutex_unlock(&balloon_mutex);
 }
 
@@ -296,53 +376,34 @@ static struct notifier_block xen_memory_nb = {
 	.priority = 0
 };
 #else
-static long current_credit(void)
+static enum bp_state reserve_additional_memory(void)
 {
-	unsigned long target = balloon_stats.target_pages;
-
-	target = min(target,
-		     balloon_stats.current_pages +
-		     balloon_stats.balloon_low +
-		     balloon_stats.balloon_high);
-
-	return target - balloon_stats.current_pages;
+	balloon_stats.target_pages = balloon_stats.current_pages;
+	return BP_ECANCELED;
 }
+#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
 
-static bool balloon_is_inflated(void)
+static long current_credit(void)
 {
-	if (balloon_stats.balloon_low || balloon_stats.balloon_high)
-		return true;
-	else
-		return false;
+	return balloon_stats.target_pages - balloon_stats.current_pages;
 }
 
-static enum bp_state reserve_additional_memory(long credit)
+static bool balloon_is_inflated(void)
 {
-	balloon_stats.target_pages = balloon_stats.current_pages;
-	return BP_DONE;
+	return balloon_stats.balloon_low || balloon_stats.balloon_high;
 }
-#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */
 
 static enum bp_state increase_reservation(unsigned long nr_pages)
 {
 	int rc;
-	unsigned long  pfn, i;
+	unsigned long i;
 	struct page   *page;
 	struct xen_memory_reservation reservation = {
 		.address_bits = 0,
-		.extent_order = 0,
+		.extent_order = EXTENT_ORDER,
 		.domid        = DOMID_SELF
 	};
 
-#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-	if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) {
-		nr_pages = min(nr_pages, balloon_stats.balloon_hotplug);
-		balloon_stats.hotplug_pages += nr_pages;
-		balloon_stats.balloon_hotplug -= nr_pages;
-		return BP_DONE;
-	}
-#endif
-
 	if (nr_pages > ARRAY_SIZE(frame_list))
 		nr_pages = ARRAY_SIZE(frame_list);
 
@@ -352,7 +413,11 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
 			nr_pages = i;
 			break;
 		}
-		frame_list[i] = page_to_pfn(page);
+
+		/* XENMEM_populate_physmap requires a PFN based on Xen
+		 * granularity.
+		 */
+		frame_list[i] = page_to_xen_pfn(page);
 		page = balloon_next_page(page);
 	}
 
@@ -366,10 +431,16 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
 		page = balloon_retrieve(false);
 		BUG_ON(page == NULL);
 
-		pfn = page_to_pfn(page);
-
 #ifdef CONFIG_XEN_HAVE_PVMMU
+		/*
+		 * We don't support PV MMU when Linux and Xen is using
+		 * different page granularity.
+		 */
+		BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE);
+
 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			unsigned long pfn = page_to_pfn(page);
+
 			set_phys_to_machine(pfn, frame_list[i]);
 
 			/* Link back into the page tables if not highmem. */
@@ -396,23 +467,15 @@ static enum bp_state increase_reservation(unsigned long nr_pages)
 static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 {
 	enum bp_state state = BP_DONE;
-	unsigned long  pfn, i;
-	struct page   *page;
+	unsigned long i;
+	struct page *page, *tmp;
 	int ret;
 	struct xen_memory_reservation reservation = {
 		.address_bits = 0,
-		.extent_order = 0,
+		.extent_order = EXTENT_ORDER,
 		.domid        = DOMID_SELF
 	};
-
-#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-	if (balloon_stats.hotplug_pages) {
-		nr_pages = min(nr_pages, balloon_stats.hotplug_pages);
-		balloon_stats.hotplug_pages -= nr_pages;
-		balloon_stats.balloon_hotplug += nr_pages;
-		return BP_DONE;
-	}
-#endif
+	LIST_HEAD(pages);
 
 	if (nr_pages > ARRAY_SIZE(frame_list))
 		nr_pages = ARRAY_SIZE(frame_list);
@@ -425,8 +488,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 			break;
 		}
 		scrub_page(page);
-
-		frame_list[i] = page_to_pfn(page);
+		list_add(&page->lru, &pages);
 	}
 
 	/*
@@ -438,14 +500,25 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 	 */
 	kmap_flush_unused();
 
-	/* Update direct mapping, invalidate P2M, and add to balloon. */
-	for (i = 0; i < nr_pages; i++) {
-		pfn = frame_list[i];
-		frame_list[i] = pfn_to_gfn(pfn);
-		page = pfn_to_page(pfn);
+	/*
+	 * Setup the frame, update direct mapping, invalidate P2M,
+	 * and add to balloon.
+	 */
+	i = 0;
+	list_for_each_entry_safe(page, tmp, &pages, lru) {
+		/* XENMEM_decrease_reservation requires a GFN */
+		frame_list[i++] = xen_page_to_gfn(page);
 
 #ifdef CONFIG_XEN_HAVE_PVMMU
+		/*
+		 * We don't support PV MMU when Linux and Xen is using
+		 * different page granularity.
+		 */
+		BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE);
+
 		if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+			unsigned long pfn = page_to_pfn(page);
+
 			if (!PageHighMem(page)) {
 				ret = HYPERVISOR_update_va_mapping(
 						(unsigned long)__va(pfn << PAGE_SHIFT),
@@ -455,6 +528,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp)
 			__set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
 		}
 #endif
+		list_del(&page->lru);
 
 		balloon_append(page);
 	}
@@ -492,7 +566,7 @@ static void balloon_process(struct work_struct *work)
 			if (balloon_is_inflated())
 				state = increase_reservation(credit);
 			else
-				state = reserve_additional_memory(credit);
+				state = reserve_additional_memory();
 		}
 
 		if (credit < 0)
@@ -520,41 +594,71 @@ void balloon_set_new_target(unsigned long target)
 }
 EXPORT_SYMBOL_GPL(balloon_set_new_target);
 
+static int add_ballooned_pages(int nr_pages)
+{
+	enum bp_state st;
+
+	if (xen_hotplug_unpopulated) {
+		st = reserve_additional_memory();
+		if (st != BP_ECANCELED) {
+			mutex_unlock(&balloon_mutex);
+			wait_event(balloon_wq,
+				   !list_empty(&ballooned_pages));
+			mutex_lock(&balloon_mutex);
+			return 0;
+		}
+	}
+
+	st = decrease_reservation(nr_pages, GFP_USER);
+	if (st != BP_DONE)
+		return -ENOMEM;
+
+	return 0;
+}
+
 /**
  * alloc_xenballooned_pages - get pages that have been ballooned out
  * @nr_pages: Number of pages to get
  * @pages: pages returned
- * @highmem: allow highmem pages
  * @return 0 on success, error otherwise
  */
-int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem)
+int alloc_xenballooned_pages(int nr_pages, struct page **pages)
 {
 	int pgno = 0;
 	struct page *page;
+	int ret;
+
 	mutex_lock(&balloon_mutex);
+
+	balloon_stats.target_unpopulated += nr_pages;
+
 	while (pgno < nr_pages) {
-		page = balloon_retrieve(highmem);
-		if (page && (highmem || !PageHighMem(page))) {
+		page = balloon_retrieve(true);
+		if (page) {
 			pages[pgno++] = page;
+#ifdef CONFIG_XEN_HAVE_PVMMU
+			/*
+			 * We don't support PV MMU when Linux and Xen is using
+			 * different page granularity.
+			 */
+			BUILD_BUG_ON(XEN_PAGE_SIZE != PAGE_SIZE);
+
+			ret = xen_alloc_p2m_entry(page_to_pfn(page));
+			if (ret < 0)
+				goto out_undo;
+#endif
 		} else {
-			enum bp_state st;
-			if (page)
-				balloon_append(page);
-			st = decrease_reservation(nr_pages - pgno,
-					highmem ? GFP_HIGHUSER : GFP_USER);
-			if (st != BP_DONE)
+			ret = add_ballooned_pages(nr_pages - pgno);
+			if (ret < 0)
 				goto out_undo;
 		}
 	}
 	mutex_unlock(&balloon_mutex);
 	return 0;
  out_undo:
-	while (pgno)
-		balloon_append(pages[--pgno]);
-	/* Free the memory back to the kernel soon */
-	schedule_delayed_work(&balloon_worker, 0);
 	mutex_unlock(&balloon_mutex);
-	return -ENOMEM;
+	free_xenballooned_pages(pgno, pages);
+	return ret;
 }
 EXPORT_SYMBOL(alloc_xenballooned_pages);
 
@@ -574,6 +678,8 @@ void free_xenballooned_pages(int nr_pages, struct page **pages)
 			balloon_append(pages[i]);
 	}
 
+	balloon_stats.target_unpopulated -= nr_pages;
+
 	/* The balloon may be too large now. Shrink it if needed. */
 	if (current_credit())
 		schedule_delayed_work(&balloon_worker, 0);
@@ -602,6 +708,8 @@ static void __init balloon_add_region(unsigned long start_pfn,
 		   don't subtract from it. */
 		__balloon_append(page);
 	}
+
+	balloon_stats.total_pages += extra_pfn_end - start_pfn;
 }
 
 static int __init balloon_init(void)
@@ -619,6 +727,7 @@ static int __init balloon_init(void)
 	balloon_stats.target_pages  = balloon_stats.current_pages;
 	balloon_stats.balloon_low   = 0;
 	balloon_stats.balloon_high  = 0;
+	balloon_stats.total_pages   = balloon_stats.current_pages;
 
 	balloon_stats.schedule_delay = 1;
 	balloon_stats.max_schedule_delay = 32;
@@ -626,11 +735,9 @@ static int __init balloon_init(void)
 	balloon_stats.max_retry_count = RETRY_UNLIMITED;
 
 #ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-	balloon_stats.hotplug_pages = 0;
-	balloon_stats.balloon_hotplug = 0;
-
 	set_online_page_callback(&xen_online_page);
 	register_memory_notifier(&xen_memory_nb);
+	register_sysctl_table(xen_root);
 #endif
 
 	/*

+ 8 - 0
drivers/xen/biomerge.c

@@ -6,10 +6,18 @@
 bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
 			       const struct bio_vec *vec2)
 {
+#if XEN_PAGE_SIZE == PAGE_SIZE
 	unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page));
 	unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page));
 
 	return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) &&
 		((bfn1 == bfn2) || ((bfn1+1) == bfn2));
+#else
+	/*
+	 * XXX: Add support for merging bio_vec when using different page
+	 * size in Xen and Linux.
+	 */
+	return 0;
+#endif
 }
 EXPORT_SYMBOL(xen_biovec_phys_mergeable);

+ 11 - 3
drivers/xen/cpu_hotplug.c

@@ -11,15 +11,20 @@
 static void enable_hotplug_cpu(int cpu)
 {
 	if (!cpu_present(cpu))
-		arch_register_cpu(cpu);
+		xen_arch_register_cpu(cpu);
 
 	set_cpu_present(cpu, true);
 }
 
 static void disable_hotplug_cpu(int cpu)
 {
+	if (cpu_online(cpu)) {
+		lock_device_hotplug();
+		device_offline(get_cpu_device(cpu));
+		unlock_device_hotplug();
+	}
 	if (cpu_present(cpu))
-		arch_unregister_cpu(cpu);
+		xen_arch_unregister_cpu(cpu);
 
 	set_cpu_present(cpu, false);
 }
@@ -55,7 +60,6 @@ static void vcpu_hotplug(unsigned int cpu)
 		enable_hotplug_cpu(cpu);
 		break;
 	case 0:
-		(void)cpu_down(cpu);
 		disable_hotplug_cpu(cpu);
 		break;
 	default:
@@ -102,7 +106,11 @@ static int __init setup_vcpu_hotplug_event(void)
 	static struct notifier_block xsn_cpu = {
 		.notifier_call = setup_cpu_watcher };
 
+#ifdef CONFIG_X86
 	if (!xen_pv_domain())
+#else
+	if (!xen_domain())
+#endif
 		return -ENODEV;
 
 	register_xenstore_notifier(&xsn_cpu);

+ 1 - 1
drivers/xen/events/events_base.c

@@ -40,11 +40,11 @@
 #include <asm/idle.h>
 #include <asm/io_apic.h>
 #include <asm/xen/pci.h>
-#include <xen/page.h>
 #endif
 #include <asm/sync_bitops.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
+#include <xen/page.h>
 
 #include <xen/xen.h>
 #include <xen/hvm.h>

+ 1 - 1
drivers/xen/events/events_fifo.c

@@ -54,7 +54,7 @@
 
 #include "events_internal.h"
 
-#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t))
+#define EVENT_WORDS_PER_PAGE (XEN_PAGE_SIZE / sizeof(event_word_t))
 #define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE)
 
 struct evtchn_fifo_queue {

+ 52 - 4
drivers/xen/grant-table.c

@@ -642,7 +642,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr)
 	if (xen_auto_xlat_grant_frames.count)
 		return -EINVAL;
 
-	vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes);
+	vaddr = xen_remap(addr, XEN_PAGE_SIZE * max_nr_gframes);
 	if (vaddr == NULL) {
 		pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n",
 			&addr);
@@ -654,7 +654,7 @@ int gnttab_setup_auto_xlat_frames(phys_addr_t addr)
 		return -ENOMEM;
 	}
 	for (i = 0; i < max_nr_gframes; i++)
-		pfn[i] = PFN_DOWN(addr) + i;
+		pfn[i] = XEN_PFN_DOWN(addr) + i;
 
 	xen_auto_xlat_grant_frames.vaddr = vaddr;
 	xen_auto_xlat_grant_frames.pfn = pfn;
@@ -687,7 +687,7 @@ int gnttab_alloc_pages(int nr_pages, struct page **pages)
 	int i;
 	int ret;
 
-	ret = alloc_xenballooned_pages(nr_pages, pages, false);
+	ret = alloc_xenballooned_pages(nr_pages, pages);
 	if (ret < 0)
 		return ret;
 
@@ -776,6 +776,54 @@ void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count)
 }
 EXPORT_SYMBOL_GPL(gnttab_batch_copy);
 
+void gnttab_foreach_grant_in_range(struct page *page,
+				   unsigned int offset,
+				   unsigned int len,
+				   xen_grant_fn_t fn,
+				   void *data)
+{
+	unsigned int goffset;
+	unsigned int glen;
+	unsigned long xen_pfn;
+
+	len = min_t(unsigned int, PAGE_SIZE - offset, len);
+	goffset = xen_offset_in_page(offset);
+
+	xen_pfn = page_to_xen_pfn(page) + XEN_PFN_DOWN(offset);
+
+	while (len) {
+		glen = min_t(unsigned int, XEN_PAGE_SIZE - goffset, len);
+		fn(pfn_to_gfn(xen_pfn), goffset, glen, data);
+
+		goffset = 0;
+		xen_pfn++;
+		len -= glen;
+	}
+}
+EXPORT_SYMBOL_GPL(gnttab_foreach_grant_in_range);
+
+void gnttab_foreach_grant(struct page **pages,
+			  unsigned int nr_grefs,
+			  xen_grant_fn_t fn,
+			  void *data)
+{
+	unsigned int goffset = 0;
+	unsigned long xen_pfn = 0;
+	unsigned int i;
+
+	for (i = 0; i < nr_grefs; i++) {
+		if ((i % XEN_PFN_PER_PAGE) == 0) {
+			xen_pfn = page_to_xen_pfn(pages[i / XEN_PFN_PER_PAGE]);
+			goffset = 0;
+		}
+
+		fn(pfn_to_gfn(xen_pfn), goffset, XEN_PAGE_SIZE, data);
+
+		goffset += XEN_PAGE_SIZE;
+		xen_pfn++;
+	}
+}
+
 int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops,
 		    struct gnttab_map_grant_ref *kmap_ops,
 		    struct page **pages, unsigned int count)
@@ -978,7 +1026,7 @@ static void gnttab_request_version(void)
 {
 	/* Only version 1 is used, which will always be available. */
 	grant_table_version = 1;
-	grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1);
+	grefs_per_grant_frame = XEN_PAGE_SIZE / sizeof(struct grant_entry_v1);
 	gnttab_interface = &gnttab_v1_ops;
 
 	pr_info("Grant tables using version %d layout\n", grant_table_version);

+ 6 - 4
drivers/xen/privcmd.c

@@ -401,7 +401,7 @@ static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs)
 	if (pages == NULL)
 		return -ENOMEM;
 
-	rc = alloc_xenballooned_pages(numpgs, pages, 0);
+	rc = alloc_xenballooned_pages(numpgs, pages);
 	if (rc != 0) {
 		pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__,
 			numpgs, rc);
@@ -446,7 +446,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 		return -EINVAL;
 	}
 
-	nr_pages = m.num;
+	nr_pages = DIV_ROUND_UP(m.num, XEN_PFN_PER_PAGE);
 	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
 		return -EINVAL;
 
@@ -494,7 +494,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 			goto out_unlock;
 		}
 		if (xen_feature(XENFEAT_auto_translated_physmap)) {
-			ret = alloc_empty_pages(vma, m.num);
+			ret = alloc_empty_pages(vma, nr_pages);
 			if (ret < 0)
 				goto out_unlock;
 		} else
@@ -518,6 +518,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 	state.global_error  = 0;
 	state.version       = version;
 
+	BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0);
 	/* mmap_batch_fn guarantees ret == 0 */
 	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
 				    &pagelist, mmap_batch_fn, &state));
@@ -582,12 +583,13 @@ static void privcmd_close(struct vm_area_struct *vma)
 {
 	struct page **pages = vma->vm_private_data;
 	int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+	int numgfns = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT;
 	int rc;
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
 		return;
 
-	rc = xen_unmap_domain_gfn_range(vma, numpgs, pages);
+	rc = xen_unmap_domain_gfn_range(vma, numgfns, pages);
 	if (rc == 0)
 		free_xenballooned_pages(numpgs, pages);
 	else

+ 21 - 22
drivers/xen/swiotlb-xen.c

@@ -76,27 +76,27 @@ static unsigned long xen_io_tlb_nslabs;
 static u64 start_dma_addr;
 
 /*
- * Both of these functions should avoid PFN_PHYS because phys_addr_t
+ * Both of these functions should avoid XEN_PFN_PHYS because phys_addr_t
  * can be 32bit when dma_addr_t is 64bit leading to a loss in
  * information if the shift is done before casting to 64bit.
  */
 static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr)
 {
-	unsigned long bfn = pfn_to_bfn(PFN_DOWN(paddr));
-	dma_addr_t dma = (dma_addr_t)bfn << PAGE_SHIFT;
+	unsigned long bfn = pfn_to_bfn(XEN_PFN_DOWN(paddr));
+	dma_addr_t dma = (dma_addr_t)bfn << XEN_PAGE_SHIFT;
 
-	dma |= paddr & ~PAGE_MASK;
+	dma |= paddr & ~XEN_PAGE_MASK;
 
 	return dma;
 }
 
 static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr)
 {
-	unsigned long pfn = bfn_to_pfn(PFN_DOWN(baddr));
-	dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT;
+	unsigned long xen_pfn = bfn_to_pfn(XEN_PFN_DOWN(baddr));
+	dma_addr_t dma = (dma_addr_t)xen_pfn << XEN_PAGE_SHIFT;
 	phys_addr_t paddr = dma;
 
-	paddr |= baddr & ~PAGE_MASK;
+	paddr |= baddr & ~XEN_PAGE_MASK;
 
 	return paddr;
 }
@@ -106,7 +106,7 @@ static inline dma_addr_t xen_virt_to_bus(void *address)
 	return xen_phys_to_bus(virt_to_phys(address));
 }
 
-static int check_pages_physically_contiguous(unsigned long pfn,
+static int check_pages_physically_contiguous(unsigned long xen_pfn,
 					     unsigned int offset,
 					     size_t length)
 {
@@ -114,11 +114,11 @@ static int check_pages_physically_contiguous(unsigned long pfn,
 	int i;
 	int nr_pages;
 
-	next_bfn = pfn_to_bfn(pfn);
-	nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT;
+	next_bfn = pfn_to_bfn(xen_pfn);
+	nr_pages = (offset + length + XEN_PAGE_SIZE-1) >> XEN_PAGE_SHIFT;
 
 	for (i = 1; i < nr_pages; i++) {
-		if (pfn_to_bfn(++pfn) != ++next_bfn)
+		if (pfn_to_bfn(++xen_pfn) != ++next_bfn)
 			return 0;
 	}
 	return 1;
@@ -126,28 +126,27 @@ static int check_pages_physically_contiguous(unsigned long pfn,
 
 static inline int range_straddles_page_boundary(phys_addr_t p, size_t size)
 {
-	unsigned long pfn = PFN_DOWN(p);
-	unsigned int offset = p & ~PAGE_MASK;
+	unsigned long xen_pfn = XEN_PFN_DOWN(p);
+	unsigned int offset = p & ~XEN_PAGE_MASK;
 
-	if (offset + size <= PAGE_SIZE)
+	if (offset + size <= XEN_PAGE_SIZE)
 		return 0;
-	if (check_pages_physically_contiguous(pfn, offset, size))
+	if (check_pages_physically_contiguous(xen_pfn, offset, size))
 		return 0;
 	return 1;
 }
 
 static int is_xen_swiotlb_buffer(dma_addr_t dma_addr)
 {
-	unsigned long bfn = PFN_DOWN(dma_addr);
-	unsigned long pfn = bfn_to_local_pfn(bfn);
-	phys_addr_t paddr;
+	unsigned long bfn = XEN_PFN_DOWN(dma_addr);
+	unsigned long xen_pfn = bfn_to_local_pfn(bfn);
+	phys_addr_t paddr = XEN_PFN_PHYS(xen_pfn);
 
 	/* If the address is outside our domain, it CAN
 	 * have the same virtual address as another address
 	 * in our domain. Therefore _only_ check address within our domain.
 	 */
-	if (pfn_valid(pfn)) {
-		paddr = PFN_PHYS(pfn);
+	if (pfn_valid(PFN_DOWN(paddr))) {
 		return paddr >= virt_to_phys(xen_io_tlb_start) &&
 		       paddr < virt_to_phys(xen_io_tlb_end);
 	}
@@ -392,7 +391,7 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 */
 	if (dma_capable(dev, dev_addr, size) &&
 	    !range_straddles_page_boundary(phys, size) &&
-		!xen_arch_need_swiotlb(dev, PFN_DOWN(phys), PFN_DOWN(dev_addr)) &&
+		!xen_arch_need_swiotlb(dev, phys, dev_addr) &&
 		!swiotlb_force) {
 		/* we are not interested in the dma_addr returned by
 		 * xen_dma_map_page, only in the potential cache flushes executed
@@ -551,7 +550,7 @@ xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 		dma_addr_t dev_addr = xen_phys_to_bus(paddr);
 
 		if (swiotlb_force ||
-		    xen_arch_need_swiotlb(hwdev, PFN_DOWN(paddr), PFN_DOWN(dev_addr)) ||
+		    xen_arch_need_swiotlb(hwdev, paddr, dev_addr) ||
 		    !dma_capable(hwdev, dev_addr, sg->length) ||
 		    range_straddles_page_boundary(paddr, sg->length)) {
 			phys_addr_t map = swiotlb_tbl_map_single(hwdev,

+ 87 - 41
drivers/xen/xenbus/xenbus_client.c

@@ -49,6 +49,10 @@
 
 #include "xenbus_probe.h"
 
+#define XENBUS_PAGES(_grants)	(DIV_ROUND_UP(_grants, XEN_PFN_PER_PAGE))
+
+#define XENBUS_MAX_RING_PAGES	(XENBUS_PAGES(XENBUS_MAX_RING_GRANTS))
+
 struct xenbus_map_node {
 	struct list_head next;
 	union {
@@ -57,10 +61,11 @@ struct xenbus_map_node {
 		} pv;
 		struct {
 			struct page *pages[XENBUS_MAX_RING_PAGES];
+			unsigned long addrs[XENBUS_MAX_RING_GRANTS];
 			void *addr;
 		} hvm;
 	};
-	grant_handle_t handles[XENBUS_MAX_RING_PAGES];
+	grant_handle_t handles[XENBUS_MAX_RING_GRANTS];
 	unsigned int   nr_handles;
 };
 
@@ -388,7 +393,7 @@ int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr,
 		}
 		grefs[i] = err;
 
-		vaddr = vaddr + PAGE_SIZE;
+		vaddr = vaddr + XEN_PAGE_SIZE;
 	}
 
 	return 0;
@@ -479,12 +484,12 @@ static int __xenbus_map_ring(struct xenbus_device *dev,
 			     unsigned int flags,
 			     bool *leaked)
 {
-	struct gnttab_map_grant_ref map[XENBUS_MAX_RING_PAGES];
-	struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES];
+	struct gnttab_map_grant_ref map[XENBUS_MAX_RING_GRANTS];
+	struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS];
 	int i, j;
 	int err = GNTST_okay;
 
-	if (nr_grefs > XENBUS_MAX_RING_PAGES)
+	if (nr_grefs > XENBUS_MAX_RING_GRANTS)
 		return -EINVAL;
 
 	for (i = 0; i < nr_grefs; i++) {
@@ -540,22 +545,22 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
 {
 	struct xenbus_map_node *node;
 	struct vm_struct *area;
-	pte_t *ptes[XENBUS_MAX_RING_PAGES];
-	phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES];
+	pte_t *ptes[XENBUS_MAX_RING_GRANTS];
+	phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS];
 	int err = GNTST_okay;
 	int i;
 	bool leaked;
 
 	*vaddr = NULL;
 
-	if (nr_grefs > XENBUS_MAX_RING_PAGES)
+	if (nr_grefs > XENBUS_MAX_RING_GRANTS)
 		return -EINVAL;
 
 	node = kzalloc(sizeof(*node), GFP_KERNEL);
 	if (!node)
 		return -ENOMEM;
 
-	area = alloc_vm_area(PAGE_SIZE * nr_grefs, ptes);
+	area = alloc_vm_area(XEN_PAGE_SIZE * nr_grefs, ptes);
 	if (!area) {
 		kfree(node);
 		return -ENOMEM;
@@ -591,21 +596,44 @@ failed:
 	return err;
 }
 
+struct map_ring_valloc_hvm
+{
+	unsigned int idx;
+
+	/* Why do we need two arrays? See comment of __xenbus_map_ring */
+	phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS];
+	unsigned long addrs[XENBUS_MAX_RING_GRANTS];
+};
+
+static void xenbus_map_ring_setup_grant_hvm(unsigned long gfn,
+					    unsigned int goffset,
+					    unsigned int len,
+					    void *data)
+{
+	struct map_ring_valloc_hvm *info = data;
+	unsigned long vaddr = (unsigned long)gfn_to_virt(gfn);
+
+	info->phys_addrs[info->idx] = vaddr;
+	info->addrs[info->idx] = vaddr;
+
+	info->idx++;
+}
+
 static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
 				      grant_ref_t *gnt_ref,
 				      unsigned int nr_grefs,
 				      void **vaddr)
 {
 	struct xenbus_map_node *node;
-	int i;
 	int err;
 	void *addr;
 	bool leaked = false;
-	/* Why do we need two arrays? See comment of __xenbus_map_ring */
-	phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES];
-	unsigned long addrs[XENBUS_MAX_RING_PAGES];
+	struct map_ring_valloc_hvm info = {
+		.idx = 0,
+	};
+	unsigned int nr_pages = XENBUS_PAGES(nr_grefs);
 
-	if (nr_grefs > XENBUS_MAX_RING_PAGES)
+	if (nr_grefs > XENBUS_MAX_RING_GRANTS)
 		return -EINVAL;
 
 	*vaddr = NULL;
@@ -614,25 +642,22 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
 	if (!node)
 		return -ENOMEM;
 
-	err = alloc_xenballooned_pages(nr_grefs, node->hvm.pages,
-				       false /* lowmem */);
+	err = alloc_xenballooned_pages(nr_pages, node->hvm.pages);
 	if (err)
 		goto out_err;
 
-	for (i = 0; i < nr_grefs; i++) {
-		unsigned long pfn = page_to_pfn(node->hvm.pages[i]);
-		phys_addrs[i] = (unsigned long)pfn_to_kaddr(pfn);
-		addrs[i] = (unsigned long)pfn_to_kaddr(pfn);
-	}
+	gnttab_foreach_grant(node->hvm.pages, nr_grefs,
+			     xenbus_map_ring_setup_grant_hvm,
+			     &info);
 
 	err = __xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handles,
-				phys_addrs, GNTMAP_host_map, &leaked);
+				info.phys_addrs, GNTMAP_host_map, &leaked);
 	node->nr_handles = nr_grefs;
 
 	if (err)
 		goto out_free_ballooned_pages;
 
-	addr = vmap(node->hvm.pages, nr_grefs, VM_MAP | VM_IOREMAP,
+	addr = vmap(node->hvm.pages, nr_pages, VM_MAP | VM_IOREMAP,
 		    PAGE_KERNEL);
 	if (!addr) {
 		err = -ENOMEM;
@@ -650,14 +675,13 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
 
  out_xenbus_unmap_ring:
 	if (!leaked)
-		xenbus_unmap_ring(dev, node->handles, node->nr_handles,
-				  addrs);
+		xenbus_unmap_ring(dev, node->handles, nr_grefs, info.addrs);
 	else
 		pr_alert("leaking %p size %u page(s)",
-			 addr, nr_grefs);
+			 addr, nr_pages);
  out_free_ballooned_pages:
 	if (!leaked)
-		free_xenballooned_pages(nr_grefs, node->hvm.pages);
+		free_xenballooned_pages(nr_pages, node->hvm.pages);
  out_err:
 	kfree(node);
 	return err;
@@ -687,10 +711,10 @@ int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t *gnt_refs,
 		    unsigned int nr_grefs, grant_handle_t *handles,
 		    unsigned long *vaddrs, bool *leaked)
 {
-	phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES];
+	phys_addr_t phys_addrs[XENBUS_MAX_RING_GRANTS];
 	int i;
 
-	if (nr_grefs > XENBUS_MAX_RING_PAGES)
+	if (nr_grefs > XENBUS_MAX_RING_GRANTS)
 		return -EINVAL;
 
 	for (i = 0; i < nr_grefs; i++)
@@ -723,7 +747,7 @@ EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
 static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)
 {
 	struct xenbus_map_node *node;
-	struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES];
+	struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS];
 	unsigned int level;
 	int i;
 	bool leaked = false;
@@ -750,7 +774,7 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)
 		unsigned long addr;
 
 		memset(&unmap[i], 0, sizeof(unmap[i]));
-		addr = (unsigned long)vaddr + (PAGE_SIZE * i);
+		addr = (unsigned long)vaddr + (XEN_PAGE_SIZE * i);
 		unmap[i].host_addr = arbitrary_virt_to_machine(
 			lookup_address(addr, &level)).maddr;
 		unmap[i].dev_bus_addr = 0;
@@ -783,13 +807,33 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)
 	return err;
 }
 
+struct unmap_ring_vfree_hvm
+{
+	unsigned int idx;
+	unsigned long addrs[XENBUS_MAX_RING_GRANTS];
+};
+
+static void xenbus_unmap_ring_setup_grant_hvm(unsigned long gfn,
+					      unsigned int goffset,
+					      unsigned int len,
+					      void *data)
+{
+	struct unmap_ring_vfree_hvm *info = data;
+
+	info->addrs[info->idx] = (unsigned long)gfn_to_virt(gfn);
+
+	info->idx++;
+}
+
 static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
 {
 	int rv;
 	struct xenbus_map_node *node;
 	void *addr;
-	unsigned long addrs[XENBUS_MAX_RING_PAGES];
-	int i;
+	struct unmap_ring_vfree_hvm info = {
+		.idx = 0,
+	};
+	unsigned int nr_pages;
 
 	spin_lock(&xenbus_valloc_lock);
 	list_for_each_entry(node, &xenbus_valloc_pages, next) {
@@ -809,18 +853,20 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
 		return GNTST_bad_virt_addr;
 	}
 
-	for (i = 0; i < node->nr_handles; i++)
-		addrs[i] = (unsigned long)pfn_to_kaddr(page_to_pfn(node->hvm.pages[i]));
+	nr_pages = XENBUS_PAGES(node->nr_handles);
+
+	gnttab_foreach_grant(node->hvm.pages, node->nr_handles,
+			     xenbus_unmap_ring_setup_grant_hvm,
+			     &info);
 
 	rv = xenbus_unmap_ring(dev, node->handles, node->nr_handles,
-			       addrs);
+			       info.addrs);
 	if (!rv) {
 		vunmap(vaddr);
-		free_xenballooned_pages(node->nr_handles, node->hvm.pages);
+		free_xenballooned_pages(nr_pages, node->hvm.pages);
 	}
 	else
-		WARN(1, "Leaking %p, size %u page(s)\n", vaddr,
-		     node->nr_handles);
+		WARN(1, "Leaking %p, size %u page(s)\n", vaddr, nr_pages);
 
 	kfree(node);
 	return rv;
@@ -841,11 +887,11 @@ int xenbus_unmap_ring(struct xenbus_device *dev,
 		      grant_handle_t *handles, unsigned int nr_handles,
 		      unsigned long *vaddrs)
 {
-	struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES];
+	struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_GRANTS];
 	int i;
 	int err;
 
-	if (nr_handles > XENBUS_MAX_RING_PAGES)
+	if (nr_handles > XENBUS_MAX_RING_GRANTS)
 		return -EINVAL;
 
 	for (i = 0; i < nr_handles; i++)

+ 2 - 1
drivers/xen/xenbus/xenbus_probe.c

@@ -802,7 +802,8 @@ static int __init xenbus_init(void)
 			goto out_error;
 		xen_store_gfn = (unsigned long)v;
 		xen_store_interface =
-			xen_remap(xen_store_gfn << PAGE_SHIFT, PAGE_SIZE);
+			xen_remap(xen_store_gfn << XEN_PAGE_SHIFT,
+				  XEN_PAGE_SIZE);
 		break;
 	default:
 		pr_warn("Xenstore state unknown\n");

+ 84 - 40
drivers/xen/xlate_mmu.c

@@ -38,31 +38,28 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/memory.h>
 
-/* map fgfn of domid to lpfn in the current domain */
-static int map_foreign_page(unsigned long lpfn, unsigned long fgfn,
-			    unsigned int domid)
-{
-	int rc;
-	struct xen_add_to_physmap_range xatp = {
-		.domid = DOMID_SELF,
-		.foreign_domid = domid,
-		.size = 1,
-		.space = XENMAPSPACE_gmfn_foreign,
-	};
-	xen_ulong_t idx = fgfn;
-	xen_pfn_t gpfn = lpfn;
-	int err = 0;
+typedef void (*xen_gfn_fn_t)(unsigned long gfn, void *data);
 
-	set_xen_guest_handle(xatp.idxs, &idx);
-	set_xen_guest_handle(xatp.gpfns, &gpfn);
-	set_xen_guest_handle(xatp.errs, &err);
+/* Break down the pages in 4KB chunk and call fn for each gfn */
+static void xen_for_each_gfn(struct page **pages, unsigned nr_gfn,
+			     xen_gfn_fn_t fn, void *data)
+{
+	unsigned long xen_pfn = 0;
+	struct page *page;
+	int i;
 
-	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
-	return rc < 0 ? rc : err;
+	for (i = 0; i < nr_gfn; i++) {
+		if ((i % XEN_PFN_PER_PAGE) == 0) {
+			page = pages[i / XEN_PFN_PER_PAGE];
+			xen_pfn = page_to_xen_pfn(page);
+		}
+		fn(pfn_to_gfn(xen_pfn++), data);
+	}
 }
 
 struct remap_data {
 	xen_pfn_t *fgfn; /* foreign domain's gfn */
+	int nr_fgfn; /* Number of foreign gfn left to map */
 	pgprot_t prot;
 	domid_t  domid;
 	struct vm_area_struct *vma;
@@ -71,24 +68,71 @@ struct remap_data {
 	struct xen_remap_gfn_info *info;
 	int *err_ptr;
 	int mapped;
+
+	/* Hypercall parameters */
+	int h_errs[XEN_PFN_PER_PAGE];
+	xen_ulong_t h_idxs[XEN_PFN_PER_PAGE];
+	xen_pfn_t h_gpfns[XEN_PFN_PER_PAGE];
+
+	int h_iter;	/* Iterator */
 };
 
+static void setup_hparams(unsigned long gfn, void *data)
+{
+	struct remap_data *info = data;
+
+	info->h_idxs[info->h_iter] = *info->fgfn;
+	info->h_gpfns[info->h_iter] = gfn;
+	info->h_errs[info->h_iter] = 0;
+
+	info->h_iter++;
+	info->fgfn++;
+}
+
 static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
 			void *data)
 {
 	struct remap_data *info = data;
 	struct page *page = info->pages[info->index++];
-	unsigned long pfn = page_to_pfn(page);
-	pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot));
-	int rc;
+	pte_t pte = pte_mkspecial(pfn_pte(page_to_pfn(page), info->prot));
+	int rc, nr_gfn;
+	uint32_t i;
+	struct xen_add_to_physmap_range xatp = {
+		.domid = DOMID_SELF,
+		.foreign_domid = info->domid,
+		.space = XENMAPSPACE_gmfn_foreign,
+	};
 
-	rc = map_foreign_page(pfn, *info->fgfn, info->domid);
-	*info->err_ptr++ = rc;
-	if (!rc) {
-		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
-		info->mapped++;
+	nr_gfn = min_t(typeof(info->nr_fgfn), XEN_PFN_PER_PAGE, info->nr_fgfn);
+	info->nr_fgfn -= nr_gfn;
+
+	info->h_iter = 0;
+	xen_for_each_gfn(&page, nr_gfn, setup_hparams, info);
+	BUG_ON(info->h_iter != nr_gfn);
+
+	set_xen_guest_handle(xatp.idxs, info->h_idxs);
+	set_xen_guest_handle(xatp.gpfns, info->h_gpfns);
+	set_xen_guest_handle(xatp.errs, info->h_errs);
+	xatp.size = nr_gfn;
+
+	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
+
+	/* info->err_ptr expect to have one error status per Xen PFN */
+	for (i = 0; i < nr_gfn; i++) {
+		int err = (rc < 0) ? rc : info->h_errs[i];
+
+		*(info->err_ptr++) = err;
+		if (!err)
+			info->mapped++;
 	}
-	info->fgfn++;
+
+	/*
+	 * Note: The hypercall will return 0 in most of the case if even if
+	 * all the fgmfn are not mapped. We still have to update the pte
+	 * as the userspace may decide to continue.
+	 */
+	if (!rc)
+		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
 
 	return 0;
 }
@@ -102,13 +146,14 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 {
 	int err;
 	struct remap_data data;
-	unsigned long range = nr << PAGE_SHIFT;
+	unsigned long range = DIV_ROUND_UP(nr, XEN_PFN_PER_PAGE) << PAGE_SHIFT;
 
 	/* Kept here for the purpose of making sure code doesn't break
 	   x86 PVOPS */
 	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
 	data.fgfn = gfn;
+	data.nr_fgfn = nr;
 	data.prot  = prot;
 	data.domid = domid;
 	data.vma   = vma;
@@ -123,21 +168,20 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array);
 
-int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
-			      int nr, struct page **pages)
+static void unmap_gfn(unsigned long gfn, void *data)
 {
-	int i;
+	struct xen_remove_from_physmap xrp;
 
-	for (i = 0; i < nr; i++) {
-		struct xen_remove_from_physmap xrp;
-		unsigned long pfn;
+	xrp.domid = DOMID_SELF;
+	xrp.gpfn = gfn;
+	(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
+}
 
-		pfn = page_to_pfn(pages[i]);
+int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
+			      int nr, struct page **pages)
+{
+	xen_for_each_gfn(pages, nr, unmap_gfn, NULL);
 
-		xrp.domid = DOMID_SELF;
-		xrp.gpfn = pfn;
-		(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
-	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xen_xlate_unmap_gfn_range);

+ 2 - 0
include/linux/memory_hotplug.h

@@ -11,6 +11,7 @@ struct zone;
 struct pglist_data;
 struct mem_section;
 struct memory_block;
+struct resource;
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 
@@ -266,6 +267,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
 extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 		void *arg, int (*func)(struct memory_block *, void *));
 extern int add_memory(int nid, u64 start, u64 size);
+extern int add_memory_resource(int nid, struct resource *resource);
 extern int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
 		bool for_device);
 extern int arch_add_memory(int nid, u64 start, u64 size, bool for_device);

+ 12 - 10
include/uapi/xen/gntalloc.h

@@ -11,6 +11,8 @@
 #ifndef __LINUX_PUBLIC_GNTALLOC_H__
 #define __LINUX_PUBLIC_GNTALLOC_H__
 
+#include <linux/types.h>
+
 /*
  * Allocates a new page and creates a new grant reference.
  */
@@ -19,17 +21,17 @@ _IOC(_IOC_NONE, 'G', 5, sizeof(struct ioctl_gntalloc_alloc_gref))
 struct ioctl_gntalloc_alloc_gref {
 	/* IN parameters */
 	/* The ID of the domain to be given access to the grants. */
-	uint16_t domid;
+	__u16 domid;
 	/* Flags for this mapping */
-	uint16_t flags;
+	__u16 flags;
 	/* Number of pages to map */
-	uint32_t count;
+	__u32 count;
 	/* OUT parameters */
 	/* The offset to be used on a subsequent call to mmap(). */
-	uint64_t index;
+	__u64 index;
 	/* The grant references of the newly created grant, one per page */
 	/* Variable size, depending on count */
-	uint32_t gref_ids[1];
+	__u32 gref_ids[1];
 };
 
 #define GNTALLOC_FLAG_WRITABLE 1
@@ -43,9 +45,9 @@ _IOC(_IOC_NONE, 'G', 6, sizeof(struct ioctl_gntalloc_dealloc_gref))
 struct ioctl_gntalloc_dealloc_gref {
 	/* IN parameters */
 	/* The offset returned in the map operation */
-	uint64_t index;
+	__u64 index;
 	/* Number of references to unmap */
-	uint32_t count;
+	__u32 count;
 };
 
 /*
@@ -67,11 +69,11 @@ struct ioctl_gntalloc_unmap_notify {
 	 * be cleared. Otherwise, it can be any byte in the page whose
 	 * notification we are adjusting.
 	 */
-	uint64_t index;
+	__u64 index;
 	/* Action(s) to take on unmap */
-	uint32_t action;
+	__u32 action;
 	/* Event channel to notify */
-	uint32_t event_channel_port;
+	__u32 event_channel_port;
 };
 
 /* Clear (set to zero) the byte specified by index */

+ 18 - 16
include/uapi/xen/gntdev.h

@@ -33,11 +33,13 @@
 #ifndef __LINUX_PUBLIC_GNTDEV_H__
 #define __LINUX_PUBLIC_GNTDEV_H__
 
+#include <linux/types.h>
+
 struct ioctl_gntdev_grant_ref {
 	/* The domain ID of the grant to be mapped. */
-	uint32_t domid;
+	__u32 domid;
 	/* The grant reference of the grant to be mapped. */
-	uint32_t ref;
+	__u32 ref;
 };
 
 /*
@@ -50,11 +52,11 @@ _IOC(_IOC_NONE, 'G', 0, sizeof(struct ioctl_gntdev_map_grant_ref))
 struct ioctl_gntdev_map_grant_ref {
 	/* IN parameters */
 	/* The number of grants to be mapped. */
-	uint32_t count;
-	uint32_t pad;
+	__u32 count;
+	__u32 pad;
 	/* OUT parameters */
 	/* The offset to be used on a subsequent call to mmap(). */
-	uint64_t index;
+	__u64 index;
 	/* Variable IN parameter. */
 	/* Array of grant references, of size @count. */
 	struct ioctl_gntdev_grant_ref refs[1];
@@ -70,10 +72,10 @@ _IOC(_IOC_NONE, 'G', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref))
 struct ioctl_gntdev_unmap_grant_ref {
 	/* IN parameters */
 	/* The offset was returned by the corresponding map operation. */
-	uint64_t index;
+	__u64 index;
 	/* The number of pages to be unmapped. */
-	uint32_t count;
-	uint32_t pad;
+	__u32 count;
+	__u32 pad;
 };
 
 /*
@@ -93,13 +95,13 @@ _IOC(_IOC_NONE, 'G', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr))
 struct ioctl_gntdev_get_offset_for_vaddr {
 	/* IN parameters */
 	/* The virtual address of the first mapped page in a range. */
-	uint64_t vaddr;
+	__u64 vaddr;
 	/* OUT parameters */
 	/* The offset that was used in the initial mmap() operation. */
-	uint64_t offset;
+	__u64 offset;
 	/* The number of pages mapped in the VM area that begins at @vaddr. */
-	uint32_t count;
-	uint32_t pad;
+	__u32 count;
+	__u32 pad;
 };
 
 /*
@@ -113,7 +115,7 @@ _IOC(_IOC_NONE, 'G', 3, sizeof(struct ioctl_gntdev_set_max_grants))
 struct ioctl_gntdev_set_max_grants {
 	/* IN parameter */
 	/* The maximum number of grants that may be mapped at once. */
-	uint32_t count;
+	__u32 count;
 };
 
 /*
@@ -135,11 +137,11 @@ struct ioctl_gntdev_unmap_notify {
 	 * be cleared. Otherwise, it can be any byte in the page whose
 	 * notification we are adjusting.
 	 */
-	uint64_t index;
+	__u64 index;
 	/* Action(s) to take on unmap */
-	uint32_t action;
+	__u32 action;
 	/* Event channel to notify */
-	uint32_t event_channel_port;
+	__u32 event_channel_port;
 };
 
 /* Clear (set to zero) the byte specified by index */

+ 3 - 9
include/xen/balloon.h

@@ -8,30 +8,24 @@ struct balloon_stats {
 	/* We aim for 'current allocation' == 'target allocation'. */
 	unsigned long current_pages;
 	unsigned long target_pages;
+	unsigned long target_unpopulated;
 	/* Number of pages in high- and low-memory balloons. */
 	unsigned long balloon_low;
 	unsigned long balloon_high;
+	unsigned long total_pages;
 	unsigned long schedule_delay;
 	unsigned long max_schedule_delay;
 	unsigned long retry_count;
 	unsigned long max_retry_count;
-#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG
-	unsigned long hotplug_pages;
-	unsigned long balloon_hotplug;
-#endif
 };
 
 extern struct balloon_stats balloon_stats;
 
 void balloon_set_new_target(unsigned long target);
 
-int alloc_xenballooned_pages(int nr_pages, struct page **pages,
-		bool highmem);
+int alloc_xenballooned_pages(int nr_pages, struct page **pages);
 void free_xenballooned_pages(int nr_pages, struct page **pages);
 
-struct page *get_balloon_scratch_page(void);
-void put_balloon_scratch_page(void);
-
 struct device;
 #ifdef CONFIG_XEN_SELFBALLOONING
 extern int register_xen_selfballooning(struct device *dev);

+ 57 - 0
include/xen/grant_table.h

@@ -45,8 +45,10 @@
 #include <asm/xen/hypervisor.h>
 
 #include <xen/features.h>
+#include <xen/page.h>
 #include <linux/mm_types.h>
 #include <linux/page-flags.h>
+#include <linux/kernel.h>
 
 #define GNTTAB_RESERVED_XENSTORE 1
 
@@ -129,6 +131,15 @@ void gnttab_cancel_free_callback(struct gnttab_free_callback *callback);
 void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid,
 				     unsigned long frame, int readonly);
 
+/* Give access to the first 4K of the page */
+static inline void gnttab_page_grant_foreign_access_ref_one(
+	grant_ref_t ref, domid_t domid,
+	struct page *page, int readonly)
+{
+	gnttab_grant_foreign_access_ref(ref, domid, xen_page_to_gfn(page),
+					readonly);
+}
+
 void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid,
 				       unsigned long pfn);
 
@@ -224,4 +235,50 @@ static inline struct xen_page_foreign *xen_page_foreign(struct page *page)
 #endif
 }
 
+/* Split Linux page in chunk of the size of the grant and call fn
+ *
+ * Parameters of fn:
+ *	gfn: guest frame number
+ *	offset: offset in the grant
+ *	len: length of the data in the grant.
+ *	data: internal information
+ */
+typedef void (*xen_grant_fn_t)(unsigned long gfn, unsigned int offset,
+			       unsigned int len, void *data);
+
+void gnttab_foreach_grant_in_range(struct page *page,
+				   unsigned int offset,
+				   unsigned int len,
+				   xen_grant_fn_t fn,
+				   void *data);
+
+/* Helper to get to call fn only on the first "grant chunk" */
+static inline void gnttab_for_one_grant(struct page *page, unsigned int offset,
+					unsigned len, xen_grant_fn_t fn,
+					void *data)
+{
+	/* The first request is limited to the size of one grant */
+	len = min_t(unsigned int, XEN_PAGE_SIZE - (offset & ~XEN_PAGE_MASK),
+		    len);
+
+	gnttab_foreach_grant_in_range(page, offset, len, fn, data);
+}
+
+/* Get @nr_grefs grants from an array of page and call fn for each grant */
+void gnttab_foreach_grant(struct page **pages,
+			  unsigned int nr_grefs,
+			  xen_grant_fn_t fn,
+			  void *data);
+
+/* Get the number of grant in a specified region
+ *
+ * start: Offset from the beginning of the first page
+ * len: total length of data (can cross multiple page)
+ */
+static inline unsigned int gnttab_count_grant(unsigned int start,
+					      unsigned int len)
+{
+	return XEN_PFN_UP(xen_offset_in_page(start) + len);
+}
+
 #endif /* __ASM_GNTTAB_H__ */

+ 26 - 1
include/xen/page.h

@@ -1,11 +1,36 @@
 #ifndef _XEN_PAGE_H
 #define _XEN_PAGE_H
 
+#include <asm/page.h>
+
+/* The hypercall interface supports only 4KB page */
+#define XEN_PAGE_SHIFT	12
+#define XEN_PAGE_SIZE	(_AC(1, UL) << XEN_PAGE_SHIFT)
+#define XEN_PAGE_MASK	(~(XEN_PAGE_SIZE-1))
+#define xen_offset_in_page(p)	((unsigned long)(p) & ~XEN_PAGE_MASK)
+
+/*
+ * We assume that PAGE_SIZE is a multiple of XEN_PAGE_SIZE
+ * XXX: Add a BUILD_BUG_ON?
+ */
+
+#define xen_pfn_to_page(xen_pfn)	\
+	((pfn_to_page(((unsigned long)(xen_pfn) << XEN_PAGE_SHIFT) >> PAGE_SHIFT)))
+#define page_to_xen_pfn(page)		\
+	(((page_to_pfn(page)) << PAGE_SHIFT) >> XEN_PAGE_SHIFT)
+
+#define XEN_PFN_PER_PAGE	(PAGE_SIZE / XEN_PAGE_SIZE)
+
+#define XEN_PFN_DOWN(x)	((x) >> XEN_PAGE_SHIFT)
+#define XEN_PFN_UP(x)	(((x) + XEN_PAGE_SIZE-1) >> XEN_PAGE_SHIFT)
+#define XEN_PFN_PHYS(x)	((phys_addr_t)(x) << XEN_PAGE_SHIFT)
+
 #include <asm/xen/page.h>
 
+/* Return the GFN associated to the first 4KB of the page */
 static inline unsigned long xen_page_to_gfn(struct page *page)
 {
-	return pfn_to_gfn(page_to_pfn(page));
+	return pfn_to_gfn(page_to_xen_pfn(page));
 }
 
 struct xen_memory_region {

+ 2 - 2
include/xen/xenbus.h

@@ -46,8 +46,8 @@
 #include <xen/interface/io/xenbus.h>
 #include <xen/interface/io/xs_wire.h>
 
-#define XENBUS_MAX_RING_PAGE_ORDER 4
-#define XENBUS_MAX_RING_PAGES      (1U << XENBUS_MAX_RING_PAGE_ORDER)
+#define XENBUS_MAX_RING_GRANT_ORDER 4
+#define XENBUS_MAX_RING_GRANTS      (1U << XENBUS_MAX_RING_GRANT_ORDER)
 #define INVALID_GRANT_HANDLE       (~0U)
 
 /* Register callback to watch this node. */

+ 21 - 8
mm/memory_hotplug.c

@@ -1232,23 +1232,21 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default,
 }
 
 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory(int nid, u64 start, u64 size)
+int __ref add_memory_resource(int nid, struct resource *res)
 {
+	u64 start, size;
 	pg_data_t *pgdat = NULL;
 	bool new_pgdat;
 	bool new_node;
-	struct resource *res;
 	int ret;
 
+	start = res->start;
+	size = resource_size(res);
+
 	ret = check_hotplug_memory_range(start, size);
 	if (ret)
 		return ret;
 
-	res = register_memory_resource(start, size);
-	ret = -EEXIST;
-	if (!res)
-		return ret;
-
 	{	/* Stupid hack to suppress address-never-null warning */
 		void *p = NODE_DATA(nid);
 		new_pgdat = !p;
@@ -1300,13 +1298,28 @@ error:
 	/* rollback pgdat allocation and others */
 	if (new_pgdat)
 		rollback_node_hotadd(nid, pgdat);
-	release_memory_resource(res);
 	memblock_remove(start, size);
 
 out:
 	mem_hotplug_done();
 	return ret;
 }
+EXPORT_SYMBOL_GPL(add_memory_resource);
+
+int __ref add_memory(int nid, u64 start, u64 size)
+{
+	struct resource *res;
+	int ret;
+
+	res = register_memory_resource(start, size);
+	if (!res)
+		return -EEXIST;
+
+	ret = add_memory_resource(nid, res);
+	if (ret < 0)
+		release_memory_resource(res);
+	return ret;
+}
 EXPORT_SYMBOL_GPL(add_memory);
 
 #ifdef CONFIG_MEMORY_HOTREMOVE