Bläddra i källkod

Merge tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull Xen updates from David Vrabel:
 "Features and fixes:

   - Add pvscsi frontend and backend drivers.
   - Remove _PAGE_IOMAP PTE flag, freeing it for alternate uses.
   - Try and keep memory contiguous during PV memory setup (reduces
     SWIOTLB usage).
   - Allow front/back drivers to use threaded irqs.
   - Support large initrds in PV guests.
   - Fix PVH guests in preparation for Xen 4.5"

* tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (22 commits)
  xen: remove DEFINE_XENBUS_DRIVER() macro
  xen/xenbus: Remove BUG_ON() when error string trucated
  xen/xenbus: Correct the comments for xenbus_grant_ring()
  x86/xen: Set EFER.NX and EFER.SCE in PVH guests
  xen: eliminate scalability issues from initrd handling
  xen: sync some headers with xen tree
  xen: make pvscsi frontend dependant on xenbus frontend
  arm{,64}/xen: Remove "EXPERIMENTAL" in the description of the Xen options
  xen-scsifront: don't deadlock if the ring becomes full
  x86: remove the Xen-specific _PAGE_IOMAP PTE flag
  x86/xen: do not use _PAGE_IOMAP PTE flag for I/O mappings
  x86: skip check for spurious faults for non-present faults
  xen/efi: Directly include needed headers
  xen-scsiback: clean up a type issue in scsiback_make_tpg()
  xen-scsifront: use GFP_ATOMIC under spin_lock
  MAINTAINERS: Add xen pvscsi maintainer
  xen-scsiback: Add Xen PV SCSI backend driver
  xen-scsifront: Add Xen PV SCSI frontend driver
  xen: Add Xen pvSCSI protocol description
  xen/events: support threaded irqs for interdomain event channels
  ...
Linus Torvalds 11 år sedan
förälder
incheckning
81ae31d782
46 ändrade filer med 4214 tillägg och 263 borttagningar
  1. 9 0
      MAINTAINERS
  2. 1 1
      arch/arm/Kconfig
  3. 1 1
      arch/arm64/Kconfig
  4. 5 6
      arch/x86/include/asm/pgtable_types.h
  5. 20 2
      arch/x86/mm/fault.c
  6. 1 1
      arch/x86/mm/init_32.c
  7. 1 1
      arch/x86/mm/init_64.c
  8. 0 2
      arch/x86/pci/i386.c
  9. 2 0
      arch/x86/xen/efi.c
  10. 15 4
      arch/x86/xen/enlighten.c
  11. 4 44
      arch/x86/xen/mmu.c
  12. 7 16
      arch/x86/xen/p2m.c
  13. 15 0
      arch/x86/xen/p2m.h
  14. 292 78
      arch/x86/xen/setup.c
  15. 18 11
      arch/x86/xen/smp.c
  16. 8 0
      arch/x86/xen/smp.h
  17. 36 0
      arch/x86/xen/xen-head.S
  18. 3 8
      drivers/block/xen-blkback/xenbus.c
  19. 3 2
      drivers/block/xen-blkfront.c
  20. 7 6
      drivers/char/tpm/xen-tpmfront.c
  21. 3 2
      drivers/input/misc/xen-kbdfront.c
  22. 3 7
      drivers/net/xen-netback/xenbus.c
  23. 8 8
      drivers/net/xen-netfront.c
  24. 4 2
      drivers/pci/xen-pcifront.c
  25. 10 0
      drivers/scsi/Kconfig
  26. 1 0
      drivers/scsi/Makefile
  27. 1026 0
      drivers/scsi/xen-scsifront.c
  28. 4 5
      drivers/tty/hvc/hvc_xen.c
  29. 3 2
      drivers/video/fbdev/xen-fbfront.c
  30. 9 0
      drivers/xen/Kconfig
  31. 1 0
      drivers/xen/Makefile
  32. 2 0
      drivers/xen/efi.c
  33. 3 2
      drivers/xen/events/events_base.c
  34. 1 1
      drivers/xen/grant-table.c
  35. 4 2
      drivers/xen/xen-pciback/xenbus.c
  36. 2126 0
      drivers/xen/xen-scsiback.c
  37. 3 6
      drivers/xen/xenbus/xenbus_client.c
  38. 5 1
      drivers/xen/xenbus/xenbus_probe.c
  39. 3 1
      drivers/xen/xenbus/xenbus_probe.h
  40. 5 3
      drivers/xen/xenbus/xenbus_probe_backend.c
  41. 5 3
      drivers/xen/xenbus/xenbus_probe_frontend.c
  42. 2 0
      include/xen/events.h
  43. 46 2
      include/xen/interface/elfnote.h
  44. 229 0
      include/xen/interface/io/vscsiif.h
  45. 248 24
      include/xen/interface/xen.h
  46. 12 9
      include/xen/xenbus.h

+ 9 - 0
MAINTAINERS

@@ -10268,6 +10268,15 @@ S:	Supported
 F:	drivers/block/xen-blkback/*
 F:	drivers/block/xen*
 
+XEN PVSCSI DRIVERS
+M:	Juergen Gross <jgross@suse.com>
+L:	xen-devel@lists.xenproject.org (moderated for non-subscribers)
+L:	linux-scsi@vger.kernel.org
+S:	Supported
+F:	drivers/scsi/xen-scsifront.c
+F:	drivers/xen/xen-scsiback.c
+F:	include/xen/interface/io/vscsiif.h
+
 XEN SWIOTLB SUBSYSTEM
 M:	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
 L:	xen-devel@lists.xenproject.org (moderated for non-subscribers)

+ 1 - 1
arch/arm/Kconfig

@@ -1779,7 +1779,7 @@ config XEN_DOM0
 	depends on XEN
 
 config XEN
-	bool "Xen guest support on ARM (EXPERIMENTAL)"
+	bool "Xen guest support on ARM"
 	depends on ARM && AEABI && OF
 	depends on CPU_V7 && !CPU_V6
 	depends on !GENERIC_ATOMIC64

+ 1 - 1
arch/arm64/Kconfig

@@ -349,7 +349,7 @@ config XEN_DOM0
 	depends on XEN
 
 config XEN
-	bool "Xen guest support on ARM64 (EXPERIMENTAL)"
+	bool "Xen guest support on ARM64"
 	depends on ARM64 && OF
 	select SWIOTLB_XEN
 	help

+ 5 - 6
arch/x86/include/asm/pgtable_types.h

@@ -23,7 +23,6 @@
 #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_SPLITTING	_PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
-#define _PAGE_BIT_IOMAP		_PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
 #define _PAGE_BIT_HIDDEN	_PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
@@ -52,7 +51,7 @@
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
 #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
-#define _PAGE_IOMAP	(_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
@@ -168,10 +167,10 @@
 #define __PAGE_KERNEL_LARGE_NOCACHE	(__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
 
-#define __PAGE_KERNEL_IO		(__PAGE_KERNEL | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_UC_MINUS	(__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_WC		(__PAGE_KERNEL_WC | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO		(__PAGE_KERNEL)
+#define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE)
+#define __PAGE_KERNEL_IO_UC_MINUS	(__PAGE_KERNEL_UC_MINUS)
+#define __PAGE_KERNEL_IO_WC		(__PAGE_KERNEL_WC)
 
 #define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
 #define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)

+ 20 - 2
arch/x86/mm/fault.c

@@ -933,8 +933,17 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
  * cross-processor TLB flush, even if no stale TLB entries exist
  * on other processors.
  *
+ * Spurious faults may only occur if the TLB contains an entry with
+ * fewer permission than the page table entry.  Non-present (P = 0)
+ * and reserved bit (R = 1) faults are never spurious.
+ *
  * There are no security implications to leaving a stale TLB when
  * increasing the permissions on a page.
+ *
+ * Returns non-zero if a spurious fault was handled, zero otherwise.
+ *
+ * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+ * (Optional Invalidation).
  */
 static noinline int
 spurious_fault(unsigned long error_code, unsigned long address)
@@ -945,8 +954,17 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	pte_t *pte;
 	int ret;
 
-	/* Reserved-bit violation or user access to kernel space? */
-	if (error_code & (PF_USER | PF_RSVD))
+	/*
+	 * Only writes to RO or instruction fetches from NX may cause
+	 * spurious faults.
+	 *
+	 * These could be from user or supervisor accesses but the TLB
+	 * is only lazily flushed after a kernel mapping protection
+	 * change, so user accesses are not expected to cause spurious
+	 * faults.
+	 */
+	if (error_code != (PF_WRITE | PF_PROT)
+	    && error_code != (PF_INSTR | PF_PROT))
 		return 0;
 
 	pgd = init_mm.pgd + pgd_index(address);

+ 1 - 1
arch/x86/mm/init_32.c

@@ -537,7 +537,7 @@ static void __init pagetable_init(void)
 	permanent_kmaps_init(pgd_base);
 }
 
-pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
 /* user-defined highmem size */

+ 1 - 1
arch/x86/mm/init_64.c

@@ -151,7 +151,7 @@ early_param("gbpages", parse_direct_gbpages_on);
  * around without checking the pgd every time.
  */
 
-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
+pteval_t __supported_pte_mask __read_mostly = ~0;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
 int force_personality32;

+ 0 - 2
arch/x86/pci/i386.c

@@ -442,8 +442,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 		 */
 		prot |= _PAGE_CACHE_UC_MINUS;
 
-	prot |= _PAGE_IOMAP;	/* creating a mapping for IO */
-
 	vma->vm_page_prot = __pgprot(prot);
 
 	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,

+ 2 - 0
arch/x86/xen/efi.c

@@ -15,12 +15,14 @@
  * with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/bitops.h>
 #include <linux/efi.h>
 #include <linux/init.h>
 #include <linux/string.h>
 
 #include <xen/xen-ops.h>
 
+#include <asm/page.h>
 #include <asm/setup.h>
 
 void __init xen_efi_init(void)

+ 15 - 4
arch/x86/xen/enlighten.c

@@ -1463,6 +1463,7 @@ static void __ref xen_setup_gdt(int cpu)
 	pv_cpu_ops.load_gdt = xen_load_gdt;
 }
 
+#ifdef CONFIG_XEN_PVH
 /*
  * A PV guest starts with default flags that are not set for PVH, set them
  * here asap.
@@ -1508,17 +1509,21 @@ static void __init xen_pvh_early_guest_init(void)
 		return;
 
 	xen_have_vector_callback = 1;
+
+	xen_pvh_early_cpu_init(0, false);
 	xen_pvh_set_cr_flags(0);
 
 #ifdef CONFIG_X86_32
 	BUG(); /* PVH: Implement proper support. */
 #endif
 }
+#endif    /* CONFIG_XEN_PVH */
 
 /* First C function to be called on Xen boot */
 asmlinkage __visible void __init xen_start_kernel(void)
 {
 	struct physdev_set_iopl set_iopl;
+	unsigned long initrd_start = 0;
 	int rc;
 
 	if (!xen_start_info)
@@ -1527,7 +1532,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	xen_domain_type = XEN_PV_DOMAIN;
 
 	xen_setup_features();
+#ifdef CONFIG_XEN_PVH
 	xen_pvh_early_guest_init();
+#endif
 	xen_setup_machphys_mapping();
 
 	/* Install Xen paravirt ops */
@@ -1559,8 +1566,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
 #endif
 		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
-	__supported_pte_mask |= _PAGE_IOMAP;
-
 	/*
 	 * Prevent page tables from being allocated in highmem, even
 	 * if CONFIG_HIGHPTE is enabled.
@@ -1667,10 +1672,16 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	new_cpu_data.x86_capability[0] = cpuid_edx(1);
 #endif
 
+	if (xen_start_info->mod_start) {
+	    if (xen_start_info->flags & SIF_MOD_START_PFN)
+		initrd_start = PFN_PHYS(xen_start_info->mod_start);
+	    else
+		initrd_start = __pa(xen_start_info->mod_start);
+	}
+
 	/* Poke various useful things into boot_params */
 	boot_params.hdr.type_of_loader = (9 << 4) | 0;
-	boot_params.hdr.ramdisk_image = xen_start_info->mod_start
-		? __pa(xen_start_info->mod_start) : 0;
+	boot_params.hdr.ramdisk_image = initrd_start;
 	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
 	boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
 

+ 4 - 44
arch/x86/xen/mmu.c

@@ -399,38 +399,14 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
 		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 			mfn = 0;
 			flags = 0;
-		} else {
-			/*
-			 * Paramount to do this test _after_ the
-			 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
-			 * IDENTITY_FRAME_BIT resolves to true.
-			 */
-			mfn &= ~FOREIGN_FRAME_BIT;
-			if (mfn & IDENTITY_FRAME_BIT) {
-				mfn &= ~IDENTITY_FRAME_BIT;
-				flags |= _PAGE_IOMAP;
-			}
-		}
+		} else
+			mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
 		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 	}
 
 	return val;
 }
 
-static pteval_t iomap_pte(pteval_t val)
-{
-	if (val & _PAGE_PRESENT) {
-		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
-		pteval_t flags = val & PTE_FLAGS_MASK;
-
-		/* We assume the pte frame number is a MFN, so
-		   just use it as-is. */
-		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
-	}
-
-	return val;
-}
-
 __visible pteval_t xen_pte_val(pte_t pte)
 {
 	pteval_t pteval = pte.pte;
@@ -441,9 +417,6 @@ __visible pteval_t xen_pte_val(pte_t pte)
 		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
 	}
 #endif
-	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
-		return pteval;
-
 	return pte_mfn_to_pfn(pteval);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@@ -481,7 +454,6 @@ void xen_set_pat(u64 pat)
 
 __visible pte_t xen_make_pte(pteval_t pte)
 {
-	phys_addr_t addr = (pte & PTE_PFN_MASK);
 #if 0
 	/* If Linux is trying to set a WC pte, then map to the Xen WC.
 	 * If _PAGE_PAT is set, then it probably means it is really
@@ -496,19 +468,7 @@ __visible pte_t xen_make_pte(pteval_t pte)
 			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
 	}
 #endif
-	/*
-	 * Unprivileged domains are allowed to do IOMAPpings for
-	 * PCI passthrough, but not map ISA space.  The ISA
-	 * mappings are just dummy local mappings to keep other
-	 * parts of the kernel happy.
-	 */
-	if (unlikely(pte & _PAGE_IOMAP) &&
-	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
-		pte = iomap_pte(pte);
-	} else {
-		pte &= ~_PAGE_IOMAP;
-		pte = pte_pfn_to_mfn(pte);
-	}
+	pte = pte_pfn_to_mfn(pte);
 
 	return native_make_pte(pte);
 }
@@ -2091,7 +2051,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 
 	default:
 		/* By default, set_fixmap is used for hardware mappings */
-		pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+		pte = mfn_pte(phys, prot);
 		break;
 	}
 

+ 7 - 16
arch/x86/xen/p2m.c

@@ -173,6 +173,7 @@
 #include <xen/balloon.h>
 #include <xen/grant_table.h>
 
+#include "p2m.h"
 #include "multicalls.h"
 #include "xen-ops.h"
 
@@ -180,12 +181,6 @@ static void __init m2p_override_init(void);
 
 unsigned long xen_max_p2m_pfn __read_mostly;
 
-#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
 /* Placeholders for holes in the address space */
 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
 static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
@@ -202,16 +197,12 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
 RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 
-/* We might hit two boundary violations at the start and end, at max each
- * boundary violation will require three middle nodes. */
-RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
-
-/* When we populate back during bootup, the amount of pages can vary. The
- * max we have is seen is 395979, but that does not mean it can't be more.
- * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
- * it can re-use Xen provided mfn_list array, so we only need to allocate at
- * most three P2M top nodes. */
-RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
+/* For each I/O range remapped we may lose up to two leaf pages for the boundary
+ * violations and three mid pages to cover up to 3GB. With
+ * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the
+ * remapped region.
+ */
+RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES);
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {

+ 15 - 0
arch/x86/xen/p2m.h

@@ -0,0 +1,15 @@
+#ifndef _XEN_P2M_H
+#define _XEN_P2M_H
+
+#define P2M_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN         (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+#define MAX_REMAP_RANGES    10
+
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+                                      unsigned long pfn_e);
+
+#endif  /* _XEN_P2M_H */

+ 292 - 78
arch/x86/xen/setup.c

@@ -29,6 +29,7 @@
 #include <xen/features.h>
 #include "xen-ops.h"
 #include "vdso.h"
+#include "p2m.h"
 
 /* These are code, but not functions.  Defined in entry.S */
 extern const char xen_hypervisor_callback[];
@@ -46,6 +47,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 /* Number of pages released from the initial allocation. */
 unsigned long xen_released_pages;
 
+/* Buffer used to remap identity mapped pages */
+unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
+
 /* 
  * The maximum amount of extra memory compared to the base size.  The
  * main scaling factor is the size of struct page.  At extreme ratios
@@ -151,107 +155,325 @@ static unsigned long __init xen_do_chunk(unsigned long start,
 	return len;
 }
 
-static unsigned long __init xen_release_chunk(unsigned long start,
-					      unsigned long end)
-{
-	return xen_do_chunk(start, end, true);
-}
-
-static unsigned long __init xen_populate_chunk(
+/*
+ * Finds the next RAM pfn available in the E820 map after min_pfn.
+ * This function updates min_pfn with the pfn found and returns
+ * the size of that range or zero if not found.
+ */
+static unsigned long __init xen_find_pfn_range(
 	const struct e820entry *list, size_t map_size,
-	unsigned long max_pfn, unsigned long *last_pfn,
-	unsigned long credits_left)
+	unsigned long *min_pfn)
 {
 	const struct e820entry *entry;
 	unsigned int i;
 	unsigned long done = 0;
-	unsigned long dest_pfn;
 
 	for (i = 0, entry = list; i < map_size; i++, entry++) {
 		unsigned long s_pfn;
 		unsigned long e_pfn;
-		unsigned long pfns;
-		long capacity;
-
-		if (credits_left <= 0)
-			break;
 
 		if (entry->type != E820_RAM)
 			continue;
 
 		e_pfn = PFN_DOWN(entry->addr + entry->size);
 
-		/* We only care about E820 after the xen_start_info->nr_pages */
-		if (e_pfn <= max_pfn)
+		/* We only care about E820 after this */
+		if (e_pfn < *min_pfn)
 			continue;
 
 		s_pfn = PFN_UP(entry->addr);
-		/* If the E820 falls within the nr_pages, we want to start
-		 * at the nr_pages PFN.
-		 * If that would mean going past the E820 entry, skip it
+
+		/* If min_pfn falls within the E820 entry, we want to start
+		 * at the min_pfn PFN.
 		 */
-		if (s_pfn <= max_pfn) {
-			capacity = e_pfn - max_pfn;
-			dest_pfn = max_pfn;
+		if (s_pfn <= *min_pfn) {
+			done = e_pfn - *min_pfn;
 		} else {
-			capacity = e_pfn - s_pfn;
-			dest_pfn = s_pfn;
+			done = e_pfn - s_pfn;
+			*min_pfn = s_pfn;
 		}
+		break;
+	}
 
-		if (credits_left < capacity)
-			capacity = credits_left;
+	return done;
+}
 
-		pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
-		done += pfns;
-		*last_pfn = (dest_pfn + pfns);
-		if (pfns < capacity)
-			break;
-		credits_left -= pfns;
+/*
+ * This releases a chunk of memory and then does the identity map. It's used as
+ * as a fallback if the remapping fails.
+ */
+static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
+	unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
+	unsigned long *released)
+{
+	WARN_ON(start_pfn > end_pfn);
+
+	/* Need to release pages first */
+	*released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true);
+	*identity += set_phys_range_identity(start_pfn, end_pfn);
+}
+
+/*
+ * Helper function to update both the p2m and m2p tables.
+ */
+static unsigned long __init xen_update_mem_tables(unsigned long pfn,
+						  unsigned long mfn)
+{
+	struct mmu_update update = {
+		.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
+		.val = pfn
+	};
+
+	/* Update p2m */
+	if (!early_set_phys_to_machine(pfn, mfn)) {
+		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
+		     pfn, mfn);
+		return false;
 	}
-	return done;
+
+	/* Update m2p */
+	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
+		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
+		     mfn, pfn);
+		return false;
+	}
+
+	return true;
 }
 
-static void __init xen_set_identity_and_release_chunk(
-	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
-	unsigned long *released, unsigned long *identity)
+/*
+ * This function updates the p2m and m2p tables with an identity map from
+ * start_pfn to start_pfn+size and remaps the underlying RAM of the original
+ * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks
+ * to not exhaust the reserved brk space. Doing it in properly aligned blocks
+ * ensures we only allocate the minimum required leaf pages in the p2m table. It
+ * copies the existing mfns from the p2m table under the 1:1 map, overwrites
+ * them with the identity map and then updates the p2m and m2p tables with the
+ * remapped memory.
+ */
+static unsigned long __init xen_do_set_identity_and_remap_chunk(
+        unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
 {
-	unsigned long pfn;
+	unsigned long ident_pfn_iter, remap_pfn_iter;
+	unsigned long ident_start_pfn_align, remap_start_pfn_align;
+	unsigned long ident_end_pfn_align, remap_end_pfn_align;
+	unsigned long ident_boundary_pfn, remap_boundary_pfn;
+	unsigned long ident_cnt = 0;
+	unsigned long remap_cnt = 0;
+	unsigned long left = size;
+	unsigned long mod;
+	int i;
+
+	WARN_ON(size == 0);
+
+	BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
 
 	/*
-	 * If the PFNs are currently mapped, clear the mappings
-	 * (except for the ISA region which must be 1:1 mapped) to
-	 * release the refcounts (in Xen) on the original frames.
+	 * Determine the proper alignment to remap memory in P2M_PER_PAGE sized
+	 * blocks. We need to keep track of both the existing pfn mapping and
+	 * the new pfn remapping.
 	 */
-	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
-		pte_t pte = __pte_ma(0);
+	mod = start_pfn % P2M_PER_PAGE;
+	ident_start_pfn_align =
+		mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn;
+	mod = remap_pfn % P2M_PER_PAGE;
+	remap_start_pfn_align =
+		mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn;
+	mod = (start_pfn + size) % P2M_PER_PAGE;
+	ident_end_pfn_align = start_pfn + size - mod;
+	mod = (remap_pfn + size) % P2M_PER_PAGE;
+	remap_end_pfn_align = remap_pfn + size - mod;
+
+	/* Iterate over each p2m leaf node in each range */
+	for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align;
+	     ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align;
+	     ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) {
+		/* Check we aren't past the end */
+		BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size);
+		BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size);
+
+		/* Save p2m mappings */
+		for (i = 0; i < P2M_PER_PAGE; i++)
+			xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i);
+
+		/* Set identity map which will free a p2m leaf */
+		ident_cnt += set_phys_range_identity(ident_pfn_iter,
+			ident_pfn_iter + P2M_PER_PAGE);
+
+#ifdef DEBUG
+		/* Helps verify a p2m leaf has been freed */
+		for (i = 0; i < P2M_PER_PAGE; i++) {
+			unsigned int pfn = ident_pfn_iter + i;
+			BUG_ON(pfn_to_mfn(pfn) != pfn);
+		}
+#endif
+		/* Now remap memory */
+		for (i = 0; i < P2M_PER_PAGE; i++) {
+			unsigned long mfn = xen_remap_buf[i];
+
+			/* This will use the p2m leaf freed above */
+			if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) {
+				WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
+					remap_pfn_iter + i, mfn);
+				return 0;
+			}
+
+			remap_cnt++;
+		}
 
-		if (pfn < PFN_UP(ISA_END_ADDRESS))
-			pte = mfn_pte(pfn, PAGE_KERNEL_IO);
+		left -= P2M_PER_PAGE;
+	}
 
-		(void)HYPERVISOR_update_va_mapping(
-			(unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
+	/* Max boundary space possible */
+	BUG_ON(left > (P2M_PER_PAGE - 1) * 2);
+
+	/* Now handle the boundary conditions */
+	ident_boundary_pfn = start_pfn;
+	remap_boundary_pfn = remap_pfn;
+	for (i = 0; i < left; i++) {
+		unsigned long mfn;
+
+		/* These two checks move from the start to end boundaries */
+		if (ident_boundary_pfn == ident_start_pfn_align)
+			ident_boundary_pfn = ident_pfn_iter;
+		if (remap_boundary_pfn == remap_start_pfn_align)
+			remap_boundary_pfn = remap_pfn_iter;
+
+		/* Check we aren't past the end */
+		BUG_ON(ident_boundary_pfn >= start_pfn + size);
+		BUG_ON(remap_boundary_pfn >= remap_pfn + size);
+
+		mfn = pfn_to_mfn(ident_boundary_pfn);
+
+		if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) {
+			WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
+				remap_pfn_iter + i, mfn);
+			return 0;
+		}
+		remap_cnt++;
+
+		ident_boundary_pfn++;
+		remap_boundary_pfn++;
 	}
 
-	if (start_pfn < nr_pages)
-		*released += xen_release_chunk(
-			start_pfn, min(end_pfn, nr_pages));
+	/* Finish up the identity map */
+	if (ident_start_pfn_align >= ident_end_pfn_align) {
+		/*
+                 * In this case we have an identity range which does not span an
+                 * aligned block so everything needs to be identity mapped here.
+                 * If we didn't check this we might remap too many pages since
+                 * the align boundaries are not meaningful in this case.
+	         */
+		ident_cnt += set_phys_range_identity(start_pfn,
+			start_pfn + size);
+	} else {
+		/* Remapped above so check each end of the chunk */
+		if (start_pfn < ident_start_pfn_align)
+			ident_cnt += set_phys_range_identity(start_pfn,
+				ident_start_pfn_align);
+		if (start_pfn + size > ident_pfn_iter)
+			ident_cnt += set_phys_range_identity(ident_pfn_iter,
+				start_pfn + size);
+	}
 
-	*identity += set_phys_range_identity(start_pfn, end_pfn);
+	BUG_ON(ident_cnt != size);
+	BUG_ON(remap_cnt != size);
+
+	return size;
 }
 
-static unsigned long __init xen_set_identity_and_release(
-	const struct e820entry *list, size_t map_size, unsigned long nr_pages)
+/*
+ * This function takes a contiguous pfn range that needs to be identity mapped
+ * and:
+ *
+ *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
+ *  2) Calls the do_ function to actually do the mapping/remapping work.
+ *
+ * The goal is to not allocate additional memory but to remap the existing
+ * pages. In the case of an error the underlying memory is simply released back
+ * to Xen and not remapped.
+ */
+static unsigned long __init xen_set_identity_and_remap_chunk(
+        const struct e820entry *list, size_t map_size, unsigned long start_pfn,
+	unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
+	unsigned long *identity, unsigned long *remapped,
+	unsigned long *released)
+{
+	unsigned long pfn;
+	unsigned long i = 0;
+	unsigned long n = end_pfn - start_pfn;
+
+	while (i < n) {
+		unsigned long cur_pfn = start_pfn + i;
+		unsigned long left = n - i;
+		unsigned long size = left;
+		unsigned long remap_range_size;
+
+		/* Do not remap pages beyond the current allocation */
+		if (cur_pfn >= nr_pages) {
+			/* Identity map remaining pages */
+			*identity += set_phys_range_identity(cur_pfn,
+				cur_pfn + size);
+			break;
+		}
+		if (cur_pfn + size > nr_pages)
+			size = nr_pages - cur_pfn;
+
+		remap_range_size = xen_find_pfn_range(list, map_size,
+						      &remap_pfn);
+		if (!remap_range_size) {
+			pr_warning("Unable to find available pfn range, not remapping identity pages\n");
+			xen_set_identity_and_release_chunk(cur_pfn,
+				cur_pfn + left, nr_pages, identity, released);
+			break;
+		}
+		/* Adjust size to fit in current e820 RAM region */
+		if (size > remap_range_size)
+			size = remap_range_size;
+
+		if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) {
+			WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n",
+				cur_pfn, size, remap_pfn);
+			xen_set_identity_and_release_chunk(cur_pfn,
+				cur_pfn + left, nr_pages, identity, released);
+			break;
+		}
+
+		/* Update variables to reflect new mappings. */
+		i += size;
+		remap_pfn += size;
+		*identity += size;
+		*remapped += size;
+	}
+
+	/*
+	 * If the PFNs are currently mapped, the VA mapping also needs
+	 * to be updated to be 1:1.
+	 */
+	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
+		(void)HYPERVISOR_update_va_mapping(
+			(unsigned long)__va(pfn << PAGE_SHIFT),
+			mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+
+	return remap_pfn;
+}
+
+static unsigned long __init xen_set_identity_and_remap(
+	const struct e820entry *list, size_t map_size, unsigned long nr_pages,
+	unsigned long *released)
 {
 	phys_addr_t start = 0;
-	unsigned long released = 0;
 	unsigned long identity = 0;
+	unsigned long remapped = 0;
+	unsigned long last_pfn = nr_pages;
 	const struct e820entry *entry;
+	unsigned long num_released = 0;
 	int i;
 
 	/*
 	 * Combine non-RAM regions and gaps until a RAM region (or the
 	 * end of the map) is reached, then set the 1:1 map and
-	 * release the pages (if available) in those non-RAM regions.
+	 * remap the memory in those non-RAM regions.
 	 *
 	 * The combined non-RAM regions are rounded to a whole number
 	 * of pages so any partial pages are accessible via the 1:1
@@ -269,22 +491,24 @@ static unsigned long __init xen_set_identity_and_release(
 				end_pfn = PFN_UP(entry->addr);
 
 			if (start_pfn < end_pfn)
-				xen_set_identity_and_release_chunk(
-					start_pfn, end_pfn, nr_pages,
-					&released, &identity);
-
+				last_pfn = xen_set_identity_and_remap_chunk(
+						list, map_size, start_pfn,
+						end_pfn, nr_pages, last_pfn,
+						&identity, &remapped,
+						&num_released);
 			start = end;
 		}
 	}
 
-	if (released)
-		printk(KERN_INFO "Released %lu pages of unused memory\n", released);
-	if (identity)
-		printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
+	*released = num_released;
 
-	return released;
-}
+	pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
+	pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped,
+		last_pfn);
+	pr_info("Released %ld page(s)\n", num_released);
 
+	return last_pfn;
+}
 static unsigned long __init xen_get_max_pages(void)
 {
 	unsigned long max_pages = MAX_DOMAIN_PAGES;
@@ -347,7 +571,6 @@ char * __init xen_memory_setup(void)
 	unsigned long max_pages;
 	unsigned long last_pfn = 0;
 	unsigned long extra_pages = 0;
-	unsigned long populated;
 	int i;
 	int op;
 
@@ -392,20 +615,11 @@ char * __init xen_memory_setup(void)
 		extra_pages += max_pages - max_pfn;
 
 	/*
-	 * Set P2M for all non-RAM pages and E820 gaps to be identity
-	 * type PFNs.  Any RAM pages that would be made inaccesible by
-	 * this are first released.
+	 * Set identity map on non-RAM pages and remap the underlying RAM.
 	 */
-	xen_released_pages = xen_set_identity_and_release(
-		map, memmap.nr_entries, max_pfn);
-
-	/*
-	 * Populate back the non-RAM pages and E820 gaps that had been
-	 * released. */
-	populated = xen_populate_chunk(map, memmap.nr_entries,
-			max_pfn, &last_pfn, xen_released_pages);
+	last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
+					      &xen_released_pages);
 
-	xen_released_pages -= populated;
 	extra_pages += xen_released_pages;
 
 	if (last_pfn > max_pfn) {

+ 18 - 11
arch/x86/xen/smp.c

@@ -37,6 +37,7 @@
 #include <xen/hvc-console.h>
 #include "xen-ops.h"
 #include "mmu.h"
+#include "smp.h"
 
 cpumask_var_t xen_cpu_initialized_map;
 
@@ -99,10 +100,14 @@ static void cpu_bringup(void)
 	wmb();			/* make sure everything is out */
 }
 
-/* Note: cpu parameter is only relevant for PVH */
-static void cpu_bringup_and_idle(int cpu)
+/*
+ * Note: cpu parameter is only relevant for PVH. The reason for passing it
+ * is we can't do smp_processor_id until the percpu segments are loaded, for
+ * which we need the cpu number! So we pass it in rdi as first parameter.
+ */
+asmlinkage __visible void cpu_bringup_and_idle(int cpu)
 {
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_XEN_PVH
 	if (xen_feature(XENFEAT_auto_translated_physmap) &&
 	    xen_feature(XENFEAT_supervisor_mode_kernel))
 		xen_pvh_secondary_vcpu_init(cpu);
@@ -374,11 +379,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->user_regs.fs = __KERNEL_PERCPU;
 	ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
 #endif
-	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-
 	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 		ctxt->flags = VGCF_IN_KERNEL;
 		ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 		ctxt->user_regs.ds = __USER_DS;
@@ -413,15 +417,18 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 					(unsigned long)xen_failsafe_callback;
 		ctxt->user_regs.cs = __KERNEL_CS;
 		per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
-#ifdef CONFIG_X86_32
 	}
-#else
-	} else
-		/* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
-		 * %rdi having the cpu number - which means are passing in
-		 * as the first parameter the cpu. Subtle!
+#ifdef CONFIG_XEN_PVH
+	else {
+		/*
+		 * The vcpu comes on kernel page tables which have the NX pte
+		 * bit set. This means before DS/SS is touched, NX in
+		 * EFER must be set. Hence the following assembly glue code.
 		 */
+		ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init;
 		ctxt->user_regs.rdi = cpu;
+		ctxt->user_regs.rsi = true;  /* entry == true */
+	}
 #endif
 	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
 	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));

+ 8 - 0
arch/x86/xen/smp.h

@@ -8,4 +8,12 @@ extern void xen_send_IPI_allbutself(int vector);
 extern void xen_send_IPI_all(int vector);
 extern void xen_send_IPI_self(int vector);
 
+#ifdef CONFIG_XEN_PVH
+extern void xen_pvh_early_cpu_init(int cpu, bool entry);
+#else
+static inline void xen_pvh_early_cpu_init(int cpu, bool entry)
+{
+}
+#endif
+
 #endif

+ 36 - 0
arch/x86/xen/xen-head.S

@@ -47,6 +47,41 @@ ENTRY(startup_xen)
 
 	__FINIT
 
+#ifdef CONFIG_XEN_PVH
+/*
+ * xen_pvh_early_cpu_init() - early PVH VCPU initialization
+ * @cpu:   this cpu number (%rdi)
+ * @entry: true if this is a secondary vcpu coming up on this entry
+ *         point, false if this is the boot CPU being initialized for
+ *         the first time (%rsi)
+ *
+ * Note: This is called as a function on the boot CPU, and is the entry point
+ *       on the secondary CPU.
+ */
+ENTRY(xen_pvh_early_cpu_init)
+	mov     %rsi, %r11
+
+	/* Gather features to see if NX implemented. */
+	mov     $0x80000001, %eax
+	cpuid
+	mov     %edx, %esi
+
+	mov     $MSR_EFER, %ecx
+	rdmsr
+	bts     $_EFER_SCE, %eax
+
+	bt      $20, %esi
+	jnc     1f      	/* No NX, skip setting it */
+	bts     $_EFER_NX, %eax
+1:	wrmsr
+#ifdef CONFIG_SMP
+	cmp     $0, %r11b
+	jne     cpu_bringup_and_idle
+#endif
+	ret
+
+#endif /* CONFIG_XEN_PVH */
+
 .pushsection .text
 	.balign PAGE_SIZE
 ENTRY(hypercall_page)
@@ -124,6 +159,7 @@ NEXT_HYPERCALL(arch_6)
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
 		.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
 	ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
+	ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN,  .long 1)
 	ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   _ASM_PTR __HYPERVISOR_VIRT_START)
 	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   _ASM_PTR 0)
 

+ 3 - 8
drivers/block/xen-blkback/xenbus.c

@@ -907,22 +907,17 @@ static int connect_ring(struct backend_info *be)
 	return 0;
 }
 
-
-/* ** Driver Registration ** */
-
-
 static const struct xenbus_device_id xen_blkbk_ids[] = {
 	{ "vbd" },
 	{ "" }
 };
 
-
-static DEFINE_XENBUS_DRIVER(xen_blkbk, ,
+static struct xenbus_driver xen_blkbk_driver = {
+	.ids  = xen_blkbk_ids,
 	.probe = xen_blkbk_probe,
 	.remove = xen_blkbk_remove,
 	.otherend_changed = frontend_changed
-);
-
+};
 
 int xen_blkif_xenbus_init(void)
 {

+ 3 - 2
drivers/block/xen-blkfront.c

@@ -2055,13 +2055,14 @@ static const struct xenbus_device_id blkfront_ids[] = {
 	{ "" }
 };
 
-static DEFINE_XENBUS_DRIVER(blkfront, ,
+static struct xenbus_driver blkfront_driver = {
+	.ids  = blkfront_ids,
 	.probe = blkfront_probe,
 	.remove = blkfront_remove,
 	.resume = blkfront_resume,
 	.otherend_changed = blkback_changed,
 	.is_ready = blkfront_is_ready,
-);
+};
 
 static int __init xlblk_init(void)
 {

+ 7 - 6
drivers/char/tpm/xen-tpmfront.c

@@ -367,12 +367,13 @@ static const struct xenbus_device_id tpmfront_ids[] = {
 };
 MODULE_ALIAS("xen:vtpm");
 
-static DEFINE_XENBUS_DRIVER(tpmfront, ,
-		.probe = tpmfront_probe,
-		.remove = tpmfront_remove,
-		.resume = tpmfront_resume,
-		.otherend_changed = backend_changed,
-	);
+static struct xenbus_driver tpmfront_driver = {
+	.ids = tpmfront_ids,
+	.probe = tpmfront_probe,
+	.remove = tpmfront_remove,
+	.resume = tpmfront_resume,
+	.otherend_changed = backend_changed,
+};
 
 static int __init xen_tpmfront_init(void)
 {

+ 3 - 2
drivers/input/misc/xen-kbdfront.c

@@ -365,12 +365,13 @@ static const struct xenbus_device_id xenkbd_ids[] = {
 	{ "" }
 };
 
-static DEFINE_XENBUS_DRIVER(xenkbd, ,
+static struct xenbus_driver xenkbd_driver = {
+	.ids = xenkbd_ids,
 	.probe = xenkbd_probe,
 	.remove = xenkbd_remove,
 	.resume = xenkbd_resume,
 	.otherend_changed = xenkbd_backend_changed,
-);
+};
 
 static int __init xenkbd_init(void)
 {

+ 3 - 7
drivers/net/xen-netback/xenbus.c

@@ -937,22 +937,18 @@ static int read_xenbus_vif_flags(struct backend_info *be)
 	return 0;
 }
 
-
-/* ** Driver Registration ** */
-
-
 static const struct xenbus_device_id netback_ids[] = {
 	{ "vif" },
 	{ "" }
 };
 
-
-static DEFINE_XENBUS_DRIVER(netback, ,
+static struct xenbus_driver netback_driver = {
+	.ids = netback_ids,
 	.probe = netback_probe,
 	.remove = netback_remove,
 	.uevent = netback_uevent,
 	.otherend_changed = frontend_changed,
-);
+};
 
 int xenvif_xenbus_init(void)
 {

+ 8 - 8
drivers/net/xen-netfront.c

@@ -2300,12 +2300,6 @@ static void xennet_sysfs_delif(struct net_device *netdev)
 
 #endif /* CONFIG_SYSFS */
 
-static const struct xenbus_device_id netfront_ids[] = {
-	{ "vif" },
-	{ "" }
-};
-
-
 static int xennet_remove(struct xenbus_device *dev)
 {
 	struct netfront_info *info = dev_get_drvdata(&dev->dev);
@@ -2338,12 +2332,18 @@ static int xennet_remove(struct xenbus_device *dev)
 	return 0;
 }
 
-static DEFINE_XENBUS_DRIVER(netfront, ,
+static const struct xenbus_device_id netfront_ids[] = {
+	{ "vif" },
+	{ "" }
+};
+
+static struct xenbus_driver netfront_driver = {
+	.ids = netfront_ids,
 	.probe = netfront_probe,
 	.remove = xennet_remove,
 	.resume = netfront_resume,
 	.otherend_changed = netback_changed,
-);
+};
 
 static int __init netif_init(void)
 {

+ 4 - 2
drivers/pci/xen-pcifront.c

@@ -1136,11 +1136,13 @@ static const struct xenbus_device_id xenpci_ids[] = {
 	{""},
 };
 
-static DEFINE_XENBUS_DRIVER(xenpci, "pcifront",
+static struct xenbus_driver xenpci_driver = {
+	.name			= "pcifront",
+	.ids			= xenpci_ids,
 	.probe			= pcifront_xenbus_probe,
 	.remove			= pcifront_xenbus_remove,
 	.otherend_changed	= pcifront_backend_changed,
-);
+};
 
 static int __init pcifront_init(void)
 {

+ 10 - 0
drivers/scsi/Kconfig

@@ -587,6 +587,16 @@ config VMWARE_PVSCSI
 	  To compile this driver as a module, choose M here: the
 	  module will be called vmw_pvscsi.
 
+config XEN_SCSI_FRONTEND
+	tristate "XEN SCSI frontend driver"
+	depends on SCSI && XEN
+	select XEN_XENBUS_FRONTEND
+	help
+	  The XEN SCSI frontend driver allows the kernel to access SCSI Devices
+	  within another guest OS (usually Dom0).
+	  Only needed if the kernel is running in a XEN guest and generic
+	  SCSI access to a device is needed.
+
 config HYPERV_STORAGE
 	tristate "Microsoft Hyper-V virtual storage driver"
 	depends on SCSI && HYPERV

+ 1 - 0
drivers/scsi/Makefile

@@ -141,6 +141,7 @@ obj-$(CONFIG_SCSI_ESAS2R)	+= esas2r/
 obj-$(CONFIG_SCSI_PMCRAID)	+= pmcraid.o
 obj-$(CONFIG_SCSI_VIRTIO)	+= virtio_scsi.o
 obj-$(CONFIG_VMWARE_PVSCSI)	+= vmw_pvscsi.o
+obj-$(CONFIG_XEN_SCSI_FRONTEND)	+= xen-scsifront.o
 obj-$(CONFIG_HYPERV_STORAGE)	+= hv_storvsc.o
 
 obj-$(CONFIG_ARM)		+= arm/

+ 1026 - 0
drivers/scsi/xen-scsifront.c

@@ -0,0 +1,1026 @@
+/*
+ * Xen SCSI frontend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/wait.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/blkdev.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/page.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/vscsiif.h>
+#include <xen/interface/io/protocols.h>
+
+#include <asm/xen/hypervisor.h>
+
+
+#define GRANT_INVALID_REF	0
+
+#define VSCSIFRONT_OP_ADD_LUN	1
+#define VSCSIFRONT_OP_DEL_LUN	2
+
+/* Tuning point. */
+#define VSCSIIF_DEFAULT_CMD_PER_LUN 10
+#define VSCSIIF_MAX_TARGET          64
+#define VSCSIIF_MAX_LUN             255
+
+#define VSCSIIF_RING_SIZE	__CONST_RING_SIZE(vscsiif, PAGE_SIZE)
+#define VSCSIIF_MAX_REQS	VSCSIIF_RING_SIZE
+
+#define vscsiif_grants_sg(_sg)	(PFN_UP((_sg) *		\
+				sizeof(struct scsiif_request_segment)))
+
+struct vscsifrnt_shadow {
+	/* command between backend and frontend */
+	unsigned char act;
+	uint16_t rqid;
+
+	unsigned int nr_grants;		/* number of grants in gref[] */
+	struct scsiif_request_segment *sg;	/* scatter/gather elements */
+
+	/* Do reset or abort function. */
+	wait_queue_head_t wq_reset;	/* reset work queue           */
+	int wait_reset;			/* reset work queue condition */
+	int32_t rslt_reset;		/* reset response status:     */
+					/* SUCCESS or FAILED or:      */
+#define RSLT_RESET_WAITING	0
+#define RSLT_RESET_ERR		-1
+
+	/* Requested struct scsi_cmnd is stored from kernel. */
+	struct scsi_cmnd *sc;
+	int gref[vscsiif_grants_sg(SG_ALL) + SG_ALL];
+};
+
+struct vscsifrnt_info {
+	struct xenbus_device *dev;
+
+	struct Scsi_Host *host;
+	int host_active;
+
+	unsigned int evtchn;
+	unsigned int irq;
+
+	grant_ref_t ring_ref;
+	struct vscsiif_front_ring ring;
+	struct vscsiif_response	ring_rsp;
+
+	spinlock_t shadow_lock;
+	DECLARE_BITMAP(shadow_free_bitmap, VSCSIIF_MAX_REQS);
+	struct vscsifrnt_shadow *shadow[VSCSIIF_MAX_REQS];
+
+	wait_queue_head_t wq_sync;
+	unsigned int wait_ring_available:1;
+
+	char dev_state_path[64];
+	struct task_struct *curr;
+};
+
+static DEFINE_MUTEX(scsifront_mutex);
+
+static void scsifront_wake_up(struct vscsifrnt_info *info)
+{
+	info->wait_ring_available = 0;
+	wake_up(&info->wq_sync);
+}
+
+static int scsifront_get_rqid(struct vscsifrnt_info *info)
+{
+	unsigned long flags;
+	int free;
+
+	spin_lock_irqsave(&info->shadow_lock, flags);
+
+	free = find_first_bit(info->shadow_free_bitmap, VSCSIIF_MAX_REQS);
+	__clear_bit(free, info->shadow_free_bitmap);
+
+	spin_unlock_irqrestore(&info->shadow_lock, flags);
+
+	return free;
+}
+
+static int _scsifront_put_rqid(struct vscsifrnt_info *info, uint32_t id)
+{
+	int empty = bitmap_empty(info->shadow_free_bitmap, VSCSIIF_MAX_REQS);
+
+	__set_bit(id, info->shadow_free_bitmap);
+	info->shadow[id] = NULL;
+
+	return empty || info->wait_ring_available;
+}
+
+static void scsifront_put_rqid(struct vscsifrnt_info *info, uint32_t id)
+{
+	unsigned long flags;
+	int kick;
+
+	spin_lock_irqsave(&info->shadow_lock, flags);
+	kick = _scsifront_put_rqid(info, id);
+	spin_unlock_irqrestore(&info->shadow_lock, flags);
+
+	if (kick)
+		scsifront_wake_up(info);
+}
+
+static struct vscsiif_request *scsifront_pre_req(struct vscsifrnt_info *info)
+{
+	struct vscsiif_front_ring *ring = &(info->ring);
+	struct vscsiif_request *ring_req;
+	uint32_t id;
+
+	id = scsifront_get_rqid(info);	/* use id in response */
+	if (id >= VSCSIIF_MAX_REQS)
+		return NULL;
+
+	ring_req = RING_GET_REQUEST(&(info->ring), ring->req_prod_pvt);
+
+	ring->req_prod_pvt++;
+
+	ring_req->rqid = (uint16_t)id;
+
+	return ring_req;
+}
+
+static void scsifront_do_request(struct vscsifrnt_info *info)
+{
+	struct vscsiif_front_ring *ring = &(info->ring);
+	int notify;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(ring, notify);
+	if (notify)
+		notify_remote_via_irq(info->irq);
+}
+
+static void scsifront_gnttab_done(struct vscsifrnt_info *info, uint32_t id)
+{
+	struct vscsifrnt_shadow *s = info->shadow[id];
+	int i;
+
+	if (s->sc->sc_data_direction == DMA_NONE)
+		return;
+
+	for (i = 0; i < s->nr_grants; i++) {
+		if (unlikely(gnttab_query_foreign_access(s->gref[i]) != 0)) {
+			shost_printk(KERN_ALERT, info->host, KBUILD_MODNAME
+				     "grant still in use by backend\n");
+			BUG();
+		}
+		gnttab_end_foreign_access(s->gref[i], 0, 0UL);
+	}
+
+	kfree(s->sg);
+}
+
+static void scsifront_cdb_cmd_done(struct vscsifrnt_info *info,
+				   struct vscsiif_response *ring_rsp)
+{
+	struct scsi_cmnd *sc;
+	uint32_t id;
+	uint8_t sense_len;
+
+	id = ring_rsp->rqid;
+	sc = info->shadow[id]->sc;
+
+	BUG_ON(sc == NULL);
+
+	scsifront_gnttab_done(info, id);
+	scsifront_put_rqid(info, id);
+
+	sc->result = ring_rsp->rslt;
+	scsi_set_resid(sc, ring_rsp->residual_len);
+
+	sense_len = min_t(uint8_t, VSCSIIF_SENSE_BUFFERSIZE,
+			  ring_rsp->sense_len);
+
+	if (sense_len)
+		memcpy(sc->sense_buffer, ring_rsp->sense_buffer, sense_len);
+
+	sc->scsi_done(sc);
+}
+
+static void scsifront_sync_cmd_done(struct vscsifrnt_info *info,
+				    struct vscsiif_response *ring_rsp)
+{
+	uint16_t id = ring_rsp->rqid;
+	unsigned long flags;
+	struct vscsifrnt_shadow *shadow = info->shadow[id];
+	int kick;
+
+	spin_lock_irqsave(&info->shadow_lock, flags);
+	shadow->wait_reset = 1;
+	switch (shadow->rslt_reset) {
+	case RSLT_RESET_WAITING:
+		shadow->rslt_reset = ring_rsp->rslt;
+		break;
+	case RSLT_RESET_ERR:
+		kick = _scsifront_put_rqid(info, id);
+		spin_unlock_irqrestore(&info->shadow_lock, flags);
+		kfree(shadow);
+		if (kick)
+			scsifront_wake_up(info);
+		return;
+	default:
+		shost_printk(KERN_ERR, info->host, KBUILD_MODNAME
+			     "bad reset state %d, possibly leaking %u\n",
+			     shadow->rslt_reset, id);
+		break;
+	}
+	spin_unlock_irqrestore(&info->shadow_lock, flags);
+
+	wake_up(&shadow->wq_reset);
+}
+
+static int scsifront_cmd_done(struct vscsifrnt_info *info)
+{
+	struct vscsiif_response *ring_rsp;
+	RING_IDX i, rp;
+	int more_to_do = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(info->host->host_lock, flags);
+
+	rp = info->ring.sring->rsp_prod;
+	rmb();	/* ordering required respective to dom0 */
+	for (i = info->ring.rsp_cons; i != rp; i++) {
+
+		ring_rsp = RING_GET_RESPONSE(&info->ring, i);
+
+		if (WARN(ring_rsp->rqid >= VSCSIIF_MAX_REQS ||
+			 test_bit(ring_rsp->rqid, info->shadow_free_bitmap),
+			 "illegal rqid %u returned by backend!\n",
+			 ring_rsp->rqid))
+			continue;
+
+		if (info->shadow[ring_rsp->rqid]->act == VSCSIIF_ACT_SCSI_CDB)
+			scsifront_cdb_cmd_done(info, ring_rsp);
+		else
+			scsifront_sync_cmd_done(info, ring_rsp);
+	}
+
+	info->ring.rsp_cons = i;
+
+	if (i != info->ring.req_prod_pvt)
+		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+	else
+		info->ring.sring->rsp_event = i + 1;
+
+	info->wait_ring_available = 0;
+
+	spin_unlock_irqrestore(info->host->host_lock, flags);
+
+	wake_up(&info->wq_sync);
+
+	return more_to_do;
+}
+
+static irqreturn_t scsifront_irq_fn(int irq, void *dev_id)
+{
+	struct vscsifrnt_info *info = dev_id;
+
+	while (scsifront_cmd_done(info))
+		/* Yield point for this unbounded loop. */
+		cond_resched();
+
+	return IRQ_HANDLED;
+}
+
+static int map_data_for_request(struct vscsifrnt_info *info,
+				struct scsi_cmnd *sc,
+				struct vscsiif_request *ring_req,
+				struct vscsifrnt_shadow *shadow)
+{
+	grant_ref_t gref_head;
+	struct page *page;
+	int err, ref, ref_cnt = 0;
+	int grant_ro = (sc->sc_data_direction == DMA_TO_DEVICE);
+	unsigned int i, off, len, bytes;
+	unsigned int data_len = scsi_bufflen(sc);
+	unsigned int data_grants = 0, seg_grants = 0;
+	struct scatterlist *sg;
+	unsigned long mfn;
+	struct scsiif_request_segment *seg;
+
+	ring_req->nr_segments = 0;
+	if (sc->sc_data_direction == DMA_NONE || !data_len)
+		return 0;
+
+	scsi_for_each_sg(sc, sg, scsi_sg_count(sc), i)
+		data_grants += PFN_UP(sg->offset + sg->length);
+
+	if (data_grants > VSCSIIF_SG_TABLESIZE) {
+		if (data_grants > info->host->sg_tablesize) {
+			shost_printk(KERN_ERR, info->host, KBUILD_MODNAME
+			     "Unable to map request_buffer for command!\n");
+			return -E2BIG;
+		}
+		seg_grants = vscsiif_grants_sg(data_grants);
+		shadow->sg = kcalloc(data_grants,
+			sizeof(struct scsiif_request_segment), GFP_ATOMIC);
+		if (!shadow->sg)
+			return -ENOMEM;
+	}
+	seg = shadow->sg ? : ring_req->seg;
+
+	err = gnttab_alloc_grant_references(seg_grants + data_grants,
+					    &gref_head);
+	if (err) {
+		kfree(shadow->sg);
+		shost_printk(KERN_ERR, info->host, KBUILD_MODNAME
+			     "gnttab_alloc_grant_references() error\n");
+		return -ENOMEM;
+	}
+
+	if (seg_grants) {
+		page = virt_to_page(seg);
+		off = (unsigned long)seg & ~PAGE_MASK;
+		len = sizeof(struct scsiif_request_segment) * data_grants;
+		while (len > 0) {
+			bytes = min_t(unsigned int, len, PAGE_SIZE - off);
+
+			ref = gnttab_claim_grant_reference(&gref_head);
+			BUG_ON(ref == -ENOSPC);
+
+			mfn = pfn_to_mfn(page_to_pfn(page));
+			gnttab_grant_foreign_access_ref(ref,
+				info->dev->otherend_id, mfn, 1);
+			shadow->gref[ref_cnt] = ref;
+			ring_req->seg[ref_cnt].gref   = ref;
+			ring_req->seg[ref_cnt].offset = (uint16_t)off;
+			ring_req->seg[ref_cnt].length = (uint16_t)bytes;
+
+			page++;
+			len -= bytes;
+			off = 0;
+			ref_cnt++;
+		}
+		BUG_ON(seg_grants < ref_cnt);
+		seg_grants = ref_cnt;
+	}
+
+	scsi_for_each_sg(sc, sg, scsi_sg_count(sc), i) {
+		page = sg_page(sg);
+		off = sg->offset;
+		len = sg->length;
+
+		while (len > 0 && data_len > 0) {
+			/*
+			 * sg sends a scatterlist that is larger than
+			 * the data_len it wants transferred for certain
+			 * IO sizes.
+			 */
+			bytes = min_t(unsigned int, len, PAGE_SIZE - off);
+			bytes = min(bytes, data_len);
+
+			ref = gnttab_claim_grant_reference(&gref_head);
+			BUG_ON(ref == -ENOSPC);
+
+			mfn = pfn_to_mfn(page_to_pfn(page));
+			gnttab_grant_foreign_access_ref(ref,
+				info->dev->otherend_id, mfn, grant_ro);
+
+			shadow->gref[ref_cnt] = ref;
+			seg->gref   = ref;
+			seg->offset = (uint16_t)off;
+			seg->length = (uint16_t)bytes;
+
+			page++;
+			seg++;
+			len -= bytes;
+			data_len -= bytes;
+			off = 0;
+			ref_cnt++;
+		}
+	}
+
+	if (seg_grants)
+		ring_req->nr_segments = VSCSIIF_SG_GRANT | seg_grants;
+	else
+		ring_req->nr_segments = (uint8_t)ref_cnt;
+	shadow->nr_grants = ref_cnt;
+
+	return 0;
+}
+
+static struct vscsiif_request *scsifront_command2ring(
+		struct vscsifrnt_info *info, struct scsi_cmnd *sc,
+		struct vscsifrnt_shadow *shadow)
+{
+	struct vscsiif_request *ring_req;
+
+	memset(shadow, 0, sizeof(*shadow));
+
+	ring_req = scsifront_pre_req(info);
+	if (!ring_req)
+		return NULL;
+
+	info->shadow[ring_req->rqid] = shadow;
+	shadow->rqid = ring_req->rqid;
+
+	ring_req->id      = sc->device->id;
+	ring_req->lun     = sc->device->lun;
+	ring_req->channel = sc->device->channel;
+	ring_req->cmd_len = sc->cmd_len;
+
+	BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE);
+
+	memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
+
+	ring_req->sc_data_direction   = (uint8_t)sc->sc_data_direction;
+	ring_req->timeout_per_command = sc->request->timeout / HZ;
+
+	return ring_req;
+}
+
+static int scsifront_queuecommand(struct Scsi_Host *shost,
+				  struct scsi_cmnd *sc)
+{
+	struct vscsifrnt_info *info = shost_priv(shost);
+	struct vscsiif_request *ring_req;
+	struct vscsifrnt_shadow *shadow = scsi_cmd_priv(sc);
+	unsigned long flags;
+	int err;
+	uint16_t rqid;
+
+	spin_lock_irqsave(shost->host_lock, flags);
+	if (RING_FULL(&info->ring))
+		goto busy;
+
+	ring_req = scsifront_command2ring(info, sc, shadow);
+	if (!ring_req)
+		goto busy;
+
+	sc->result = 0;
+
+	rqid = ring_req->rqid;
+	ring_req->act = VSCSIIF_ACT_SCSI_CDB;
+
+	shadow->sc  = sc;
+	shadow->act = VSCSIIF_ACT_SCSI_CDB;
+
+	err = map_data_for_request(info, sc, ring_req, shadow);
+	if (err < 0) {
+		pr_debug("%s: err %d\n", __func__, err);
+		scsifront_put_rqid(info, rqid);
+		spin_unlock_irqrestore(shost->host_lock, flags);
+		if (err == -ENOMEM)
+			return SCSI_MLQUEUE_HOST_BUSY;
+		sc->result = DID_ERROR << 16;
+		sc->scsi_done(sc);
+		return 0;
+	}
+
+	scsifront_do_request(info);
+	spin_unlock_irqrestore(shost->host_lock, flags);
+
+	return 0;
+
+busy:
+	spin_unlock_irqrestore(shost->host_lock, flags);
+	pr_debug("%s: busy\n", __func__);
+	return SCSI_MLQUEUE_HOST_BUSY;
+}
+
+/*
+ * Any exception handling (reset or abort) must be forwarded to the backend.
+ * We have to wait until an answer is returned. This answer contains the
+ * result to be returned to the requestor.
+ */
+static int scsifront_action_handler(struct scsi_cmnd *sc, uint8_t act)
+{
+	struct Scsi_Host *host = sc->device->host;
+	struct vscsifrnt_info *info = shost_priv(host);
+	struct vscsifrnt_shadow *shadow, *s = scsi_cmd_priv(sc);
+	struct vscsiif_request *ring_req;
+	int err = 0;
+
+	shadow = kmalloc(sizeof(*shadow), GFP_NOIO);
+	if (!shadow)
+		return FAILED;
+
+	spin_lock_irq(host->host_lock);
+
+	for (;;) {
+		if (!RING_FULL(&info->ring)) {
+			ring_req = scsifront_command2ring(info, sc, shadow);
+			if (ring_req)
+				break;
+		}
+		if (err) {
+			spin_unlock_irq(host->host_lock);
+			kfree(shadow);
+			return FAILED;
+		}
+		info->wait_ring_available = 1;
+		spin_unlock_irq(host->host_lock);
+		err = wait_event_interruptible(info->wq_sync,
+					       !info->wait_ring_available);
+		spin_lock_irq(host->host_lock);
+	}
+
+	ring_req->act = act;
+	ring_req->ref_rqid = s->rqid;
+
+	shadow->act = act;
+	shadow->rslt_reset = RSLT_RESET_WAITING;
+	init_waitqueue_head(&shadow->wq_reset);
+
+	ring_req->nr_segments = 0;
+
+	scsifront_do_request(info);
+
+	spin_unlock_irq(host->host_lock);
+	err = wait_event_interruptible(shadow->wq_reset, shadow->wait_reset);
+	spin_lock_irq(host->host_lock);
+
+	if (!err) {
+		err = shadow->rslt_reset;
+		scsifront_put_rqid(info, shadow->rqid);
+		kfree(shadow);
+	} else {
+		spin_lock(&info->shadow_lock);
+		shadow->rslt_reset = RSLT_RESET_ERR;
+		spin_unlock(&info->shadow_lock);
+		err = FAILED;
+	}
+
+	spin_unlock_irq(host->host_lock);
+	return err;
+}
+
+static int scsifront_eh_abort_handler(struct scsi_cmnd *sc)
+{
+	pr_debug("%s\n", __func__);
+	return scsifront_action_handler(sc, VSCSIIF_ACT_SCSI_ABORT);
+}
+
+static int scsifront_dev_reset_handler(struct scsi_cmnd *sc)
+{
+	pr_debug("%s\n", __func__);
+	return scsifront_action_handler(sc, VSCSIIF_ACT_SCSI_RESET);
+}
+
+static int scsifront_sdev_configure(struct scsi_device *sdev)
+{
+	struct vscsifrnt_info *info = shost_priv(sdev->host);
+
+	if (info && current == info->curr)
+		xenbus_printf(XBT_NIL, info->dev->nodename,
+			      info->dev_state_path, "%d", XenbusStateConnected);
+
+	return 0;
+}
+
+static void scsifront_sdev_destroy(struct scsi_device *sdev)
+{
+	struct vscsifrnt_info *info = shost_priv(sdev->host);
+
+	if (info && current == info->curr)
+		xenbus_printf(XBT_NIL, info->dev->nodename,
+			      info->dev_state_path, "%d", XenbusStateClosed);
+}
+
+static struct scsi_host_template scsifront_sht = {
+	.module			= THIS_MODULE,
+	.name			= "Xen SCSI frontend driver",
+	.queuecommand		= scsifront_queuecommand,
+	.eh_abort_handler	= scsifront_eh_abort_handler,
+	.eh_device_reset_handler = scsifront_dev_reset_handler,
+	.slave_configure	= scsifront_sdev_configure,
+	.slave_destroy		= scsifront_sdev_destroy,
+	.cmd_per_lun		= VSCSIIF_DEFAULT_CMD_PER_LUN,
+	.can_queue		= VSCSIIF_MAX_REQS,
+	.this_id		= -1,
+	.cmd_size		= sizeof(struct vscsifrnt_shadow),
+	.sg_tablesize		= VSCSIIF_SG_TABLESIZE,
+	.use_clustering		= DISABLE_CLUSTERING,
+	.proc_name		= "scsifront",
+};
+
+static int scsifront_alloc_ring(struct vscsifrnt_info *info)
+{
+	struct xenbus_device *dev = info->dev;
+	struct vscsiif_sring *sring;
+	int err = -ENOMEM;
+
+	/***** Frontend to Backend ring start *****/
+	sring = (struct vscsiif_sring *)__get_free_page(GFP_KERNEL);
+	if (!sring) {
+		xenbus_dev_fatal(dev, err,
+			"fail to allocate shared ring (Front to Back)");
+		return err;
+	}
+	SHARED_RING_INIT(sring);
+	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+	err = xenbus_grant_ring(dev, virt_to_mfn(sring));
+	if (err < 0) {
+		free_page((unsigned long)sring);
+		xenbus_dev_fatal(dev, err,
+			"fail to grant shared ring (Front to Back)");
+		return err;
+	}
+	info->ring_ref = err;
+
+	err = xenbus_alloc_evtchn(dev, &info->evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "xenbus_alloc_evtchn");
+		goto free_gnttab;
+	}
+
+	err = bind_evtchn_to_irq(info->evtchn);
+	if (err <= 0) {
+		xenbus_dev_fatal(dev, err, "bind_evtchn_to_irq");
+		goto free_gnttab;
+	}
+
+	info->irq = err;
+
+	err = request_threaded_irq(info->irq, NULL, scsifront_irq_fn,
+				   IRQF_ONESHOT, "scsifront", info);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "request_threaded_irq");
+		goto free_irq;
+	}
+
+	return 0;
+
+/* free resource */
+free_irq:
+	unbind_from_irqhandler(info->irq, info);
+free_gnttab:
+	gnttab_end_foreign_access(info->ring_ref, 0,
+				  (unsigned long)info->ring.sring);
+
+	return err;
+}
+
+static int scsifront_init_ring(struct vscsifrnt_info *info)
+{
+	struct xenbus_device *dev = info->dev;
+	struct xenbus_transaction xbt;
+	int err;
+
+	pr_debug("%s\n", __func__);
+
+	err = scsifront_alloc_ring(info);
+	if (err)
+		return err;
+	pr_debug("%s: %u %u\n", __func__, info->ring_ref, info->evtchn);
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err)
+		xenbus_dev_fatal(dev, err, "starting transaction");
+
+	err = xenbus_printf(xbt, dev->nodename, "ring-ref", "%u",
+			    info->ring_ref);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "%s", "writing ring-ref");
+		goto fail;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
+			    info->evtchn);
+
+	if (err) {
+		xenbus_dev_fatal(dev, err, "%s", "writing event-channel");
+		goto fail;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto free_sring;
+	}
+
+	return 0;
+
+fail:
+	xenbus_transaction_end(xbt, 1);
+free_sring:
+	unbind_from_irqhandler(info->irq, info);
+	gnttab_end_foreign_access(info->ring_ref, 0,
+				  (unsigned long)info->ring.sring);
+
+	return err;
+}
+
+
+static int scsifront_probe(struct xenbus_device *dev,
+			   const struct xenbus_device_id *id)
+{
+	struct vscsifrnt_info *info;
+	struct Scsi_Host *host;
+	int err = -ENOMEM;
+	char name[TASK_COMM_LEN];
+
+	host = scsi_host_alloc(&scsifront_sht, sizeof(*info));
+	if (!host) {
+		xenbus_dev_fatal(dev, err, "fail to allocate scsi host");
+		return err;
+	}
+	info = (struct vscsifrnt_info *)host->hostdata;
+
+	dev_set_drvdata(&dev->dev, info);
+	info->dev = dev;
+
+	bitmap_fill(info->shadow_free_bitmap, VSCSIIF_MAX_REQS);
+
+	err = scsifront_init_ring(info);
+	if (err) {
+		scsi_host_put(host);
+		return err;
+	}
+
+	init_waitqueue_head(&info->wq_sync);
+	spin_lock_init(&info->shadow_lock);
+
+	snprintf(name, TASK_COMM_LEN, "vscsiif.%d", host->host_no);
+
+	host->max_id      = VSCSIIF_MAX_TARGET;
+	host->max_channel = 0;
+	host->max_lun     = VSCSIIF_MAX_LUN;
+	host->max_sectors = (host->sg_tablesize - 1) * PAGE_SIZE / 512;
+	host->max_cmd_len = VSCSIIF_MAX_COMMAND_SIZE;
+
+	err = scsi_add_host(host, &dev->dev);
+	if (err) {
+		dev_err(&dev->dev, "fail to add scsi host %d\n", err);
+		goto free_sring;
+	}
+	info->host = host;
+	info->host_active = 1;
+
+	xenbus_switch_state(dev, XenbusStateInitialised);
+
+	return 0;
+
+free_sring:
+	unbind_from_irqhandler(info->irq, info);
+	gnttab_end_foreign_access(info->ring_ref, 0,
+				  (unsigned long)info->ring.sring);
+	scsi_host_put(host);
+	return err;
+}
+
+static int scsifront_remove(struct xenbus_device *dev)
+{
+	struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev);
+
+	pr_debug("%s: %s removed\n", __func__, dev->nodename);
+
+	mutex_lock(&scsifront_mutex);
+	if (info->host_active) {
+		/* Scsi_host not yet removed */
+		scsi_remove_host(info->host);
+		info->host_active = 0;
+	}
+	mutex_unlock(&scsifront_mutex);
+
+	gnttab_end_foreign_access(info->ring_ref, 0,
+				  (unsigned long)info->ring.sring);
+	unbind_from_irqhandler(info->irq, info);
+
+	scsi_host_put(info->host);
+
+	return 0;
+}
+
+static void scsifront_disconnect(struct vscsifrnt_info *info)
+{
+	struct xenbus_device *dev = info->dev;
+	struct Scsi_Host *host = info->host;
+
+	pr_debug("%s: %s disconnect\n", __func__, dev->nodename);
+
+	/*
+	 * When this function is executed, all devices of
+	 * Frontend have been deleted.
+	 * Therefore, it need not block I/O before remove_host.
+	 */
+
+	mutex_lock(&scsifront_mutex);
+	if (info->host_active) {
+		scsi_remove_host(host);
+		info->host_active = 0;
+	}
+	mutex_unlock(&scsifront_mutex);
+
+	xenbus_frontend_closed(dev);
+}
+
+static void scsifront_do_lun_hotplug(struct vscsifrnt_info *info, int op)
+{
+	struct xenbus_device *dev = info->dev;
+	int i, err = 0;
+	char str[64];
+	char **dir;
+	unsigned int dir_n = 0;
+	unsigned int device_state;
+	unsigned int hst, chn, tgt, lun;
+	struct scsi_device *sdev;
+
+	dir = xenbus_directory(XBT_NIL, dev->otherend, "vscsi-devs", &dir_n);
+	if (IS_ERR(dir))
+		return;
+
+	/* mark current task as the one allowed to modify device states */
+	BUG_ON(info->curr);
+	info->curr = current;
+
+	for (i = 0; i < dir_n; i++) {
+		/* read status */
+		snprintf(str, sizeof(str), "vscsi-devs/%s/state", dir[i]);
+		err = xenbus_scanf(XBT_NIL, dev->otherend, str, "%u",
+				   &device_state);
+		if (XENBUS_EXIST_ERR(err))
+			continue;
+
+		/* virtual SCSI device */
+		snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", dir[i]);
+		err = xenbus_scanf(XBT_NIL, dev->otherend, str,
+				   "%u:%u:%u:%u", &hst, &chn, &tgt, &lun);
+		if (XENBUS_EXIST_ERR(err))
+			continue;
+
+		/*
+		 * Front device state path, used in slave_configure called
+		 * on successfull scsi_add_device, and in slave_destroy called
+		 * on remove of a device.
+		 */
+		snprintf(info->dev_state_path, sizeof(info->dev_state_path),
+			 "vscsi-devs/%s/state", dir[i]);
+
+		switch (op) {
+		case VSCSIFRONT_OP_ADD_LUN:
+			if (device_state != XenbusStateInitialised)
+				break;
+
+			if (scsi_add_device(info->host, chn, tgt, lun)) {
+				dev_err(&dev->dev, "scsi_add_device\n");
+				xenbus_printf(XBT_NIL, dev->nodename,
+					      info->dev_state_path,
+					      "%d", XenbusStateClosed);
+			}
+			break;
+		case VSCSIFRONT_OP_DEL_LUN:
+			if (device_state != XenbusStateClosing)
+				break;
+
+			sdev = scsi_device_lookup(info->host, chn, tgt, lun);
+			if (sdev) {
+				scsi_remove_device(sdev);
+				scsi_device_put(sdev);
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	info->curr = NULL;
+
+	kfree(dir);
+}
+
+static void scsifront_read_backend_params(struct xenbus_device *dev,
+					  struct vscsifrnt_info *info)
+{
+	unsigned int sg_grant;
+	int ret;
+	struct Scsi_Host *host = info->host;
+
+	ret = xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg-grant", "%u",
+			   &sg_grant);
+	if (ret == 1 && sg_grant) {
+		sg_grant = min_t(unsigned int, sg_grant, SG_ALL);
+		sg_grant = max_t(unsigned int, sg_grant, VSCSIIF_SG_TABLESIZE);
+		host->sg_tablesize = min_t(unsigned int, sg_grant,
+			VSCSIIF_SG_TABLESIZE * PAGE_SIZE /
+			sizeof(struct scsiif_request_segment));
+		host->max_sectors = (host->sg_tablesize - 1) * PAGE_SIZE / 512;
+	}
+	dev_info(&dev->dev, "using up to %d SG entries\n", host->sg_tablesize);
+}
+
+static void scsifront_backend_changed(struct xenbus_device *dev,
+				      enum xenbus_state backend_state)
+{
+	struct vscsifrnt_info *info = dev_get_drvdata(&dev->dev);
+
+	pr_debug("%s: %p %u %u\n", __func__, dev, dev->state, backend_state);
+
+	switch (backend_state) {
+	case XenbusStateUnknown:
+	case XenbusStateInitialising:
+	case XenbusStateInitWait:
+	case XenbusStateInitialised:
+		break;
+
+	case XenbusStateConnected:
+		scsifront_read_backend_params(dev, info);
+		if (xenbus_read_driver_state(dev->nodename) ==
+		    XenbusStateInitialised)
+			scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
+
+		if (dev->state != XenbusStateConnected)
+			xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosed:
+		if (dev->state == XenbusStateClosed)
+			break;
+		/* Missed the backend's Closing state -- fallthrough */
+	case XenbusStateClosing:
+		scsifront_disconnect(info);
+		break;
+
+	case XenbusStateReconfiguring:
+		scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_DEL_LUN);
+		xenbus_switch_state(dev, XenbusStateReconfiguring);
+		break;
+
+	case XenbusStateReconfigured:
+		scsifront_do_lun_hotplug(info, VSCSIFRONT_OP_ADD_LUN);
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+	}
+}
+
+static const struct xenbus_device_id scsifront_ids[] = {
+	{ "vscsi" },
+	{ "" }
+};
+
+static struct xenbus_driver scsifront_driver = {
+	.ids			= scsifront_ids,
+	.probe			= scsifront_probe,
+	.remove			= scsifront_remove,
+	.otherend_changed	= scsifront_backend_changed,
+};
+
+static int __init scsifront_init(void)
+{
+	if (!xen_domain())
+		return -ENODEV;
+
+	return xenbus_register_frontend(&scsifront_driver);
+}
+module_init(scsifront_init);
+
+static void __exit scsifront_exit(void)
+{
+	xenbus_unregister_driver(&scsifront_driver);
+}
+module_exit(scsifront_exit);
+
+MODULE_DESCRIPTION("Xen SCSI frontend driver");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("xen:vscsi");
+MODULE_AUTHOR("Juergen Gross <jgross@suse.com>");

+ 4 - 5
drivers/tty/hvc/hvc_xen.c

@@ -347,8 +347,6 @@ static int xen_console_remove(struct xencons_info *info)
 }
 
 #ifdef CONFIG_HVC_XEN_FRONTEND
-static struct xenbus_driver xencons_driver;
-
 static int xencons_remove(struct xenbus_device *dev)
 {
 	return xen_console_remove(dev_get_drvdata(&dev->dev));
@@ -499,13 +497,14 @@ static const struct xenbus_device_id xencons_ids[] = {
 	{ "" }
 };
 
-
-static DEFINE_XENBUS_DRIVER(xencons, "xenconsole",
+static struct xenbus_driver xencons_driver = {
+	.name = "xenconsole",
+	.ids = xencons_ids,
 	.probe = xencons_probe,
 	.remove = xencons_remove,
 	.resume = xencons_resume,
 	.otherend_changed = xencons_backend_changed,
-);
+};
 #endif /* CONFIG_HVC_XEN_FRONTEND */
 
 static int __init xen_hvc_init(void)

+ 3 - 2
drivers/video/fbdev/xen-fbfront.c

@@ -684,12 +684,13 @@ static const struct xenbus_device_id xenfb_ids[] = {
 	{ "" }
 };
 
-static DEFINE_XENBUS_DRIVER(xenfb, ,
+static struct xenbus_driver xenfb_driver = {
+	.ids = xenfb_ids,
 	.probe = xenfb_probe,
 	.remove = xenfb_remove,
 	.resume = xenfb_resume,
 	.otherend_changed = xenfb_backend_changed,
-);
+};
 
 static int __init xenfb_init(void)
 {

+ 9 - 0
drivers/xen/Kconfig

@@ -172,6 +172,15 @@ config XEN_PCIDEV_BACKEND
 
 	  If in doubt, say m.
 
+config XEN_SCSI_BACKEND
+	tristate "XEN SCSI backend driver"
+	depends on XEN && XEN_BACKEND && TARGET_CORE
+	help
+	  The SCSI backend driver allows the kernel to export its SCSI Devices
+	  to other guests via a high-performance shared-memory interface.
+	  Only needed for systems running as XEN driver domains (e.g. Dom0) and
+	  if guests need generic access to SCSI devices.
+
 config XEN_PRIVCMD
 	tristate
 	depends on XEN

+ 1 - 0
drivers/xen/Makefile

@@ -36,6 +36,7 @@ obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY)	+= xen-acpi-memhotplug.o
 obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU)	+= xen-acpi-cpuhotplug.o
 obj-$(CONFIG_XEN_ACPI_PROCESSOR)	+= xen-acpi-processor.o
 obj-$(CONFIG_XEN_EFI)			+= efi.o
+obj-$(CONFIG_XEN_SCSI_BACKEND)		+= xen-scsiback.o
 xen-evtchn-y				:= evtchn.o
 xen-gntdev-y				:= gntdev.o
 xen-gntalloc-y				:= gntalloc.o

+ 2 - 0
drivers/xen/efi.c

@@ -27,6 +27,8 @@
 #include <xen/interface/platform.h>
 #include <xen/xen.h>
 
+#include <asm/page.h>
+
 #include <asm/xen/hypercall.h>
 
 #define INIT_EFI_OP(name) \

+ 3 - 2
drivers/xen/events/events_base.c

@@ -900,8 +900,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
 	return irq;
 }
 
-static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
-					  unsigned int remote_port)
+int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+				   unsigned int remote_port)
 {
 	struct evtchn_bind_interdomain bind_interdomain;
 	int err;
@@ -914,6 +914,7 @@ static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
 
 	return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
 }
+EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq);
 
 static int find_virq(unsigned int virq, unsigned int cpu)
 {

+ 1 - 1
drivers/xen/grant-table.c

@@ -592,7 +592,7 @@ static int grow_gnttab_list(unsigned int more_frames)
 	return 0;
 
 grow_nomem:
-	for ( ; i >= nr_glist_frames; i--)
+	while (i-- > nr_glist_frames)
 		free_page((unsigned long) gnttab_list[i]);
 	return -ENOMEM;
 }

+ 4 - 2
drivers/xen/xen-pciback/xenbus.c

@@ -719,11 +719,13 @@ static const struct xenbus_device_id xen_pcibk_ids[] = {
 	{""},
 };
 
-static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME,
+static struct xenbus_driver xen_pcibk_driver = {
+	.name                   = DRV_NAME,
+	.ids                    = xen_pcibk_ids,
 	.probe			= xen_pcibk_xenbus_probe,
 	.remove			= xen_pcibk_xenbus_remove,
 	.otherend_changed	= xen_pcibk_frontend_changed,
-);
+};
 
 const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend;
 

+ 2126 - 0
drivers/xen/xen-scsiback.c

@@ -0,0 +1,2126 @@
+/*
+ * Xen SCSI backend driver
+ *
+ * Copyright (c) 2008, FUJITSU Limited
+ *
+ * Based on the blkback driver code.
+ * Adaption to kernel taget core infrastructure taken from vhost/scsi.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdarg.h>
+
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/gfp.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/configfs.h>
+
+#include <generated/utsrelease.h>
+
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_eh.h>
+#include <scsi/scsi_tcq.h>
+
+#include <target/target_core_base.h>
+#include <target/target_core_fabric.h>
+#include <target/target_core_configfs.h>
+#include <target/target_core_fabric_configfs.h>
+
+#include <asm/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/balloon.h>
+#include <xen/events.h>
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/page.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/vscsiif.h>
+
+#define DPRINTK(_f, _a...)			\
+	pr_debug("(file=%s, line=%d) " _f, __FILE__ , __LINE__ , ## _a)
+
+#define VSCSI_VERSION	"v0.1"
+#define VSCSI_NAMELEN	32
+
+struct ids_tuple {
+	unsigned int hst;		/* host    */
+	unsigned int chn;		/* channel */
+	unsigned int tgt;		/* target  */
+	unsigned int lun;		/* LUN     */
+};
+
+struct v2p_entry {
+	struct ids_tuple v;		/* translate from */
+	struct scsiback_tpg *tpg;	/* translate to   */
+	unsigned int lun;
+	struct kref kref;
+	struct list_head l;
+};
+
+struct vscsibk_info {
+	struct xenbus_device *dev;
+
+	domid_t domid;
+	unsigned int irq;
+
+	struct vscsiif_back_ring ring;
+	int ring_error;
+
+	spinlock_t ring_lock;
+	atomic_t nr_unreplied_reqs;
+
+	spinlock_t v2p_lock;
+	struct list_head v2p_entry_lists;
+
+	wait_queue_head_t waiting_to_free;
+};
+
+/* theoretical maximum of grants for one request */
+#define VSCSI_MAX_GRANTS	(SG_ALL + VSCSIIF_SG_TABLESIZE)
+
+/*
+ * VSCSI_GRANT_BATCH is the maximum number of grants to be processed in one
+ * call to map/unmap grants. Don't choose it too large, as there are arrays
+ * with VSCSI_GRANT_BATCH elements allocated on the stack.
+ */
+#define VSCSI_GRANT_BATCH	16
+
+struct vscsibk_pend {
+	uint16_t rqid;
+
+	uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];
+	uint8_t cmd_len;
+
+	uint8_t sc_data_direction;
+	uint16_t n_sg;		/* real length of SG list */
+	uint16_t n_grants;	/* SG pages and potentially SG list */
+	uint32_t data_len;
+	uint32_t result;
+
+	struct vscsibk_info *info;
+	struct v2p_entry *v2p;
+	struct scatterlist *sgl;
+
+	uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
+
+	grant_handle_t grant_handles[VSCSI_MAX_GRANTS];
+	struct page *pages[VSCSI_MAX_GRANTS];
+
+	struct se_cmd se_cmd;
+};
+
+struct scsiback_tmr {
+	atomic_t tmr_complete;
+	wait_queue_head_t tmr_wait;
+};
+
+struct scsiback_nexus {
+	/* Pointer to TCM session for I_T Nexus */
+	struct se_session *tvn_se_sess;
+};
+
+struct scsiback_tport {
+	/* SCSI protocol the tport is providing */
+	u8 tport_proto_id;
+	/* Binary World Wide unique Port Name for pvscsi Target port */
+	u64 tport_wwpn;
+	/* ASCII formatted WWPN for pvscsi Target port */
+	char tport_name[VSCSI_NAMELEN];
+	/* Returned by scsiback_make_tport() */
+	struct se_wwn tport_wwn;
+};
+
+struct scsiback_tpg {
+	/* scsiback port target portal group tag for TCM */
+	u16 tport_tpgt;
+	/* track number of TPG Port/Lun Links wrt explicit I_T Nexus shutdown */
+	int tv_tpg_port_count;
+	/* xen-pvscsi references to tpg_nexus, protected by tv_tpg_mutex */
+	int tv_tpg_fe_count;
+	/* list for scsiback_list */
+	struct list_head tv_tpg_list;
+	/* Used to protect access for tpg_nexus */
+	struct mutex tv_tpg_mutex;
+	/* Pointer to the TCM pvscsi I_T Nexus for this TPG endpoint */
+	struct scsiback_nexus *tpg_nexus;
+	/* Pointer back to scsiback_tport */
+	struct scsiback_tport *tport;
+	/* Returned by scsiback_make_tpg() */
+	struct se_portal_group se_tpg;
+	/* alias used in xenstore */
+	char param_alias[VSCSI_NAMELEN];
+	/* list of info structures related to this target portal group */
+	struct list_head info_list;
+};
+
+#define SCSIBACK_INVALID_HANDLE (~0)
+
+static bool log_print_stat;
+module_param(log_print_stat, bool, 0644);
+
+static int scsiback_max_buffer_pages = 1024;
+module_param_named(max_buffer_pages, scsiback_max_buffer_pages, int, 0644);
+MODULE_PARM_DESC(max_buffer_pages,
+"Maximum number of free pages to keep in backend buffer");
+
+static struct kmem_cache *scsiback_cachep;
+static DEFINE_SPINLOCK(free_pages_lock);
+static int free_pages_num;
+static LIST_HEAD(scsiback_free_pages);
+
+/* Global spinlock to protect scsiback TPG list */
+static DEFINE_MUTEX(scsiback_mutex);
+static LIST_HEAD(scsiback_list);
+
+/* Local pointer to allocated TCM configfs fabric module */
+static struct target_fabric_configfs *scsiback_fabric_configfs;
+
+static void scsiback_get(struct vscsibk_info *info)
+{
+	atomic_inc(&info->nr_unreplied_reqs);
+}
+
+static void scsiback_put(struct vscsibk_info *info)
+{
+	if (atomic_dec_and_test(&info->nr_unreplied_reqs))
+		wake_up(&info->waiting_to_free);
+}
+
+static void put_free_pages(struct page **page, int num)
+{
+	unsigned long flags;
+	int i = free_pages_num + num, n = num;
+
+	if (num == 0)
+		return;
+	if (i > scsiback_max_buffer_pages) {
+		n = min(num, i - scsiback_max_buffer_pages);
+		free_xenballooned_pages(n, page + num - n);
+		n = num - n;
+	}
+	spin_lock_irqsave(&free_pages_lock, flags);
+	for (i = 0; i < n; i++)
+		list_add(&page[i]->lru, &scsiback_free_pages);
+	free_pages_num += n;
+	spin_unlock_irqrestore(&free_pages_lock, flags);
+}
+
+static int get_free_page(struct page **page)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&free_pages_lock, flags);
+	if (list_empty(&scsiback_free_pages)) {
+		spin_unlock_irqrestore(&free_pages_lock, flags);
+		return alloc_xenballooned_pages(1, page, false);
+	}
+	page[0] = list_first_entry(&scsiback_free_pages, struct page, lru);
+	list_del(&page[0]->lru);
+	free_pages_num--;
+	spin_unlock_irqrestore(&free_pages_lock, flags);
+	return 0;
+}
+
+static unsigned long vaddr_page(struct page *page)
+{
+	unsigned long pfn = page_to_pfn(page);
+
+	return (unsigned long)pfn_to_kaddr(pfn);
+}
+
+static unsigned long vaddr(struct vscsibk_pend *req, int seg)
+{
+	return vaddr_page(req->pages[seg]);
+}
+
+static void scsiback_print_status(char *sense_buffer, int errors,
+					struct vscsibk_pend *pending_req)
+{
+	struct scsiback_tpg *tpg = pending_req->v2p->tpg;
+
+	pr_err("xen-pvscsi[%s:%d] cmnd[0]=%02x -> st=%02x msg=%02x host=%02x drv=%02x\n",
+	       tpg->tport->tport_name, pending_req->v2p->lun,
+	       pending_req->cmnd[0], status_byte(errors), msg_byte(errors),
+	       host_byte(errors), driver_byte(errors));
+
+	if (CHECK_CONDITION & status_byte(errors))
+		__scsi_print_sense("xen-pvscsi", sense_buffer,
+				   SCSI_SENSE_BUFFERSIZE);
+}
+
+static void scsiback_fast_flush_area(struct vscsibk_pend *req)
+{
+	struct gnttab_unmap_grant_ref unmap[VSCSI_GRANT_BATCH];
+	struct page *pages[VSCSI_GRANT_BATCH];
+	unsigned int i, invcount = 0;
+	grant_handle_t handle;
+	int err;
+
+	kfree(req->sgl);
+	req->sgl = NULL;
+	req->n_sg = 0;
+
+	if (!req->n_grants)
+		return;
+
+	for (i = 0; i < req->n_grants; i++) {
+		handle = req->grant_handles[i];
+		if (handle == SCSIBACK_INVALID_HANDLE)
+			continue;
+		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
+				    GNTMAP_host_map, handle);
+		req->grant_handles[i] = SCSIBACK_INVALID_HANDLE;
+		pages[invcount] = req->pages[i];
+		put_page(pages[invcount]);
+		invcount++;
+		if (invcount < VSCSI_GRANT_BATCH)
+			continue;
+		err = gnttab_unmap_refs(unmap, NULL, pages, invcount);
+		BUG_ON(err);
+		invcount = 0;
+	}
+
+	if (invcount) {
+		err = gnttab_unmap_refs(unmap, NULL, pages, invcount);
+		BUG_ON(err);
+	}
+
+	put_free_pages(req->pages, req->n_grants);
+	req->n_grants = 0;
+}
+
+static void scsiback_free_translation_entry(struct kref *kref)
+{
+	struct v2p_entry *entry = container_of(kref, struct v2p_entry, kref);
+	struct scsiback_tpg *tpg = entry->tpg;
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tpg->tv_tpg_fe_count--;
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	kfree(entry);
+}
+
+static void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result,
+			uint32_t resid, struct vscsibk_pend *pending_req)
+{
+	struct vscsiif_response *ring_res;
+	struct vscsibk_info *info = pending_req->info;
+	int notify;
+	struct scsi_sense_hdr sshdr;
+	unsigned long flags;
+	unsigned len;
+
+	spin_lock_irqsave(&info->ring_lock, flags);
+
+	ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt);
+	info->ring.rsp_prod_pvt++;
+
+	ring_res->rslt   = result;
+	ring_res->rqid   = pending_req->rqid;
+
+	if (sense_buffer != NULL &&
+	    scsi_normalize_sense(sense_buffer, VSCSIIF_SENSE_BUFFERSIZE,
+				 &sshdr)) {
+		len = min_t(unsigned, 8 + sense_buffer[7],
+			    VSCSIIF_SENSE_BUFFERSIZE);
+		memcpy(ring_res->sense_buffer, sense_buffer, len);
+		ring_res->sense_len = len;
+	} else {
+		ring_res->sense_len = 0;
+	}
+
+	ring_res->residual_len = resid;
+
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify);
+	spin_unlock_irqrestore(&info->ring_lock, flags);
+
+	if (notify)
+		notify_remote_via_irq(info->irq);
+
+	if (pending_req->v2p)
+		kref_put(&pending_req->v2p->kref,
+			 scsiback_free_translation_entry);
+}
+
+static void scsiback_cmd_done(struct vscsibk_pend *pending_req)
+{
+	struct vscsibk_info *info = pending_req->info;
+	unsigned char *sense_buffer;
+	unsigned int resid;
+	int errors;
+
+	sense_buffer = pending_req->sense_buffer;
+	resid        = pending_req->se_cmd.residual_count;
+	errors       = pending_req->result;
+
+	if (errors && log_print_stat)
+		scsiback_print_status(sense_buffer, errors, pending_req);
+
+	scsiback_fast_flush_area(pending_req);
+	scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req);
+	scsiback_put(info);
+}
+
+static void scsiback_cmd_exec(struct vscsibk_pend *pending_req)
+{
+	struct se_cmd *se_cmd = &pending_req->se_cmd;
+	struct se_session *sess = pending_req->v2p->tpg->tpg_nexus->tvn_se_sess;
+	int rc;
+
+	memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE);
+
+	memset(se_cmd, 0, sizeof(*se_cmd));
+
+	scsiback_get(pending_req->info);
+	rc = target_submit_cmd_map_sgls(se_cmd, sess, pending_req->cmnd,
+			pending_req->sense_buffer, pending_req->v2p->lun,
+			pending_req->data_len, 0,
+			pending_req->sc_data_direction, 0,
+			pending_req->sgl, pending_req->n_sg,
+			NULL, 0, NULL, 0);
+	if (rc < 0) {
+		transport_send_check_condition_and_sense(se_cmd,
+				TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE, 0);
+		transport_generic_free_cmd(se_cmd, 0);
+	}
+}
+
+static int scsiback_gnttab_data_map_batch(struct gnttab_map_grant_ref *map,
+	struct page **pg, grant_handle_t *grant, int cnt)
+{
+	int err, i;
+
+	if (!cnt)
+		return 0;
+
+	err = gnttab_map_refs(map, NULL, pg, cnt);
+	BUG_ON(err);
+	for (i = 0; i < cnt; i++) {
+		if (unlikely(map[i].status != GNTST_okay)) {
+			pr_err("xen-pvscsi: invalid buffer -- could not remap it\n");
+			map[i].handle = SCSIBACK_INVALID_HANDLE;
+			err = -ENOMEM;
+		} else {
+			get_page(pg[i]);
+		}
+		grant[i] = map[i].handle;
+	}
+	return err;
+}
+
+static int scsiback_gnttab_data_map_list(struct vscsibk_pend *pending_req,
+			struct scsiif_request_segment *seg, struct page **pg,
+			grant_handle_t *grant, int cnt, u32 flags)
+{
+	int mapcount = 0, i, err = 0;
+	struct gnttab_map_grant_ref map[VSCSI_GRANT_BATCH];
+	struct vscsibk_info *info = pending_req->info;
+
+	for (i = 0; i < cnt; i++) {
+		if (get_free_page(pg + mapcount)) {
+			put_free_pages(pg, mapcount);
+			pr_err("xen-pvscsi: no grant page\n");
+			return -ENOMEM;
+		}
+		gnttab_set_map_op(&map[mapcount], vaddr_page(pg[mapcount]),
+				  flags, seg[i].gref, info->domid);
+		mapcount++;
+		if (mapcount < VSCSI_GRANT_BATCH)
+			continue;
+		err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount);
+		pg += mapcount;
+		grant += mapcount;
+		pending_req->n_grants += mapcount;
+		if (err)
+			return err;
+		mapcount = 0;
+	}
+	err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount);
+	pending_req->n_grants += mapcount;
+	return err;
+}
+
+static int scsiback_gnttab_data_map(struct vscsiif_request *ring_req,
+					struct vscsibk_pend *pending_req)
+{
+	u32 flags;
+	int i, err, n_segs, i_seg = 0;
+	struct page **pg;
+	struct scsiif_request_segment *seg;
+	unsigned long end_seg = 0;
+	unsigned int nr_segments = (unsigned int)ring_req->nr_segments;
+	unsigned int nr_sgl = 0;
+	struct scatterlist *sg;
+	grant_handle_t *grant;
+
+	pending_req->n_sg = 0;
+	pending_req->n_grants = 0;
+	pending_req->data_len = 0;
+
+	nr_segments &= ~VSCSIIF_SG_GRANT;
+	if (!nr_segments)
+		return 0;
+
+	if (nr_segments > VSCSIIF_SG_TABLESIZE) {
+		DPRINTK("xen-pvscsi: invalid parameter nr_seg = %d\n",
+			ring_req->nr_segments);
+		return -EINVAL;
+	}
+
+	if (ring_req->nr_segments & VSCSIIF_SG_GRANT) {
+		err = scsiback_gnttab_data_map_list(pending_req, ring_req->seg,
+			pending_req->pages, pending_req->grant_handles,
+			nr_segments, GNTMAP_host_map | GNTMAP_readonly);
+		if (err)
+			return err;
+		nr_sgl = nr_segments;
+		nr_segments = 0;
+		for (i = 0; i < nr_sgl; i++) {
+			n_segs = ring_req->seg[i].length /
+				 sizeof(struct scsiif_request_segment);
+			if ((unsigned)ring_req->seg[i].offset +
+			    (unsigned)ring_req->seg[i].length > PAGE_SIZE ||
+			    n_segs * sizeof(struct scsiif_request_segment) !=
+			    ring_req->seg[i].length)
+				return -EINVAL;
+			nr_segments += n_segs;
+		}
+		if (nr_segments > SG_ALL) {
+			DPRINTK("xen-pvscsi: invalid nr_seg = %d\n",
+				nr_segments);
+			return -EINVAL;
+		}
+	}
+
+	/* free of (sgl) in fast_flush_area()*/
+	pending_req->sgl = kmalloc_array(nr_segments,
+					sizeof(struct scatterlist), GFP_KERNEL);
+	if (!pending_req->sgl)
+		return -ENOMEM;
+
+	sg_init_table(pending_req->sgl, nr_segments);
+	pending_req->n_sg = nr_segments;
+
+	flags = GNTMAP_host_map;
+	if (pending_req->sc_data_direction == DMA_TO_DEVICE)
+		flags |= GNTMAP_readonly;
+
+	pg = pending_req->pages + nr_sgl;
+	grant = pending_req->grant_handles + nr_sgl;
+	if (!nr_sgl) {
+		seg = ring_req->seg;
+		err = scsiback_gnttab_data_map_list(pending_req, seg,
+			pg, grant, nr_segments, flags);
+		if (err)
+			return err;
+	} else {
+		for (i = 0; i < nr_sgl; i++) {
+			seg = (struct scsiif_request_segment *)(
+			      vaddr(pending_req, i) + ring_req->seg[i].offset);
+			n_segs = ring_req->seg[i].length /
+				 sizeof(struct scsiif_request_segment);
+			err = scsiback_gnttab_data_map_list(pending_req, seg,
+				pg, grant, n_segs, flags);
+			if (err)
+				return err;
+			pg += n_segs;
+			grant += n_segs;
+		}
+		end_seg = vaddr(pending_req, 0) + ring_req->seg[0].offset;
+		seg = (struct scsiif_request_segment *)end_seg;
+		end_seg += ring_req->seg[0].length;
+		pg = pending_req->pages + nr_sgl;
+	}
+
+	for_each_sg(pending_req->sgl, sg, nr_segments, i) {
+		sg_set_page(sg, pg[i], seg->length, seg->offset);
+		pending_req->data_len += seg->length;
+		seg++;
+		if (nr_sgl && (unsigned long)seg >= end_seg) {
+			i_seg++;
+			end_seg = vaddr(pending_req, i_seg) +
+				  ring_req->seg[i_seg].offset;
+			seg = (struct scsiif_request_segment *)end_seg;
+			end_seg += ring_req->seg[i_seg].length;
+		}
+		if (sg->offset >= PAGE_SIZE ||
+		    sg->length > PAGE_SIZE ||
+		    sg->offset + sg->length > PAGE_SIZE)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static void scsiback_disconnect(struct vscsibk_info *info)
+{
+	wait_event(info->waiting_to_free,
+		atomic_read(&info->nr_unreplied_reqs) == 0);
+
+	unbind_from_irqhandler(info->irq, info);
+	info->irq = 0;
+	xenbus_unmap_ring_vfree(info->dev, info->ring.sring);
+}
+
+static void scsiback_device_action(struct vscsibk_pend *pending_req,
+	enum tcm_tmreq_table act, int tag)
+{
+	int rc, err = FAILED;
+	struct scsiback_tpg *tpg = pending_req->v2p->tpg;
+	struct se_cmd *se_cmd = &pending_req->se_cmd;
+	struct scsiback_tmr *tmr;
+
+	tmr = kzalloc(sizeof(struct scsiback_tmr), GFP_KERNEL);
+	if (!tmr)
+		goto out;
+
+	init_waitqueue_head(&tmr->tmr_wait);
+
+	transport_init_se_cmd(se_cmd, tpg->se_tpg.se_tpg_tfo,
+		tpg->tpg_nexus->tvn_se_sess, 0, DMA_NONE, MSG_SIMPLE_TAG,
+		&pending_req->sense_buffer[0]);
+
+	rc = core_tmr_alloc_req(se_cmd, tmr, act, GFP_KERNEL);
+	if (rc < 0)
+		goto out;
+
+	se_cmd->se_tmr_req->ref_task_tag = tag;
+
+	if (transport_lookup_tmr_lun(se_cmd, pending_req->v2p->lun) < 0)
+		goto out;
+
+	transport_generic_handle_tmr(se_cmd);
+	wait_event(tmr->tmr_wait, atomic_read(&tmr->tmr_complete));
+
+	err = (se_cmd->se_tmr_req->response == TMR_FUNCTION_COMPLETE) ?
+		SUCCESS : FAILED;
+
+out:
+	if (tmr) {
+		transport_generic_free_cmd(&pending_req->se_cmd, 1);
+		kfree(tmr);
+	}
+
+	scsiback_do_resp_with_sense(NULL, err, 0, pending_req);
+
+	kmem_cache_free(scsiback_cachep, pending_req);
+}
+
+/*
+  Perform virtual to physical translation
+*/
+static struct v2p_entry *scsiback_do_translation(struct vscsibk_info *info,
+			struct ids_tuple *v)
+{
+	struct v2p_entry *entry;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+	list_for_each_entry(entry, head, l) {
+		if ((entry->v.chn == v->chn) &&
+		    (entry->v.tgt == v->tgt) &&
+		    (entry->v.lun == v->lun)) {
+			kref_get(&entry->kref);
+			goto out;
+		}
+	}
+	entry = NULL;
+
+out:
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+	return entry;
+}
+
+static int prepare_pending_reqs(struct vscsibk_info *info,
+				struct vscsiif_request *ring_req,
+				struct vscsibk_pend *pending_req)
+{
+	struct v2p_entry *v2p;
+	struct ids_tuple vir;
+
+	pending_req->rqid       = ring_req->rqid;
+	pending_req->info       = info;
+
+	vir.chn = ring_req->channel;
+	vir.tgt = ring_req->id;
+	vir.lun = ring_req->lun;
+
+	v2p = scsiback_do_translation(info, &vir);
+	if (!v2p) {
+		pending_req->v2p = NULL;
+		DPRINTK("xen-pvscsi: doesn't exist.\n");
+		return -ENODEV;
+	}
+	pending_req->v2p = v2p;
+
+	/* request range check from frontend */
+	pending_req->sc_data_direction = ring_req->sc_data_direction;
+	if ((pending_req->sc_data_direction != DMA_BIDIRECTIONAL) &&
+		(pending_req->sc_data_direction != DMA_TO_DEVICE) &&
+		(pending_req->sc_data_direction != DMA_FROM_DEVICE) &&
+		(pending_req->sc_data_direction != DMA_NONE)) {
+		DPRINTK("xen-pvscsi: invalid parameter data_dir = %d\n",
+			pending_req->sc_data_direction);
+		return -EINVAL;
+	}
+
+	pending_req->cmd_len = ring_req->cmd_len;
+	if (pending_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) {
+		DPRINTK("xen-pvscsi: invalid parameter cmd_len = %d\n",
+			pending_req->cmd_len);
+		return -EINVAL;
+	}
+	memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len);
+
+	return 0;
+}
+
+static int scsiback_do_cmd_fn(struct vscsibk_info *info)
+{
+	struct vscsiif_back_ring *ring = &info->ring;
+	struct vscsiif_request *ring_req;
+	struct vscsibk_pend *pending_req;
+	RING_IDX rc, rp;
+	int err, more_to_do;
+	uint32_t result;
+	uint8_t act;
+
+	rc = ring->req_cons;
+	rp = ring->sring->req_prod;
+	rmb();	/* guest system is accessing ring, too */
+
+	if (RING_REQUEST_PROD_OVERFLOW(ring, rp)) {
+		rc = ring->rsp_prod_pvt;
+		pr_warn("xen-pvscsi: Dom%d provided bogus ring requests (%#x - %#x = %u). Halting ring processing\n",
+			   info->domid, rp, rc, rp - rc);
+		info->ring_error = 1;
+		return 0;
+	}
+
+	while ((rc != rp)) {
+		if (RING_REQUEST_CONS_OVERFLOW(ring, rc))
+			break;
+		pending_req = kmem_cache_alloc(scsiback_cachep, GFP_KERNEL);
+		if (!pending_req)
+			return 1;
+
+		ring_req = RING_GET_REQUEST(ring, rc);
+		ring->req_cons = ++rc;
+
+		act = ring_req->act;
+		err = prepare_pending_reqs(info, ring_req, pending_req);
+		if (err) {
+			switch (err) {
+			case -ENODEV:
+				result = DID_NO_CONNECT;
+				break;
+			default:
+				result = DRIVER_ERROR;
+				break;
+			}
+			scsiback_do_resp_with_sense(NULL, result << 24, 0,
+						    pending_req);
+			kmem_cache_free(scsiback_cachep, pending_req);
+			return 1;
+		}
+
+		switch (act) {
+		case VSCSIIF_ACT_SCSI_CDB:
+			if (scsiback_gnttab_data_map(ring_req, pending_req)) {
+				scsiback_fast_flush_area(pending_req);
+				scsiback_do_resp_with_sense(NULL,
+					DRIVER_ERROR << 24, 0, pending_req);
+				kmem_cache_free(scsiback_cachep, pending_req);
+			} else {
+				scsiback_cmd_exec(pending_req);
+			}
+			break;
+		case VSCSIIF_ACT_SCSI_ABORT:
+			scsiback_device_action(pending_req, TMR_ABORT_TASK,
+				ring_req->ref_rqid);
+			break;
+		case VSCSIIF_ACT_SCSI_RESET:
+			scsiback_device_action(pending_req, TMR_LUN_RESET, 0);
+			break;
+		default:
+			pr_err_ratelimited("xen-pvscsi: invalid request\n");
+			scsiback_do_resp_with_sense(NULL, DRIVER_ERROR << 24,
+						    0, pending_req);
+			kmem_cache_free(scsiback_cachep, pending_req);
+			break;
+		}
+
+		/* Yield point for this unbounded loop. */
+		cond_resched();
+	}
+
+	RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do);
+	return more_to_do;
+}
+
+static irqreturn_t scsiback_irq_fn(int irq, void *dev_id)
+{
+	struct vscsibk_info *info = dev_id;
+
+	if (info->ring_error)
+		return IRQ_HANDLED;
+
+	while (scsiback_do_cmd_fn(info))
+		cond_resched();
+
+	return IRQ_HANDLED;
+}
+
+static int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref,
+			evtchn_port_t evtchn)
+{
+	void *area;
+	struct vscsiif_sring *sring;
+	int err;
+
+	if (info->irq)
+		return -1;
+
+	err = xenbus_map_ring_valloc(info->dev, ring_ref, &area);
+	if (err)
+		return err;
+
+	sring = (struct vscsiif_sring *)area;
+	BACK_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+	err = bind_interdomain_evtchn_to_irq(info->domid, evtchn);
+	if (err < 0)
+		goto unmap_page;
+
+	info->irq = err;
+
+	err = request_threaded_irq(info->irq, NULL, scsiback_irq_fn,
+				   IRQF_ONESHOT, "vscsiif-backend", info);
+	if (err)
+		goto free_irq;
+
+	return 0;
+
+free_irq:
+	unbind_from_irqhandler(info->irq, info);
+	info->irq = 0;
+unmap_page:
+	xenbus_unmap_ring_vfree(info->dev, area);
+
+	return err;
+}
+
+static int scsiback_map(struct vscsibk_info *info)
+{
+	struct xenbus_device *dev = info->dev;
+	unsigned int ring_ref, evtchn;
+	int err;
+
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+			"ring-ref", "%u", &ring_ref,
+			"event-channel", "%u", &evtchn, NULL);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend);
+		return err;
+	}
+
+	return scsiback_init_sring(info, ring_ref, evtchn);
+}
+
+/*
+  Add a new translation entry
+*/
+static int scsiback_add_translation_entry(struct vscsibk_info *info,
+					  char *phy, struct ids_tuple *v)
+{
+	int err = 0;
+	struct v2p_entry *entry;
+	struct v2p_entry *new;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+	char *lunp;
+	unsigned int lun;
+	struct scsiback_tpg *tpg_entry, *tpg = NULL;
+	char *error = "doesn't exist";
+
+	lunp = strrchr(phy, ':');
+	if (!lunp) {
+		pr_err("xen-pvscsi: illegal format of physical device %s\n",
+			phy);
+		return -EINVAL;
+	}
+	*lunp = 0;
+	lunp++;
+	if (kstrtouint(lunp, 10, &lun) || lun >= TRANSPORT_MAX_LUNS_PER_TPG) {
+		pr_err("xen-pvscsi: lun number not valid: %s\n", lunp);
+		return -EINVAL;
+	}
+
+	mutex_lock(&scsiback_mutex);
+	list_for_each_entry(tpg_entry, &scsiback_list, tv_tpg_list) {
+		if (!strcmp(phy, tpg_entry->tport->tport_name) ||
+		    !strcmp(phy, tpg_entry->param_alias)) {
+			spin_lock(&tpg_entry->se_tpg.tpg_lun_lock);
+			if (tpg_entry->se_tpg.tpg_lun_list[lun]->lun_status ==
+			    TRANSPORT_LUN_STATUS_ACTIVE) {
+				if (!tpg_entry->tpg_nexus)
+					error = "nexus undefined";
+				else
+					tpg = tpg_entry;
+			}
+			spin_unlock(&tpg_entry->se_tpg.tpg_lun_lock);
+			break;
+		}
+	}
+	if (tpg) {
+		mutex_lock(&tpg->tv_tpg_mutex);
+		tpg->tv_tpg_fe_count++;
+		mutex_unlock(&tpg->tv_tpg_mutex);
+	}
+	mutex_unlock(&scsiback_mutex);
+
+	if (!tpg) {
+		pr_err("xen-pvscsi: %s:%d %s\n", phy, lun, error);
+		return -ENODEV;
+	}
+
+	new = kmalloc(sizeof(struct v2p_entry), GFP_KERNEL);
+	if (new == NULL) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+
+	/* Check double assignment to identical virtual ID */
+	list_for_each_entry(entry, head, l) {
+		if ((entry->v.chn == v->chn) &&
+		    (entry->v.tgt == v->tgt) &&
+		    (entry->v.lun == v->lun)) {
+			pr_warn("xen-pvscsi: Virtual ID is already used. Assignment was not performed.\n");
+			err = -EEXIST;
+			goto out;
+		}
+
+	}
+
+	/* Create a new translation entry and add to the list */
+	kref_init(&new->kref);
+	new->v = *v;
+	new->tpg = tpg;
+	new->lun = lun;
+	list_add_tail(&new->l, head);
+
+out:
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+
+out_free:
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tpg->tv_tpg_fe_count--;
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	if (err)
+		kfree(new);
+
+	return err;
+}
+
+static void __scsiback_del_translation_entry(struct v2p_entry *entry)
+{
+	list_del(&entry->l);
+	kref_put(&entry->kref, scsiback_free_translation_entry);
+}
+
+/*
+  Delete the translation entry specfied
+*/
+static int scsiback_del_translation_entry(struct vscsibk_info *info,
+					  struct ids_tuple *v)
+{
+	struct v2p_entry *entry;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+	/* Find out the translation entry specified */
+	list_for_each_entry(entry, head, l) {
+		if ((entry->v.chn == v->chn) &&
+		    (entry->v.tgt == v->tgt) &&
+		    (entry->v.lun == v->lun)) {
+			goto found;
+		}
+	}
+
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+	return 1;
+
+found:
+	/* Delete the translation entry specfied */
+	__scsiback_del_translation_entry(entry);
+
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+	return 0;
+}
+
+static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state,
+				char *phy, struct ids_tuple *vir)
+{
+	if (!scsiback_add_translation_entry(info, phy, vir)) {
+		if (xenbus_printf(XBT_NIL, info->dev->nodename, state,
+				  "%d", XenbusStateInitialised)) {
+			pr_err("xen-pvscsi: xenbus_printf error %s\n", state);
+			scsiback_del_translation_entry(info, vir);
+		}
+	} else {
+		xenbus_printf(XBT_NIL, info->dev->nodename, state,
+			      "%d", XenbusStateClosed);
+	}
+}
+
+static void scsiback_do_del_lun(struct vscsibk_info *info, const char *state,
+				struct ids_tuple *vir)
+{
+	if (!scsiback_del_translation_entry(info, vir)) {
+		if (xenbus_printf(XBT_NIL, info->dev->nodename, state,
+				  "%d", XenbusStateClosed))
+			pr_err("xen-pvscsi: xenbus_printf error %s\n", state);
+	}
+}
+
+#define VSCSIBACK_OP_ADD_OR_DEL_LUN	1
+#define VSCSIBACK_OP_UPDATEDEV_STATE	2
+
+static void scsiback_do_1lun_hotplug(struct vscsibk_info *info, int op,
+				     char *ent)
+{
+	int err;
+	struct ids_tuple vir;
+	char *val;
+	int device_state;
+	char phy[VSCSI_NAMELEN];
+	char str[64];
+	char state[64];
+	struct xenbus_device *dev = info->dev;
+
+	/* read status */
+	snprintf(state, sizeof(state), "vscsi-devs/%s/state", ent);
+	err = xenbus_scanf(XBT_NIL, dev->nodename, state, "%u", &device_state);
+	if (XENBUS_EXIST_ERR(err))
+		return;
+
+	/* physical SCSI device */
+	snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", ent);
+	val = xenbus_read(XBT_NIL, dev->nodename, str, NULL);
+	if (IS_ERR(val)) {
+		xenbus_printf(XBT_NIL, dev->nodename, state,
+			      "%d", XenbusStateClosed);
+		return;
+	}
+	strlcpy(phy, val, VSCSI_NAMELEN);
+	kfree(val);
+
+	/* virtual SCSI device */
+	snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", ent);
+	err = xenbus_scanf(XBT_NIL, dev->nodename, str, "%u:%u:%u:%u",
+			   &vir.hst, &vir.chn, &vir.tgt, &vir.lun);
+	if (XENBUS_EXIST_ERR(err)) {
+		xenbus_printf(XBT_NIL, dev->nodename, state,
+			      "%d", XenbusStateClosed);
+		return;
+	}
+
+	switch (op) {
+	case VSCSIBACK_OP_ADD_OR_DEL_LUN:
+		if (device_state == XenbusStateInitialising)
+			scsiback_do_add_lun(info, state, phy, &vir);
+		if (device_state == XenbusStateClosing)
+			scsiback_do_del_lun(info, state, &vir);
+		break;
+
+	case VSCSIBACK_OP_UPDATEDEV_STATE:
+		if (device_state == XenbusStateInitialised) {
+			/* modify vscsi-devs/dev-x/state */
+			if (xenbus_printf(XBT_NIL, dev->nodename, state,
+					  "%d", XenbusStateConnected)) {
+				pr_err("xen-pvscsi: xenbus_printf error %s\n",
+				       str);
+				scsiback_del_translation_entry(info, &vir);
+				xenbus_printf(XBT_NIL, dev->nodename, state,
+					      "%d", XenbusStateClosed);
+			}
+		}
+		break;
+	/*When it is necessary, processing is added here.*/
+	default:
+		break;
+	}
+}
+
+static void scsiback_do_lun_hotplug(struct vscsibk_info *info, int op)
+{
+	int i;
+	char **dir;
+	unsigned int ndir = 0;
+
+	dir = xenbus_directory(XBT_NIL, info->dev->nodename, "vscsi-devs",
+			       &ndir);
+	if (IS_ERR(dir))
+		return;
+
+	for (i = 0; i < ndir; i++)
+		scsiback_do_1lun_hotplug(info, op, dir[i]);
+
+	kfree(dir);
+}
+
+static void scsiback_frontend_changed(struct xenbus_device *dev,
+					enum xenbus_state frontend_state)
+{
+	struct vscsibk_info *info = dev_get_drvdata(&dev->dev);
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		break;
+
+	case XenbusStateInitialised:
+		if (scsiback_map(info))
+			break;
+
+		scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN);
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateConnected:
+		scsiback_do_lun_hotplug(info, VSCSIBACK_OP_UPDATEDEV_STATE);
+
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		xenbus_switch_state(dev, XenbusStateConnected);
+		break;
+
+	case XenbusStateClosing:
+		if (info->irq)
+			scsiback_disconnect(info);
+
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		/* fall through if not online */
+	case XenbusStateUnknown:
+		device_unregister(&dev->dev);
+		break;
+
+	case XenbusStateReconfiguring:
+		scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN);
+		xenbus_switch_state(dev, XenbusStateReconfigured);
+
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+					frontend_state);
+		break;
+	}
+}
+
+/*
+  Release the translation entry specfied
+*/
+static void scsiback_release_translation_entry(struct vscsibk_info *info)
+{
+	struct v2p_entry *entry, *tmp;
+	struct list_head *head = &(info->v2p_entry_lists);
+	unsigned long flags;
+
+	spin_lock_irqsave(&info->v2p_lock, flags);
+
+	list_for_each_entry_safe(entry, tmp, head, l)
+		__scsiback_del_translation_entry(entry);
+
+	spin_unlock_irqrestore(&info->v2p_lock, flags);
+}
+
+static int scsiback_remove(struct xenbus_device *dev)
+{
+	struct vscsibk_info *info = dev_get_drvdata(&dev->dev);
+
+	if (info->irq)
+		scsiback_disconnect(info);
+
+	scsiback_release_translation_entry(info);
+
+	dev_set_drvdata(&dev->dev, NULL);
+
+	return 0;
+}
+
+static int scsiback_probe(struct xenbus_device *dev,
+			   const struct xenbus_device_id *id)
+{
+	int err;
+
+	struct vscsibk_info *info = kzalloc(sizeof(struct vscsibk_info),
+					    GFP_KERNEL);
+
+	DPRINTK("%p %d\n", dev, dev->otherend_id);
+
+	if (!info) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating backend structure");
+		return -ENOMEM;
+	}
+	info->dev = dev;
+	dev_set_drvdata(&dev->dev, info);
+
+	info->domid = dev->otherend_id;
+	spin_lock_init(&info->ring_lock);
+	info->ring_error = 0;
+	atomic_set(&info->nr_unreplied_reqs, 0);
+	init_waitqueue_head(&info->waiting_to_free);
+	info->dev = dev;
+	info->irq = 0;
+	INIT_LIST_HEAD(&info->v2p_entry_lists);
+	spin_lock_init(&info->v2p_lock);
+
+	err = xenbus_printf(XBT_NIL, dev->nodename, "feature-sg-grant", "%u",
+			    SG_ALL);
+	if (err)
+		xenbus_dev_error(dev, err, "writing feature-sg-grant");
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	pr_warn("xen-pvscsi: %s failed\n", __func__);
+	scsiback_remove(dev);
+
+	return err;
+}
+
+static char *scsiback_dump_proto_id(struct scsiback_tport *tport)
+{
+	switch (tport->tport_proto_id) {
+	case SCSI_PROTOCOL_SAS:
+		return "SAS";
+	case SCSI_PROTOCOL_FCP:
+		return "FCP";
+	case SCSI_PROTOCOL_ISCSI:
+		return "iSCSI";
+	default:
+		break;
+	}
+
+	return "Unknown";
+}
+
+static u8 scsiback_get_fabric_proto_ident(struct se_portal_group *se_tpg)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	struct scsiback_tport *tport = tpg->tport;
+
+	switch (tport->tport_proto_id) {
+	case SCSI_PROTOCOL_SAS:
+		return sas_get_fabric_proto_ident(se_tpg);
+	case SCSI_PROTOCOL_FCP:
+		return fc_get_fabric_proto_ident(se_tpg);
+	case SCSI_PROTOCOL_ISCSI:
+		return iscsi_get_fabric_proto_ident(se_tpg);
+	default:
+		pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n",
+			tport->tport_proto_id);
+		break;
+	}
+
+	return sas_get_fabric_proto_ident(se_tpg);
+}
+
+static char *scsiback_get_fabric_wwn(struct se_portal_group *se_tpg)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	struct scsiback_tport *tport = tpg->tport;
+
+	return &tport->tport_name[0];
+}
+
+static u16 scsiback_get_tag(struct se_portal_group *se_tpg)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	return tpg->tport_tpgt;
+}
+
+static u32 scsiback_get_default_depth(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static u32
+scsiback_get_pr_transport_id(struct se_portal_group *se_tpg,
+			      struct se_node_acl *se_nacl,
+			      struct t10_pr_registration *pr_reg,
+			      int *format_code,
+			      unsigned char *buf)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	struct scsiback_tport *tport = tpg->tport;
+
+	switch (tport->tport_proto_id) {
+	case SCSI_PROTOCOL_SAS:
+		return sas_get_pr_transport_id(se_tpg, se_nacl, pr_reg,
+					format_code, buf);
+	case SCSI_PROTOCOL_FCP:
+		return fc_get_pr_transport_id(se_tpg, se_nacl, pr_reg,
+					format_code, buf);
+	case SCSI_PROTOCOL_ISCSI:
+		return iscsi_get_pr_transport_id(se_tpg, se_nacl, pr_reg,
+					format_code, buf);
+	default:
+		pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n",
+			tport->tport_proto_id);
+		break;
+	}
+
+	return sas_get_pr_transport_id(se_tpg, se_nacl, pr_reg,
+			format_code, buf);
+}
+
+static u32
+scsiback_get_pr_transport_id_len(struct se_portal_group *se_tpg,
+				  struct se_node_acl *se_nacl,
+				  struct t10_pr_registration *pr_reg,
+				  int *format_code)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	struct scsiback_tport *tport = tpg->tport;
+
+	switch (tport->tport_proto_id) {
+	case SCSI_PROTOCOL_SAS:
+		return sas_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg,
+					format_code);
+	case SCSI_PROTOCOL_FCP:
+		return fc_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg,
+					format_code);
+	case SCSI_PROTOCOL_ISCSI:
+		return iscsi_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg,
+					format_code);
+	default:
+		pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n",
+			tport->tport_proto_id);
+		break;
+	}
+
+	return sas_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg,
+			format_code);
+}
+
+static char *
+scsiback_parse_pr_out_transport_id(struct se_portal_group *se_tpg,
+				    const char *buf,
+				    u32 *out_tid_len,
+				    char **port_nexus_ptr)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	struct scsiback_tport *tport = tpg->tport;
+
+	switch (tport->tport_proto_id) {
+	case SCSI_PROTOCOL_SAS:
+		return sas_parse_pr_out_transport_id(se_tpg, buf, out_tid_len,
+					port_nexus_ptr);
+	case SCSI_PROTOCOL_FCP:
+		return fc_parse_pr_out_transport_id(se_tpg, buf, out_tid_len,
+					port_nexus_ptr);
+	case SCSI_PROTOCOL_ISCSI:
+		return iscsi_parse_pr_out_transport_id(se_tpg, buf, out_tid_len,
+					port_nexus_ptr);
+	default:
+		pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n",
+			tport->tport_proto_id);
+		break;
+	}
+
+	return sas_parse_pr_out_transport_id(se_tpg, buf, out_tid_len,
+			port_nexus_ptr);
+}
+
+static struct se_wwn *
+scsiback_make_tport(struct target_fabric_configfs *tf,
+		     struct config_group *group,
+		     const char *name)
+{
+	struct scsiback_tport *tport;
+	char *ptr;
+	u64 wwpn = 0;
+	int off = 0;
+
+	tport = kzalloc(sizeof(struct scsiback_tport), GFP_KERNEL);
+	if (!tport)
+		return ERR_PTR(-ENOMEM);
+
+	tport->tport_wwpn = wwpn;
+	/*
+	 * Determine the emulated Protocol Identifier and Target Port Name
+	 * based on the incoming configfs directory name.
+	 */
+	ptr = strstr(name, "naa.");
+	if (ptr) {
+		tport->tport_proto_id = SCSI_PROTOCOL_SAS;
+		goto check_len;
+	}
+	ptr = strstr(name, "fc.");
+	if (ptr) {
+		tport->tport_proto_id = SCSI_PROTOCOL_FCP;
+		off = 3; /* Skip over "fc." */
+		goto check_len;
+	}
+	ptr = strstr(name, "iqn.");
+	if (ptr) {
+		tport->tport_proto_id = SCSI_PROTOCOL_ISCSI;
+		goto check_len;
+	}
+
+	pr_err("Unable to locate prefix for emulated Target Port: %s\n", name);
+	kfree(tport);
+	return ERR_PTR(-EINVAL);
+
+check_len:
+	if (strlen(name) >= VSCSI_NAMELEN) {
+		pr_err("Emulated %s Address: %s, exceeds max: %d\n", name,
+			scsiback_dump_proto_id(tport), VSCSI_NAMELEN);
+		kfree(tport);
+		return ERR_PTR(-EINVAL);
+	}
+	snprintf(&tport->tport_name[0], VSCSI_NAMELEN, "%s", &name[off]);
+
+	pr_debug("xen-pvscsi: Allocated emulated Target %s Address: %s\n",
+		 scsiback_dump_proto_id(tport), name);
+
+	return &tport->tport_wwn;
+}
+
+static void scsiback_drop_tport(struct se_wwn *wwn)
+{
+	struct scsiback_tport *tport = container_of(wwn,
+				struct scsiback_tport, tport_wwn);
+
+	pr_debug("xen-pvscsi: Deallocating emulated Target %s Address: %s\n",
+		 scsiback_dump_proto_id(tport), tport->tport_name);
+
+	kfree(tport);
+}
+
+static struct se_node_acl *
+scsiback_alloc_fabric_acl(struct se_portal_group *se_tpg)
+{
+	return kzalloc(sizeof(struct se_node_acl), GFP_KERNEL);
+}
+
+static void
+scsiback_release_fabric_acl(struct se_portal_group *se_tpg,
+			     struct se_node_acl *se_nacl)
+{
+	kfree(se_nacl);
+}
+
+static u32 scsiback_tpg_get_inst_index(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static int scsiback_check_stop_free(struct se_cmd *se_cmd)
+{
+	/*
+	 * Do not release struct se_cmd's containing a valid TMR
+	 * pointer.  These will be released directly in scsiback_device_action()
+	 * with transport_generic_free_cmd().
+	 */
+	if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB)
+		return 0;
+
+	transport_generic_free_cmd(se_cmd, 0);
+	return 1;
+}
+
+static void scsiback_release_cmd(struct se_cmd *se_cmd)
+{
+	struct vscsibk_pend *pending_req = container_of(se_cmd,
+				struct vscsibk_pend, se_cmd);
+
+	kmem_cache_free(scsiback_cachep, pending_req);
+}
+
+static int scsiback_shutdown_session(struct se_session *se_sess)
+{
+	return 0;
+}
+
+static void scsiback_close_session(struct se_session *se_sess)
+{
+}
+
+static u32 scsiback_sess_get_index(struct se_session *se_sess)
+{
+	return 0;
+}
+
+static int scsiback_write_pending(struct se_cmd *se_cmd)
+{
+	/* Go ahead and process the write immediately */
+	target_execute_cmd(se_cmd);
+
+	return 0;
+}
+
+static int scsiback_write_pending_status(struct se_cmd *se_cmd)
+{
+	return 0;
+}
+
+static void scsiback_set_default_node_attrs(struct se_node_acl *nacl)
+{
+}
+
+static u32 scsiback_get_task_tag(struct se_cmd *se_cmd)
+{
+	struct vscsibk_pend *pending_req = container_of(se_cmd,
+				struct vscsibk_pend, se_cmd);
+
+	return pending_req->rqid;
+}
+
+static int scsiback_get_cmd_state(struct se_cmd *se_cmd)
+{
+	return 0;
+}
+
+static int scsiback_queue_data_in(struct se_cmd *se_cmd)
+{
+	struct vscsibk_pend *pending_req = container_of(se_cmd,
+				struct vscsibk_pend, se_cmd);
+
+	pending_req->result = SAM_STAT_GOOD;
+	scsiback_cmd_done(pending_req);
+	return 0;
+}
+
+static int scsiback_queue_status(struct se_cmd *se_cmd)
+{
+	struct vscsibk_pend *pending_req = container_of(se_cmd,
+				struct vscsibk_pend, se_cmd);
+
+	if (se_cmd->sense_buffer &&
+	    ((se_cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) ||
+	     (se_cmd->se_cmd_flags & SCF_EMULATED_TASK_SENSE)))
+		pending_req->result = (DRIVER_SENSE << 24) |
+				      SAM_STAT_CHECK_CONDITION;
+	else
+		pending_req->result = se_cmd->scsi_status;
+
+	scsiback_cmd_done(pending_req);
+	return 0;
+}
+
+static void scsiback_queue_tm_rsp(struct se_cmd *se_cmd)
+{
+	struct se_tmr_req *se_tmr = se_cmd->se_tmr_req;
+	struct scsiback_tmr *tmr = se_tmr->fabric_tmr_ptr;
+
+	atomic_set(&tmr->tmr_complete, 1);
+	wake_up(&tmr->tmr_wait);
+}
+
+static void scsiback_aborted_task(struct se_cmd *se_cmd)
+{
+}
+
+static ssize_t scsiback_tpg_param_show_alias(struct se_portal_group *se_tpg,
+					     char *page)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg,
+						se_tpg);
+	ssize_t rb;
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	rb = snprintf(page, PAGE_SIZE, "%s\n", tpg->param_alias);
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	return rb;
+}
+
+static ssize_t scsiback_tpg_param_store_alias(struct se_portal_group *se_tpg,
+					      const char *page, size_t count)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg,
+						se_tpg);
+	int len;
+
+	if (strlen(page) >= VSCSI_NAMELEN) {
+		pr_err("param alias: %s, exceeds max: %d\n", page,
+			VSCSI_NAMELEN);
+		return -EINVAL;
+	}
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	len = snprintf(tpg->param_alias, VSCSI_NAMELEN, "%s", page);
+	if (tpg->param_alias[len - 1] == '\n')
+		tpg->param_alias[len - 1] = '\0';
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	return count;
+}
+
+TF_TPG_PARAM_ATTR(scsiback, alias, S_IRUGO | S_IWUSR);
+
+static struct configfs_attribute *scsiback_param_attrs[] = {
+	&scsiback_tpg_param_alias.attr,
+	NULL,
+};
+
+static int scsiback_make_nexus(struct scsiback_tpg *tpg,
+				const char *name)
+{
+	struct se_portal_group *se_tpg;
+	struct se_session *se_sess;
+	struct scsiback_nexus *tv_nexus;
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	if (tpg->tpg_nexus) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		pr_debug("tpg->tpg_nexus already exists\n");
+		return -EEXIST;
+	}
+	se_tpg = &tpg->se_tpg;
+
+	tv_nexus = kzalloc(sizeof(struct scsiback_nexus), GFP_KERNEL);
+	if (!tv_nexus) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		return -ENOMEM;
+	}
+	/*
+	 *  Initialize the struct se_session pointer
+	 */
+	tv_nexus->tvn_se_sess = transport_init_session(TARGET_PROT_NORMAL);
+	if (IS_ERR(tv_nexus->tvn_se_sess)) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		kfree(tv_nexus);
+		return -ENOMEM;
+	}
+	se_sess = tv_nexus->tvn_se_sess;
+	/*
+	 * Since we are running in 'demo mode' this call with generate a
+	 * struct se_node_acl for the scsiback struct se_portal_group with
+	 * the SCSI Initiator port name of the passed configfs group 'name'.
+	 */
+	tv_nexus->tvn_se_sess->se_node_acl = core_tpg_check_initiator_node_acl(
+				se_tpg, (unsigned char *)name);
+	if (!tv_nexus->tvn_se_sess->se_node_acl) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		pr_debug("core_tpg_check_initiator_node_acl() failed for %s\n",
+			 name);
+		goto out;
+	}
+	/*
+	 * Now register the TCM pvscsi virtual I_T Nexus as active with the
+	 * call to __transport_register_session()
+	 */
+	__transport_register_session(se_tpg, tv_nexus->tvn_se_sess->se_node_acl,
+			tv_nexus->tvn_se_sess, tv_nexus);
+	tpg->tpg_nexus = tv_nexus;
+
+	mutex_unlock(&tpg->tv_tpg_mutex);
+	return 0;
+
+out:
+	transport_free_session(se_sess);
+	kfree(tv_nexus);
+	return -ENOMEM;
+}
+
+static int scsiback_drop_nexus(struct scsiback_tpg *tpg)
+{
+	struct se_session *se_sess;
+	struct scsiback_nexus *tv_nexus;
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tv_nexus = tpg->tpg_nexus;
+	if (!tv_nexus) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		return -ENODEV;
+	}
+
+	se_sess = tv_nexus->tvn_se_sess;
+	if (!se_sess) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		return -ENODEV;
+	}
+
+	if (tpg->tv_tpg_port_count != 0) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG port count: %d\n",
+			tpg->tv_tpg_port_count);
+		return -EBUSY;
+	}
+
+	if (tpg->tv_tpg_fe_count != 0) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG frontend count: %d\n",
+			tpg->tv_tpg_fe_count);
+		return -EBUSY;
+	}
+
+	pr_debug("xen-pvscsi: Removing I_T Nexus to emulated %s Initiator Port: %s\n",
+		scsiback_dump_proto_id(tpg->tport),
+		tv_nexus->tvn_se_sess->se_node_acl->initiatorname);
+
+	/*
+	 * Release the SCSI I_T Nexus to the emulated xen-pvscsi Target Port
+	 */
+	transport_deregister_session(tv_nexus->tvn_se_sess);
+	tpg->tpg_nexus = NULL;
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	kfree(tv_nexus);
+	return 0;
+}
+
+static ssize_t scsiback_tpg_show_nexus(struct se_portal_group *se_tpg,
+					char *page)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	struct scsiback_nexus *tv_nexus;
+	ssize_t ret;
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tv_nexus = tpg->tpg_nexus;
+	if (!tv_nexus) {
+		mutex_unlock(&tpg->tv_tpg_mutex);
+		return -ENODEV;
+	}
+	ret = snprintf(page, PAGE_SIZE, "%s\n",
+			tv_nexus->tvn_se_sess->se_node_acl->initiatorname);
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	return ret;
+}
+
+static ssize_t scsiback_tpg_store_nexus(struct se_portal_group *se_tpg,
+					 const char *page,
+					 size_t count)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+	struct scsiback_tport *tport_wwn = tpg->tport;
+	unsigned char i_port[VSCSI_NAMELEN], *ptr, *port_ptr;
+	int ret;
+	/*
+	 * Shutdown the active I_T nexus if 'NULL' is passed..
+	 */
+	if (!strncmp(page, "NULL", 4)) {
+		ret = scsiback_drop_nexus(tpg);
+		return (!ret) ? count : ret;
+	}
+	/*
+	 * Otherwise make sure the passed virtual Initiator port WWN matches
+	 * the fabric protocol_id set in scsiback_make_tport(), and call
+	 * scsiback_make_nexus().
+	 */
+	if (strlen(page) >= VSCSI_NAMELEN) {
+		pr_err("Emulated NAA Sas Address: %s, exceeds max: %d\n",
+			page, VSCSI_NAMELEN);
+		return -EINVAL;
+	}
+	snprintf(&i_port[0], VSCSI_NAMELEN, "%s", page);
+
+	ptr = strstr(i_port, "naa.");
+	if (ptr) {
+		if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_SAS) {
+			pr_err("Passed SAS Initiator Port %s does not match target port protoid: %s\n",
+				i_port, scsiback_dump_proto_id(tport_wwn));
+			return -EINVAL;
+		}
+		port_ptr = &i_port[0];
+		goto check_newline;
+	}
+	ptr = strstr(i_port, "fc.");
+	if (ptr) {
+		if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_FCP) {
+			pr_err("Passed FCP Initiator Port %s does not match target port protoid: %s\n",
+				i_port, scsiback_dump_proto_id(tport_wwn));
+			return -EINVAL;
+		}
+		port_ptr = &i_port[3]; /* Skip over "fc." */
+		goto check_newline;
+	}
+	ptr = strstr(i_port, "iqn.");
+	if (ptr) {
+		if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_ISCSI) {
+			pr_err("Passed iSCSI Initiator Port %s does not match target port protoid: %s\n",
+				i_port, scsiback_dump_proto_id(tport_wwn));
+			return -EINVAL;
+		}
+		port_ptr = &i_port[0];
+		goto check_newline;
+	}
+	pr_err("Unable to locate prefix for emulated Initiator Port: %s\n",
+		i_port);
+	return -EINVAL;
+	/*
+	 * Clear any trailing newline for the NAA WWN
+	 */
+check_newline:
+	if (i_port[strlen(i_port) - 1] == '\n')
+		i_port[strlen(i_port) - 1] = '\0';
+
+	ret = scsiback_make_nexus(tpg, port_ptr);
+	if (ret < 0)
+		return ret;
+
+	return count;
+}
+
+TF_TPG_BASE_ATTR(scsiback, nexus, S_IRUGO | S_IWUSR);
+
+static struct configfs_attribute *scsiback_tpg_attrs[] = {
+	&scsiback_tpg_nexus.attr,
+	NULL,
+};
+
+static ssize_t
+scsiback_wwn_show_attr_version(struct target_fabric_configfs *tf,
+				char *page)
+{
+	return sprintf(page, "xen-pvscsi fabric module %s on %s/%s on "
+		UTS_RELEASE"\n",
+		VSCSI_VERSION, utsname()->sysname, utsname()->machine);
+}
+
+TF_WWN_ATTR_RO(scsiback, version);
+
+static struct configfs_attribute *scsiback_wwn_attrs[] = {
+	&scsiback_wwn_version.attr,
+	NULL,
+};
+
+static char *scsiback_get_fabric_name(void)
+{
+	return "xen-pvscsi";
+}
+
+static int scsiback_port_link(struct se_portal_group *se_tpg,
+			       struct se_lun *lun)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tpg->tv_tpg_port_count++;
+	mutex_unlock(&tpg->tv_tpg_mutex);
+
+	return 0;
+}
+
+static void scsiback_port_unlink(struct se_portal_group *se_tpg,
+				  struct se_lun *lun)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+
+	mutex_lock(&tpg->tv_tpg_mutex);
+	tpg->tv_tpg_port_count--;
+	mutex_unlock(&tpg->tv_tpg_mutex);
+}
+
+static struct se_portal_group *
+scsiback_make_tpg(struct se_wwn *wwn,
+		   struct config_group *group,
+		   const char *name)
+{
+	struct scsiback_tport *tport = container_of(wwn,
+			struct scsiback_tport, tport_wwn);
+
+	struct scsiback_tpg *tpg;
+	u16 tpgt;
+	int ret;
+
+	if (strstr(name, "tpgt_") != name)
+		return ERR_PTR(-EINVAL);
+	ret = kstrtou16(name + 5, 10, &tpgt);
+	if (ret)
+		return ERR_PTR(ret);
+
+	tpg = kzalloc(sizeof(struct scsiback_tpg), GFP_KERNEL);
+	if (!tpg)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&tpg->tv_tpg_mutex);
+	INIT_LIST_HEAD(&tpg->tv_tpg_list);
+	INIT_LIST_HEAD(&tpg->info_list);
+	tpg->tport = tport;
+	tpg->tport_tpgt = tpgt;
+
+	ret = core_tpg_register(&scsiback_fabric_configfs->tf_ops, wwn,
+				&tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL);
+	if (ret < 0) {
+		kfree(tpg);
+		return NULL;
+	}
+	mutex_lock(&scsiback_mutex);
+	list_add_tail(&tpg->tv_tpg_list, &scsiback_list);
+	mutex_unlock(&scsiback_mutex);
+
+	return &tpg->se_tpg;
+}
+
+static void scsiback_drop_tpg(struct se_portal_group *se_tpg)
+{
+	struct scsiback_tpg *tpg = container_of(se_tpg,
+				struct scsiback_tpg, se_tpg);
+
+	mutex_lock(&scsiback_mutex);
+	list_del(&tpg->tv_tpg_list);
+	mutex_unlock(&scsiback_mutex);
+	/*
+	 * Release the virtual I_T Nexus for this xen-pvscsi TPG
+	 */
+	scsiback_drop_nexus(tpg);
+	/*
+	 * Deregister the se_tpg from TCM..
+	 */
+	core_tpg_deregister(se_tpg);
+	kfree(tpg);
+}
+
+static int scsiback_check_true(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static int scsiback_check_false(struct se_portal_group *se_tpg)
+{
+	return 0;
+}
+
+static struct target_core_fabric_ops scsiback_ops = {
+	.get_fabric_name		= scsiback_get_fabric_name,
+	.get_fabric_proto_ident		= scsiback_get_fabric_proto_ident,
+	.tpg_get_wwn			= scsiback_get_fabric_wwn,
+	.tpg_get_tag			= scsiback_get_tag,
+	.tpg_get_default_depth		= scsiback_get_default_depth,
+	.tpg_get_pr_transport_id	= scsiback_get_pr_transport_id,
+	.tpg_get_pr_transport_id_len	= scsiback_get_pr_transport_id_len,
+	.tpg_parse_pr_out_transport_id	= scsiback_parse_pr_out_transport_id,
+	.tpg_check_demo_mode		= scsiback_check_true,
+	.tpg_check_demo_mode_cache	= scsiback_check_true,
+	.tpg_check_demo_mode_write_protect = scsiback_check_false,
+	.tpg_check_prod_mode_write_protect = scsiback_check_false,
+	.tpg_alloc_fabric_acl		= scsiback_alloc_fabric_acl,
+	.tpg_release_fabric_acl		= scsiback_release_fabric_acl,
+	.tpg_get_inst_index		= scsiback_tpg_get_inst_index,
+	.check_stop_free		= scsiback_check_stop_free,
+	.release_cmd			= scsiback_release_cmd,
+	.put_session			= NULL,
+	.shutdown_session		= scsiback_shutdown_session,
+	.close_session			= scsiback_close_session,
+	.sess_get_index			= scsiback_sess_get_index,
+	.sess_get_initiator_sid		= NULL,
+	.write_pending			= scsiback_write_pending,
+	.write_pending_status		= scsiback_write_pending_status,
+	.set_default_node_attributes	= scsiback_set_default_node_attrs,
+	.get_task_tag			= scsiback_get_task_tag,
+	.get_cmd_state			= scsiback_get_cmd_state,
+	.queue_data_in			= scsiback_queue_data_in,
+	.queue_status			= scsiback_queue_status,
+	.queue_tm_rsp			= scsiback_queue_tm_rsp,
+	.aborted_task			= scsiback_aborted_task,
+	/*
+	 * Setup callers for generic logic in target_core_fabric_configfs.c
+	 */
+	.fabric_make_wwn		= scsiback_make_tport,
+	.fabric_drop_wwn		= scsiback_drop_tport,
+	.fabric_make_tpg		= scsiback_make_tpg,
+	.fabric_drop_tpg		= scsiback_drop_tpg,
+	.fabric_post_link		= scsiback_port_link,
+	.fabric_pre_unlink		= scsiback_port_unlink,
+	.fabric_make_np			= NULL,
+	.fabric_drop_np			= NULL,
+#if 0
+	.fabric_make_nodeacl		= scsiback_make_nodeacl,
+	.fabric_drop_nodeacl		= scsiback_drop_nodeacl,
+#endif
+};
+
+static int scsiback_register_configfs(void)
+{
+	struct target_fabric_configfs *fabric;
+	int ret;
+
+	pr_debug("xen-pvscsi: fabric module %s on %s/%s on "UTS_RELEASE"\n",
+		 VSCSI_VERSION, utsname()->sysname, utsname()->machine);
+	/*
+	 * Register the top level struct config_item_type with TCM core
+	 */
+	fabric = target_fabric_configfs_init(THIS_MODULE, "xen-pvscsi");
+	if (IS_ERR(fabric))
+		return PTR_ERR(fabric);
+
+	/*
+	 * Setup fabric->tf_ops from our local scsiback_ops
+	 */
+	fabric->tf_ops = scsiback_ops;
+	/*
+	 * Setup default attribute lists for various fabric->tf_cit_tmpl
+	 */
+	fabric->tf_cit_tmpl.tfc_wwn_cit.ct_attrs = scsiback_wwn_attrs;
+	fabric->tf_cit_tmpl.tfc_tpg_base_cit.ct_attrs = scsiback_tpg_attrs;
+	fabric->tf_cit_tmpl.tfc_tpg_attrib_cit.ct_attrs = NULL;
+	fabric->tf_cit_tmpl.tfc_tpg_param_cit.ct_attrs = scsiback_param_attrs;
+	fabric->tf_cit_tmpl.tfc_tpg_np_base_cit.ct_attrs = NULL;
+	fabric->tf_cit_tmpl.tfc_tpg_nacl_base_cit.ct_attrs = NULL;
+	fabric->tf_cit_tmpl.tfc_tpg_nacl_attrib_cit.ct_attrs = NULL;
+	fabric->tf_cit_tmpl.tfc_tpg_nacl_auth_cit.ct_attrs = NULL;
+	fabric->tf_cit_tmpl.tfc_tpg_nacl_param_cit.ct_attrs = NULL;
+	/*
+	 * Register the fabric for use within TCM
+	 */
+	ret = target_fabric_configfs_register(fabric);
+	if (ret < 0) {
+		target_fabric_configfs_free(fabric);
+		return ret;
+	}
+	/*
+	 * Setup our local pointer to *fabric
+	 */
+	scsiback_fabric_configfs = fabric;
+	pr_debug("xen-pvscsi: Set fabric -> scsiback_fabric_configfs\n");
+	return 0;
+};
+
+static void scsiback_deregister_configfs(void)
+{
+	if (!scsiback_fabric_configfs)
+		return;
+
+	target_fabric_configfs_deregister(scsiback_fabric_configfs);
+	scsiback_fabric_configfs = NULL;
+	pr_debug("xen-pvscsi: Cleared scsiback_fabric_configfs\n");
+};
+
+static const struct xenbus_device_id scsiback_ids[] = {
+	{ "vscsi" },
+	{ "" }
+};
+
+static struct xenbus_driver scsiback_driver = {
+	.ids			= scsiback_ids,
+	.probe			= scsiback_probe,
+	.remove			= scsiback_remove,
+	.otherend_changed	= scsiback_frontend_changed
+};
+
+static void scsiback_init_pend(void *p)
+{
+	struct vscsibk_pend *pend = p;
+	int i;
+
+	memset(pend, 0, sizeof(*pend));
+	for (i = 0; i < VSCSI_MAX_GRANTS; i++)
+		pend->grant_handles[i] = SCSIBACK_INVALID_HANDLE;
+}
+
+static int __init scsiback_init(void)
+{
+	int ret;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	scsiback_cachep = kmem_cache_create("vscsiif_cache",
+		sizeof(struct vscsibk_pend), 0, 0, scsiback_init_pend);
+	if (!scsiback_cachep)
+		return -ENOMEM;
+
+	ret = xenbus_register_backend(&scsiback_driver);
+	if (ret)
+		goto out_cache_destroy;
+
+	ret = scsiback_register_configfs();
+	if (ret)
+		goto out_unregister_xenbus;
+
+	return 0;
+
+out_unregister_xenbus:
+	xenbus_unregister_driver(&scsiback_driver);
+out_cache_destroy:
+	kmem_cache_destroy(scsiback_cachep);
+	pr_err("xen-pvscsi: %s: error %d\n", __func__, ret);
+	return ret;
+}
+
+static void __exit scsiback_exit(void)
+{
+	struct page *page;
+
+	while (free_pages_num) {
+		if (get_free_page(&page))
+			BUG();
+		free_xenballooned_pages(1, &page);
+	}
+	scsiback_deregister_configfs();
+	xenbus_unregister_driver(&scsiback_driver);
+	kmem_cache_destroy(scsiback_cachep);
+}
+
+module_init(scsiback_init);
+module_exit(scsiback_exit);
+
+MODULE_DESCRIPTION("Xen SCSI backend driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vscsi");
+MODULE_AUTHOR("Juergen Gross <jgross@suse.com>");

+ 3 - 6
drivers/xen/xenbus/xenbus_client.c

@@ -259,7 +259,6 @@ static char *error_path(struct xenbus_device *dev)
 static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
 				const char *fmt, va_list ap)
 {
-	int ret;
 	unsigned int len;
 	char *printf_buffer = NULL;
 	char *path_buffer = NULL;
@@ -270,9 +269,7 @@ static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
 		goto fail;
 
 	len = sprintf(printf_buffer, "%i ", -err);
-	ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
-
-	BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
+	vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
 
 	dev_err(&dev->dev, "%s\n", printf_buffer);
 
@@ -361,8 +358,8 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
  * @ring_mfn: mfn of ring to grant
 
  * Grant access to the given @ring_mfn to the peer of the given device.  Return
- * 0 on success, or -errno on error.  On error, the device will switch to
- * XenbusStateClosing, and the error will be saved in the store.
+ * a grant reference on success, or -errno on error. On error, the device will
+ * switch to XenbusStateClosing, and the error will be saved in the store.
  */
 int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
 {

+ 5 - 1
drivers/xen/xenbus/xenbus_probe.c

@@ -297,9 +297,13 @@ void xenbus_dev_shutdown(struct device *_dev)
 EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
 
 int xenbus_register_driver_common(struct xenbus_driver *drv,
-				  struct xen_bus_type *bus)
+				  struct xen_bus_type *bus,
+				  struct module *owner, const char *mod_name)
 {
+	drv->driver.name = drv->name ? drv->name : drv->ids[0].devicetype;
 	drv->driver.bus = &bus->bus;
+	drv->driver.owner = owner;
+	drv->driver.mod_name = mod_name;
 
 	return driver_register(&drv->driver);
 }

+ 3 - 1
drivers/xen/xenbus/xenbus_probe.h

@@ -60,7 +60,9 @@ extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
 extern int xenbus_dev_probe(struct device *_dev);
 extern int xenbus_dev_remove(struct device *_dev);
 extern int xenbus_register_driver_common(struct xenbus_driver *drv,
-					 struct xen_bus_type *bus);
+					 struct xen_bus_type *bus,
+					 struct module *owner,
+					 const char *mod_name);
 extern int xenbus_probe_node(struct xen_bus_type *bus,
 			     const char *type,
 			     const char *nodename);

+ 5 - 3
drivers/xen/xenbus/xenbus_probe_backend.c

@@ -234,13 +234,15 @@ int xenbus_dev_is_online(struct xenbus_device *dev)
 }
 EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
 
-int xenbus_register_backend(struct xenbus_driver *drv)
+int __xenbus_register_backend(struct xenbus_driver *drv, struct module *owner,
+			      const char *mod_name)
 {
 	drv->read_otherend_details = read_frontend_details;
 
-	return xenbus_register_driver_common(drv, &xenbus_backend);
+	return xenbus_register_driver_common(drv, &xenbus_backend,
+					     owner, mod_name);
 }
-EXPORT_SYMBOL_GPL(xenbus_register_backend);
+EXPORT_SYMBOL_GPL(__xenbus_register_backend);
 
 static int backend_probe_and_watch(struct notifier_block *notifier,
 				   unsigned long event,

+ 5 - 3
drivers/xen/xenbus/xenbus_probe_frontend.c

@@ -317,13 +317,15 @@ static void wait_for_devices(struct xenbus_driver *xendrv)
 			 print_device_status);
 }
 
-int xenbus_register_frontend(struct xenbus_driver *drv)
+int __xenbus_register_frontend(struct xenbus_driver *drv, struct module *owner,
+			       const char *mod_name)
 {
 	int ret;
 
 	drv->read_otherend_details = read_backend_details;
 
-	ret = xenbus_register_driver_common(drv, &xenbus_frontend);
+	ret = xenbus_register_driver_common(drv, &xenbus_frontend,
+					    owner, mod_name);
 	if (ret)
 		return ret;
 
@@ -332,7 +334,7 @@ int xenbus_register_frontend(struct xenbus_driver *drv)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(xenbus_register_frontend);
+EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
 
 static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
 static int backend_state;

+ 2 - 0
include/xen/events.h

@@ -28,6 +28,8 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
 			   unsigned long irqflags,
 			   const char *devname,
 			   void *dev_id);
+int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
+				   unsigned int remote_port);
 int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
 					  unsigned int remote_port,
 					  irq_handler_t handler,

+ 46 - 2
include/xen/interface/elfnote.h

@@ -3,6 +3,24 @@
  *
  * Definitions used for the Xen ELF notes.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2006, Ian Campbell, XenSource Ltd.
  */
 
@@ -18,12 +36,13 @@
  *
  * LEGACY indicated the fields in the legacy __xen_guest string which
  * this a note type replaces.
+ *
+ * String values (for non-legacy) are NULL terminated ASCII, also known
+ * as ASCIZ type.
  */
 
 /*
  * NAME=VALUE pair (string).
- *
- * LEGACY: FEATURES and PAE
  */
 #define XEN_ELFNOTE_INFO           0
 
@@ -137,9 +156,29 @@
 
 /*
  * Whether or not the guest supports cooperative suspend cancellation.
+ * This is a numeric value.
+ *
+ * Default is 0
  */
 #define XEN_ELFNOTE_SUSPEND_CANCEL 14
 
+/*
+ * The (non-default) location the initial phys-to-machine map should be
+ * placed at by the hypervisor (Dom0) or the tools (DomU).
+ * The kernel must be prepared for this mapping to be established using
+ * large pages, despite such otherwise not being available to guests.
+ * The kernel must also be able to handle the page table pages used for
+ * this mapping not being accessible through the initial mapping.
+ * (Only x86-64 supports this at present.)
+ */
+#define XEN_ELFNOTE_INIT_P2M      15
+
+/*
+ * Whether or not the guest can deal with being passed an initrd not
+ * mapped through its initial page tables.
+ */
+#define XEN_ELFNOTE_MOD_START_PFN 16
+
 /*
  * The features supported by this kernel (numeric).
  *
@@ -153,6 +192,11 @@
  */
 #define XEN_ELFNOTE_SUPPORTED_FEATURES 17
 
+/*
+ * The number of the highest elfnote defined.
+ */
+#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES
+
 #endif /* __XEN_PUBLIC_ELFNOTE_H__ */
 
 /*

+ 229 - 0
include/xen/interface/io/vscsiif.h

@@ -0,0 +1,229 @@
+/******************************************************************************
+ * vscsiif.h
+ *
+ * Based on the blkif.h code.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright(c) FUJITSU Limited 2008.
+ */
+
+#ifndef __XEN__PUBLIC_IO_SCSI_H__
+#define __XEN__PUBLIC_IO_SCSI_H__
+
+#include "ring.h"
+#include "../grant_table.h"
+
+/*
+ * Feature and Parameter Negotiation
+ * =================================
+ * The two halves of a Xen pvSCSI driver utilize nodes within the XenStore to
+ * communicate capabilities and to negotiate operating parameters.  This
+ * section enumerates these nodes which reside in the respective front and
+ * backend portions of the XenStore, following the XenBus convention.
+ *
+ * Any specified default value is in effect if the corresponding XenBus node
+ * is not present in the XenStore.
+ *
+ * XenStore nodes in sections marked "PRIVATE" are solely for use by the
+ * driver side whose XenBus tree contains them.
+ *
+ *****************************************************************************
+ *                            Backend XenBus Nodes
+ *****************************************************************************
+ *
+ *------------------ Backend Device Identification (PRIVATE) ------------------
+ *
+ * p-devname
+ *      Values:         string
+ *
+ *      A free string used to identify the physical device (e.g. a disk name).
+ *
+ * p-dev
+ *      Values:         string
+ *
+ *      A string specifying the backend device: either a 4-tuple "h:c:t:l"
+ *      (host, controller, target, lun, all integers), or a WWN (e.g.
+ *      "naa.60014054ac780582").
+ *
+ * v-dev
+ *      Values:         string
+ *
+ *      A string specifying the frontend device in form of a 4-tuple "h:c:t:l"
+ *      (host, controller, target, lun, all integers).
+ *
+ *--------------------------------- Features ---------------------------------
+ *
+ * feature-sg-grant
+ *      Values:         unsigned [VSCSIIF_SG_TABLESIZE...65535]
+ *      Default Value:  0
+ *
+ *      Specifies the maximum number of scatter/gather elements in grant pages
+ *      supported. If not set, the backend supports up to VSCSIIF_SG_TABLESIZE
+ *      SG elements specified directly in the request.
+ *
+ *****************************************************************************
+ *                            Frontend XenBus Nodes
+ *****************************************************************************
+ *
+ *----------------------- Request Transport Parameters -----------------------
+ *
+ * event-channel
+ *      Values:         unsigned
+ *
+ *      The identifier of the Xen event channel used to signal activity
+ *      in the ring buffer.
+ *
+ * ring-ref
+ *      Values:         unsigned
+ *
+ *      The Xen grant reference granting permission for the backend to map
+ *      the sole page in a single page sized ring buffer.
+ *
+ * protocol
+ *      Values:         string (XEN_IO_PROTO_ABI_*)
+ *      Default Value:  XEN_IO_PROTO_ABI_NATIVE
+ *
+ *      The machine ABI rules governing the format of all ring request and
+ *      response structures.
+ */
+
+/* Requests from the frontend to the backend */
+
+/*
+ * Request a SCSI operation specified via a CDB in vscsiif_request.cmnd.
+ * The target is specified via channel, id and lun.
+ *
+ * The operation to be performed is specified via a CDB in cmnd[], the length
+ * of the CDB is in cmd_len. sc_data_direction specifies the direction of data
+ * (to the device, from the device, or none at all).
+ *
+ * If data is to be transferred to or from the device the buffer(s) in the
+ * guest memory is/are specified via one or multiple scsiif_request_segment
+ * descriptors each specifying a memory page via a grant_ref_t, a offset into
+ * the page and the length of the area in that page. All scsiif_request_segment
+ * areas concatenated form the resulting data buffer used by the operation.
+ * If the number of scsiif_request_segment areas is not too large (less than
+ * or equal VSCSIIF_SG_TABLESIZE) the areas can be specified directly in the
+ * seg[] array and the number of valid scsiif_request_segment elements is to be
+ * set in nr_segments.
+ *
+ * If "feature-sg-grant" in the Xenstore is set it is possible to specify more
+ * than VSCSIIF_SG_TABLESIZE scsiif_request_segment elements via indirection.
+ * The maximum number of allowed scsiif_request_segment elements is the value
+ * of the "feature-sg-grant" entry from Xenstore. When using indirection the
+ * seg[] array doesn't contain specifications of the data buffers, but
+ * references to scsiif_request_segment arrays, which in turn reference the
+ * data buffers. While nr_segments holds the number of populated seg[] entries
+ * (plus the set VSCSIIF_SG_GRANT bit), the number of scsiif_request_segment
+ * elements referencing the target data buffers is calculated from the lengths
+ * of the seg[] elements (the sum of all valid seg[].length divided by the
+ * size of one scsiif_request_segment structure).
+ */
+#define VSCSIIF_ACT_SCSI_CDB		1
+
+/*
+ * Request abort of a running operation for the specified target given by
+ * channel, id, lun and the operation's rqid in ref_rqid.
+ */
+#define VSCSIIF_ACT_SCSI_ABORT		2
+
+/*
+ * Request a device reset of the specified target (channel and id).
+ */
+#define VSCSIIF_ACT_SCSI_RESET		3
+
+/*
+ * Preset scatter/gather elements for a following request. Deprecated.
+ * Keeping the define only to avoid usage of the value "4" for other actions.
+ */
+#define VSCSIIF_ACT_SCSI_SG_PRESET	4
+
+/*
+ * Maximum scatter/gather segments per request.
+ *
+ * Considering balance between allocating at least 16 "vscsiif_request"
+ * structures on one page (4096 bytes) and the number of scatter/gather
+ * elements needed, we decided to use 26 as a magic number.
+ *
+ * If "feature-sg-grant" is set, more scatter/gather elements can be specified
+ * by placing them in one or more (up to VSCSIIF_SG_TABLESIZE) granted pages.
+ * In this case the vscsiif_request seg elements don't contain references to
+ * the user data, but to the SG elements referencing the user data.
+ */
+#define VSCSIIF_SG_TABLESIZE		26
+
+/*
+ * based on Linux kernel 2.6.18, still valid
+ * Changing these values requires support of multiple protocols via the rings
+ * as "old clients" will blindly use these values and the resulting structure
+ * sizes.
+ */
+#define VSCSIIF_MAX_COMMAND_SIZE	16
+#define VSCSIIF_SENSE_BUFFERSIZE	96
+
+struct scsiif_request_segment {
+	grant_ref_t gref;
+	uint16_t offset;
+	uint16_t length;
+};
+
+#define VSCSIIF_SG_PER_PAGE (PAGE_SIZE / sizeof(struct scsiif_request_segment))
+
+/* Size of one request is 252 bytes */
+struct vscsiif_request {
+	uint16_t rqid;		/* private guest value, echoed in resp  */
+	uint8_t act;		/* command between backend and frontend */
+	uint8_t cmd_len;	/* valid CDB bytes */
+
+	uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE];	/* the CDB */
+	uint16_t timeout_per_command;	/* deprecated */
+	uint16_t channel, id, lun;	/* (virtual) device specification */
+	uint16_t ref_rqid;		/* command abort reference */
+	uint8_t sc_data_direction;	/* for DMA_TO_DEVICE(1)
+					   DMA_FROM_DEVICE(2)
+					   DMA_NONE(3) requests */
+	uint8_t nr_segments;		/* Number of pieces of scatter-gather */
+/*
+ * flag in nr_segments: SG elements via grant page
+ *
+ * If VSCSIIF_SG_GRANT is set, the low 7 bits of nr_segments specify the number
+ * of grant pages containing SG elements. Usable if "feature-sg-grant" set.
+ */
+#define VSCSIIF_SG_GRANT	0x80
+
+	struct scsiif_request_segment seg[VSCSIIF_SG_TABLESIZE];
+	uint32_t reserved[3];
+};
+
+/* Size of one response is 252 bytes */
+struct vscsiif_response {
+	uint16_t rqid;		/* identifies request */
+	uint8_t padding;
+	uint8_t sense_len;
+	uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
+	int32_t rslt;
+	uint32_t residual_len;	/* request bufflen -
+				   return the value from physical device */
+	uint32_t reserved[36];
+};
+
+DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response);
+
+#endif /*__XEN__PUBLIC_IO_SCSI_H__*/

+ 248 - 24
include/xen/interface/xen.h

@@ -3,6 +3,24 @@
  *
  * Guest OS interface to Xen.
  *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
  * Copyright (c) 2004, K A Fraser
  */
 
@@ -73,13 +91,23 @@
  * VIRTUAL INTERRUPTS
  *
  * Virtual interrupts that a guest OS may receive from Xen.
+ * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
+ * global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
+ * The latter can be allocated only once per guest: they must initially be
+ * allocated to VCPU0 but can subsequently be re-bound.
  */
-#define VIRQ_TIMER      0  /* Timebase update, and/or requested timeout.  */
-#define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
-#define VIRQ_CONSOLE    2  /* (DOM0) Bytes received on emergency console. */
-#define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
-#define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
-#define VIRQ_PCPU_STATE 9  /* (DOM0) PCPU state changed                   */
+#define VIRQ_TIMER      0  /* V. Timebase update, and/or requested timeout.  */
+#define VIRQ_DEBUG      1  /* V. Request guest to dump debug info.           */
+#define VIRQ_CONSOLE    2  /* G. (DOM0) Bytes received on emergency console. */
+#define VIRQ_DOM_EXC    3  /* G. (DOM0) Exceptional event for some domain.   */
+#define VIRQ_TBUF       4  /* G. (DOM0) Trace buffer has records available.  */
+#define VIRQ_DEBUGGER   6  /* G. (DOM0) A domain has paused for debugging.   */
+#define VIRQ_XENOPROF   7  /* V. XenOprofile interrupt: new sample available */
+#define VIRQ_CON_RING   8  /* G. (DOM0) Bytes received on console            */
+#define VIRQ_PCPU_STATE 9  /* G. (DOM0) PCPU state changed                   */
+#define VIRQ_MEM_EVENT  10 /* G. (DOM0) A memory event has occured           */
+#define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient                     */
+#define VIRQ_ENOMEM     12 /* G. (DOM0) Low on heap memory       */
 
 /* Architecture-specific VIRQ definitions. */
 #define VIRQ_ARCH_0    16
@@ -92,24 +120,68 @@
 #define VIRQ_ARCH_7    23
 
 #define NR_VIRQS       24
+
 /*
- * MMU-UPDATE REQUESTS
- *
- * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
- * A foreigndom (FD) can be specified (or DOMID_SELF for none).
- * Where the FD has some effect, it is described below.
- * ptr[1:0] specifies the appropriate MMU_* command.
+ * enum neg_errnoval HYPERVISOR_mmu_update(const struct mmu_update reqs[],
+ *                                         unsigned count, unsigned *done_out,
+ *                                         unsigned foreigndom)
+ * @reqs is an array of mmu_update_t structures ((ptr, val) pairs).
+ * @count is the length of the above array.
+ * @pdone is an output parameter indicating number of completed operations
+ * @foreigndom[15:0]: FD, the expected owner of data pages referenced in this
+ *                    hypercall invocation. Can be DOMID_SELF.
+ * @foreigndom[31:16]: PFD, the expected owner of pagetable pages referenced
+ *                     in this hypercall invocation. The value of this field
+ *                     (x) encodes the PFD as follows:
+ *                     x == 0 => PFD == DOMID_SELF
+ *                     x != 0 => PFD == x - 1
  *
+ * Sub-commands: ptr[1:0] specifies the appropriate MMU_* command.
+ * -------------
  * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
- * Updates an entry in a page table. If updating an L1 table, and the new
- * table entry is valid/present, the mapped frame must belong to the FD, if
- * an FD has been specified. If attempting to map an I/O page then the
- * caller assumes the privilege of the FD.
+ * Updates an entry in a page table belonging to PFD. If updating an L1 table,
+ * and the new table entry is valid/present, the mapped frame must belong to
+ * FD. If attempting to map an I/O page then the caller assumes the privilege
+ * of the FD.
  * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
  * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
  * ptr[:2]  -- Machine address of the page-table entry to modify.
  * val      -- Value to write.
  *
+ * There also certain implicit requirements when using this hypercall. The
+ * pages that make up a pagetable must be mapped read-only in the guest.
+ * This prevents uncontrolled guest updates to the pagetable. Xen strictly
+ * enforces this, and will disallow any pagetable update which will end up
+ * mapping pagetable page RW, and will disallow using any writable page as a
+ * pagetable. In practice it means that when constructing a page table for a
+ * process, thread, etc, we MUST be very dilligient in following these rules:
+ *  1). Start with top-level page (PGD or in Xen language: L4). Fill out
+ *      the entries.
+ *  2). Keep on going, filling out the upper (PUD or L3), and middle (PMD
+ *      or L2).
+ *  3). Start filling out the PTE table (L1) with the PTE entries. Once
+ *      done, make sure to set each of those entries to RO (so writeable bit
+ *      is unset). Once that has been completed, set the PMD (L2) for this
+ *      PTE table as RO.
+ *  4). When completed with all of the PMD (L2) entries, and all of them have
+ *      been set to RO, make sure to set RO the PUD (L3). Do the same
+ *      operation on PGD (L4) pagetable entries that have a PUD (L3) entry.
+ *  5). Now before you can use those pages (so setting the cr3), you MUST also
+ *      pin them so that the hypervisor can verify the entries. This is done
+ *      via the HYPERVISOR_mmuext_op(MMUEXT_PIN_L4_TABLE, guest physical frame
+ *      number of the PGD (L4)). And this point the HYPERVISOR_mmuext_op(
+ *      MMUEXT_NEW_BASEPTR, guest physical frame number of the PGD (L4)) can be
+ *      issued.
+ * For 32-bit guests, the L4 is not used (as there is less pagetables), so
+ * instead use L3.
+ * At this point the pagetables can be modified using the MMU_NORMAL_PT_UPDATE
+ * hypercall. Also if so desired the OS can also try to write to the PTE
+ * and be trapped by the hypervisor (as the PTE entry is RO).
+ *
+ * To deallocate the pages, the operations are the reverse of the steps
+ * mentioned above. The argument is MMUEXT_UNPIN_TABLE for all levels and the
+ * pagetable MUST not be in use (meaning that the cr3 is not set to it).
+ *
  * ptr[1:0] == MMU_MACHPHYS_UPDATE:
  * Updates an entry in the machine->pseudo-physical mapping table.
  * ptr[:2]  -- Machine address within the frame whose mapping to modify.
@@ -119,6 +191,72 @@
  * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD:
  * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed
  * with those in @val.
+ *
+ * @val is usually the machine frame number along with some attributes.
+ * The attributes by default follow the architecture defined bits. Meaning that
+ * if this is a X86_64 machine and four page table layout is used, the layout
+ * of val is:
+ *  - 63 if set means No execute (NX)
+ *  - 46-13 the machine frame number
+ *  - 12 available for guest
+ *  - 11 available for guest
+ *  - 10 available for guest
+ *  - 9 available for guest
+ *  - 8 global
+ *  - 7 PAT (PSE is disabled, must use hypercall to make 4MB or 2MB pages)
+ *  - 6 dirty
+ *  - 5 accessed
+ *  - 4 page cached disabled
+ *  - 3 page write through
+ *  - 2 userspace accessible
+ *  - 1 writeable
+ *  - 0 present
+ *
+ *  The one bits that does not fit with the default layout is the PAGE_PSE
+ *  also called PAGE_PAT). The MMUEXT_[UN]MARK_SUPER arguments to the
+ *  HYPERVISOR_mmuext_op serve as mechanism to set a pagetable to be 4MB
+ *  (or 2MB) instead of using the PAGE_PSE bit.
+ *
+ *  The reason that the PAGE_PSE (bit 7) is not being utilized is due to Xen
+ *  using it as the Page Attribute Table (PAT) bit - for details on it please
+ *  refer to Intel SDM 10.12. The PAT allows to set the caching attributes of
+ *  pages instead of using MTRRs.
+ *
+ *  The PAT MSR is as follows (it is a 64-bit value, each entry is 8 bits):
+ *                    PAT4                 PAT0
+ *  +-----+-----+----+----+----+-----+----+----+
+ *  | UC  | UC- | WC | WB | UC | UC- | WC | WB |  <= Linux
+ *  +-----+-----+----+----+----+-----+----+----+
+ *  | UC  | UC- | WT | WB | UC | UC- | WT | WB |  <= BIOS (default when machine boots)
+ *  +-----+-----+----+----+----+-----+----+----+
+ *  | rsv | rsv | WP | WC | UC | UC- | WT | WB |  <= Xen
+ *  +-----+-----+----+----+----+-----+----+----+
+ *
+ *  The lookup of this index table translates to looking up
+ *  Bit 7, Bit 4, and Bit 3 of val entry:
+ *
+ *  PAT/PSE (bit 7) ... PCD (bit 4) .. PWT (bit 3).
+ *
+ *  If all bits are off, then we are using PAT0. If bit 3 turned on,
+ *  then we are using PAT1, if bit 3 and bit 4, then PAT2..
+ *
+ *  As you can see, the Linux PAT1 translates to PAT4 under Xen. Which means
+ *  that if a guest that follows Linux's PAT setup and would like to set Write
+ *  Combined on pages it MUST use PAT4 entry. Meaning that Bit 7 (PAGE_PAT) is
+ *  set. For example, under Linux it only uses PAT0, PAT1, and PAT2 for the
+ *  caching as:
+ *
+ *   WB = none (so PAT0)
+ *   WC = PWT (bit 3 on)
+ *   UC = PWT | PCD (bit 3 and 4 are on).
+ *
+ * To make it work with Xen, it needs to translate the WC bit as so:
+ *
+ *  PWT (so bit 3 on) --> PAT (so bit 7 is on) and clear bit 3
+ *
+ * And to translate back it would:
+ *
+ * PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7.
  */
 #define MMU_NORMAL_PT_UPDATE      0 /* checked '*ptr = val'. ptr is MA.       */
 #define MMU_MACHPHYS_UPDATE       1 /* ptr = MA of frame to modify entry for  */
@@ -127,7 +265,12 @@
 /*
  * MMU EXTENDED OPERATIONS
  *
- * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
+ * enum neg_errnoval HYPERVISOR_mmuext_op(mmuext_op_t uops[],
+ *                                        unsigned int count,
+ *                                        unsigned int *pdone,
+ *                                        unsigned int foreigndom)
+ */
+/* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
  * A foreigndom (FD) can be specified (or DOMID_SELF for none).
  * Where the FD has some effect, it is described below.
  *
@@ -164,9 +307,23 @@
  * cmd: MMUEXT_FLUSH_CACHE
  * No additional arguments. Writes back and flushes cache contents.
  *
+ * cmd: MMUEXT_FLUSH_CACHE_GLOBAL
+ * No additional arguments. Writes back and flushes cache contents
+ * on all CPUs in the system.
+ *
  * cmd: MMUEXT_SET_LDT
  * linear_addr: Linear address of LDT base (NB. must be page-aligned).
  * nr_ents: Number of entries in LDT.
+ *
+ * cmd: MMUEXT_CLEAR_PAGE
+ * mfn: Machine frame number to be cleared.
+ *
+ * cmd: MMUEXT_COPY_PAGE
+ * mfn: Machine frame number of the destination page.
+ * src_mfn: Machine frame number of the source page.
+ *
+ * cmd: MMUEXT_[UN]MARK_SUPER
+ * mfn: Machine frame number of head of superpage to be [un]marked.
  */
 #define MMUEXT_PIN_L1_TABLE      0
 #define MMUEXT_PIN_L2_TABLE      1
@@ -183,12 +340,18 @@
 #define MMUEXT_FLUSH_CACHE      12
 #define MMUEXT_SET_LDT          13
 #define MMUEXT_NEW_USER_BASEPTR 15
+#define MMUEXT_CLEAR_PAGE       16
+#define MMUEXT_COPY_PAGE        17
+#define MMUEXT_FLUSH_CACHE_GLOBAL 18
+#define MMUEXT_MARK_SUPER       19
+#define MMUEXT_UNMARK_SUPER     20
 
 #ifndef __ASSEMBLY__
 struct mmuext_op {
 	unsigned int cmd;
 	union {
-		/* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
+		/* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
+		 * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */
 		xen_pfn_t mfn;
 		/* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
 		unsigned long linear_addr;
@@ -198,6 +361,8 @@ struct mmuext_op {
 		unsigned int nr_ents;
 		/* TLB_FLUSH_MULTI, INVLPG_MULTI */
 		void *vcpumask;
+		/* COPY_PAGE */
+		xen_pfn_t src_mfn;
 	} arg2;
 };
 DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
@@ -225,10 +390,23 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
  */
 #define VMASST_CMD_enable                0
 #define VMASST_CMD_disable               1
+
+/* x86/32 guests: simulate full 4GB segment limits. */
 #define VMASST_TYPE_4gb_segments         0
+
+/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
 #define VMASST_TYPE_4gb_segments_notify  1
+
+/*
+ * x86 guests: support writes to bottom-level PTEs.
+ * NB1. Page-directory entries cannot be written.
+ * NB2. Guest must continue to remove all writable mappings of PTEs.
+ */
 #define VMASST_TYPE_writable_pagetables  2
+
+/* x86/PAE guests: support PDPTs above 4GB. */
 #define VMASST_TYPE_pae_extended_cr3     3
+
 #define MAX_VMASST_TYPE 3
 
 #ifndef __ASSEMBLY__
@@ -260,6 +438,15 @@ typedef uint16_t domid_t;
  */
 #define DOMID_XEN  (0x7FF2U)
 
+/* DOMID_COW is used as the owner of sharable pages */
+#define DOMID_COW  (0x7FF3U)
+
+/* DOMID_INVALID is used to identify pages with unknown owner. */
+#define DOMID_INVALID (0x7FF4U)
+
+/* Idle domain. */
+#define DOMID_IDLE (0x7FFFU)
+
 /*
  * Send an array of these to HYPERVISOR_mmu_update().
  * NB. The fields are natural pointer/address size for this architecture.
@@ -272,7 +459,9 @@ DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
 
 /*
  * Send an array of these to HYPERVISOR_multicall().
- * NB. The fields are natural register size for this architecture.
+ * NB. The fields are logically the natural register size for this
+ * architecture. In cases where xen_ulong_t is larger than this then
+ * any unused bits in the upper portion must be zero.
  */
 struct multicall_entry {
     xen_ulong_t op;
@@ -442,8 +631,48 @@ struct start_info {
 	unsigned long mod_start;    /* VIRTUAL address of pre-loaded module.  */
 	unsigned long mod_len;      /* Size (bytes) of pre-loaded module.     */
 	int8_t cmd_line[MAX_GUEST_CMDLINE];
+	/* The pfn range here covers both page table and p->m table frames.   */
+	unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table.    */
+	unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table.  */
 };
 
+/* These flags are passed in the 'flags' field of start_info_t. */
+#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
+#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
+#define SIF_MULTIBOOT_MOD (1<<2)  /* Is mod_start a multiboot module? */
+#define SIF_MOD_START_PFN (1<<3)  /* Is mod_start a PFN? */
+#define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
+
+/*
+ * A multiboot module is a package containing modules very similar to a
+ * multiboot module array. The only differences are:
+ * - the array of module descriptors is by convention simply at the beginning
+ *   of the multiboot module,
+ * - addresses in the module descriptors are based on the beginning of the
+ *   multiboot module,
+ * - the number of modules is determined by a termination descriptor that has
+ *   mod_start == 0.
+ *
+ * This permits to both build it statically and reference it in a configuration
+ * file, and let the PV guest easily rebase the addresses to virtual addresses
+ * and at the same time count the number of modules.
+ */
+struct xen_multiboot_mod_list {
+	/* Address of first byte of the module */
+	uint32_t mod_start;
+	/* Address of last byte of the module (inclusive) */
+	uint32_t mod_end;
+	/* Address of zero-terminated command line */
+	uint32_t cmdline;
+	/* Unused, must be zero */
+	uint32_t pad;
+};
+/*
+ * The console structure in start_info.console.dom0
+ *
+ * This structure includes a variety of information required to
+ * have a working VGA/VESA console.
+ */
 struct dom0_vga_console_info {
 	uint8_t video_type;
 #define XEN_VGATYPE_TEXT_MODE_3 0x03
@@ -484,11 +713,6 @@ struct dom0_vga_console_info {
 	} u;
 };
 
-/* These flags are passed in the 'flags' field of start_info_t. */
-#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
-#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
-#define SIF_PM_MASK       (0xFF<<8) /* reserve 1 byte for xen-pm options */
-
 typedef uint64_t cpumap_t;
 
 typedef uint8_t xen_domain_handle_t[16];

+ 12 - 9
include/xen/xenbus.h

@@ -86,6 +86,7 @@ struct xenbus_device_id
 
 /* A xenbus driver. */
 struct xenbus_driver {
+	const char *name;       /* defaults to ids[0].devicetype */
 	const struct xenbus_device_id *ids;
 	int (*probe)(struct xenbus_device *dev,
 		     const struct xenbus_device_id *id);
@@ -100,20 +101,22 @@ struct xenbus_driver {
 	int (*is_ready)(struct xenbus_device *dev);
 };
 
-#define DEFINE_XENBUS_DRIVER(var, drvname, methods...)		\
-struct xenbus_driver var ## _driver = {				\
-	.driver.name = drvname + 0 ?: var ## _ids->devicetype,	\
-	.driver.owner = THIS_MODULE,				\
-	.ids = var ## _ids, ## methods				\
-}
-
 static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
 {
 	return container_of(drv, struct xenbus_driver, driver);
 }
 
-int __must_check xenbus_register_frontend(struct xenbus_driver *);
-int __must_check xenbus_register_backend(struct xenbus_driver *);
+int __must_check __xenbus_register_frontend(struct xenbus_driver *drv,
+					    struct module *owner,
+					    const char *mod_name);
+int __must_check __xenbus_register_backend(struct xenbus_driver *drv,
+					   struct module *owner,
+					   const char *mod_name);
+
+#define xenbus_register_frontend(drv) \
+	__xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME);
+#define xenbus_register_backend(drv) \
+	__xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME);
 
 void xenbus_unregister_driver(struct xenbus_driver *drv);