Browse Source

Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/sparc-2.6

* 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/sparc-2.6: (26 commits)
  [SPARC64]: Fix UP build.
  [SPARC64]: dr-cpu unconfigure support.
  [SERIAL]: Fix console write locking in sparc drivers.
  [SPARC64]: Give more accurate errors in dr_cpu_configure().
  [SPARC64]: Clear cpu_{core,sibling}_map[] in smp_fill_in_sib_core_maps()
  [SPARC64]: Fix leak when DR added cpu does not bootup.
  [SPARC64]: Add ->set_affinity IRQ handlers.
  [SPARC64]: Process dr-cpu events in a kthread instead of workqueue.
  [SPARC64]: More sensible udelay implementation.
  [SPARC64]: SMP build fixes.
  [SPARC64]: mdesc.c needs linux/mm.h
  [SPARC64]: Fix build regressions added by dr-cpu changes.
  [SPARC64]: Unconditionally register vio_bus_type.
  [SPARC64]: Initial LDOM cpu hotplug support.
  [SPARC64]: Fix setting of variables in LDOM guest.
  [SPARC64]: Fix MD property lifetime bugs.
  [SPARC64]: Abstract out mdesc accesses for better MD update handling.
  [SPARC64]: Use more mearningful names for IRQ registry.
  [SPARC64]: Initial domain-services driver.
  [SPARC64]: Export powerd facilities for external entities.
  ...
Linus Torvalds 18 years ago
parent
commit
02b2318e07
45 changed files with 8609 additions and 537 deletions
  1. 15 0
      arch/sparc64/Kconfig
  2. 2 1
      arch/sparc64/kernel/Makefile
  3. 1158 0
      arch/sparc64/kernel/ds.c
  4. 139 0
      arch/sparc64/kernel/hvtramp.S
  5. 84 0
      arch/sparc64/kernel/irq.c
  6. 2373 0
      arch/sparc64/kernel/ldc.c
  7. 398 300
      arch/sparc64/kernel/mdesc.c
  8. 31 23
      arch/sparc64/kernel/power.c
  9. 16 5
      arch/sparc64/kernel/process.c
  10. 1 1
      arch/sparc64/kernel/prom.c
  11. 1 4
      arch/sparc64/kernel/setup.c
  12. 201 50
      arch/sparc64/kernel/smp.c
  13. 2 14
      arch/sparc64/kernel/sparc64_ksyms.c
  14. 0 2
      arch/sparc64/kernel/sysfs.c
  15. 21 7
      arch/sparc64/kernel/time.c
  16. 395 0
      arch/sparc64/kernel/vio.c
  17. 792 0
      arch/sparc64/kernel/viohs.c
  18. 1 1
      arch/sparc64/lib/Makefile
  19. 0 46
      arch/sparc64/lib/delay.c
  20. 13 0
      arch/sparc64/prom/misc.c
  21. 1 0
      arch/sparc64/prom/p1275.c
  22. 11 2
      arch/sparc64/prom/tree.c
  23. 7 0
      drivers/block/Kconfig
  24. 1 0
      drivers/block/Makefile
  25. 972 0
      drivers/block/sunvdc.c
  26. 6 0
      drivers/net/Kconfig
  27. 1 0
      drivers/net/Makefile
  28. 1164 0
      drivers/net/sunvnet.c
  29. 70 0
      drivers/net/sunvnet.h
  30. 26 4
      drivers/serial/sunhv.c
  31. 14 5
      drivers/serial/sunsab.c
  32. 14 0
      drivers/serial/sunsu.c
  33. 14 3
      drivers/serial/sunzilog.c
  34. 0 5
      include/asm-sparc64/bugs.h
  35. 3 2
      include/asm-sparc64/cpudata.h
  36. 6 26
      include/asm-sparc64/delay.h
  37. 37 0
      include/asm-sparc64/hvtramp.h
  38. 1 1
      include/asm-sparc64/hypervisor.h
  39. 2 0
      include/asm-sparc64/irq.h
  40. 138 0
      include/asm-sparc64/ldc.h
  41. 58 30
      include/asm-sparc64/mdesc.h
  42. 3 0
      include/asm-sparc64/mmu_context.h
  43. 7 0
      include/asm-sparc64/power.h
  44. 6 5
      include/asm-sparc64/smp.h
  45. 404 0
      include/asm-sparc64/vio.h

+ 15 - 0
arch/sparc64/Kconfig

@@ -108,6 +108,15 @@ config SECCOMP
 
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.hz
 
 
+config HOTPLUG_CPU
+	bool "Support for hot-pluggable CPUs"
+	depends on SMP
+	select HOTPLUG
+	---help---
+	  Say Y here to experiment with turning CPUs off and on.  CPUs
+	  can be controlled through /sys/devices/system/cpu/cpu#.
+	  Say N if you want to disable CPU hotplug.
+
 source "init/Kconfig"
 source "init/Kconfig"
 
 
 config SYSVIPC_COMPAT
 config SYSVIPC_COMPAT
@@ -305,6 +314,12 @@ config SUN_IO
 	bool
 	bool
 	default y
 	default y
 
 
+config SUN_LDOMS
+	bool "Sun Logical Domains support"
+	help
+	  Say Y here is you want to support virtual devices via
+	  Logical Domains.
+
 config PCI
 config PCI
 	bool "PCI support"
 	bool "PCI support"
 	select ARCH_SUPPORTS_MSI
 	select ARCH_SUPPORTS_MSI

+ 2 - 1
arch/sparc64/kernel/Makefile

@@ -18,7 +18,7 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-$(CONFIG_PCI)	 += ebus.o isa.o pci_common.o pci_iommu.o \
 obj-$(CONFIG_PCI)	 += ebus.o isa.o pci_common.o pci_iommu.o \
 			    pci_psycho.o pci_sabre.o pci_schizo.o \
 			    pci_psycho.o pci_sabre.o pci_schizo.o \
 			    pci_sun4v.o pci_sun4v_asm.o pci_fire.o
 			    pci_sun4v.o pci_sun4v_asm.o pci_fire.o
-obj-$(CONFIG_SMP)	 += smp.o trampoline.o
+obj-$(CONFIG_SMP)	 += smp.o trampoline.o hvtramp.o
 obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o
 obj-$(CONFIG_SPARC32_COMPAT) += sys32.o sys_sparc32.o signal32.o
 obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o
 obj-$(CONFIG_BINFMT_ELF32) += binfmt_elf32.o
 obj-$(CONFIG_BINFMT_AOUT32) += binfmt_aout32.o
 obj-$(CONFIG_BINFMT_AOUT32) += binfmt_aout32.o
@@ -26,6 +26,7 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_US3_FREQ) += us3_cpufreq.o
 obj-$(CONFIG_US3_FREQ) += us3_cpufreq.o
 obj-$(CONFIG_US2E_FREQ) += us2e_cpufreq.o
 obj-$(CONFIG_US2E_FREQ) += us2e_cpufreq.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KPROBES) += kprobes.o
+obj-$(CONFIG_SUN_LDOMS) += ldc.o vio.o viohs.o ds.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDIT)$(CONFIG_SPARC32_COMPAT) += compat_audit.o
 obj-$(CONFIG_AUDIT)$(CONFIG_SPARC32_COMPAT) += compat_audit.o
 obj-y += $(obj-yy)
 obj-y += $(obj-yy)

+ 1158 - 0
arch/sparc64/kernel/ds.c

@@ -0,0 +1,1158 @@
+/* ds.c: Domain Services driver for Logical Domains
+ *
+ * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/mutex.h>
+#include <linux/kthread.h>
+#include <linux/cpu.h>
+
+#include <asm/ldc.h>
+#include <asm/vio.h>
+#include <asm/power.h>
+#include <asm/mdesc.h>
+#include <asm/head.h>
+#include <asm/irq.h>
+
+#define DRV_MODULE_NAME		"ds"
+#define PFX DRV_MODULE_NAME	": "
+#define DRV_MODULE_VERSION	"1.0"
+#define DRV_MODULE_RELDATE	"Jul 11, 2007"
+
+static char version[] __devinitdata =
+	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_DESCRIPTION("Sun LDOM domain services driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+struct ds_msg_tag {
+	__u32			type;
+#define DS_INIT_REQ		0x00
+#define DS_INIT_ACK		0x01
+#define DS_INIT_NACK		0x02
+#define DS_REG_REQ		0x03
+#define DS_REG_ACK		0x04
+#define DS_REG_NACK		0x05
+#define DS_UNREG_REQ		0x06
+#define DS_UNREG_ACK		0x07
+#define DS_UNREG_NACK		0x08
+#define DS_DATA			0x09
+#define DS_NACK			0x0a
+
+	__u32			len;
+};
+
+/* Result codes */
+#define DS_OK			0x00
+#define DS_REG_VER_NACK		0x01
+#define DS_REG_DUP		0x02
+#define DS_INV_HDL		0x03
+#define DS_TYPE_UNKNOWN		0x04
+
+struct ds_version {
+	__u16			major;
+	__u16			minor;
+};
+
+struct ds_ver_req {
+	struct ds_msg_tag	tag;
+	struct ds_version	ver;
+};
+
+struct ds_ver_ack {
+	struct ds_msg_tag	tag;
+	__u16			minor;
+};
+
+struct ds_ver_nack {
+	struct ds_msg_tag	tag;
+	__u16			major;
+};
+
+struct ds_reg_req {
+	struct ds_msg_tag	tag;
+	__u64			handle;
+	__u16			major;
+	__u16			minor;
+	char			svc_id[0];
+};
+
+struct ds_reg_ack {
+	struct ds_msg_tag	tag;
+	__u64			handle;
+	__u16			minor;
+};
+
+struct ds_reg_nack {
+	struct ds_msg_tag	tag;
+	__u64			handle;
+	__u16			major;
+};
+
+struct ds_unreg_req {
+	struct ds_msg_tag	tag;
+	__u64			handle;
+};
+
+struct ds_unreg_ack {
+	struct ds_msg_tag	tag;
+	__u64			handle;
+};
+
+struct ds_unreg_nack {
+	struct ds_msg_tag	tag;
+	__u64			handle;
+};
+
+struct ds_data {
+	struct ds_msg_tag	tag;
+	__u64			handle;
+};
+
+struct ds_data_nack {
+	struct ds_msg_tag	tag;
+	__u64			handle;
+	__u64			result;
+};
+
+struct ds_cap_state {
+	__u64			handle;
+
+	void			(*data)(struct ldc_channel *lp,
+					struct ds_cap_state *cp,
+					void *buf, int len);
+
+	const char		*service_id;
+
+	u8			state;
+#define CAP_STATE_UNKNOWN	0x00
+#define CAP_STATE_REG_SENT	0x01
+#define CAP_STATE_REGISTERED	0x02
+};
+
+static void md_update_data(struct ldc_channel *lp, struct ds_cap_state *cp,
+			   void *buf, int len);
+static void domain_shutdown_data(struct ldc_channel *lp,
+				 struct ds_cap_state *cp,
+				 void *buf, int len);
+static void domain_panic_data(struct ldc_channel *lp,
+			      struct ds_cap_state *cp,
+			      void *buf, int len);
+#ifdef CONFIG_HOTPLUG_CPU
+static void dr_cpu_data(struct ldc_channel *lp,
+			struct ds_cap_state *cp,
+			void *buf, int len);
+#endif
+static void ds_pri_data(struct ldc_channel *lp,
+			struct ds_cap_state *cp,
+			void *buf, int len);
+static void ds_var_data(struct ldc_channel *lp,
+			struct ds_cap_state *cp,
+			void *buf, int len);
+
+struct ds_cap_state ds_states[] = {
+	{
+		.service_id	= "md-update",
+		.data		= md_update_data,
+	},
+	{
+		.service_id	= "domain-shutdown",
+		.data		= domain_shutdown_data,
+	},
+	{
+		.service_id	= "domain-panic",
+		.data		= domain_panic_data,
+	},
+#ifdef CONFIG_HOTPLUG_CPU
+	{
+		.service_id	= "dr-cpu",
+		.data		= dr_cpu_data,
+	},
+#endif
+	{
+		.service_id	= "pri",
+		.data		= ds_pri_data,
+	},
+	{
+		.service_id	= "var-config",
+		.data		= ds_var_data,
+	},
+	{
+		.service_id	= "var-config-backup",
+		.data		= ds_var_data,
+	},
+};
+
+static DEFINE_SPINLOCK(ds_lock);
+
+struct ds_info {
+	struct ldc_channel	*lp;
+	u8			hs_state;
+#define DS_HS_START		0x01
+#define DS_HS_DONE		0x02
+
+	void			*rcv_buf;
+	int			rcv_buf_len;
+};
+
+static struct ds_info *ds_info;
+
+static struct ds_cap_state *find_cap(u64 handle)
+{
+	unsigned int index = handle >> 32;
+
+	if (index >= ARRAY_SIZE(ds_states))
+		return NULL;
+	return &ds_states[index];
+}
+
+static struct ds_cap_state *find_cap_by_string(const char *name)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ds_states); i++) {
+		if (strcmp(ds_states[i].service_id, name))
+			continue;
+
+		return &ds_states[i];
+	}
+	return NULL;
+}
+
+static int ds_send(struct ldc_channel *lp, void *data, int len)
+{
+	int err, limit = 1000;
+
+	err = -EINVAL;
+	while (limit-- > 0) {
+		err = ldc_write(lp, data, len);
+		if (!err || (err != -EAGAIN))
+			break;
+		udelay(1);
+	}
+
+	return err;
+}
+
+struct ds_md_update_req {
+	__u64				req_num;
+};
+
+struct ds_md_update_res {
+	__u64				req_num;
+	__u32				result;
+};
+
+static void md_update_data(struct ldc_channel *lp,
+			   struct ds_cap_state *dp,
+			   void *buf, int len)
+{
+	struct ds_data *dpkt = buf;
+	struct ds_md_update_req *rp;
+	struct {
+		struct ds_data		data;
+		struct ds_md_update_res	res;
+	} pkt;
+
+	rp = (struct ds_md_update_req *) (dpkt + 1);
+
+	printk(KERN_INFO PFX "Machine description update.\n");
+
+	memset(&pkt, 0, sizeof(pkt));
+	pkt.data.tag.type = DS_DATA;
+	pkt.data.tag.len = sizeof(pkt) - sizeof(struct ds_msg_tag);
+	pkt.data.handle = dp->handle;
+	pkt.res.req_num = rp->req_num;
+	pkt.res.result = DS_OK;
+
+	ds_send(lp, &pkt, sizeof(pkt));
+
+	mdesc_update();
+}
+
+struct ds_shutdown_req {
+	__u64				req_num;
+	__u32				ms_delay;
+};
+
+struct ds_shutdown_res {
+	__u64				req_num;
+	__u32				result;
+	char				reason[1];
+};
+
+static void domain_shutdown_data(struct ldc_channel *lp,
+				 struct ds_cap_state *dp,
+				 void *buf, int len)
+{
+	struct ds_data *dpkt = buf;
+	struct ds_shutdown_req *rp;
+	struct {
+		struct ds_data		data;
+		struct ds_shutdown_res	res;
+	} pkt;
+
+	rp = (struct ds_shutdown_req *) (dpkt + 1);
+
+	printk(KERN_ALERT PFX "Shutdown request from "
+	       "LDOM manager received.\n");
+
+	memset(&pkt, 0, sizeof(pkt));
+	pkt.data.tag.type = DS_DATA;
+	pkt.data.tag.len = sizeof(pkt) - sizeof(struct ds_msg_tag);
+	pkt.data.handle = dp->handle;
+	pkt.res.req_num = rp->req_num;
+	pkt.res.result = DS_OK;
+	pkt.res.reason[0] = 0;
+
+	ds_send(lp, &pkt, sizeof(pkt));
+
+	wake_up_powerd();
+}
+
+struct ds_panic_req {
+	__u64				req_num;
+};
+
+struct ds_panic_res {
+	__u64				req_num;
+	__u32				result;
+	char				reason[1];
+};
+
+static void domain_panic_data(struct ldc_channel *lp,
+			      struct ds_cap_state *dp,
+			      void *buf, int len)
+{
+	struct ds_data *dpkt = buf;
+	struct ds_panic_req *rp;
+	struct {
+		struct ds_data		data;
+		struct ds_panic_res	res;
+	} pkt;
+
+	rp = (struct ds_panic_req *) (dpkt + 1);
+
+	printk(KERN_ALERT PFX "Panic request from "
+	       "LDOM manager received.\n");
+
+	memset(&pkt, 0, sizeof(pkt));
+	pkt.data.tag.type = DS_DATA;
+	pkt.data.tag.len = sizeof(pkt) - sizeof(struct ds_msg_tag);
+	pkt.data.handle = dp->handle;
+	pkt.res.req_num = rp->req_num;
+	pkt.res.result = DS_OK;
+	pkt.res.reason[0] = 0;
+
+	ds_send(lp, &pkt, sizeof(pkt));
+
+	panic("PANIC requested by LDOM manager.");
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+struct dr_cpu_tag {
+	__u64				req_num;
+	__u32				type;
+#define DR_CPU_CONFIGURE		0x43
+#define DR_CPU_UNCONFIGURE		0x55
+#define DR_CPU_FORCE_UNCONFIGURE	0x46
+#define DR_CPU_STATUS			0x53
+
+/* Responses */
+#define DR_CPU_OK			0x6f
+#define DR_CPU_ERROR			0x65
+
+	__u32				num_records;
+};
+
+struct dr_cpu_resp_entry {
+	__u32				cpu;
+	__u32				result;
+#define DR_CPU_RES_OK			0x00
+#define DR_CPU_RES_FAILURE		0x01
+#define DR_CPU_RES_BLOCKED		0x02
+#define DR_CPU_RES_CPU_NOT_RESPONDING	0x03
+#define DR_CPU_RES_NOT_IN_MD		0x04
+
+	__u32				stat;
+#define DR_CPU_STAT_NOT_PRESENT		0x00
+#define DR_CPU_STAT_UNCONFIGURED	0x01
+#define DR_CPU_STAT_CONFIGURED		0x02
+
+	__u32				str_off;
+};
+
+/* DR cpu requests get queued onto the work list by the
+ * dr_cpu_data() callback.  The list is protected by
+ * ds_lock, and processed by dr_cpu_process() in order.
+ */
+static LIST_HEAD(dr_cpu_work_list);
+static DECLARE_WAIT_QUEUE_HEAD(dr_cpu_wait);
+
+struct dr_cpu_queue_entry {
+	struct list_head		list;
+	char				req[0];
+};
+
+static void __dr_cpu_send_error(struct ds_cap_state *cp, struct ds_data *data)
+{
+	struct dr_cpu_tag *tag = (struct dr_cpu_tag *) (data + 1);
+	struct ds_info *dp = ds_info;
+	struct {
+		struct ds_data		data;
+		struct dr_cpu_tag	tag;
+	} pkt;
+	int msg_len;
+
+	memset(&pkt, 0, sizeof(pkt));
+	pkt.data.tag.type = DS_DATA;
+	pkt.data.handle = cp->handle;
+	pkt.tag.req_num = tag->req_num;
+	pkt.tag.type = DR_CPU_ERROR;
+	pkt.tag.num_records = 0;
+
+	msg_len = (sizeof(struct ds_data) +
+		   sizeof(struct dr_cpu_tag));
+
+	pkt.data.tag.len = msg_len - sizeof(struct ds_msg_tag);
+
+	ds_send(dp->lp, &pkt, msg_len);
+}
+
+static void dr_cpu_send_error(struct ds_cap_state *cp, struct ds_data *data)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ds_lock, flags);
+	__dr_cpu_send_error(cp, data);
+	spin_unlock_irqrestore(&ds_lock, flags);
+}
+
+#define CPU_SENTINEL	0xffffffff
+
+static void purge_dups(u32 *list, u32 num_ents)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_ents; i++) {
+		u32 cpu = list[i];
+		unsigned int j;
+
+		if (cpu == CPU_SENTINEL)
+			continue;
+
+		for (j = i + 1; j < num_ents; j++) {
+			if (list[j] == cpu)
+				list[j] = CPU_SENTINEL;
+		}
+	}
+}
+
+static int dr_cpu_size_response(int ncpus)
+{
+	return (sizeof(struct ds_data) +
+		sizeof(struct dr_cpu_tag) +
+		(sizeof(struct dr_cpu_resp_entry) * ncpus));
+}
+
+static void dr_cpu_init_response(struct ds_data *resp, u64 req_num,
+				 u64 handle, int resp_len, int ncpus,
+				 cpumask_t *mask, u32 default_stat)
+{
+	struct dr_cpu_resp_entry *ent;
+	struct dr_cpu_tag *tag;
+	int i, cpu;
+
+	tag = (struct dr_cpu_tag *) (resp + 1);
+	ent = (struct dr_cpu_resp_entry *) (tag + 1);
+
+	resp->tag.type = DS_DATA;
+	resp->tag.len = resp_len - sizeof(struct ds_msg_tag);
+	resp->handle = handle;
+	tag->req_num = req_num;
+	tag->type = DR_CPU_OK;
+	tag->num_records = ncpus;
+
+	i = 0;
+	for_each_cpu_mask(cpu, *mask) {
+		ent[i].cpu = cpu;
+		ent[i].result = DR_CPU_RES_OK;
+		ent[i].stat = default_stat;
+		i++;
+	}
+	BUG_ON(i != ncpus);
+}
+
+static void dr_cpu_mark(struct ds_data *resp, int cpu, int ncpus,
+			u32 res, u32 stat)
+{
+	struct dr_cpu_resp_entry *ent;
+	struct dr_cpu_tag *tag;
+	int i;
+
+	tag = (struct dr_cpu_tag *) (resp + 1);
+	ent = (struct dr_cpu_resp_entry *) (tag + 1);
+
+	for (i = 0; i < ncpus; i++) {
+		if (ent[i].cpu != cpu)
+			continue;
+		ent[i].result = res;
+		ent[i].stat = stat;
+		break;
+	}
+}
+
+static int dr_cpu_configure(struct ds_cap_state *cp, u64 req_num,
+			    cpumask_t *mask)
+{
+	struct ds_data *resp;
+	int resp_len, ncpus, cpu;
+	unsigned long flags;
+
+	ncpus = cpus_weight(*mask);
+	resp_len = dr_cpu_size_response(ncpus);
+	resp = kzalloc(resp_len, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	dr_cpu_init_response(resp, req_num, cp->handle,
+			     resp_len, ncpus, mask,
+			     DR_CPU_STAT_CONFIGURED);
+
+	mdesc_fill_in_cpu_data(*mask);
+
+	for_each_cpu_mask(cpu, *mask) {
+		int err;
+
+		printk(KERN_INFO PFX "Starting cpu %d...\n", cpu);
+		err = cpu_up(cpu);
+		if (err) {
+			__u32 res = DR_CPU_RES_FAILURE;
+			__u32 stat = DR_CPU_STAT_UNCONFIGURED;
+
+			if (!cpu_present(cpu)) {
+				/* CPU not present in MD */
+				res = DR_CPU_RES_NOT_IN_MD;
+				stat = DR_CPU_STAT_NOT_PRESENT;
+			} else if (err == -ENODEV) {
+				/* CPU did not call in successfully */
+				res = DR_CPU_RES_CPU_NOT_RESPONDING;
+			}
+
+			printk(KERN_INFO PFX "CPU startup failed err=%d\n",
+			       err);
+			dr_cpu_mark(resp, cpu, ncpus, res, stat);
+		}
+	}
+
+	spin_lock_irqsave(&ds_lock, flags);
+	ds_send(ds_info->lp, resp, resp_len);
+	spin_unlock_irqrestore(&ds_lock, flags);
+
+	kfree(resp);
+
+	/* Redistribute IRQs, taking into account the new cpus.  */
+	fixup_irqs();
+
+	return 0;
+}
+
+static int dr_cpu_unconfigure(struct ds_cap_state *cp, u64 req_num,
+			      cpumask_t *mask)
+{
+	struct ds_data *resp;
+	int resp_len, ncpus, cpu;
+	unsigned long flags;
+
+	ncpus = cpus_weight(*mask);
+	resp_len = dr_cpu_size_response(ncpus);
+	resp = kzalloc(resp_len, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	dr_cpu_init_response(resp, req_num, cp->handle,
+			     resp_len, ncpus, mask,
+			     DR_CPU_STAT_UNCONFIGURED);
+
+	for_each_cpu_mask(cpu, *mask) {
+		int err;
+
+		printk(KERN_INFO PFX "CPU[%d]: Shutting down cpu %d...\n",
+		       smp_processor_id(), cpu);
+		err = cpu_down(cpu);
+		if (err)
+			dr_cpu_mark(resp, cpu, ncpus,
+				    DR_CPU_RES_FAILURE,
+				    DR_CPU_STAT_CONFIGURED);
+	}
+
+	spin_lock_irqsave(&ds_lock, flags);
+	ds_send(ds_info->lp, resp, resp_len);
+	spin_unlock_irqrestore(&ds_lock, flags);
+
+	kfree(resp);
+
+	return 0;
+}
+
+static void process_dr_cpu_list(struct ds_cap_state *cp)
+{
+	struct dr_cpu_queue_entry *qp, *tmp;
+	unsigned long flags;
+	LIST_HEAD(todo);
+	cpumask_t mask;
+
+	spin_lock_irqsave(&ds_lock, flags);
+	list_splice(&dr_cpu_work_list, &todo);
+	INIT_LIST_HEAD(&dr_cpu_work_list);
+	spin_unlock_irqrestore(&ds_lock, flags);
+
+	list_for_each_entry_safe(qp, tmp, &todo, list) {
+		struct ds_data *data = (struct ds_data *) qp->req;
+		struct dr_cpu_tag *tag = (struct dr_cpu_tag *) (data + 1);
+		u32 *cpu_list = (u32 *) (tag + 1);
+		u64 req_num = tag->req_num;
+		unsigned int i;
+		int err;
+
+		switch (tag->type) {
+		case DR_CPU_CONFIGURE:
+		case DR_CPU_UNCONFIGURE:
+		case DR_CPU_FORCE_UNCONFIGURE:
+			break;
+
+		default:
+			dr_cpu_send_error(cp, data);
+			goto next;
+		}
+
+		purge_dups(cpu_list, tag->num_records);
+
+		cpus_clear(mask);
+		for (i = 0; i < tag->num_records; i++) {
+			if (cpu_list[i] == CPU_SENTINEL)
+				continue;
+
+			if (cpu_list[i] < NR_CPUS)
+				cpu_set(cpu_list[i], mask);
+		}
+
+		if (tag->type == DR_CPU_CONFIGURE)
+			err = dr_cpu_configure(cp, req_num, &mask);
+		else
+			err = dr_cpu_unconfigure(cp, req_num, &mask);
+
+		if (err)
+			dr_cpu_send_error(cp, data);
+
+next:
+		list_del(&qp->list);
+		kfree(qp);
+	}
+}
+
+static int dr_cpu_thread(void *__unused)
+{
+	struct ds_cap_state *cp;
+	DEFINE_WAIT(wait);
+
+	cp = find_cap_by_string("dr-cpu");
+
+	while (1) {
+		prepare_to_wait(&dr_cpu_wait, &wait, TASK_INTERRUPTIBLE);
+		if (list_empty(&dr_cpu_work_list))
+			schedule();
+		finish_wait(&dr_cpu_wait, &wait);
+
+		if (kthread_should_stop())
+			break;
+
+		process_dr_cpu_list(cp);
+	}
+
+	return 0;
+}
+
+static void dr_cpu_data(struct ldc_channel *lp,
+			struct ds_cap_state *dp,
+			void *buf, int len)
+{
+	struct dr_cpu_queue_entry *qp;
+	struct ds_data *dpkt = buf;
+	struct dr_cpu_tag *rp;
+
+	rp = (struct dr_cpu_tag *) (dpkt + 1);
+
+	qp = kmalloc(sizeof(struct dr_cpu_queue_entry) + len, GFP_ATOMIC);
+	if (!qp) {
+		struct ds_cap_state *cp;
+
+		cp = find_cap_by_string("dr-cpu");
+		__dr_cpu_send_error(cp, dpkt);
+	} else {
+		memcpy(&qp->req, buf, len);
+		list_add_tail(&qp->list, &dr_cpu_work_list);
+		wake_up(&dr_cpu_wait);
+	}
+}
+#endif
+
+struct ds_pri_msg {
+	__u64				req_num;
+	__u64				type;
+#define DS_PRI_REQUEST			0x00
+#define DS_PRI_DATA			0x01
+#define DS_PRI_UPDATE			0x02
+};
+
+static void ds_pri_data(struct ldc_channel *lp,
+			struct ds_cap_state *dp,
+			void *buf, int len)
+{
+	struct ds_data *dpkt = buf;
+	struct ds_pri_msg *rp;
+
+	rp = (struct ds_pri_msg *) (dpkt + 1);
+
+	printk(KERN_INFO PFX "PRI REQ [%lx:%lx], len=%d\n",
+	       rp->req_num, rp->type, len);
+}
+
+struct ds_var_hdr {
+	__u32				type;
+#define DS_VAR_SET_REQ			0x00
+#define DS_VAR_DELETE_REQ		0x01
+#define DS_VAR_SET_RESP			0x02
+#define DS_VAR_DELETE_RESP		0x03
+};
+
+struct ds_var_set_msg {
+	struct ds_var_hdr		hdr;
+	char				name_and_value[0];
+};
+
+struct ds_var_delete_msg {
+	struct ds_var_hdr		hdr;
+	char				name[0];
+};
+
+struct ds_var_resp {
+	struct ds_var_hdr		hdr;
+	__u32				result;
+#define DS_VAR_SUCCESS			0x00
+#define DS_VAR_NO_SPACE			0x01
+#define DS_VAR_INVALID_VAR		0x02
+#define DS_VAR_INVALID_VAL		0x03
+#define DS_VAR_NOT_PRESENT		0x04
+};
+
+static DEFINE_MUTEX(ds_var_mutex);
+static int ds_var_doorbell;
+static int ds_var_response;
+
+static void ds_var_data(struct ldc_channel *lp,
+			struct ds_cap_state *dp,
+			void *buf, int len)
+{
+	struct ds_data *dpkt = buf;
+	struct ds_var_resp *rp;
+
+	rp = (struct ds_var_resp *) (dpkt + 1);
+
+	if (rp->hdr.type != DS_VAR_SET_RESP &&
+	    rp->hdr.type != DS_VAR_DELETE_RESP)
+		return;
+
+	ds_var_response = rp->result;
+	wmb();
+	ds_var_doorbell = 1;
+}
+
+void ldom_set_var(const char *var, const char *value)
+{
+	struct ds_info *dp = ds_info;
+	struct ds_cap_state *cp;
+
+	cp = find_cap_by_string("var-config");
+	if (cp->state != CAP_STATE_REGISTERED)
+		cp = find_cap_by_string("var-config-backup");
+
+	if (cp->state == CAP_STATE_REGISTERED) {
+		union {
+			struct {
+				struct ds_data		data;
+				struct ds_var_set_msg	msg;
+			} header;
+			char			all[512];
+		} pkt;
+		unsigned long flags;
+		char  *base, *p;
+		int msg_len, loops;
+
+		memset(&pkt, 0, sizeof(pkt));
+		pkt.header.data.tag.type = DS_DATA;
+		pkt.header.data.handle = cp->handle;
+		pkt.header.msg.hdr.type = DS_VAR_SET_REQ;
+		base = p = &pkt.header.msg.name_and_value[0];
+		strcpy(p, var);
+		p += strlen(var) + 1;
+		strcpy(p, value);
+		p += strlen(value) + 1;
+
+		msg_len = (sizeof(struct ds_data) +
+			   sizeof(struct ds_var_set_msg) +
+			   (p - base));
+		msg_len = (msg_len + 3) & ~3;
+		pkt.header.data.tag.len = msg_len - sizeof(struct ds_msg_tag);
+
+		mutex_lock(&ds_var_mutex);
+
+		spin_lock_irqsave(&ds_lock, flags);
+		ds_var_doorbell = 0;
+		ds_var_response = -1;
+
+		ds_send(dp->lp, &pkt, msg_len);
+		spin_unlock_irqrestore(&ds_lock, flags);
+
+		loops = 1000;
+		while (ds_var_doorbell == 0) {
+			if (loops-- < 0)
+				break;
+			barrier();
+			udelay(100);
+		}
+
+		mutex_unlock(&ds_var_mutex);
+
+		if (ds_var_doorbell == 0 ||
+		    ds_var_response != DS_VAR_SUCCESS)
+			printk(KERN_ERR PFX "var-config [%s:%s] "
+			       "failed, response(%d).\n",
+			       var, value,
+			       ds_var_response);
+	} else {
+		printk(KERN_ERR PFX "var-config not registered so "
+		       "could not set (%s) variable to (%s).\n",
+		       var, value);
+	}
+}
+
+void ldom_reboot(const char *boot_command)
+{
+	/* Don't bother with any of this if the boot_command
+	 * is empty.
+	 */
+	if (boot_command && strlen(boot_command)) {
+		char full_boot_str[256];
+
+		strcpy(full_boot_str, "boot ");
+		strcpy(full_boot_str + strlen("boot "), boot_command);
+
+		ldom_set_var("reboot-command", full_boot_str);
+	}
+	sun4v_mach_sir();
+}
+
+void ldom_power_off(void)
+{
+	sun4v_mach_exit(0);
+}
+
+static void ds_conn_reset(struct ds_info *dp)
+{
+	printk(KERN_ERR PFX "ds_conn_reset() from %p\n",
+	       __builtin_return_address(0));
+}
+
+static int register_services(struct ds_info *dp)
+{
+	struct ldc_channel *lp = dp->lp;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ds_states); i++) {
+		struct {
+			struct ds_reg_req req;
+			u8 id_buf[256];
+		} pbuf;
+		struct ds_cap_state *cp = &ds_states[i];
+		int err, msg_len;
+		u64 new_count;
+
+		if (cp->state == CAP_STATE_REGISTERED)
+			continue;
+
+		new_count = sched_clock() & 0xffffffff;
+		cp->handle = ((u64) i << 32) | new_count;
+
+		msg_len = (sizeof(struct ds_reg_req) +
+			   strlen(cp->service_id));
+
+		memset(&pbuf, 0, sizeof(pbuf));
+		pbuf.req.tag.type = DS_REG_REQ;
+		pbuf.req.tag.len = (msg_len - sizeof(struct ds_msg_tag));
+		pbuf.req.handle = cp->handle;
+		pbuf.req.major = 1;
+		pbuf.req.minor = 0;
+		strcpy(pbuf.req.svc_id, cp->service_id);
+
+		err = ds_send(lp, &pbuf, msg_len);
+		if (err > 0)
+			cp->state = CAP_STATE_REG_SENT;
+	}
+	return 0;
+}
+
+static int ds_handshake(struct ds_info *dp, struct ds_msg_tag *pkt)
+{
+
+	if (dp->hs_state == DS_HS_START) {
+		if (pkt->type != DS_INIT_ACK)
+			goto conn_reset;
+
+		dp->hs_state = DS_HS_DONE;
+
+		return register_services(dp);
+	}
+
+	if (dp->hs_state != DS_HS_DONE)
+		goto conn_reset;
+
+	if (pkt->type == DS_REG_ACK) {
+		struct ds_reg_ack *ap = (struct ds_reg_ack *) pkt;
+		struct ds_cap_state *cp = find_cap(ap->handle);
+
+		if (!cp) {
+			printk(KERN_ERR PFX "REG ACK for unknown handle %lx\n",
+			       ap->handle);
+			return 0;
+		}
+		printk(KERN_INFO PFX "Registered %s service.\n",
+		       cp->service_id);
+		cp->state = CAP_STATE_REGISTERED;
+	} else if (pkt->type == DS_REG_NACK) {
+		struct ds_reg_nack *np = (struct ds_reg_nack *) pkt;
+		struct ds_cap_state *cp = find_cap(np->handle);
+
+		if (!cp) {
+			printk(KERN_ERR PFX "REG NACK for "
+			       "unknown handle %lx\n",
+			       np->handle);
+			return 0;
+		}
+		printk(KERN_INFO PFX "Could not register %s service\n",
+		       cp->service_id);
+		cp->state = CAP_STATE_UNKNOWN;
+	}
+
+	return 0;
+
+conn_reset:
+	ds_conn_reset(dp);
+	return -ECONNRESET;
+}
+
+static int ds_data(struct ds_info *dp, struct ds_msg_tag *pkt, int len)
+{
+	struct ds_data *dpkt = (struct ds_data *) pkt;
+	struct ds_cap_state *cp = find_cap(dpkt->handle);
+
+	if (!cp) {
+		struct ds_data_nack nack = {
+			.tag = {
+				.type = DS_NACK,
+				.len = (sizeof(struct ds_data_nack) -
+					sizeof(struct ds_msg_tag)),
+			},
+			.handle = dpkt->handle,
+			.result = DS_INV_HDL,
+		};
+
+		printk(KERN_ERR PFX "Data for unknown handle %lu\n",
+		       dpkt->handle);
+		ds_send(dp->lp, &nack, sizeof(nack));
+	} else {
+		cp->data(dp->lp, cp, dpkt, len);
+	}
+	return 0;
+}
+
+static void ds_up(struct ds_info *dp)
+{
+	struct ldc_channel *lp = dp->lp;
+	struct ds_ver_req req;
+	int err;
+
+	req.tag.type = DS_INIT_REQ;
+	req.tag.len = sizeof(req) - sizeof(struct ds_msg_tag);
+	req.ver.major = 1;
+	req.ver.minor = 0;
+
+	err = ds_send(lp, &req, sizeof(req));
+	if (err > 0)
+		dp->hs_state = DS_HS_START;
+}
+
+static void ds_event(void *arg, int event)
+{
+	struct ds_info *dp = arg;
+	struct ldc_channel *lp = dp->lp;
+	unsigned long flags;
+	int err;
+
+	spin_lock_irqsave(&ds_lock, flags);
+
+	if (event == LDC_EVENT_UP) {
+		ds_up(dp);
+		spin_unlock_irqrestore(&ds_lock, flags);
+		return;
+	}
+
+	if (event != LDC_EVENT_DATA_READY) {
+		printk(KERN_WARNING PFX "Unexpected LDC event %d\n", event);
+		spin_unlock_irqrestore(&ds_lock, flags);
+		return;
+	}
+
+	err = 0;
+	while (1) {
+		struct ds_msg_tag *tag;
+
+		err = ldc_read(lp, dp->rcv_buf, sizeof(*tag));
+
+		if (unlikely(err < 0)) {
+			if (err == -ECONNRESET)
+				ds_conn_reset(dp);
+			break;
+		}
+		if (err == 0)
+			break;
+
+		tag = dp->rcv_buf;
+		err = ldc_read(lp, tag + 1, tag->len);
+
+		if (unlikely(err < 0)) {
+			if (err == -ECONNRESET)
+				ds_conn_reset(dp);
+			break;
+		}
+		if (err < tag->len)
+			break;
+
+		if (tag->type < DS_DATA)
+			err = ds_handshake(dp, dp->rcv_buf);
+		else
+			err = ds_data(dp, dp->rcv_buf,
+				      sizeof(*tag) + err);
+		if (err == -ECONNRESET)
+			break;
+	}
+
+	spin_unlock_irqrestore(&ds_lock, flags);
+}
+
+static int __devinit ds_probe(struct vio_dev *vdev,
+			      const struct vio_device_id *id)
+{
+	static int ds_version_printed;
+	struct ldc_channel_config ds_cfg = {
+		.event		= ds_event,
+		.mtu		= 4096,
+		.mode		= LDC_MODE_STREAM,
+	};
+	struct ldc_channel *lp;
+	struct ds_info *dp;
+	int err;
+
+	if (ds_version_printed++ == 0)
+		printk(KERN_INFO "%s", version);
+
+	dp = kzalloc(sizeof(*dp), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!dp)
+		goto out_err;
+
+	dp->rcv_buf = kzalloc(4096, GFP_KERNEL);
+	if (!dp->rcv_buf)
+		goto out_free_dp;
+
+	dp->rcv_buf_len = 4096;
+
+	ds_cfg.tx_irq = vdev->tx_irq;
+	ds_cfg.rx_irq = vdev->rx_irq;
+
+	lp = ldc_alloc(vdev->channel_id, &ds_cfg, dp);
+	if (IS_ERR(lp)) {
+		err = PTR_ERR(lp);
+		goto out_free_rcv_buf;
+	}
+	dp->lp = lp;
+
+	err = ldc_bind(lp, "DS");
+	if (err)
+		goto out_free_ldc;
+
+	ds_info = dp;
+
+	start_powerd();
+
+	return err;
+
+out_free_ldc:
+	ldc_free(dp->lp);
+
+out_free_rcv_buf:
+	kfree(dp->rcv_buf);
+
+out_free_dp:
+	kfree(dp);
+
+out_err:
+	return err;
+}
+
+static int ds_remove(struct vio_dev *vdev)
+{
+	return 0;
+}
+
+static struct vio_device_id ds_match[] = {
+	{
+		.type = "domain-services-port",
+	},
+	{},
+};
+
+static struct vio_driver ds_driver = {
+	.id_table	= ds_match,
+	.probe		= ds_probe,
+	.remove		= ds_remove,
+	.driver		= {
+		.name	= "ds",
+		.owner	= THIS_MODULE,
+	}
+};
+
+static int __init ds_init(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ds_states); i++)
+		ds_states[i].handle = ((u64)i << 32);
+
+#ifdef CONFIG_HOTPLUG_CPU
+	kthread_run(dr_cpu_thread, NULL, "kdrcpud");
+#endif
+
+	return vio_register_driver(&ds_driver);
+}
+
+subsys_initcall(ds_init);

+ 139 - 0
arch/sparc64/kernel/hvtramp.S

@@ -0,0 +1,139 @@
+/* hvtramp.S: Hypervisor start-cpu trampoline code.
+ *
+ * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
+ */
+
+#include <asm/thread_info.h>
+#include <asm/hypervisor.h>
+#include <asm/scratchpad.h>
+#include <asm/spitfire.h>
+#include <asm/hvtramp.h>
+#include <asm/pstate.h>
+#include <asm/ptrace.h>
+#include <asm/asi.h>
+
+	.text
+	.align		8
+	.globl		hv_cpu_startup, hv_cpu_startup_end
+
+	/* This code executes directly out of the hypervisor
+	 * with physical addressing (va==pa).  %o0 contains
+	 * our client argument which for Linux points to
+	 * a descriptor data structure which defines the
+	 * MMU entries we need to load up.
+	 *
+	 * After we set things up we enable the MMU and call
+	 * into the kernel.
+	 *
+	 * First setup basic privileged cpu state.
+	 */
+hv_cpu_startup:
+	wrpr		%g0, 0, %gl
+	wrpr		%g0, 15, %pil
+	wrpr		%g0, 0, %canrestore
+	wrpr		%g0, 0, %otherwin
+	wrpr		%g0, 6, %cansave
+	wrpr		%g0, 6, %cleanwin
+	wrpr		%g0, 0, %cwp
+	wrpr		%g0, 0, %wstate
+	wrpr		%g0, 0, %tl
+
+	sethi		%hi(sparc64_ttable_tl0), %g1
+	wrpr		%g1, %tba
+
+	mov		%o0, %l0
+
+	lduw		[%l0 + HVTRAMP_DESCR_CPU], %g1
+	mov		SCRATCHPAD_CPUID, %g2
+	stxa		%g1, [%g2] ASI_SCRATCHPAD
+
+	ldx		[%l0 + HVTRAMP_DESCR_FAULT_INFO_VA], %g2
+	stxa		%g2, [%g0] ASI_SCRATCHPAD
+
+	mov		0, %l1
+	lduw		[%l0 + HVTRAMP_DESCR_NUM_MAPPINGS], %l2
+	add		%l0, HVTRAMP_DESCR_MAPS, %l3
+
+1:	ldx		[%l3 + HVTRAMP_MAPPING_VADDR], %o0
+	clr		%o1
+	ldx		[%l3 + HVTRAMP_MAPPING_TTE], %o2
+	mov		HV_MMU_IMMU | HV_MMU_DMMU, %o3
+	mov		HV_FAST_MMU_MAP_PERM_ADDR, %o5
+	ta		HV_FAST_TRAP
+
+	brnz,pn		%o0, 80f
+	 nop
+
+	add		%l1, 1, %l1
+	cmp		%l1, %l2
+	blt,a,pt	%xcc, 1b
+	 add		%l3, HVTRAMP_MAPPING_SIZE, %l3
+
+	ldx		[%l0 + HVTRAMP_DESCR_FAULT_INFO_PA], %o0
+	mov		HV_FAST_MMU_FAULT_AREA_CONF, %o5
+	ta		HV_FAST_TRAP
+
+	brnz,pn		%o0, 80f
+	 nop
+
+	wrpr		%g0, (PSTATE_PRIV | PSTATE_PEF), %pstate
+
+	ldx		[%l0 + HVTRAMP_DESCR_THREAD_REG], %l6
+
+	mov		1, %o0
+	set		1f, %o1
+	mov		HV_FAST_MMU_ENABLE, %o5
+	ta		HV_FAST_TRAP
+
+	ba,pt		%xcc, 80f
+	 nop
+
+1:
+	wr		%g0, 0, %fprs
+	wr		%g0, ASI_P, %asi
+
+	mov		PRIMARY_CONTEXT, %g7
+	stxa		%g0, [%g7] ASI_MMU
+	membar		#Sync
+
+	mov		SECONDARY_CONTEXT, %g7
+	stxa		%g0, [%g7] ASI_MMU
+	membar		#Sync
+
+	mov		%l6, %g6
+	ldx		[%g6 + TI_TASK], %g4
+
+	mov		1, %g5
+	sllx		%g5, THREAD_SHIFT, %g5
+	sub		%g5, (STACKFRAME_SZ + STACK_BIAS), %g5
+	add		%g6, %g5, %sp
+	mov		0, %fp
+
+	call		init_irqwork_curcpu
+	 nop
+	call		hard_smp_processor_id
+	 nop
+
+	mov		%o0, %o1
+	mov		0, %o0
+	mov		0, %o2
+	call		sun4v_init_mondo_queues
+	 mov		1, %o3
+
+	call		init_cur_cpu_trap
+	 mov		%g6, %o0
+
+	wrpr		%g0, (PSTATE_PRIV | PSTATE_PEF | PSTATE_IE), %pstate
+
+	call		smp_callin
+	 nop
+	call		cpu_idle
+	 mov		0, %o0
+	call		cpu_panic
+	 nop
+
+80:	ba,pt		%xcc, 80b
+	 nop
+
+	.align		8
+hv_cpu_startup_end:

+ 84 - 0
arch/sparc64/kernel/irq.c

@@ -293,6 +293,11 @@ static void sun4u_irq_enable(unsigned int virt_irq)
 	}
 	}
 }
 }
 
 
+static void sun4u_set_affinity(unsigned int virt_irq, cpumask_t mask)
+{
+	sun4u_irq_enable(virt_irq);
+}
+
 static void sun4u_irq_disable(unsigned int virt_irq)
 static void sun4u_irq_disable(unsigned int virt_irq)
 {
 {
 	struct irq_handler_data *data = get_irq_chip_data(virt_irq);
 	struct irq_handler_data *data = get_irq_chip_data(virt_irq);
@@ -309,6 +314,10 @@ static void sun4u_irq_disable(unsigned int virt_irq)
 static void sun4u_irq_end(unsigned int virt_irq)
 static void sun4u_irq_end(unsigned int virt_irq)
 {
 {
 	struct irq_handler_data *data = get_irq_chip_data(virt_irq);
 	struct irq_handler_data *data = get_irq_chip_data(virt_irq);
+	struct irq_desc *desc = irq_desc + virt_irq;
+
+	if (unlikely(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS)))
+		return;
 
 
 	if (likely(data))
 	if (likely(data))
 		upa_writeq(ICLR_IDLE, data->iclr);
 		upa_writeq(ICLR_IDLE, data->iclr);
@@ -340,6 +349,24 @@ static void sun4v_irq_enable(unsigned int virt_irq)
 	}
 	}
 }
 }
 
 
+static void sun4v_set_affinity(unsigned int virt_irq, cpumask_t mask)
+{
+	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
+	unsigned int ino = bucket - &ivector_table[0];
+
+	if (likely(bucket)) {
+		unsigned long cpuid;
+		int err;
+
+		cpuid = irq_choose_cpu(virt_irq);
+
+		err = sun4v_intr_settarget(ino, cpuid);
+		if (err != HV_EOK)
+			printk("sun4v_intr_settarget(%x,%lu): err(%d)\n",
+			       ino, cpuid, err);
+	}
+}
+
 static void sun4v_irq_disable(unsigned int virt_irq)
 static void sun4v_irq_disable(unsigned int virt_irq)
 {
 {
 	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
 	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
@@ -373,6 +400,10 @@ static void sun4v_irq_end(unsigned int virt_irq)
 {
 {
 	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
 	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
 	unsigned int ino = bucket - &ivector_table[0];
 	unsigned int ino = bucket - &ivector_table[0];
+	struct irq_desc *desc = irq_desc + virt_irq;
+
+	if (unlikely(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS)))
+		return;
 
 
 	if (likely(bucket)) {
 	if (likely(bucket)) {
 		int err;
 		int err;
@@ -418,6 +449,28 @@ static void sun4v_virq_enable(unsigned int virt_irq)
 	}
 	}
 }
 }
 
 
+static void sun4v_virt_set_affinity(unsigned int virt_irq, cpumask_t mask)
+{
+	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
+	unsigned int ino = bucket - &ivector_table[0];
+
+	if (likely(bucket)) {
+		unsigned long cpuid, dev_handle, dev_ino;
+		int err;
+
+		cpuid = irq_choose_cpu(virt_irq);
+
+		dev_handle = ino & IMAP_IGN;
+		dev_ino = ino & IMAP_INO;
+
+		err = sun4v_vintr_set_target(dev_handle, dev_ino, cpuid);
+		if (err != HV_EOK)
+			printk("sun4v_vintr_set_target(%lx,%lx,%lu): "
+			       "err(%d)\n",
+			       dev_handle, dev_ino, cpuid, err);
+	}
+}
+
 static void sun4v_virq_disable(unsigned int virt_irq)
 static void sun4v_virq_disable(unsigned int virt_irq)
 {
 {
 	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
 	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
@@ -443,6 +496,10 @@ static void sun4v_virq_end(unsigned int virt_irq)
 {
 {
 	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
 	struct ino_bucket *bucket = virt_irq_to_bucket(virt_irq);
 	unsigned int ino = bucket - &ivector_table[0];
 	unsigned int ino = bucket - &ivector_table[0];
+	struct irq_desc *desc = irq_desc + virt_irq;
+
+	if (unlikely(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS)))
+		return;
 
 
 	if (likely(bucket)) {
 	if (likely(bucket)) {
 		unsigned long dev_handle, dev_ino;
 		unsigned long dev_handle, dev_ino;
@@ -477,6 +534,7 @@ static struct irq_chip sun4u_irq = {
 	.enable		= sun4u_irq_enable,
 	.enable		= sun4u_irq_enable,
 	.disable	= sun4u_irq_disable,
 	.disable	= sun4u_irq_disable,
 	.end		= sun4u_irq_end,
 	.end		= sun4u_irq_end,
+	.set_affinity	= sun4u_set_affinity,
 };
 };
 
 
 static struct irq_chip sun4u_irq_ack = {
 static struct irq_chip sun4u_irq_ack = {
@@ -485,6 +543,7 @@ static struct irq_chip sun4u_irq_ack = {
 	.disable	= sun4u_irq_disable,
 	.disable	= sun4u_irq_disable,
 	.ack		= run_pre_handler,
 	.ack		= run_pre_handler,
 	.end		= sun4u_irq_end,
 	.end		= sun4u_irq_end,
+	.set_affinity	= sun4u_set_affinity,
 };
 };
 
 
 static struct irq_chip sun4v_irq = {
 static struct irq_chip sun4v_irq = {
@@ -492,6 +551,7 @@ static struct irq_chip sun4v_irq = {
 	.enable		= sun4v_irq_enable,
 	.enable		= sun4v_irq_enable,
 	.disable	= sun4v_irq_disable,
 	.disable	= sun4v_irq_disable,
 	.end		= sun4v_irq_end,
 	.end		= sun4v_irq_end,
+	.set_affinity	= sun4v_set_affinity,
 };
 };
 
 
 static struct irq_chip sun4v_irq_ack = {
 static struct irq_chip sun4v_irq_ack = {
@@ -500,6 +560,7 @@ static struct irq_chip sun4v_irq_ack = {
 	.disable	= sun4v_irq_disable,
 	.disable	= sun4v_irq_disable,
 	.ack		= run_pre_handler,
 	.ack		= run_pre_handler,
 	.end		= sun4v_irq_end,
 	.end		= sun4v_irq_end,
+	.set_affinity	= sun4v_set_affinity,
 };
 };
 
 
 #ifdef CONFIG_PCI_MSI
 #ifdef CONFIG_PCI_MSI
@@ -511,6 +572,7 @@ static struct irq_chip sun4v_msi = {
 	.disable	= sun4v_msi_disable,
 	.disable	= sun4v_msi_disable,
 	.ack		= run_pre_handler,
 	.ack		= run_pre_handler,
 	.end		= sun4v_irq_end,
 	.end		= sun4v_irq_end,
+	.set_affinity	= sun4v_set_affinity,
 };
 };
 #endif
 #endif
 
 
@@ -519,6 +581,7 @@ static struct irq_chip sun4v_virq = {
 	.enable		= sun4v_virq_enable,
 	.enable		= sun4v_virq_enable,
 	.disable	= sun4v_virq_disable,
 	.disable	= sun4v_virq_disable,
 	.end		= sun4v_virq_end,
 	.end		= sun4v_virq_end,
+	.set_affinity	= sun4v_virt_set_affinity,
 };
 };
 
 
 static struct irq_chip sun4v_virq_ack = {
 static struct irq_chip sun4v_virq_ack = {
@@ -527,6 +590,7 @@ static struct irq_chip sun4v_virq_ack = {
 	.disable	= sun4v_virq_disable,
 	.disable	= sun4v_virq_disable,
 	.ack		= run_pre_handler,
 	.ack		= run_pre_handler,
 	.end		= sun4v_virq_end,
 	.end		= sun4v_virq_end,
+	.set_affinity	= sun4v_virt_set_affinity,
 };
 };
 
 
 void irq_install_pre_handler(int virt_irq,
 void irq_install_pre_handler(int virt_irq,
@@ -739,6 +803,26 @@ void handler_irq(int irq, struct pt_regs *regs)
 	set_irq_regs(old_regs);
 	set_irq_regs(old_regs);
 }
 }
 
 
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(void)
+{
+	unsigned int irq;
+
+	for (irq = 0; irq < NR_IRQS; irq++) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&irq_desc[irq].lock, flags);
+		if (irq_desc[irq].action &&
+		    !(irq_desc[irq].status & IRQ_PER_CPU)) {
+			if (irq_desc[irq].chip->set_affinity)
+				irq_desc[irq].chip->set_affinity(irq,
+					irq_desc[irq].affinity);
+		}
+		spin_unlock_irqrestore(&irq_desc[irq].lock, flags);
+	}
+}
+#endif
+
 struct sun5_timer {
 struct sun5_timer {
 	u64	count0;
 	u64	count0;
 	u64	limit0;
 	u64	limit0;

+ 2373 - 0
arch/sparc64/kernel/ldc.c

@@ -0,0 +1,2373 @@
+/* ldc.c: Logical Domain Channel link-layer protocol driver.
+ *
+ * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/init.h>
+
+#include <asm/hypervisor.h>
+#include <asm/iommu.h>
+#include <asm/page.h>
+#include <asm/ldc.h>
+#include <asm/mdesc.h>
+
+#define DRV_MODULE_NAME		"ldc"
+#define PFX DRV_MODULE_NAME	": "
+#define DRV_MODULE_VERSION	"1.0"
+#define DRV_MODULE_RELDATE	"June 25, 2007"
+
+static char version[] __devinitdata =
+	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+#define LDC_PACKET_SIZE		64
+
+/* Packet header layout for unreliable and reliable mode frames.
+ * When in RAW mode, packets are simply straight 64-byte payloads
+ * with no headers.
+ */
+struct ldc_packet {
+	u8			type;
+#define LDC_CTRL		0x01
+#define LDC_DATA		0x02
+#define LDC_ERR			0x10
+
+	u8			stype;
+#define LDC_INFO		0x01
+#define LDC_ACK			0x02
+#define LDC_NACK		0x04
+
+	u8			ctrl;
+#define LDC_VERS		0x01 /* Link Version		*/
+#define LDC_RTS			0x02 /* Request To Send		*/
+#define LDC_RTR			0x03 /* Ready To Receive	*/
+#define LDC_RDX			0x04 /* Ready for Data eXchange	*/
+#define LDC_CTRL_MSK		0x0f
+
+	u8			env;
+#define LDC_LEN			0x3f
+#define LDC_FRAG_MASK		0xc0
+#define LDC_START		0x40
+#define LDC_STOP		0x80
+
+	u32			seqid;
+
+	union {
+		u8		u_data[LDC_PACKET_SIZE - 8];
+		struct {
+			u32	pad;
+			u32	ackid;
+			u8	r_data[LDC_PACKET_SIZE - 8 - 8];
+		} r;
+	} u;
+};
+
+struct ldc_version {
+	u16 major;
+	u16 minor;
+};
+
+/* Ordered from largest major to lowest.  */
+static struct ldc_version ver_arr[] = {
+	{ .major = 1, .minor = 0 },
+};
+
+#define LDC_DEFAULT_MTU			(4 * LDC_PACKET_SIZE)
+#define LDC_DEFAULT_NUM_ENTRIES		(PAGE_SIZE / LDC_PACKET_SIZE)
+
+struct ldc_channel;
+
+struct ldc_mode_ops {
+	int (*write)(struct ldc_channel *, const void *, unsigned int);
+	int (*read)(struct ldc_channel *, void *, unsigned int);
+};
+
+static const struct ldc_mode_ops raw_ops;
+static const struct ldc_mode_ops nonraw_ops;
+static const struct ldc_mode_ops stream_ops;
+
+int ldom_domaining_enabled;
+
+struct ldc_iommu {
+	/* Protects arena alloc/free.  */
+	spinlock_t			lock;
+	struct iommu_arena		arena;
+	struct ldc_mtable_entry		*page_table;
+};
+
+struct ldc_channel {
+	/* Protects all operations that depend upon channel state.  */
+	spinlock_t			lock;
+
+	unsigned long			id;
+
+	u8				*mssbuf;
+	u32				mssbuf_len;
+	u32				mssbuf_off;
+
+	struct ldc_packet		*tx_base;
+	unsigned long			tx_head;
+	unsigned long			tx_tail;
+	unsigned long			tx_num_entries;
+	unsigned long			tx_ra;
+
+	unsigned long			tx_acked;
+
+	struct ldc_packet		*rx_base;
+	unsigned long			rx_head;
+	unsigned long			rx_tail;
+	unsigned long			rx_num_entries;
+	unsigned long			rx_ra;
+
+	u32				rcv_nxt;
+	u32				snd_nxt;
+
+	unsigned long			chan_state;
+
+	struct ldc_channel_config	cfg;
+	void				*event_arg;
+
+	const struct ldc_mode_ops	*mops;
+
+	struct ldc_iommu		iommu;
+
+	struct ldc_version		ver;
+
+	u8				hs_state;
+#define LDC_HS_CLOSED			0x00
+#define LDC_HS_OPEN			0x01
+#define LDC_HS_GOTVERS			0x02
+#define LDC_HS_SENTRTR			0x03
+#define LDC_HS_GOTRTR			0x04
+#define LDC_HS_COMPLETE			0x10
+
+	u8				flags;
+#define LDC_FLAG_ALLOCED_QUEUES		0x01
+#define LDC_FLAG_REGISTERED_QUEUES	0x02
+#define LDC_FLAG_REGISTERED_IRQS	0x04
+#define LDC_FLAG_RESET			0x10
+
+	u8				mss;
+	u8				state;
+
+#define LDC_IRQ_NAME_MAX		32
+	char				rx_irq_name[LDC_IRQ_NAME_MAX];
+	char				tx_irq_name[LDC_IRQ_NAME_MAX];
+
+	struct hlist_head		mh_list;
+
+	struct hlist_node		list;
+};
+
+#define ldcdbg(TYPE, f, a...) \
+do {	if (lp->cfg.debug & LDC_DEBUG_##TYPE) \
+		printk(KERN_INFO PFX "ID[%lu] " f, lp->id, ## a); \
+} while (0)
+
+static const char *state_to_str(u8 state)
+{
+	switch (state) {
+	case LDC_STATE_INVALID:
+		return "INVALID";
+	case LDC_STATE_INIT:
+		return "INIT";
+	case LDC_STATE_BOUND:
+		return "BOUND";
+	case LDC_STATE_READY:
+		return "READY";
+	case LDC_STATE_CONNECTED:
+		return "CONNECTED";
+	default:
+		return "<UNKNOWN>";
+	}
+}
+
+static void ldc_set_state(struct ldc_channel *lp, u8 state)
+{
+	ldcdbg(STATE, "STATE (%s) --> (%s)\n",
+	       state_to_str(lp->state),
+	       state_to_str(state));
+
+	lp->state = state;
+}
+
+static unsigned long __advance(unsigned long off, unsigned long num_entries)
+{
+	off += LDC_PACKET_SIZE;
+	if (off == (num_entries * LDC_PACKET_SIZE))
+		off = 0;
+
+	return off;
+}
+
+static unsigned long rx_advance(struct ldc_channel *lp, unsigned long off)
+{
+	return __advance(off, lp->rx_num_entries);
+}
+
+static unsigned long tx_advance(struct ldc_channel *lp, unsigned long off)
+{
+	return __advance(off, lp->tx_num_entries);
+}
+
+static struct ldc_packet *handshake_get_tx_packet(struct ldc_channel *lp,
+						  unsigned long *new_tail)
+{
+	struct ldc_packet *p;
+	unsigned long t;
+
+	t = tx_advance(lp, lp->tx_tail);
+	if (t == lp->tx_head)
+		return NULL;
+
+	*new_tail = t;
+
+	p = lp->tx_base;
+	return p + (lp->tx_tail / LDC_PACKET_SIZE);
+}
+
+/* When we are in reliable or stream mode, have to track the next packet
+ * we haven't gotten an ACK for in the TX queue using tx_acked.  We have
+ * to be careful not to stomp over the queue past that point.  During
+ * the handshake, we don't have TX data packets pending in the queue
+ * and that's why handshake_get_tx_packet() need not be mindful of
+ * lp->tx_acked.
+ */
+static unsigned long head_for_data(struct ldc_channel *lp)
+{
+	if (lp->cfg.mode == LDC_MODE_STREAM)
+		return lp->tx_acked;
+	return lp->tx_head;
+}
+
+static int tx_has_space_for(struct ldc_channel *lp, unsigned int size)
+{
+	unsigned long limit, tail, new_tail, diff;
+	unsigned int mss;
+
+	limit = head_for_data(lp);
+	tail = lp->tx_tail;
+	new_tail = tx_advance(lp, tail);
+	if (new_tail == limit)
+		return 0;
+
+	if (limit > new_tail)
+		diff = limit - new_tail;
+	else
+		diff = (limit +
+			((lp->tx_num_entries * LDC_PACKET_SIZE) - new_tail));
+	diff /= LDC_PACKET_SIZE;
+	mss = lp->mss;
+
+	if (diff * mss < size)
+		return 0;
+
+	return 1;
+}
+
+static struct ldc_packet *data_get_tx_packet(struct ldc_channel *lp,
+					     unsigned long *new_tail)
+{
+	struct ldc_packet *p;
+	unsigned long h, t;
+
+	h = head_for_data(lp);
+	t = tx_advance(lp, lp->tx_tail);
+	if (t == h)
+		return NULL;
+
+	*new_tail = t;
+
+	p = lp->tx_base;
+	return p + (lp->tx_tail / LDC_PACKET_SIZE);
+}
+
+static int set_tx_tail(struct ldc_channel *lp, unsigned long tail)
+{
+	unsigned long orig_tail = lp->tx_tail;
+	int limit = 1000;
+
+	lp->tx_tail = tail;
+	while (limit-- > 0) {
+		unsigned long err;
+
+		err = sun4v_ldc_tx_set_qtail(lp->id, tail);
+		if (!err)
+			return 0;
+
+		if (err != HV_EWOULDBLOCK) {
+			lp->tx_tail = orig_tail;
+			return -EINVAL;
+		}
+		udelay(1);
+	}
+
+	lp->tx_tail = orig_tail;
+	return -EBUSY;
+}
+
+/* This just updates the head value in the hypervisor using
+ * a polling loop with a timeout.  The caller takes care of
+ * upating software state representing the head change, if any.
+ */
+static int __set_rx_head(struct ldc_channel *lp, unsigned long head)
+{
+	int limit = 1000;
+
+	while (limit-- > 0) {
+		unsigned long err;
+
+		err = sun4v_ldc_rx_set_qhead(lp->id, head);
+		if (!err)
+			return 0;
+
+		if (err != HV_EWOULDBLOCK)
+			return -EINVAL;
+
+		udelay(1);
+	}
+
+	return -EBUSY;
+}
+
+static int send_tx_packet(struct ldc_channel *lp,
+			  struct ldc_packet *p,
+			  unsigned long new_tail)
+{
+	BUG_ON(p != (lp->tx_base + (lp->tx_tail / LDC_PACKET_SIZE)));
+
+	return set_tx_tail(lp, new_tail);
+}
+
+static struct ldc_packet *handshake_compose_ctrl(struct ldc_channel *lp,
+						 u8 stype, u8 ctrl,
+						 void *data, int dlen,
+						 unsigned long *new_tail)
+{
+	struct ldc_packet *p = handshake_get_tx_packet(lp, new_tail);
+
+	if (p) {
+		memset(p, 0, sizeof(*p));
+		p->type = LDC_CTRL;
+		p->stype = stype;
+		p->ctrl = ctrl;
+		if (data)
+			memcpy(p->u.u_data, data, dlen);
+	}
+	return p;
+}
+
+static int start_handshake(struct ldc_channel *lp)
+{
+	struct ldc_packet *p;
+	struct ldc_version *ver;
+	unsigned long new_tail;
+
+	ver = &ver_arr[0];
+
+	ldcdbg(HS, "SEND VER INFO maj[%u] min[%u]\n",
+	       ver->major, ver->minor);
+
+	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS,
+				   ver, sizeof(*ver), &new_tail);
+	if (p) {
+		int err = send_tx_packet(lp, p, new_tail);
+		if (!err)
+			lp->flags &= ~LDC_FLAG_RESET;
+		return err;
+	}
+	return -EBUSY;
+}
+
+static int send_version_nack(struct ldc_channel *lp,
+			     u16 major, u16 minor)
+{
+	struct ldc_packet *p;
+	struct ldc_version ver;
+	unsigned long new_tail;
+
+	ver.major = major;
+	ver.minor = minor;
+
+	p = handshake_compose_ctrl(lp, LDC_NACK, LDC_VERS,
+				   &ver, sizeof(ver), &new_tail);
+	if (p) {
+		ldcdbg(HS, "SEND VER NACK maj[%u] min[%u]\n",
+		       ver.major, ver.minor);
+
+		return send_tx_packet(lp, p, new_tail);
+	}
+	return -EBUSY;
+}
+
+static int send_version_ack(struct ldc_channel *lp,
+			    struct ldc_version *vp)
+{
+	struct ldc_packet *p;
+	unsigned long new_tail;
+
+	p = handshake_compose_ctrl(lp, LDC_ACK, LDC_VERS,
+				   vp, sizeof(*vp), &new_tail);
+	if (p) {
+		ldcdbg(HS, "SEND VER ACK maj[%u] min[%u]\n",
+		       vp->major, vp->minor);
+
+		return send_tx_packet(lp, p, new_tail);
+	}
+	return -EBUSY;
+}
+
+static int send_rts(struct ldc_channel *lp)
+{
+	struct ldc_packet *p;
+	unsigned long new_tail;
+
+	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTS, NULL, 0,
+				   &new_tail);
+	if (p) {
+		p->env = lp->cfg.mode;
+		p->seqid = 0;
+		lp->rcv_nxt = 0;
+
+		ldcdbg(HS, "SEND RTS env[0x%x] seqid[0x%x]\n",
+		       p->env, p->seqid);
+
+		return send_tx_packet(lp, p, new_tail);
+	}
+	return -EBUSY;
+}
+
+static int send_rtr(struct ldc_channel *lp)
+{
+	struct ldc_packet *p;
+	unsigned long new_tail;
+
+	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RTR, NULL, 0,
+				   &new_tail);
+	if (p) {
+		p->env = lp->cfg.mode;
+		p->seqid = 0;
+
+		ldcdbg(HS, "SEND RTR env[0x%x] seqid[0x%x]\n",
+		       p->env, p->seqid);
+
+		return send_tx_packet(lp, p, new_tail);
+	}
+	return -EBUSY;
+}
+
+static int send_rdx(struct ldc_channel *lp)
+{
+	struct ldc_packet *p;
+	unsigned long new_tail;
+
+	p = handshake_compose_ctrl(lp, LDC_INFO, LDC_RDX, NULL, 0,
+				   &new_tail);
+	if (p) {
+		p->env = 0;
+		p->seqid = ++lp->snd_nxt;
+		p->u.r.ackid = lp->rcv_nxt;
+
+		ldcdbg(HS, "SEND RDX env[0x%x] seqid[0x%x] ackid[0x%x]\n",
+		       p->env, p->seqid, p->u.r.ackid);
+
+		return send_tx_packet(lp, p, new_tail);
+	}
+	return -EBUSY;
+}
+
+static int send_data_nack(struct ldc_channel *lp, struct ldc_packet *data_pkt)
+{
+	struct ldc_packet *p;
+	unsigned long new_tail;
+	int err;
+
+	p = data_get_tx_packet(lp, &new_tail);
+	if (!p)
+		return -EBUSY;
+	memset(p, 0, sizeof(*p));
+	p->type = data_pkt->type;
+	p->stype = LDC_NACK;
+	p->ctrl = data_pkt->ctrl & LDC_CTRL_MSK;
+	p->seqid = lp->snd_nxt + 1;
+	p->u.r.ackid = lp->rcv_nxt;
+
+	ldcdbg(HS, "SEND DATA NACK type[0x%x] ctl[0x%x] seq[0x%x] ack[0x%x]\n",
+	       p->type, p->ctrl, p->seqid, p->u.r.ackid);
+
+	err = send_tx_packet(lp, p, new_tail);
+	if (!err)
+		lp->snd_nxt++;
+
+	return err;
+}
+
+static int ldc_abort(struct ldc_channel *lp)
+{
+	unsigned long hv_err;
+
+	ldcdbg(STATE, "ABORT\n");
+
+	/* We report but do not act upon the hypervisor errors because
+	 * there really isn't much we can do if they fail at this point.
+	 */
+	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
+	if (hv_err)
+		printk(KERN_ERR PFX "ldc_abort: "
+		       "sun4v_ldc_tx_qconf(%lx,%lx,%lx) failed, err=%lu\n",
+		       lp->id, lp->tx_ra, lp->tx_num_entries, hv_err);
+
+	hv_err = sun4v_ldc_tx_get_state(lp->id,
+					&lp->tx_head,
+					&lp->tx_tail,
+					&lp->chan_state);
+	if (hv_err)
+		printk(KERN_ERR PFX "ldc_abort: "
+		       "sun4v_ldc_tx_get_state(%lx,...) failed, err=%lu\n",
+		       lp->id, hv_err);
+
+	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
+	if (hv_err)
+		printk(KERN_ERR PFX "ldc_abort: "
+		       "sun4v_ldc_rx_qconf(%lx,%lx,%lx) failed, err=%lu\n",
+		       lp->id, lp->rx_ra, lp->rx_num_entries, hv_err);
+
+	/* Refetch the RX queue state as well, because we could be invoked
+	 * here in the queue processing context.
+	 */
+	hv_err = sun4v_ldc_rx_get_state(lp->id,
+					&lp->rx_head,
+					&lp->rx_tail,
+					&lp->chan_state);
+	if (hv_err)
+		printk(KERN_ERR PFX "ldc_abort: "
+		       "sun4v_ldc_rx_get_state(%lx,...) failed, err=%lu\n",
+		       lp->id, hv_err);
+
+	return -ECONNRESET;
+}
+
+static struct ldc_version *find_by_major(u16 major)
+{
+	struct ldc_version *ret = NULL;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ver_arr); i++) {
+		struct ldc_version *v = &ver_arr[i];
+		if (v->major <= major) {
+			ret = v;
+			break;
+		}
+	}
+	return ret;
+}
+
+static int process_ver_info(struct ldc_channel *lp, struct ldc_version *vp)
+{
+	struct ldc_version *vap;
+	int err;
+
+	ldcdbg(HS, "GOT VERSION INFO major[%x] minor[%x]\n",
+	       vp->major, vp->minor);
+
+	if (lp->hs_state == LDC_HS_GOTVERS) {
+		lp->hs_state = LDC_HS_OPEN;
+		memset(&lp->ver, 0, sizeof(lp->ver));
+	}
+
+	vap = find_by_major(vp->major);
+	if (!vap) {
+		err = send_version_nack(lp, 0, 0);
+	} else if (vap->major != vp->major) {
+		err = send_version_nack(lp, vap->major, vap->minor);
+	} else {
+		struct ldc_version ver = *vp;
+		if (ver.minor > vap->minor)
+			ver.minor = vap->minor;
+		err = send_version_ack(lp, &ver);
+		if (!err) {
+			lp->ver = ver;
+			lp->hs_state = LDC_HS_GOTVERS;
+		}
+	}
+	if (err)
+		return ldc_abort(lp);
+
+	return 0;
+}
+
+static int process_ver_ack(struct ldc_channel *lp, struct ldc_version *vp)
+{
+	ldcdbg(HS, "GOT VERSION ACK major[%x] minor[%x]\n",
+	       vp->major, vp->minor);
+
+	if (lp->hs_state == LDC_HS_GOTVERS) {
+		if (lp->ver.major != vp->major ||
+		    lp->ver.minor != vp->minor)
+			return ldc_abort(lp);
+	} else {
+		lp->ver = *vp;
+		lp->hs_state = LDC_HS_GOTVERS;
+	}
+	if (send_rts(lp))
+		return ldc_abort(lp);
+	return 0;
+}
+
+static int process_ver_nack(struct ldc_channel *lp, struct ldc_version *vp)
+{
+	struct ldc_version *vap;
+
+	if ((vp->major == 0 && vp->minor == 0) ||
+	    !(vap = find_by_major(vp->major))) {
+		return ldc_abort(lp);
+	} else {
+		struct ldc_packet *p;
+		unsigned long new_tail;
+
+		p = handshake_compose_ctrl(lp, LDC_INFO, LDC_VERS,
+					   vap, sizeof(*vap),
+					   &new_tail);
+		if (p)
+			return send_tx_packet(lp, p, new_tail);
+		else
+			return ldc_abort(lp);
+	}
+}
+
+static int process_version(struct ldc_channel *lp,
+			   struct ldc_packet *p)
+{
+	struct ldc_version *vp;
+
+	vp = (struct ldc_version *) p->u.u_data;
+
+	switch (p->stype) {
+	case LDC_INFO:
+		return process_ver_info(lp, vp);
+
+	case LDC_ACK:
+		return process_ver_ack(lp, vp);
+
+	case LDC_NACK:
+		return process_ver_nack(lp, vp);
+
+	default:
+		return ldc_abort(lp);
+	}
+}
+
+static int process_rts(struct ldc_channel *lp,
+		       struct ldc_packet *p)
+{
+	ldcdbg(HS, "GOT RTS stype[%x] seqid[%x] env[%x]\n",
+	       p->stype, p->seqid, p->env);
+
+	if (p->stype     != LDC_INFO	   ||
+	    lp->hs_state != LDC_HS_GOTVERS ||
+	    p->env       != lp->cfg.mode)
+		return ldc_abort(lp);
+
+	lp->snd_nxt = p->seqid;
+	lp->rcv_nxt = p->seqid;
+	lp->hs_state = LDC_HS_SENTRTR;
+	if (send_rtr(lp))
+		return ldc_abort(lp);
+
+	return 0;
+}
+
+static int process_rtr(struct ldc_channel *lp,
+		       struct ldc_packet *p)
+{
+	ldcdbg(HS, "GOT RTR stype[%x] seqid[%x] env[%x]\n",
+	       p->stype, p->seqid, p->env);
+
+	if (p->stype     != LDC_INFO ||
+	    p->env       != lp->cfg.mode)
+		return ldc_abort(lp);
+
+	lp->snd_nxt = p->seqid;
+	lp->hs_state = LDC_HS_COMPLETE;
+	ldc_set_state(lp, LDC_STATE_CONNECTED);
+	send_rdx(lp);
+
+	return LDC_EVENT_UP;
+}
+
+static int rx_seq_ok(struct ldc_channel *lp, u32 seqid)
+{
+	return lp->rcv_nxt + 1 == seqid;
+}
+
+static int process_rdx(struct ldc_channel *lp,
+		       struct ldc_packet *p)
+{
+	ldcdbg(HS, "GOT RDX stype[%x] seqid[%x] env[%x] ackid[%x]\n",
+	       p->stype, p->seqid, p->env, p->u.r.ackid);
+
+	if (p->stype != LDC_INFO ||
+	    !(rx_seq_ok(lp, p->seqid)))
+		return ldc_abort(lp);
+
+	lp->rcv_nxt = p->seqid;
+
+	lp->hs_state = LDC_HS_COMPLETE;
+	ldc_set_state(lp, LDC_STATE_CONNECTED);
+
+	return LDC_EVENT_UP;
+}
+
+static int process_control_frame(struct ldc_channel *lp,
+				 struct ldc_packet *p)
+{
+	switch (p->ctrl) {
+	case LDC_VERS:
+		return process_version(lp, p);
+
+	case LDC_RTS:
+		return process_rts(lp, p);
+
+	case LDC_RTR:
+		return process_rtr(lp, p);
+
+	case LDC_RDX:
+		return process_rdx(lp, p);
+
+	default:
+		return ldc_abort(lp);
+	}
+}
+
+static int process_error_frame(struct ldc_channel *lp,
+			       struct ldc_packet *p)
+{
+	return ldc_abort(lp);
+}
+
+static int process_data_ack(struct ldc_channel *lp,
+			    struct ldc_packet *ack)
+{
+	unsigned long head = lp->tx_acked;
+	u32 ackid = ack->u.r.ackid;
+
+	while (1) {
+		struct ldc_packet *p = lp->tx_base + (head / LDC_PACKET_SIZE);
+
+		head = tx_advance(lp, head);
+
+		if (p->seqid == ackid) {
+			lp->tx_acked = head;
+			return 0;
+		}
+		if (head == lp->tx_tail)
+			return ldc_abort(lp);
+	}
+
+	return 0;
+}
+
+static void send_events(struct ldc_channel *lp, unsigned int event_mask)
+{
+	if (event_mask & LDC_EVENT_RESET)
+		lp->cfg.event(lp->event_arg, LDC_EVENT_RESET);
+	if (event_mask & LDC_EVENT_UP)
+		lp->cfg.event(lp->event_arg, LDC_EVENT_UP);
+	if (event_mask & LDC_EVENT_DATA_READY)
+		lp->cfg.event(lp->event_arg, LDC_EVENT_DATA_READY);
+}
+
+static irqreturn_t ldc_rx(int irq, void *dev_id)
+{
+	struct ldc_channel *lp = dev_id;
+	unsigned long orig_state, hv_err, flags;
+	unsigned int event_mask;
+
+	spin_lock_irqsave(&lp->lock, flags);
+
+	orig_state = lp->chan_state;
+	hv_err = sun4v_ldc_rx_get_state(lp->id,
+					&lp->rx_head,
+					&lp->rx_tail,
+					&lp->chan_state);
+
+	ldcdbg(RX, "RX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n",
+	       orig_state, lp->chan_state, lp->rx_head, lp->rx_tail);
+
+	event_mask = 0;
+
+	if (lp->cfg.mode == LDC_MODE_RAW &&
+	    lp->chan_state == LDC_CHANNEL_UP) {
+		lp->hs_state = LDC_HS_COMPLETE;
+		ldc_set_state(lp, LDC_STATE_CONNECTED);
+
+		event_mask |= LDC_EVENT_UP;
+
+		orig_state = lp->chan_state;
+	}
+
+	/* If we are in reset state, flush the RX queue and ignore
+	 * everything.
+	 */
+	if (lp->flags & LDC_FLAG_RESET) {
+		(void) __set_rx_head(lp, lp->rx_tail);
+		goto out;
+	}
+
+	/* Once we finish the handshake, we let the ldc_read()
+	 * paths do all of the control frame and state management.
+	 * Just trigger the callback.
+	 */
+	if (lp->hs_state == LDC_HS_COMPLETE) {
+handshake_complete:
+		if (lp->chan_state != orig_state) {
+			unsigned int event = LDC_EVENT_RESET;
+
+			if (lp->chan_state == LDC_CHANNEL_UP)
+				event = LDC_EVENT_UP;
+
+			event_mask |= event;
+		}
+		if (lp->rx_head != lp->rx_tail)
+			event_mask |= LDC_EVENT_DATA_READY;
+
+		goto out;
+	}
+
+	if (lp->chan_state != orig_state)
+		goto out;
+
+	while (lp->rx_head != lp->rx_tail) {
+		struct ldc_packet *p;
+		unsigned long new;
+		int err;
+
+		p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE);
+
+		switch (p->type) {
+		case LDC_CTRL:
+			err = process_control_frame(lp, p);
+			if (err > 0)
+				event_mask |= err;
+			break;
+
+		case LDC_DATA:
+			event_mask |= LDC_EVENT_DATA_READY;
+			err = 0;
+			break;
+
+		case LDC_ERR:
+			err = process_error_frame(lp, p);
+			break;
+
+		default:
+			err = ldc_abort(lp);
+			break;
+		}
+
+		if (err < 0)
+			break;
+
+		new = lp->rx_head;
+		new += LDC_PACKET_SIZE;
+		if (new == (lp->rx_num_entries * LDC_PACKET_SIZE))
+			new = 0;
+		lp->rx_head = new;
+
+		err = __set_rx_head(lp, new);
+		if (err < 0) {
+			(void) ldc_abort(lp);
+			break;
+		}
+		if (lp->hs_state == LDC_HS_COMPLETE)
+			goto handshake_complete;
+	}
+
+out:
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	send_events(lp, event_mask);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t ldc_tx(int irq, void *dev_id)
+{
+	struct ldc_channel *lp = dev_id;
+	unsigned long flags, hv_err, orig_state;
+	unsigned int event_mask = 0;
+
+	spin_lock_irqsave(&lp->lock, flags);
+
+	orig_state = lp->chan_state;
+	hv_err = sun4v_ldc_tx_get_state(lp->id,
+					&lp->tx_head,
+					&lp->tx_tail,
+					&lp->chan_state);
+
+	ldcdbg(TX, " TX state[0x%02lx:0x%02lx] head[0x%04lx] tail[0x%04lx]\n",
+	       orig_state, lp->chan_state, lp->tx_head, lp->tx_tail);
+
+	if (lp->cfg.mode == LDC_MODE_RAW &&
+	    lp->chan_state == LDC_CHANNEL_UP) {
+		lp->hs_state = LDC_HS_COMPLETE;
+		ldc_set_state(lp, LDC_STATE_CONNECTED);
+
+		event_mask |= LDC_EVENT_UP;
+	}
+
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	send_events(lp, event_mask);
+
+	return IRQ_HANDLED;
+}
+
+/* XXX ldc_alloc() and ldc_free() needs to run under a mutex so
+ * XXX that addition and removal from the ldc_channel_list has
+ * XXX atomicity, otherwise the __ldc_channel_exists() check is
+ * XXX totally pointless as another thread can slip into ldc_alloc()
+ * XXX and add a channel with the same ID.  There also needs to be
+ * XXX a spinlock for ldc_channel_list.
+ */
+static HLIST_HEAD(ldc_channel_list);
+
+static int __ldc_channel_exists(unsigned long id)
+{
+	struct ldc_channel *lp;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(lp, n, &ldc_channel_list, list) {
+		if (lp->id == id)
+			return 1;
+	}
+	return 0;
+}
+
+static int alloc_queue(const char *name, unsigned long num_entries,
+		       struct ldc_packet **base, unsigned long *ra)
+{
+	unsigned long size, order;
+	void *q;
+
+	size = num_entries * LDC_PACKET_SIZE;
+	order = get_order(size);
+
+	q = (void *) __get_free_pages(GFP_KERNEL, order);
+	if (!q) {
+		printk(KERN_ERR PFX "Alloc of %s queue failed with "
+		       "size=%lu order=%lu\n", name, size, order);
+		return -ENOMEM;
+	}
+
+	memset(q, 0, PAGE_SIZE << order);
+
+	*base = q;
+	*ra = __pa(q);
+
+	return 0;
+}
+
+static void free_queue(unsigned long num_entries, struct ldc_packet *q)
+{
+	unsigned long size, order;
+
+	if (!q)
+		return;
+
+	size = num_entries * LDC_PACKET_SIZE;
+	order = get_order(size);
+
+	free_pages((unsigned long)q, order);
+}
+
+/* XXX Make this configurable... XXX */
+#define LDC_IOTABLE_SIZE	(8 * 1024)
+
+static int ldc_iommu_init(struct ldc_channel *lp)
+{
+	unsigned long sz, num_tsb_entries, tsbsize, order;
+	struct ldc_iommu *iommu = &lp->iommu;
+	struct ldc_mtable_entry *table;
+	unsigned long hv_err;
+	int err;
+
+	num_tsb_entries = LDC_IOTABLE_SIZE;
+	tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry);
+
+	spin_lock_init(&iommu->lock);
+
+	sz = num_tsb_entries / 8;
+	sz = (sz + 7UL) & ~7UL;
+	iommu->arena.map = kzalloc(sz, GFP_KERNEL);
+	if (!iommu->arena.map) {
+		printk(KERN_ERR PFX "Alloc of arena map failed, sz=%lu\n", sz);
+		return -ENOMEM;
+	}
+
+	iommu->arena.limit = num_tsb_entries;
+
+	order = get_order(tsbsize);
+
+	table = (struct ldc_mtable_entry *)
+		__get_free_pages(GFP_KERNEL, order);
+	err = -ENOMEM;
+	if (!table) {
+		printk(KERN_ERR PFX "Alloc of MTE table failed, "
+		       "size=%lu order=%lu\n", tsbsize, order);
+		goto out_free_map;
+	}
+
+	memset(table, 0, PAGE_SIZE << order);
+
+	iommu->page_table = table;
+
+	hv_err = sun4v_ldc_set_map_table(lp->id, __pa(table),
+					 num_tsb_entries);
+	err = -EINVAL;
+	if (hv_err)
+		goto out_free_table;
+
+	return 0;
+
+out_free_table:
+	free_pages((unsigned long) table, order);
+	iommu->page_table = NULL;
+
+out_free_map:
+	kfree(iommu->arena.map);
+	iommu->arena.map = NULL;
+
+	return err;
+}
+
+static void ldc_iommu_release(struct ldc_channel *lp)
+{
+	struct ldc_iommu *iommu = &lp->iommu;
+	unsigned long num_tsb_entries, tsbsize, order;
+
+	(void) sun4v_ldc_set_map_table(lp->id, 0, 0);
+
+	num_tsb_entries = iommu->arena.limit;
+	tsbsize = num_tsb_entries * sizeof(struct ldc_mtable_entry);
+	order = get_order(tsbsize);
+
+	free_pages((unsigned long) iommu->page_table, order);
+	iommu->page_table = NULL;
+
+	kfree(iommu->arena.map);
+	iommu->arena.map = NULL;
+}
+
+struct ldc_channel *ldc_alloc(unsigned long id,
+			      const struct ldc_channel_config *cfgp,
+			      void *event_arg)
+{
+	struct ldc_channel *lp;
+	const struct ldc_mode_ops *mops;
+	unsigned long dummy1, dummy2, hv_err;
+	u8 mss, *mssbuf;
+	int err;
+
+	err = -ENODEV;
+	if (!ldom_domaining_enabled)
+		goto out_err;
+
+	err = -EINVAL;
+	if (!cfgp)
+		goto out_err;
+
+	switch (cfgp->mode) {
+	case LDC_MODE_RAW:
+		mops = &raw_ops;
+		mss = LDC_PACKET_SIZE;
+		break;
+
+	case LDC_MODE_UNRELIABLE:
+		mops = &nonraw_ops;
+		mss = LDC_PACKET_SIZE - 8;
+		break;
+
+	case LDC_MODE_STREAM:
+		mops = &stream_ops;
+		mss = LDC_PACKET_SIZE - 8 - 8;
+		break;
+
+	default:
+		goto out_err;
+	}
+
+	if (!cfgp->event || !event_arg || !cfgp->rx_irq || !cfgp->tx_irq)
+		goto out_err;
+
+	hv_err = sun4v_ldc_tx_qinfo(id, &dummy1, &dummy2);
+	err = -ENODEV;
+	if (hv_err == HV_ECHANNEL)
+		goto out_err;
+
+	err = -EEXIST;
+	if (__ldc_channel_exists(id))
+		goto out_err;
+
+	mssbuf = NULL;
+
+	lp = kzalloc(sizeof(*lp), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!lp)
+		goto out_err;
+
+	spin_lock_init(&lp->lock);
+
+	lp->id = id;
+
+	err = ldc_iommu_init(lp);
+	if (err)
+		goto out_free_ldc;
+
+	lp->mops = mops;
+	lp->mss = mss;
+
+	lp->cfg = *cfgp;
+	if (!lp->cfg.mtu)
+		lp->cfg.mtu = LDC_DEFAULT_MTU;
+
+	if (lp->cfg.mode == LDC_MODE_STREAM) {
+		mssbuf = kzalloc(lp->cfg.mtu, GFP_KERNEL);
+		if (!mssbuf) {
+			err = -ENOMEM;
+			goto out_free_iommu;
+		}
+		lp->mssbuf = mssbuf;
+	}
+
+	lp->event_arg = event_arg;
+
+	/* XXX allow setting via ldc_channel_config to override defaults
+	 * XXX or use some formula based upon mtu
+	 */
+	lp->tx_num_entries = LDC_DEFAULT_NUM_ENTRIES;
+	lp->rx_num_entries = LDC_DEFAULT_NUM_ENTRIES;
+
+	err = alloc_queue("TX", lp->tx_num_entries,
+			  &lp->tx_base, &lp->tx_ra);
+	if (err)
+		goto out_free_mssbuf;
+
+	err = alloc_queue("RX", lp->rx_num_entries,
+			  &lp->rx_base, &lp->rx_ra);
+	if (err)
+		goto out_free_txq;
+
+	lp->flags |= LDC_FLAG_ALLOCED_QUEUES;
+
+	lp->hs_state = LDC_HS_CLOSED;
+	ldc_set_state(lp, LDC_STATE_INIT);
+
+	INIT_HLIST_NODE(&lp->list);
+	hlist_add_head(&lp->list, &ldc_channel_list);
+
+	INIT_HLIST_HEAD(&lp->mh_list);
+
+	return lp;
+
+out_free_txq:
+	free_queue(lp->tx_num_entries, lp->tx_base);
+
+out_free_mssbuf:
+	if (mssbuf)
+		kfree(mssbuf);
+
+out_free_iommu:
+	ldc_iommu_release(lp);
+
+out_free_ldc:
+	kfree(lp);
+
+out_err:
+	return ERR_PTR(err);
+}
+EXPORT_SYMBOL(ldc_alloc);
+
+void ldc_free(struct ldc_channel *lp)
+{
+	if (lp->flags & LDC_FLAG_REGISTERED_IRQS) {
+		free_irq(lp->cfg.rx_irq, lp);
+		free_irq(lp->cfg.tx_irq, lp);
+	}
+
+	if (lp->flags & LDC_FLAG_REGISTERED_QUEUES) {
+		sun4v_ldc_tx_qconf(lp->id, 0, 0);
+		sun4v_ldc_rx_qconf(lp->id, 0, 0);
+		lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES;
+	}
+	if (lp->flags & LDC_FLAG_ALLOCED_QUEUES) {
+		free_queue(lp->tx_num_entries, lp->tx_base);
+		free_queue(lp->rx_num_entries, lp->rx_base);
+		lp->flags &= ~LDC_FLAG_ALLOCED_QUEUES;
+	}
+
+	hlist_del(&lp->list);
+
+	if (lp->mssbuf)
+		kfree(lp->mssbuf);
+
+	ldc_iommu_release(lp);
+
+	kfree(lp);
+}
+EXPORT_SYMBOL(ldc_free);
+
+/* Bind the channel.  This registers the LDC queues with
+ * the hypervisor and puts the channel into a pseudo-listening
+ * state.  This does not initiate a handshake, ldc_connect() does
+ * that.
+ */
+int ldc_bind(struct ldc_channel *lp, const char *name)
+{
+	unsigned long hv_err, flags;
+	int err = -EINVAL;
+
+	spin_lock_irqsave(&lp->lock, flags);
+
+	if (!name)
+		goto out_err;
+
+	if (lp->state != LDC_STATE_INIT)
+		goto out_err;
+
+	snprintf(lp->rx_irq_name, LDC_IRQ_NAME_MAX, "%s RX", name);
+	snprintf(lp->tx_irq_name, LDC_IRQ_NAME_MAX, "%s TX", name);
+
+	err = request_irq(lp->cfg.rx_irq, ldc_rx,
+			  IRQF_SAMPLE_RANDOM | IRQF_SHARED,
+			  lp->rx_irq_name, lp);
+	if (err)
+		goto out_err;
+
+	err = request_irq(lp->cfg.tx_irq, ldc_tx,
+			  IRQF_SAMPLE_RANDOM | IRQF_SHARED,
+			  lp->tx_irq_name, lp);
+	if (err)
+		goto out_free_rx_irq;
+
+
+	lp->flags |= LDC_FLAG_REGISTERED_IRQS;
+
+	err = -ENODEV;
+	hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0);
+	if (hv_err)
+		goto out_free_tx_irq;
+
+	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
+	if (hv_err)
+		goto out_free_tx_irq;
+
+	hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0);
+	if (hv_err)
+		goto out_unmap_tx;
+
+	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
+	if (hv_err)
+		goto out_unmap_tx;
+
+	lp->flags |= LDC_FLAG_REGISTERED_QUEUES;
+
+	hv_err = sun4v_ldc_tx_get_state(lp->id,
+					&lp->tx_head,
+					&lp->tx_tail,
+					&lp->chan_state);
+	err = -EBUSY;
+	if (hv_err)
+		goto out_unmap_rx;
+
+	lp->tx_acked = lp->tx_head;
+
+	lp->hs_state = LDC_HS_OPEN;
+	ldc_set_state(lp, LDC_STATE_BOUND);
+
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return 0;
+
+out_unmap_rx:
+	lp->flags &= ~LDC_FLAG_REGISTERED_QUEUES;
+	sun4v_ldc_rx_qconf(lp->id, 0, 0);
+
+out_unmap_tx:
+	sun4v_ldc_tx_qconf(lp->id, 0, 0);
+
+out_free_tx_irq:
+	lp->flags &= ~LDC_FLAG_REGISTERED_IRQS;
+	free_irq(lp->cfg.tx_irq, lp);
+
+out_free_rx_irq:
+	free_irq(lp->cfg.rx_irq, lp);
+
+out_err:
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return err;
+}
+EXPORT_SYMBOL(ldc_bind);
+
+int ldc_connect(struct ldc_channel *lp)
+{
+	unsigned long flags;
+	int err;
+
+	if (lp->cfg.mode == LDC_MODE_RAW)
+		return -EINVAL;
+
+	spin_lock_irqsave(&lp->lock, flags);
+
+	if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) ||
+	    !(lp->flags & LDC_FLAG_REGISTERED_QUEUES) ||
+	    lp->hs_state != LDC_HS_OPEN)
+		err = -EINVAL;
+	else
+		err = start_handshake(lp);
+
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return err;
+}
+EXPORT_SYMBOL(ldc_connect);
+
+int ldc_disconnect(struct ldc_channel *lp)
+{
+	unsigned long hv_err, flags;
+	int err;
+
+	if (lp->cfg.mode == LDC_MODE_RAW)
+		return -EINVAL;
+
+	if (!(lp->flags & LDC_FLAG_ALLOCED_QUEUES) ||
+	    !(lp->flags & LDC_FLAG_REGISTERED_QUEUES))
+		return -EINVAL;
+
+	spin_lock_irqsave(&lp->lock, flags);
+
+	err = -ENODEV;
+	hv_err = sun4v_ldc_tx_qconf(lp->id, 0, 0);
+	if (hv_err)
+		goto out_err;
+
+	hv_err = sun4v_ldc_tx_qconf(lp->id, lp->tx_ra, lp->tx_num_entries);
+	if (hv_err)
+		goto out_err;
+
+	hv_err = sun4v_ldc_rx_qconf(lp->id, 0, 0);
+	if (hv_err)
+		goto out_err;
+
+	hv_err = sun4v_ldc_rx_qconf(lp->id, lp->rx_ra, lp->rx_num_entries);
+	if (hv_err)
+		goto out_err;
+
+	ldc_set_state(lp, LDC_STATE_BOUND);
+	lp->hs_state = LDC_HS_OPEN;
+	lp->flags |= LDC_FLAG_RESET;
+
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return 0;
+
+out_err:
+	sun4v_ldc_tx_qconf(lp->id, 0, 0);
+	sun4v_ldc_rx_qconf(lp->id, 0, 0);
+	free_irq(lp->cfg.tx_irq, lp);
+	free_irq(lp->cfg.rx_irq, lp);
+	lp->flags &= ~(LDC_FLAG_REGISTERED_IRQS |
+		       LDC_FLAG_REGISTERED_QUEUES);
+	ldc_set_state(lp, LDC_STATE_INIT);
+
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return err;
+}
+EXPORT_SYMBOL(ldc_disconnect);
+
+int ldc_state(struct ldc_channel *lp)
+{
+	return lp->state;
+}
+EXPORT_SYMBOL(ldc_state);
+
+static int write_raw(struct ldc_channel *lp, const void *buf, unsigned int size)
+{
+	struct ldc_packet *p;
+	unsigned long new_tail;
+	int err;
+
+	if (size > LDC_PACKET_SIZE)
+		return -EMSGSIZE;
+
+	p = data_get_tx_packet(lp, &new_tail);
+	if (!p)
+		return -EAGAIN;
+
+	memcpy(p, buf, size);
+
+	err = send_tx_packet(lp, p, new_tail);
+	if (!err)
+		err = size;
+
+	return err;
+}
+
+static int read_raw(struct ldc_channel *lp, void *buf, unsigned int size)
+{
+	struct ldc_packet *p;
+	unsigned long hv_err, new;
+	int err;
+
+	if (size < LDC_PACKET_SIZE)
+		return -EINVAL;
+
+	hv_err = sun4v_ldc_rx_get_state(lp->id,
+					&lp->rx_head,
+					&lp->rx_tail,
+					&lp->chan_state);
+	if (hv_err)
+		return ldc_abort(lp);
+
+	if (lp->chan_state == LDC_CHANNEL_DOWN ||
+	    lp->chan_state == LDC_CHANNEL_RESETTING)
+		return -ECONNRESET;
+
+	if (lp->rx_head == lp->rx_tail)
+		return 0;
+
+	p = lp->rx_base + (lp->rx_head / LDC_PACKET_SIZE);
+	memcpy(buf, p, LDC_PACKET_SIZE);
+
+	new = rx_advance(lp, lp->rx_head);
+	lp->rx_head = new;
+
+	err = __set_rx_head(lp, new);
+	if (err < 0)
+		err = -ECONNRESET;
+	else
+		err = LDC_PACKET_SIZE;
+
+	return err;
+}
+
+static const struct ldc_mode_ops raw_ops = {
+	.write		=	write_raw,
+	.read		=	read_raw,
+};
+
+static int write_nonraw(struct ldc_channel *lp, const void *buf,
+			unsigned int size)
+{
+	unsigned long hv_err, tail;
+	unsigned int copied;
+	u32 seq;
+	int err;
+
+	hv_err = sun4v_ldc_tx_get_state(lp->id, &lp->tx_head, &lp->tx_tail,
+					&lp->chan_state);
+	if (unlikely(hv_err))
+		return -EBUSY;
+
+	if (unlikely(lp->chan_state != LDC_CHANNEL_UP))
+		return ldc_abort(lp);
+
+	if (!tx_has_space_for(lp, size))
+		return -EAGAIN;
+
+	seq = lp->snd_nxt;
+	copied = 0;
+	tail = lp->tx_tail;
+	while (copied < size) {
+		struct ldc_packet *p = lp->tx_base + (tail / LDC_PACKET_SIZE);
+		u8 *data = ((lp->cfg.mode == LDC_MODE_UNRELIABLE) ?
+			    p->u.u_data :
+			    p->u.r.r_data);
+		int data_len;
+
+		p->type = LDC_DATA;
+		p->stype = LDC_INFO;
+		p->ctrl = 0;
+
+		data_len = size - copied;
+		if (data_len > lp->mss)
+			data_len = lp->mss;
+
+		BUG_ON(data_len > LDC_LEN);
+
+		p->env = (data_len |
+			  (copied == 0 ? LDC_START : 0) |
+			  (data_len == size - copied ? LDC_STOP : 0));
+
+		p->seqid = ++seq;
+
+		ldcdbg(DATA, "SENT DATA [%02x:%02x:%02x:%02x:%08x]\n",
+		       p->type,
+		       p->stype,
+		       p->ctrl,
+		       p->env,
+		       p->seqid);
+
+		memcpy(data, buf, data_len);
+		buf += data_len;
+		copied += data_len;
+
+		tail = tx_advance(lp, tail);
+	}
+
+	err = set_tx_tail(lp, tail);
+	if (!err) {
+		lp->snd_nxt = seq;
+		err = size;
+	}
+
+	return err;
+}
+
+static int rx_bad_seq(struct ldc_channel *lp, struct ldc_packet *p,
+		      struct ldc_packet *first_frag)
+{
+	int err;
+
+	if (first_frag)
+		lp->rcv_nxt = first_frag->seqid - 1;
+
+	err = send_data_nack(lp, p);
+	if (err)
+		return err;
+
+	err = __set_rx_head(lp, lp->rx_tail);
+	if (err < 0)
+		return ldc_abort(lp);
+
+	return 0;
+}
+
+static int data_ack_nack(struct ldc_channel *lp, struct ldc_packet *p)
+{
+	if (p->stype & LDC_ACK) {
+		int err = process_data_ack(lp, p);
+		if (err)
+			return err;
+	}
+	if (p->stype & LDC_NACK)
+		return ldc_abort(lp);
+
+	return 0;
+}
+
+static int rx_data_wait(struct ldc_channel *lp, unsigned long cur_head)
+{
+	unsigned long dummy;
+	int limit = 1000;
+
+	ldcdbg(DATA, "DATA WAIT cur_head[%lx] rx_head[%lx] rx_tail[%lx]\n",
+	       cur_head, lp->rx_head, lp->rx_tail);
+	while (limit-- > 0) {
+		unsigned long hv_err;
+
+		hv_err = sun4v_ldc_rx_get_state(lp->id,
+						&dummy,
+						&lp->rx_tail,
+						&lp->chan_state);
+		if (hv_err)
+			return ldc_abort(lp);
+
+		if (lp->chan_state == LDC_CHANNEL_DOWN ||
+		    lp->chan_state == LDC_CHANNEL_RESETTING)
+			return -ECONNRESET;
+
+		if (cur_head != lp->rx_tail) {
+			ldcdbg(DATA, "DATA WAIT DONE "
+			       "head[%lx] tail[%lx] chan_state[%lx]\n",
+			       dummy, lp->rx_tail, lp->chan_state);
+			return 0;
+		}
+
+		udelay(1);
+	}
+	return -EAGAIN;
+}
+
+static int rx_set_head(struct ldc_channel *lp, unsigned long head)
+{
+	int err = __set_rx_head(lp, head);
+
+	if (err < 0)
+		return ldc_abort(lp);
+
+	lp->rx_head = head;
+	return 0;
+}
+
+static void send_data_ack(struct ldc_channel *lp)
+{
+	unsigned long new_tail;
+	struct ldc_packet *p;
+
+	p = data_get_tx_packet(lp, &new_tail);
+	if (likely(p)) {
+		int err;
+
+		memset(p, 0, sizeof(*p));
+		p->type = LDC_DATA;
+		p->stype = LDC_ACK;
+		p->ctrl = 0;
+		p->seqid = lp->snd_nxt + 1;
+		p->u.r.ackid = lp->rcv_nxt;
+
+		err = send_tx_packet(lp, p, new_tail);
+		if (!err)
+			lp->snd_nxt++;
+	}
+}
+
+static int read_nonraw(struct ldc_channel *lp, void *buf, unsigned int size)
+{
+	struct ldc_packet *first_frag;
+	unsigned long hv_err, new;
+	int err, copied;
+
+	hv_err = sun4v_ldc_rx_get_state(lp->id,
+					&lp->rx_head,
+					&lp->rx_tail,
+					&lp->chan_state);
+	if (hv_err)
+		return ldc_abort(lp);
+
+	if (lp->chan_state == LDC_CHANNEL_DOWN ||
+	    lp->chan_state == LDC_CHANNEL_RESETTING)
+		return -ECONNRESET;
+
+	if (lp->rx_head == lp->rx_tail)
+		return 0;
+
+	first_frag = NULL;
+	copied = err = 0;
+	new = lp->rx_head;
+	while (1) {
+		struct ldc_packet *p;
+		int pkt_len;
+
+		BUG_ON(new == lp->rx_tail);
+		p = lp->rx_base + (new / LDC_PACKET_SIZE);
+
+		ldcdbg(RX, "RX read pkt[%02x:%02x:%02x:%02x:%08x:%08x] "
+		       "rcv_nxt[%08x]\n",
+		       p->type,
+		       p->stype,
+		       p->ctrl,
+		       p->env,
+		       p->seqid,
+		       p->u.r.ackid,
+		       lp->rcv_nxt);
+
+		if (unlikely(!rx_seq_ok(lp, p->seqid))) {
+			err = rx_bad_seq(lp, p, first_frag);
+			copied = 0;
+			break;
+		}
+
+		if (p->type & LDC_CTRL) {
+			err = process_control_frame(lp, p);
+			if (err < 0)
+				break;
+			err = 0;
+		}
+
+		lp->rcv_nxt = p->seqid;
+
+		if (!(p->type & LDC_DATA)) {
+			new = rx_advance(lp, new);
+			goto no_data;
+		}
+		if (p->stype & (LDC_ACK | LDC_NACK)) {
+			err = data_ack_nack(lp, p);
+			if (err)
+				break;
+		}
+		if (!(p->stype & LDC_INFO)) {
+			new = rx_advance(lp, new);
+			err = rx_set_head(lp, new);
+			if (err)
+				break;
+			goto no_data;
+		}
+
+		pkt_len = p->env & LDC_LEN;
+
+		/* Every initial packet starts with the START bit set.
+		 *
+		 * Singleton packets will have both START+STOP set.
+		 *
+		 * Fragments will have START set in the first frame, STOP
+		 * set in the last frame, and neither bit set in middle
+		 * frames of the packet.
+		 *
+		 * Therefore if we are at the beginning of a packet and
+		 * we don't see START, or we are in the middle of a fragmented
+		 * packet and do see START, we are unsynchronized and should
+		 * flush the RX queue.
+		 */
+		if ((first_frag == NULL && !(p->env & LDC_START)) ||
+		    (first_frag != NULL &&  (p->env & LDC_START))) {
+			if (!first_frag)
+				new = rx_advance(lp, new);
+
+			err = rx_set_head(lp, new);
+			if (err)
+				break;
+
+			if (!first_frag)
+				goto no_data;
+		}
+		if (!first_frag)
+			first_frag = p;
+
+		if (pkt_len > size - copied) {
+			/* User didn't give us a big enough buffer,
+			 * what to do?  This is a pretty serious error.
+			 *
+			 * Since we haven't updated the RX ring head to
+			 * consume any of the packets, signal the error
+			 * to the user and just leave the RX ring alone.
+			 *
+			 * This seems the best behavior because this allows
+			 * a user of the LDC layer to start with a small
+			 * RX buffer for ldc_read() calls and use -EMSGSIZE
+			 * as a cue to enlarge it's read buffer.
+			 */
+			err = -EMSGSIZE;
+			break;
+		}
+
+		/* Ok, we are gonna eat this one.  */
+		new = rx_advance(lp, new);
+
+		memcpy(buf,
+		       (lp->cfg.mode == LDC_MODE_UNRELIABLE ?
+			p->u.u_data : p->u.r.r_data), pkt_len);
+		buf += pkt_len;
+		copied += pkt_len;
+
+		if (p->env & LDC_STOP)
+			break;
+
+no_data:
+		if (new == lp->rx_tail) {
+			err = rx_data_wait(lp, new);
+			if (err)
+				break;
+		}
+	}
+
+	if (!err)
+		err = rx_set_head(lp, new);
+
+	if (err && first_frag)
+		lp->rcv_nxt = first_frag->seqid - 1;
+
+	if (!err) {
+		err = copied;
+		if (err > 0 && lp->cfg.mode != LDC_MODE_UNRELIABLE)
+			send_data_ack(lp);
+	}
+
+	return err;
+}
+
+static const struct ldc_mode_ops nonraw_ops = {
+	.write		=	write_nonraw,
+	.read		=	read_nonraw,
+};
+
+static int write_stream(struct ldc_channel *lp, const void *buf,
+			unsigned int size)
+{
+	if (size > lp->cfg.mtu)
+		size = lp->cfg.mtu;
+	return write_nonraw(lp, buf, size);
+}
+
+static int read_stream(struct ldc_channel *lp, void *buf, unsigned int size)
+{
+	if (!lp->mssbuf_len) {
+		int err = read_nonraw(lp, lp->mssbuf, lp->cfg.mtu);
+		if (err < 0)
+			return err;
+
+		lp->mssbuf_len = err;
+		lp->mssbuf_off = 0;
+	}
+
+	if (size > lp->mssbuf_len)
+		size = lp->mssbuf_len;
+	memcpy(buf, lp->mssbuf + lp->mssbuf_off, size);
+
+	lp->mssbuf_off += size;
+	lp->mssbuf_len -= size;
+
+	return size;
+}
+
+static const struct ldc_mode_ops stream_ops = {
+	.write		=	write_stream,
+	.read		=	read_stream,
+};
+
+int ldc_write(struct ldc_channel *lp, const void *buf, unsigned int size)
+{
+	unsigned long flags;
+	int err;
+
+	if (!buf)
+		return -EINVAL;
+
+	if (!size)
+		return 0;
+
+	spin_lock_irqsave(&lp->lock, flags);
+
+	if (lp->hs_state != LDC_HS_COMPLETE)
+		err = -ENOTCONN;
+	else
+		err = lp->mops->write(lp, buf, size);
+
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return err;
+}
+EXPORT_SYMBOL(ldc_write);
+
+int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size)
+{
+	unsigned long flags;
+	int err;
+
+	if (!buf)
+		return -EINVAL;
+
+	if (!size)
+		return 0;
+
+	spin_lock_irqsave(&lp->lock, flags);
+
+	if (lp->hs_state != LDC_HS_COMPLETE)
+		err = -ENOTCONN;
+	else
+		err = lp->mops->read(lp, buf, size);
+
+	spin_unlock_irqrestore(&lp->lock, flags);
+
+	return err;
+}
+EXPORT_SYMBOL(ldc_read);
+
+static long arena_alloc(struct ldc_iommu *iommu, unsigned long npages)
+{
+	struct iommu_arena *arena = &iommu->arena;
+	unsigned long n, i, start, end, limit;
+	int pass;
+
+	limit = arena->limit;
+	start = arena->hint;
+	pass = 0;
+
+again:
+	n = find_next_zero_bit(arena->map, limit, start);
+	end = n + npages;
+	if (unlikely(end >= limit)) {
+		if (likely(pass < 1)) {
+			limit = start;
+			start = 0;
+			pass++;
+			goto again;
+		} else {
+			/* Scanned the whole thing, give up. */
+			return -1;
+		}
+	}
+
+	for (i = n; i < end; i++) {
+		if (test_bit(i, arena->map)) {
+			start = i + 1;
+			goto again;
+		}
+	}
+
+	for (i = n; i < end; i++)
+		__set_bit(i, arena->map);
+
+	arena->hint = end;
+
+	return n;
+}
+
+#define COOKIE_PGSZ_CODE	0xf000000000000000ULL
+#define COOKIE_PGSZ_CODE_SHIFT	60ULL
+
+static u64 pagesize_code(void)
+{
+	switch (PAGE_SIZE) {
+	default:
+	case (8ULL * 1024ULL):
+		return 0;
+	case (64ULL * 1024ULL):
+		return 1;
+	case (512ULL * 1024ULL):
+		return 2;
+	case (4ULL * 1024ULL * 1024ULL):
+		return 3;
+	case (32ULL * 1024ULL * 1024ULL):
+		return 4;
+	case (256ULL * 1024ULL * 1024ULL):
+		return 5;
+	}
+}
+
+static u64 make_cookie(u64 index, u64 pgsz_code, u64 page_offset)
+{
+	return ((pgsz_code << COOKIE_PGSZ_CODE_SHIFT) |
+		(index << PAGE_SHIFT) |
+		page_offset);
+}
+
+static u64 cookie_to_index(u64 cookie, unsigned long *shift)
+{
+	u64 szcode = cookie >> COOKIE_PGSZ_CODE_SHIFT;
+
+	cookie &= ~COOKIE_PGSZ_CODE;
+
+	*shift = szcode * 3;
+
+	return (cookie >> (13ULL + (szcode * 3ULL)));
+}
+
+static struct ldc_mtable_entry *alloc_npages(struct ldc_iommu *iommu,
+					     unsigned long npages)
+{
+	long entry;
+
+	entry = arena_alloc(iommu, npages);
+	if (unlikely(entry < 0))
+		return NULL;
+
+	return iommu->page_table + entry;
+}
+
+static u64 perm_to_mte(unsigned int map_perm)
+{
+	u64 mte_base;
+
+	mte_base = pagesize_code();
+
+	if (map_perm & LDC_MAP_SHADOW) {
+		if (map_perm & LDC_MAP_R)
+			mte_base |= LDC_MTE_COPY_R;
+		if (map_perm & LDC_MAP_W)
+			mte_base |= LDC_MTE_COPY_W;
+	}
+	if (map_perm & LDC_MAP_DIRECT) {
+		if (map_perm & LDC_MAP_R)
+			mte_base |= LDC_MTE_READ;
+		if (map_perm & LDC_MAP_W)
+			mte_base |= LDC_MTE_WRITE;
+		if (map_perm & LDC_MAP_X)
+			mte_base |= LDC_MTE_EXEC;
+	}
+	if (map_perm & LDC_MAP_IO) {
+		if (map_perm & LDC_MAP_R)
+			mte_base |= LDC_MTE_IOMMU_R;
+		if (map_perm & LDC_MAP_W)
+			mte_base |= LDC_MTE_IOMMU_W;
+	}
+
+	return mte_base;
+}
+
+static int pages_in_region(unsigned long base, long len)
+{
+	int count = 0;
+
+	do {
+		unsigned long new = (base + PAGE_SIZE) & PAGE_MASK;
+
+		len -= (new - base);
+		base = new;
+		count++;
+	} while (len > 0);
+
+	return count;
+}
+
+struct cookie_state {
+	struct ldc_mtable_entry		*page_table;
+	struct ldc_trans_cookie		*cookies;
+	u64				mte_base;
+	u64				prev_cookie;
+	u32				pte_idx;
+	u32				nc;
+};
+
+static void fill_cookies(struct cookie_state *sp, unsigned long pa,
+			 unsigned long off, unsigned long len)
+{
+	do {
+		unsigned long tlen, new = pa + PAGE_SIZE;
+		u64 this_cookie;
+
+		sp->page_table[sp->pte_idx].mte = sp->mte_base | pa;
+
+		tlen = PAGE_SIZE;
+		if (off)
+			tlen = PAGE_SIZE - off;
+		if (tlen > len)
+			tlen = len;
+
+		this_cookie = make_cookie(sp->pte_idx,
+					  pagesize_code(), off);
+
+		off = 0;
+
+		if (this_cookie == sp->prev_cookie) {
+			sp->cookies[sp->nc - 1].cookie_size += tlen;
+		} else {
+			sp->cookies[sp->nc].cookie_addr = this_cookie;
+			sp->cookies[sp->nc].cookie_size = tlen;
+			sp->nc++;
+		}
+		sp->prev_cookie = this_cookie + tlen;
+
+		sp->pte_idx++;
+
+		len -= tlen;
+		pa = new;
+	} while (len > 0);
+}
+
+static int sg_count_one(struct scatterlist *sg)
+{
+	unsigned long base = page_to_pfn(sg->page) << PAGE_SHIFT;
+	long len = sg->length;
+
+	if ((sg->offset | len) & (8UL - 1))
+		return -EFAULT;
+
+	return pages_in_region(base + sg->offset, len);
+}
+
+static int sg_count_pages(struct scatterlist *sg, int num_sg)
+{
+	int count;
+	int i;
+
+	count = 0;
+	for (i = 0; i < num_sg; i++) {
+		int err = sg_count_one(sg + i);
+		if (err < 0)
+			return err;
+		count += err;
+	}
+
+	return count;
+}
+
+int ldc_map_sg(struct ldc_channel *lp,
+	       struct scatterlist *sg, int num_sg,
+	       struct ldc_trans_cookie *cookies, int ncookies,
+	       unsigned int map_perm)
+{
+	unsigned long i, npages, flags;
+	struct ldc_mtable_entry *base;
+	struct cookie_state state;
+	struct ldc_iommu *iommu;
+	int err;
+
+	if (map_perm & ~LDC_MAP_ALL)
+		return -EINVAL;
+
+	err = sg_count_pages(sg, num_sg);
+	if (err < 0)
+		return err;
+
+	npages = err;
+	if (err > ncookies)
+		return -EMSGSIZE;
+
+	iommu = &lp->iommu;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	base = alloc_npages(iommu, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	if (!base)
+		return -ENOMEM;
+
+	state.page_table = iommu->page_table;
+	state.cookies = cookies;
+	state.mte_base = perm_to_mte(map_perm);
+	state.prev_cookie = ~(u64)0;
+	state.pte_idx = (base - iommu->page_table);
+	state.nc = 0;
+
+	for (i = 0; i < num_sg; i++)
+		fill_cookies(&state, page_to_pfn(sg[i].page) << PAGE_SHIFT,
+			     sg[i].offset, sg[i].length);
+
+	return state.nc;
+}
+EXPORT_SYMBOL(ldc_map_sg);
+
+int ldc_map_single(struct ldc_channel *lp,
+		   void *buf, unsigned int len,
+		   struct ldc_trans_cookie *cookies, int ncookies,
+		   unsigned int map_perm)
+{
+	unsigned long npages, pa, flags;
+	struct ldc_mtable_entry *base;
+	struct cookie_state state;
+	struct ldc_iommu *iommu;
+
+	if ((map_perm & ~LDC_MAP_ALL) || (ncookies < 1))
+		return -EINVAL;
+
+	pa = __pa(buf);
+	if ((pa | len) & (8UL - 1))
+		return -EFAULT;
+
+	npages = pages_in_region(pa, len);
+
+	iommu = &lp->iommu;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	base = alloc_npages(iommu, npages);
+	spin_unlock_irqrestore(&iommu->lock, flags);
+
+	if (!base)
+		return -ENOMEM;
+
+	state.page_table = iommu->page_table;
+	state.cookies = cookies;
+	state.mte_base = perm_to_mte(map_perm);
+	state.prev_cookie = ~(u64)0;
+	state.pte_idx = (base - iommu->page_table);
+	state.nc = 0;
+	fill_cookies(&state, (pa & PAGE_MASK), (pa & ~PAGE_MASK), len);
+	BUG_ON(state.nc != 1);
+
+	return state.nc;
+}
+EXPORT_SYMBOL(ldc_map_single);
+
+static void free_npages(unsigned long id, struct ldc_iommu *iommu,
+			u64 cookie, u64 size)
+{
+	struct iommu_arena *arena = &iommu->arena;
+	unsigned long i, shift, index, npages;
+	struct ldc_mtable_entry *base;
+
+	npages = PAGE_ALIGN(((cookie & ~PAGE_MASK) + size)) >> PAGE_SHIFT;
+	index = cookie_to_index(cookie, &shift);
+	base = iommu->page_table + index;
+
+	BUG_ON(index > arena->limit ||
+	       (index + npages) > arena->limit);
+
+	for (i = 0; i < npages; i++) {
+		if (base->cookie)
+			sun4v_ldc_revoke(id, cookie + (i << shift),
+					 base->cookie);
+		base->mte = 0;
+		__clear_bit(index + i, arena->map);
+	}
+}
+
+void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies,
+	       int ncookies)
+{
+	struct ldc_iommu *iommu = &lp->iommu;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+	for (i = 0; i < ncookies; i++) {
+		u64 addr = cookies[i].cookie_addr;
+		u64 size = cookies[i].cookie_size;
+
+		free_npages(lp->id, iommu, addr, size);
+	}
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+EXPORT_SYMBOL(ldc_unmap);
+
+int ldc_copy(struct ldc_channel *lp, int copy_dir,
+	     void *buf, unsigned int len, unsigned long offset,
+	     struct ldc_trans_cookie *cookies, int ncookies)
+{
+	unsigned int orig_len;
+	unsigned long ra;
+	int i;
+
+	if (copy_dir != LDC_COPY_IN && copy_dir != LDC_COPY_OUT) {
+		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Bad copy_dir[%d]\n",
+		       lp->id, copy_dir);
+		return -EINVAL;
+	}
+
+	ra = __pa(buf);
+	if ((ra | len | offset) & (8UL - 1)) {
+		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Unaligned buffer "
+		       "ra[%lx] len[%x] offset[%lx]\n",
+		       lp->id, ra, len, offset);
+		return -EFAULT;
+	}
+
+	if (lp->hs_state != LDC_HS_COMPLETE ||
+	    (lp->flags & LDC_FLAG_RESET)) {
+		printk(KERN_ERR PFX "ldc_copy: ID[%lu] Link down hs_state[%x] "
+		       "flags[%x]\n", lp->id, lp->hs_state, lp->flags);
+		return -ECONNRESET;
+	}
+
+	orig_len = len;
+	for (i = 0; i < ncookies; i++) {
+		unsigned long cookie_raddr = cookies[i].cookie_addr;
+		unsigned long this_len = cookies[i].cookie_size;
+		unsigned long actual_len;
+
+		if (unlikely(offset)) {
+			unsigned long this_off = offset;
+
+			if (this_off > this_len)
+				this_off = this_len;
+
+			offset -= this_off;
+			this_len -= this_off;
+			if (!this_len)
+				continue;
+			cookie_raddr += this_off;
+		}
+
+		if (this_len > len)
+			this_len = len;
+
+		while (1) {
+			unsigned long hv_err;
+
+			hv_err = sun4v_ldc_copy(lp->id, copy_dir,
+						cookie_raddr, ra,
+						this_len, &actual_len);
+			if (unlikely(hv_err)) {
+				printk(KERN_ERR PFX "ldc_copy: ID[%lu] "
+				       "HV error %lu\n",
+				       lp->id, hv_err);
+				if (lp->hs_state != LDC_HS_COMPLETE ||
+				    (lp->flags & LDC_FLAG_RESET))
+					return -ECONNRESET;
+				else
+					return -EFAULT;
+			}
+
+			cookie_raddr += actual_len;
+			ra += actual_len;
+			len -= actual_len;
+			if (actual_len == this_len)
+				break;
+
+			this_len -= actual_len;
+		}
+
+		if (!len)
+			break;
+	}
+
+	/* It is caller policy what to do about short copies.
+	 * For example, a networking driver can declare the
+	 * packet a runt and drop it.
+	 */
+
+	return orig_len - len;
+}
+EXPORT_SYMBOL(ldc_copy);
+
+void *ldc_alloc_exp_dring(struct ldc_channel *lp, unsigned int len,
+			  struct ldc_trans_cookie *cookies, int *ncookies,
+			  unsigned int map_perm)
+{
+	void *buf;
+	int err;
+
+	if (len & (8UL - 1))
+		return ERR_PTR(-EINVAL);
+
+	buf = kzalloc(len, GFP_KERNEL);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	err = ldc_map_single(lp, buf, len, cookies, *ncookies, map_perm);
+	if (err < 0) {
+		kfree(buf);
+		return ERR_PTR(err);
+	}
+	*ncookies = err;
+
+	return buf;
+}
+EXPORT_SYMBOL(ldc_alloc_exp_dring);
+
+void ldc_free_exp_dring(struct ldc_channel *lp, void *buf, unsigned int len,
+			struct ldc_trans_cookie *cookies, int ncookies)
+{
+	ldc_unmap(lp, cookies, ncookies);
+	kfree(buf);
+}
+EXPORT_SYMBOL(ldc_free_exp_dring);
+
+static int __init ldc_init(void)
+{
+	unsigned long major, minor;
+	struct mdesc_handle *hp;
+	const u64 *v;
+	u64 mp;
+
+	hp = mdesc_grab();
+	if (!hp)
+		return -ENODEV;
+
+	mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform");
+	if (mp == MDESC_NODE_NULL)
+		return -ENODEV;
+
+	v = mdesc_get_property(hp, mp, "domaining-enabled", NULL);
+	if (!v)
+		return -ENODEV;
+
+	major = 1;
+	minor = 0;
+	if (sun4v_hvapi_register(HV_GRP_LDOM, major, &minor)) {
+		printk(KERN_INFO PFX "Could not register LDOM hvapi.\n");
+		return -ENODEV;
+	}
+
+	printk(KERN_INFO "%s", version);
+
+	if (!*v) {
+		printk(KERN_INFO PFX "Domaining disabled.\n");
+		return -ENODEV;
+	}
+	ldom_domaining_enabled = 1;
+
+	return 0;
+}
+
+core_initcall(ldc_init);

+ 398 - 300
arch/sparc64/kernel/mdesc.c

@@ -6,6 +6,9 @@
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/bootmem.h>
 #include <linux/bootmem.h>
 #include <linux/log2.h>
 #include <linux/log2.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
 
 
 #include <asm/hypervisor.h>
 #include <asm/hypervisor.h>
 #include <asm/mdesc.h>
 #include <asm/mdesc.h>
@@ -29,7 +32,7 @@ struct mdesc_hdr {
 	u32	node_sz; /* node block size */
 	u32	node_sz; /* node block size */
 	u32	name_sz; /* name block size */
 	u32	name_sz; /* name block size */
 	u32	data_sz; /* data block size */
 	u32	data_sz; /* data block size */
-};
+} __attribute__((aligned(16)));
 
 
 struct mdesc_elem {
 struct mdesc_elem {
 	u8	tag;
 	u8	tag;
@@ -53,306 +56,402 @@ struct mdesc_elem {
 	} d;
 	} d;
 };
 };
 
 
-static struct mdesc_hdr *main_mdesc;
-static struct mdesc_node *allnodes;
-
-static struct mdesc_node *allnodes_tail;
-static unsigned int unique_id;
+struct mdesc_mem_ops {
+	struct mdesc_handle *(*alloc)(unsigned int mdesc_size);
+	void (*free)(struct mdesc_handle *handle);
+};
 
 
-static struct mdesc_node **mdesc_hash;
-static unsigned int mdesc_hash_size;
+struct mdesc_handle {
+	struct list_head	list;
+	struct mdesc_mem_ops	*mops;
+	void			*self_base;
+	atomic_t		refcnt;
+	unsigned int		handle_size;
+	struct mdesc_hdr	mdesc;
+};
 
 
-static inline unsigned int node_hashfn(u64 node)
+static void mdesc_handle_init(struct mdesc_handle *hp,
+			      unsigned int handle_size,
+			      void *base)
 {
 {
-	return ((unsigned int) (node ^ (node >> 8) ^ (node >> 16)))
-		& (mdesc_hash_size - 1);
+	BUG_ON(((unsigned long)&hp->mdesc) & (16UL - 1));
+
+	memset(hp, 0, handle_size);
+	INIT_LIST_HEAD(&hp->list);
+	hp->self_base = base;
+	atomic_set(&hp->refcnt, 1);
+	hp->handle_size = handle_size;
 }
 }
 
 
-static inline void hash_node(struct mdesc_node *mp)
+static struct mdesc_handle *mdesc_bootmem_alloc(unsigned int mdesc_size)
 {
 {
-	struct mdesc_node **head = &mdesc_hash[node_hashfn(mp->node)];
+	struct mdesc_handle *hp;
+	unsigned int handle_size, alloc_size;
 
 
-	mp->hash_next = *head;
-	*head = mp;
+	handle_size = (sizeof(struct mdesc_handle) -
+		       sizeof(struct mdesc_hdr) +
+		       mdesc_size);
+	alloc_size = PAGE_ALIGN(handle_size);
 
 
-	if (allnodes_tail) {
-		allnodes_tail->allnodes_next = mp;
-		allnodes_tail = mp;
-	} else {
-		allnodes = allnodes_tail = mp;
-	}
+	hp = __alloc_bootmem(alloc_size, PAGE_SIZE, 0UL);
+	if (hp)
+		mdesc_handle_init(hp, handle_size, hp);
+
+	return hp;
 }
 }
 
 
-static struct mdesc_node *find_node(u64 node)
+static void mdesc_bootmem_free(struct mdesc_handle *hp)
 {
 {
-	struct mdesc_node *mp = mdesc_hash[node_hashfn(node)];
+	unsigned int alloc_size, handle_size = hp->handle_size;
+	unsigned long start, end;
+
+	BUG_ON(atomic_read(&hp->refcnt) != 0);
+	BUG_ON(!list_empty(&hp->list));
 
 
-	while (mp) {
-		if (mp->node == node)
-			return mp;
+	alloc_size = PAGE_ALIGN(handle_size);
 
 
-		mp = mp->hash_next;
+	start = (unsigned long) hp;
+	end = start + alloc_size;
+
+	while (start < end) {
+		struct page *p;
+
+		p = virt_to_page(start);
+		ClearPageReserved(p);
+		__free_page(p);
+		start += PAGE_SIZE;
 	}
 	}
-	return NULL;
 }
 }
 
 
-struct property *md_find_property(const struct mdesc_node *mp,
-				  const char *name,
-				  int *lenp)
+static struct mdesc_mem_ops bootmem_mdesc_memops = {
+	.alloc = mdesc_bootmem_alloc,
+	.free  = mdesc_bootmem_free,
+};
+
+static struct mdesc_handle *mdesc_kmalloc(unsigned int mdesc_size)
 {
 {
-	struct property *pp;
+	unsigned int handle_size;
+	void *base;
 
 
-	for (pp = mp->properties; pp != 0; pp = pp->next) {
-		if (strcasecmp(pp->name, name) == 0) {
-			if (lenp)
-				*lenp = pp->length;
-			break;
-		}
+	handle_size = (sizeof(struct mdesc_handle) -
+		       sizeof(struct mdesc_hdr) +
+		       mdesc_size);
+
+	base = kmalloc(handle_size + 15, GFP_KERNEL);
+	if (base) {
+		struct mdesc_handle *hp;
+		unsigned long addr;
+
+		addr = (unsigned long)base;
+		addr = (addr + 15UL) & ~15UL;
+		hp = (struct mdesc_handle *) addr;
+
+		mdesc_handle_init(hp, handle_size, base);
+		return hp;
 	}
 	}
-	return pp;
+
+	return NULL;
 }
 }
-EXPORT_SYMBOL(md_find_property);
 
 
-/*
- * Find a property with a given name for a given node
- * and return the value.
- */
-const void *md_get_property(const struct mdesc_node *mp, const char *name,
-			    int *lenp)
+static void mdesc_kfree(struct mdesc_handle *hp)
 {
 {
-	struct property *pp = md_find_property(mp, name, lenp);
-	return pp ? pp->value : NULL;
+	BUG_ON(atomic_read(&hp->refcnt) != 0);
+	BUG_ON(!list_empty(&hp->list));
+
+	kfree(hp->self_base);
 }
 }
-EXPORT_SYMBOL(md_get_property);
 
 
-struct mdesc_node *md_find_node_by_name(struct mdesc_node *from,
-					const char *name)
+static struct mdesc_mem_ops kmalloc_mdesc_memops = {
+	.alloc = mdesc_kmalloc,
+	.free  = mdesc_kfree,
+};
+
+static struct mdesc_handle *mdesc_alloc(unsigned int mdesc_size,
+					struct mdesc_mem_ops *mops)
 {
 {
-	struct mdesc_node *mp;
+	struct mdesc_handle *hp = mops->alloc(mdesc_size);
 
 
-	mp = from ? from->allnodes_next : allnodes;
-	for (; mp != NULL; mp = mp->allnodes_next) {
-		if (strcmp(mp->name, name) == 0)
-			break;
-	}
-	return mp;
-}
-EXPORT_SYMBOL(md_find_node_by_name);
+	if (hp)
+		hp->mops = mops;
 
 
-static unsigned int mdesc_early_allocated;
+	return hp;
+}
 
 
-static void * __init mdesc_early_alloc(unsigned long size)
+static void mdesc_free(struct mdesc_handle *hp)
 {
 {
-	void *ret;
+	hp->mops->free(hp);
+}
 
 
-	ret = __alloc_bootmem(size, SMP_CACHE_BYTES, 0UL);
-	if (ret == NULL) {
-		prom_printf("MDESC: alloc of %lu bytes failed.\n", size);
-		prom_halt();
-	}
+static struct mdesc_handle *cur_mdesc;
+static LIST_HEAD(mdesc_zombie_list);
+static DEFINE_SPINLOCK(mdesc_lock);
 
 
-	memset(ret, 0, size);
+struct mdesc_handle *mdesc_grab(void)
+{
+	struct mdesc_handle *hp;
+	unsigned long flags;
 
 
-	mdesc_early_allocated += size;
+	spin_lock_irqsave(&mdesc_lock, flags);
+	hp = cur_mdesc;
+	if (hp)
+		atomic_inc(&hp->refcnt);
+	spin_unlock_irqrestore(&mdesc_lock, flags);
 
 
-	return ret;
+	return hp;
 }
 }
+EXPORT_SYMBOL(mdesc_grab);
 
 
-static unsigned int __init count_arcs(struct mdesc_elem *ep)
+void mdesc_release(struct mdesc_handle *hp)
 {
 {
-	unsigned int ret = 0;
+	unsigned long flags;
 
 
-	ep++;
-	while (ep->tag != MD_NODE_END) {
-		if (ep->tag == MD_PROP_ARC)
-			ret++;
-		ep++;
+	spin_lock_irqsave(&mdesc_lock, flags);
+	if (atomic_dec_and_test(&hp->refcnt)) {
+		list_del_init(&hp->list);
+		hp->mops->free(hp);
 	}
 	}
-	return ret;
+	spin_unlock_irqrestore(&mdesc_lock, flags);
 }
 }
+EXPORT_SYMBOL(mdesc_release);
 
 
-static void __init mdesc_node_alloc(u64 node, struct mdesc_elem *ep, const char *names)
+static void do_mdesc_update(struct work_struct *work)
 {
 {
-	unsigned int num_arcs = count_arcs(ep);
-	struct mdesc_node *mp;
+	unsigned long len, real_len, status;
+	struct mdesc_handle *hp, *orig_hp;
+	unsigned long flags;
+
+	(void) sun4v_mach_desc(0UL, 0UL, &len);
+
+	hp = mdesc_alloc(len, &kmalloc_mdesc_memops);
+	if (!hp) {
+		printk(KERN_ERR "MD: mdesc alloc fails\n");
+		return;
+	}
+
+	status = sun4v_mach_desc(__pa(&hp->mdesc), len, &real_len);
+	if (status != HV_EOK || real_len > len) {
+		printk(KERN_ERR "MD: mdesc reread fails with %lu\n",
+		       status);
+		atomic_dec(&hp->refcnt);
+		mdesc_free(hp);
+		return;
+	}
 
 
-	mp = mdesc_early_alloc(sizeof(*mp) +
-			       (num_arcs * sizeof(struct mdesc_arc)));
-	mp->name = names + ep->name_offset;
-	mp->node = node;
-	mp->unique_id = unique_id++;
-	mp->num_arcs = num_arcs;
+	spin_lock_irqsave(&mdesc_lock, flags);
+	orig_hp = cur_mdesc;
+	cur_mdesc = hp;
 
 
-	hash_node(mp);
+	if (atomic_dec_and_test(&orig_hp->refcnt))
+		mdesc_free(orig_hp);
+	else
+		list_add(&orig_hp->list, &mdesc_zombie_list);
+	spin_unlock_irqrestore(&mdesc_lock, flags);
 }
 }
 
 
-static inline struct mdesc_elem *node_block(struct mdesc_hdr *mdesc)
+static DECLARE_WORK(mdesc_update_work, do_mdesc_update);
+
+void mdesc_update(void)
+{
+	schedule_work(&mdesc_update_work);
+}
+
+static struct mdesc_elem *node_block(struct mdesc_hdr *mdesc)
 {
 {
 	return (struct mdesc_elem *) (mdesc + 1);
 	return (struct mdesc_elem *) (mdesc + 1);
 }
 }
 
 
-static inline void *name_block(struct mdesc_hdr *mdesc)
+static void *name_block(struct mdesc_hdr *mdesc)
 {
 {
 	return ((void *) node_block(mdesc)) + mdesc->node_sz;
 	return ((void *) node_block(mdesc)) + mdesc->node_sz;
 }
 }
 
 
-static inline void *data_block(struct mdesc_hdr *mdesc)
+static void *data_block(struct mdesc_hdr *mdesc)
 {
 {
 	return ((void *) name_block(mdesc)) + mdesc->name_sz;
 	return ((void *) name_block(mdesc)) + mdesc->name_sz;
 }
 }
 
 
-/* In order to avoid recursion (the graph can be very deep) we use a
- * two pass algorithm.  First we allocate all the nodes and hash them.
- * Then we iterate over each node, filling in the arcs and properties.
- */
-static void __init build_all_nodes(struct mdesc_hdr *mdesc)
+u64 mdesc_node_by_name(struct mdesc_handle *hp,
+		       u64 from_node, const char *name)
 {
 {
-	struct mdesc_elem *start, *ep;
-	struct mdesc_node *mp;
-	const char *names;
-	void *data;
-	u64 last_node;
-
-	start = ep = node_block(mdesc);
-	last_node = mdesc->node_sz / 16;
+	struct mdesc_elem *ep = node_block(&hp->mdesc);
+	const char *names = name_block(&hp->mdesc);
+	u64 last_node = hp->mdesc.node_sz / 16;
+	u64 ret;
+
+	if (from_node == MDESC_NODE_NULL)
+		from_node = 0;
+
+	if (from_node >= last_node)
+		return MDESC_NODE_NULL;
+
+	ret = ep[from_node].d.val;
+	while (ret < last_node) {
+		if (ep[ret].tag != MD_NODE)
+			return MDESC_NODE_NULL;
+		if (!strcmp(names + ep[ret].name_offset, name))
+			break;
+		ret = ep[ret].d.val;
+	}
+	if (ret >= last_node)
+		ret = MDESC_NODE_NULL;
+	return ret;
+}
+EXPORT_SYMBOL(mdesc_node_by_name);
 
 
-	names = name_block(mdesc);
+const void *mdesc_get_property(struct mdesc_handle *hp, u64 node,
+			       const char *name, int *lenp)
+{
+	const char *names = name_block(&hp->mdesc);
+	u64 last_node = hp->mdesc.node_sz / 16;
+	void *data = data_block(&hp->mdesc);
+	struct mdesc_elem *ep;
 
 
-	while (1) {
-		u64 node = ep - start;
+	if (node == MDESC_NODE_NULL || node >= last_node)
+		return NULL;
 
 
-		if (ep->tag == MD_LIST_END)
+	ep = node_block(&hp->mdesc) + node;
+	ep++;
+	for (; ep->tag != MD_NODE_END; ep++) {
+		void *val = NULL;
+		int len = 0;
+
+		switch (ep->tag) {
+		case MD_PROP_VAL:
+			val = &ep->d.val;
+			len = 8;
 			break;
 			break;
 
 
-		if (ep->tag != MD_NODE) {
-			prom_printf("MDESC: Inconsistent element list.\n");
-			prom_halt();
-		}
-
-		mdesc_node_alloc(node, ep, names);
+		case MD_PROP_STR:
+		case MD_PROP_DATA:
+			val = data + ep->d.data.data_offset;
+			len = ep->d.data.data_len;
+			break;
 
 
-		if (ep->d.val >= last_node) {
-			printk("MDESC: Warning, early break out of node scan.\n");
-			printk("MDESC: Next node [%lu] last_node [%lu].\n",
-			       node, last_node);
+		default:
 			break;
 			break;
 		}
 		}
+		if (!val)
+			continue;
 
 
-		ep = start + ep->d.val;
+		if (!strcmp(names + ep->name_offset, name)) {
+			if (lenp)
+				*lenp = len;
+			return val;
+		}
 	}
 	}
 
 
-	data = data_block(mdesc);
-	for (mp = allnodes; mp; mp = mp->allnodes_next) {
-		struct mdesc_elem *ep = start + mp->node;
-		struct property **link = &mp->properties;
-		unsigned int this_arc = 0;
-
-		ep++;
-		while (ep->tag != MD_NODE_END) {
-			switch (ep->tag) {
-			case MD_PROP_ARC: {
-				struct mdesc_node *target;
-
-				if (this_arc >= mp->num_arcs) {
-					prom_printf("MDESC: ARC overrun [%u:%u]\n",
-						    this_arc, mp->num_arcs);
-					prom_halt();
-				}
-				target = find_node(ep->d.val);
-				if (!target) {
-					printk("MDESC: Warning, arc points to "
-					       "missing node, ignoring.\n");
-					break;
-				}
-				mp->arcs[this_arc].name =
-					(names + ep->name_offset);
-				mp->arcs[this_arc].arc = target;
-				this_arc++;
-				break;
-			}
+	return NULL;
+}
+EXPORT_SYMBOL(mdesc_get_property);
 
 
-			case MD_PROP_VAL:
-			case MD_PROP_STR:
-			case MD_PROP_DATA: {
-				struct property *p = mdesc_early_alloc(sizeof(*p));
-
-				p->unique_id = unique_id++;
-				p->name = (char *) names + ep->name_offset;
-				if (ep->tag == MD_PROP_VAL) {
-					p->value = &ep->d.val;
-					p->length = 8;
-				} else {
-					p->value = data + ep->d.data.data_offset;
-					p->length = ep->d.data.data_len;
-				}
-				*link = p;
-				link = &p->next;
-				break;
-			}
+u64 mdesc_next_arc(struct mdesc_handle *hp, u64 from, const char *arc_type)
+{
+	struct mdesc_elem *ep, *base = node_block(&hp->mdesc);
+	const char *names = name_block(&hp->mdesc);
+	u64 last_node = hp->mdesc.node_sz / 16;
 
 
-			case MD_NOOP:
-				break;
+	if (from == MDESC_NODE_NULL || from >= last_node)
+		return MDESC_NODE_NULL;
 
 
-			default:
-				printk("MDESC: Warning, ignoring unknown tag type %02x\n",
-				       ep->tag);
-			}
-			ep++;
-		}
+	ep = base + from;
+
+	ep++;
+	for (; ep->tag != MD_NODE_END; ep++) {
+		if (ep->tag != MD_PROP_ARC)
+			continue;
+
+		if (strcmp(names + ep->name_offset, arc_type))
+			continue;
+
+		return ep - base;
 	}
 	}
+
+	return MDESC_NODE_NULL;
 }
 }
+EXPORT_SYMBOL(mdesc_next_arc);
 
 
-static unsigned int __init count_nodes(struct mdesc_hdr *mdesc)
+u64 mdesc_arc_target(struct mdesc_handle *hp, u64 arc)
 {
 {
-	struct mdesc_elem *ep = node_block(mdesc);
-	struct mdesc_elem *end;
-	unsigned int cnt = 0;
-
-	end = ((void *)ep) + mdesc->node_sz;
-	while (ep < end) {
-		if (ep->tag == MD_NODE)
-			cnt++;
-		ep++;
-	}
-	return cnt;
+	struct mdesc_elem *ep, *base = node_block(&hp->mdesc);
+
+	ep = base + arc;
+
+	return ep->d.val;
+}
+EXPORT_SYMBOL(mdesc_arc_target);
+
+const char *mdesc_node_name(struct mdesc_handle *hp, u64 node)
+{
+	struct mdesc_elem *ep, *base = node_block(&hp->mdesc);
+	const char *names = name_block(&hp->mdesc);
+	u64 last_node = hp->mdesc.node_sz / 16;
+
+	if (node == MDESC_NODE_NULL || node >= last_node)
+		return NULL;
+
+	ep = base + node;
+	if (ep->tag != MD_NODE)
+		return NULL;
+
+	return names + ep->name_offset;
 }
 }
+EXPORT_SYMBOL(mdesc_node_name);
 
 
 static void __init report_platform_properties(void)
 static void __init report_platform_properties(void)
 {
 {
-	struct mdesc_node *pn = md_find_node_by_name(NULL, "platform");
+	struct mdesc_handle *hp = mdesc_grab();
+	u64 pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform");
 	const char *s;
 	const char *s;
 	const u64 *v;
 	const u64 *v;
 
 
-	if (!pn) {
+	if (pn == MDESC_NODE_NULL) {
 		prom_printf("No platform node in machine-description.\n");
 		prom_printf("No platform node in machine-description.\n");
 		prom_halt();
 		prom_halt();
 	}
 	}
 
 
-	s = md_get_property(pn, "banner-name", NULL);
+	s = mdesc_get_property(hp, pn, "banner-name", NULL);
 	printk("PLATFORM: banner-name [%s]\n", s);
 	printk("PLATFORM: banner-name [%s]\n", s);
-	s = md_get_property(pn, "name", NULL);
+	s = mdesc_get_property(hp, pn, "name", NULL);
 	printk("PLATFORM: name [%s]\n", s);
 	printk("PLATFORM: name [%s]\n", s);
 
 
-	v = md_get_property(pn, "hostid", NULL);
+	v = mdesc_get_property(hp, pn, "hostid", NULL);
 	if (v)
 	if (v)
 		printk("PLATFORM: hostid [%08lx]\n", *v);
 		printk("PLATFORM: hostid [%08lx]\n", *v);
-	v = md_get_property(pn, "serial#", NULL);
+	v = mdesc_get_property(hp, pn, "serial#", NULL);
 	if (v)
 	if (v)
 		printk("PLATFORM: serial# [%08lx]\n", *v);
 		printk("PLATFORM: serial# [%08lx]\n", *v);
-	v = md_get_property(pn, "stick-frequency", NULL);
+	v = mdesc_get_property(hp, pn, "stick-frequency", NULL);
 	printk("PLATFORM: stick-frequency [%08lx]\n", *v);
 	printk("PLATFORM: stick-frequency [%08lx]\n", *v);
-	v = md_get_property(pn, "mac-address", NULL);
+	v = mdesc_get_property(hp, pn, "mac-address", NULL);
 	if (v)
 	if (v)
 		printk("PLATFORM: mac-address [%lx]\n", *v);
 		printk("PLATFORM: mac-address [%lx]\n", *v);
-	v = md_get_property(pn, "watchdog-resolution", NULL);
+	v = mdesc_get_property(hp, pn, "watchdog-resolution", NULL);
 	if (v)
 	if (v)
 		printk("PLATFORM: watchdog-resolution [%lu ms]\n", *v);
 		printk("PLATFORM: watchdog-resolution [%lu ms]\n", *v);
-	v = md_get_property(pn, "watchdog-max-timeout", NULL);
+	v = mdesc_get_property(hp, pn, "watchdog-max-timeout", NULL);
 	if (v)
 	if (v)
 		printk("PLATFORM: watchdog-max-timeout [%lu ms]\n", *v);
 		printk("PLATFORM: watchdog-max-timeout [%lu ms]\n", *v);
-	v = md_get_property(pn, "max-cpus", NULL);
+	v = mdesc_get_property(hp, pn, "max-cpus", NULL);
 	if (v)
 	if (v)
 		printk("PLATFORM: max-cpus [%lu]\n", *v);
 		printk("PLATFORM: max-cpus [%lu]\n", *v);
+
+#ifdef CONFIG_SMP
+	{
+		int max_cpu, i;
+
+		if (v) {
+			max_cpu = *v;
+			if (max_cpu > NR_CPUS)
+				max_cpu = NR_CPUS;
+		} else {
+			max_cpu = NR_CPUS;
+		}
+		for (i = 0; i < max_cpu; i++)
+			cpu_set(i, cpu_possible_map);
+	}
+#endif
+
+	mdesc_release(hp);
 }
 }
 
 
 static int inline find_in_proplist(const char *list, const char *match, int len)
 static int inline find_in_proplist(const char *list, const char *match, int len)
@@ -369,15 +468,17 @@ static int inline find_in_proplist(const char *list, const char *match, int len)
 	return 0;
 	return 0;
 }
 }
 
 
-static void __init fill_in_one_cache(cpuinfo_sparc *c, struct mdesc_node *mp)
+static void __devinit fill_in_one_cache(cpuinfo_sparc *c,
+					struct mdesc_handle *hp,
+					u64 mp)
 {
 {
-	const u64 *level = md_get_property(mp, "level", NULL);
-	const u64 *size = md_get_property(mp, "size", NULL);
-	const u64 *line_size = md_get_property(mp, "line-size", NULL);
+	const u64 *level = mdesc_get_property(hp, mp, "level", NULL);
+	const u64 *size = mdesc_get_property(hp, mp, "size", NULL);
+	const u64 *line_size = mdesc_get_property(hp, mp, "line-size", NULL);
 	const char *type;
 	const char *type;
 	int type_len;
 	int type_len;
 
 
-	type = md_get_property(mp, "type", &type_len);
+	type = mdesc_get_property(hp, mp, "type", &type_len);
 
 
 	switch (*level) {
 	switch (*level) {
 	case 1:
 	case 1:
@@ -400,48 +501,45 @@ static void __init fill_in_one_cache(cpuinfo_sparc *c, struct mdesc_node *mp)
 	}
 	}
 
 
 	if (*level == 1) {
 	if (*level == 1) {
-		unsigned int i;
-
-		for (i = 0; i < mp->num_arcs; i++) {
-			struct mdesc_node *t = mp->arcs[i].arc;
+		u64 a;
 
 
-			if (strcmp(mp->arcs[i].name, "fwd"))
-				continue;
+		mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_FWD) {
+			u64 target = mdesc_arc_target(hp, a);
+			const char *name = mdesc_node_name(hp, target);
 
 
-			if (!strcmp(t->name, "cache"))
-				fill_in_one_cache(c, t);
+			if (!strcmp(name, "cache"))
+				fill_in_one_cache(c, hp, target);
 		}
 		}
 	}
 	}
 }
 }
 
 
-static void __init mark_core_ids(struct mdesc_node *mp, int core_id)
+static void __devinit mark_core_ids(struct mdesc_handle *hp, u64 mp,
+				    int core_id)
 {
 {
-	unsigned int i;
+	u64 a;
 
 
-	for (i = 0; i < mp->num_arcs; i++) {
-		struct mdesc_node *t = mp->arcs[i].arc;
+	mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_BACK) {
+		u64 t = mdesc_arc_target(hp, a);
+		const char *name;
 		const u64 *id;
 		const u64 *id;
 
 
-		if (strcmp(mp->arcs[i].name, "back"))
-			continue;
-
-		if (!strcmp(t->name, "cpu")) {
-			id = md_get_property(t, "id", NULL);
+		name = mdesc_node_name(hp, t);
+		if (!strcmp(name, "cpu")) {
+			id = mdesc_get_property(hp, t, "id", NULL);
 			if (*id < NR_CPUS)
 			if (*id < NR_CPUS)
 				cpu_data(*id).core_id = core_id;
 				cpu_data(*id).core_id = core_id;
 		} else {
 		} else {
-			unsigned int j;
+			u64 j;
 
 
-			for (j = 0; j < t->num_arcs; j++) {
-				struct mdesc_node *n = t->arcs[j].arc;
+			mdesc_for_each_arc(j, hp, t, MDESC_ARC_TYPE_BACK) {
+				u64 n = mdesc_arc_target(hp, j);
+				const char *n_name;
 
 
-				if (strcmp(t->arcs[j].name, "back"))
+				n_name = mdesc_node_name(hp, n);
+				if (strcmp(n_name, "cpu"))
 					continue;
 					continue;
 
 
-				if (strcmp(n->name, "cpu"))
-					continue;
-
-				id = md_get_property(n, "id", NULL);
+				id = mdesc_get_property(hp, n, "id", NULL);
 				if (*id < NR_CPUS)
 				if (*id < NR_CPUS)
 					cpu_data(*id).core_id = core_id;
 					cpu_data(*id).core_id = core_id;
 			}
 			}
@@ -449,78 +547,81 @@ static void __init mark_core_ids(struct mdesc_node *mp, int core_id)
 	}
 	}
 }
 }
 
 
-static void __init set_core_ids(void)
+static void __devinit set_core_ids(struct mdesc_handle *hp)
 {
 {
-	struct mdesc_node *mp;
 	int idx;
 	int idx;
+	u64 mp;
 
 
 	idx = 1;
 	idx = 1;
-	md_for_each_node_by_name(mp, "cache") {
-		const u64 *level = md_get_property(mp, "level", NULL);
+	mdesc_for_each_node_by_name(hp, mp, "cache") {
+		const u64 *level;
 		const char *type;
 		const char *type;
 		int len;
 		int len;
 
 
+		level = mdesc_get_property(hp, mp, "level", NULL);
 		if (*level != 1)
 		if (*level != 1)
 			continue;
 			continue;
 
 
-		type = md_get_property(mp, "type", &len);
+		type = mdesc_get_property(hp, mp, "type", &len);
 		if (!find_in_proplist(type, "instn", len))
 		if (!find_in_proplist(type, "instn", len))
 			continue;
 			continue;
 
 
-		mark_core_ids(mp, idx);
+		mark_core_ids(hp, mp, idx);
 
 
 		idx++;
 		idx++;
 	}
 	}
 }
 }
 
 
-static void __init mark_proc_ids(struct mdesc_node *mp, int proc_id)
+static void __devinit mark_proc_ids(struct mdesc_handle *hp, u64 mp,
+				    int proc_id)
 {
 {
-	int i;
+	u64 a;
 
 
-	for (i = 0; i < mp->num_arcs; i++) {
-		struct mdesc_node *t = mp->arcs[i].arc;
+	mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_BACK) {
+		u64 t = mdesc_arc_target(hp, a);
+		const char *name;
 		const u64 *id;
 		const u64 *id;
 
 
-		if (strcmp(mp->arcs[i].name, "back"))
-			continue;
-
-		if (strcmp(t->name, "cpu"))
+		name = mdesc_node_name(hp, t);
+		if (strcmp(name, "cpu"))
 			continue;
 			continue;
 
 
-		id = md_get_property(t, "id", NULL);
+		id = mdesc_get_property(hp, t, "id", NULL);
 		if (*id < NR_CPUS)
 		if (*id < NR_CPUS)
 			cpu_data(*id).proc_id = proc_id;
 			cpu_data(*id).proc_id = proc_id;
 	}
 	}
 }
 }
 
 
-static void __init __set_proc_ids(const char *exec_unit_name)
+static void __devinit __set_proc_ids(struct mdesc_handle *hp,
+				     const char *exec_unit_name)
 {
 {
-	struct mdesc_node *mp;
 	int idx;
 	int idx;
+	u64 mp;
 
 
 	idx = 0;
 	idx = 0;
-	md_for_each_node_by_name(mp, exec_unit_name) {
+	mdesc_for_each_node_by_name(hp, mp, exec_unit_name) {
 		const char *type;
 		const char *type;
 		int len;
 		int len;
 
 
-		type = md_get_property(mp, "type", &len);
+		type = mdesc_get_property(hp, mp, "type", &len);
 		if (!find_in_proplist(type, "int", len) &&
 		if (!find_in_proplist(type, "int", len) &&
 		    !find_in_proplist(type, "integer", len))
 		    !find_in_proplist(type, "integer", len))
 			continue;
 			continue;
 
 
-		mark_proc_ids(mp, idx);
+		mark_proc_ids(hp, mp, idx);
 
 
 		idx++;
 		idx++;
 	}
 	}
 }
 }
 
 
-static void __init set_proc_ids(void)
+static void __devinit set_proc_ids(struct mdesc_handle *hp)
 {
 {
-	__set_proc_ids("exec_unit");
-	__set_proc_ids("exec-unit");
+	__set_proc_ids(hp, "exec_unit");
+	__set_proc_ids(hp, "exec-unit");
 }
 }
 
 
-static void __init get_one_mondo_bits(const u64 *p, unsigned int *mask, unsigned char def)
+static void __devinit get_one_mondo_bits(const u64 *p, unsigned int *mask,
+					 unsigned char def)
 {
 {
 	u64 val;
 	u64 val;
 
 
@@ -538,35 +639,37 @@ static void __init get_one_mondo_bits(const u64 *p, unsigned int *mask, unsigned
 	*mask = ((1U << def) * 64U) - 1U;
 	*mask = ((1U << def) * 64U) - 1U;
 }
 }
 
 
-static void __init get_mondo_data(struct mdesc_node *mp, struct trap_per_cpu *tb)
+static void __devinit get_mondo_data(struct mdesc_handle *hp, u64 mp,
+				     struct trap_per_cpu *tb)
 {
 {
 	const u64 *val;
 	const u64 *val;
 
 
-	val = md_get_property(mp, "q-cpu-mondo-#bits", NULL);
+	val = mdesc_get_property(hp, mp, "q-cpu-mondo-#bits", NULL);
 	get_one_mondo_bits(val, &tb->cpu_mondo_qmask, 7);
 	get_one_mondo_bits(val, &tb->cpu_mondo_qmask, 7);
 
 
-	val = md_get_property(mp, "q-dev-mondo-#bits", NULL);
+	val = mdesc_get_property(hp, mp, "q-dev-mondo-#bits", NULL);
 	get_one_mondo_bits(val, &tb->dev_mondo_qmask, 7);
 	get_one_mondo_bits(val, &tb->dev_mondo_qmask, 7);
 
 
-	val = md_get_property(mp, "q-resumable-#bits", NULL);
+	val = mdesc_get_property(hp, mp, "q-resumable-#bits", NULL);
 	get_one_mondo_bits(val, &tb->resum_qmask, 6);
 	get_one_mondo_bits(val, &tb->resum_qmask, 6);
 
 
-	val = md_get_property(mp, "q-nonresumable-#bits", NULL);
+	val = mdesc_get_property(hp, mp, "q-nonresumable-#bits", NULL);
 	get_one_mondo_bits(val, &tb->nonresum_qmask, 2);
 	get_one_mondo_bits(val, &tb->nonresum_qmask, 2);
 }
 }
 
 
-static void __init mdesc_fill_in_cpu_data(void)
+void __devinit mdesc_fill_in_cpu_data(cpumask_t mask)
 {
 {
-	struct mdesc_node *mp;
+	struct mdesc_handle *hp = mdesc_grab();
+	u64 mp;
 
 
 	ncpus_probed = 0;
 	ncpus_probed = 0;
-	md_for_each_node_by_name(mp, "cpu") {
-		const u64 *id = md_get_property(mp, "id", NULL);
-		const u64 *cfreq = md_get_property(mp, "clock-frequency", NULL);
+	mdesc_for_each_node_by_name(hp, mp, "cpu") {
+		const u64 *id = mdesc_get_property(hp, mp, "id", NULL);
+		const u64 *cfreq = mdesc_get_property(hp, mp, "clock-frequency", NULL);
 		struct trap_per_cpu *tb;
 		struct trap_per_cpu *tb;
 		cpuinfo_sparc *c;
 		cpuinfo_sparc *c;
-		unsigned int i;
 		int cpuid;
 		int cpuid;
+		u64 a;
 
 
 		ncpus_probed++;
 		ncpus_probed++;
 
 
@@ -575,6 +678,8 @@ static void __init mdesc_fill_in_cpu_data(void)
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 		if (cpuid >= NR_CPUS)
 		if (cpuid >= NR_CPUS)
 			continue;
 			continue;
+		if (!cpu_isset(cpuid, mask))
+			continue;
 #else
 #else
 		/* On uniprocessor we only want the values for the
 		/* On uniprocessor we only want the values for the
 		 * real physical cpu the kernel booted onto, however
 		 * real physical cpu the kernel booted onto, however
@@ -589,35 +694,30 @@ static void __init mdesc_fill_in_cpu_data(void)
 		c->clock_tick = *cfreq;
 		c->clock_tick = *cfreq;
 
 
 		tb = &trap_block[cpuid];
 		tb = &trap_block[cpuid];
-		get_mondo_data(mp, tb);
-
-		for (i = 0; i < mp->num_arcs; i++) {
-			struct mdesc_node *t = mp->arcs[i].arc;
-			unsigned int j;
+		get_mondo_data(hp, mp, tb);
 
 
-			if (strcmp(mp->arcs[i].name, "fwd"))
-				continue;
+		mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_FWD) {
+			u64 j, t = mdesc_arc_target(hp, a);
+			const char *t_name;
 
 
-			if (!strcmp(t->name, "cache")) {
-				fill_in_one_cache(c, t);
+			t_name = mdesc_node_name(hp, t);
+			if (!strcmp(t_name, "cache")) {
+				fill_in_one_cache(c, hp, t);
 				continue;
 				continue;
 			}
 			}
 
 
-			for (j = 0; j < t->num_arcs; j++) {
-				struct mdesc_node *n;
+			mdesc_for_each_arc(j, hp, t, MDESC_ARC_TYPE_FWD) {
+				u64 n = mdesc_arc_target(hp, j);
+				const char *n_name;
 
 
-				n = t->arcs[j].arc;
-				if (strcmp(t->arcs[j].name, "fwd"))
-					continue;
-
-				if (!strcmp(n->name, "cache"))
-					fill_in_one_cache(c, n);
+				n_name = mdesc_node_name(hp, n);
+				if (!strcmp(n_name, "cache"))
+					fill_in_one_cache(c, hp, n);
 			}
 			}
 		}
 		}
 
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 		cpu_set(cpuid, cpu_present_map);
 		cpu_set(cpuid, cpu_present_map);
-		cpu_set(cpuid, phys_cpu_present_map);
 #endif
 #endif
 
 
 		c->core_id = 0;
 		c->core_id = 0;
@@ -628,45 +728,43 @@ static void __init mdesc_fill_in_cpu_data(void)
 	sparc64_multi_core = 1;
 	sparc64_multi_core = 1;
 #endif
 #endif
 
 
-	set_core_ids();
-	set_proc_ids();
+	set_core_ids(hp);
+	set_proc_ids(hp);
 
 
 	smp_fill_in_sib_core_maps();
 	smp_fill_in_sib_core_maps();
+
+	mdesc_release(hp);
 }
 }
 
 
 void __init sun4v_mdesc_init(void)
 void __init sun4v_mdesc_init(void)
 {
 {
+	struct mdesc_handle *hp;
 	unsigned long len, real_len, status;
 	unsigned long len, real_len, status;
+	cpumask_t mask;
 
 
 	(void) sun4v_mach_desc(0UL, 0UL, &len);
 	(void) sun4v_mach_desc(0UL, 0UL, &len);
 
 
 	printk("MDESC: Size is %lu bytes.\n", len);
 	printk("MDESC: Size is %lu bytes.\n", len);
 
 
-	main_mdesc = mdesc_early_alloc(len);
+	hp = mdesc_alloc(len, &bootmem_mdesc_memops);
+	if (hp == NULL) {
+		prom_printf("MDESC: alloc of %lu bytes failed.\n", len);
+		prom_halt();
+	}
 
 
-	status = sun4v_mach_desc(__pa(main_mdesc), len, &real_len);
+	status = sun4v_mach_desc(__pa(&hp->mdesc), len, &real_len);
 	if (status != HV_EOK || real_len > len) {
 	if (status != HV_EOK || real_len > len) {
 		prom_printf("sun4v_mach_desc fails, err(%lu), "
 		prom_printf("sun4v_mach_desc fails, err(%lu), "
 			    "len(%lu), real_len(%lu)\n",
 			    "len(%lu), real_len(%lu)\n",
 			    status, len, real_len);
 			    status, len, real_len);
+		mdesc_free(hp);
 		prom_halt();
 		prom_halt();
 	}
 	}
 
 
-	len = count_nodes(main_mdesc);
-	printk("MDESC: %lu nodes.\n", len);
-
-	len = roundup_pow_of_two(len);
-
-	mdesc_hash = mdesc_early_alloc(len * sizeof(struct mdesc_node *));
-	mdesc_hash_size = len;
-
-	printk("MDESC: Hash size %lu entries.\n", len);
-
-	build_all_nodes(main_mdesc);
-
-	printk("MDESC: Built graph with %u bytes of memory.\n",
-	       mdesc_early_allocated);
+	cur_mdesc = hp;
 
 
 	report_platform_properties();
 	report_platform_properties();
-	mdesc_fill_in_cpu_data();
+
+	cpus_setall(mask);
+	mdesc_fill_in_cpu_data(mask);
 }
 }

+ 31 - 23
arch/sparc64/kernel/power.c

@@ -1,7 +1,6 @@
-/* $Id: power.c,v 1.10 2001/12/11 01:57:16 davem Exp $
- * power.c: Power management driver.
+/* power.c: Power management driver.
  *
  *
- * Copyright (C) 1999 David S. Miller (davem@redhat.com)
+ * Copyright (C) 1999, 2007 David S. Miller (davem@davemloft.net)
  */
  */
 
 
 #include <linux/kernel.h>
 #include <linux/kernel.h>
@@ -19,6 +18,7 @@
 #include <asm/prom.h>
 #include <asm/prom.h>
 #include <asm/of_device.h>
 #include <asm/of_device.h>
 #include <asm/io.h>
 #include <asm/io.h>
+#include <asm/power.h>
 #include <asm/sstate.h>
 #include <asm/sstate.h>
 
 
 #include <linux/unistd.h>
 #include <linux/unistd.h>
@@ -29,24 +29,26 @@
  */
  */
 int scons_pwroff = 1; 
 int scons_pwroff = 1; 
 
 
-#ifdef CONFIG_PCI
-#include <linux/pci.h>
 static void __iomem *power_reg;
 static void __iomem *power_reg;
 
 
 static DECLARE_WAIT_QUEUE_HEAD(powerd_wait);
 static DECLARE_WAIT_QUEUE_HEAD(powerd_wait);
 static int button_pressed;
 static int button_pressed;
 
 
-static irqreturn_t power_handler(int irq, void *dev_id)
+void wake_up_powerd(void)
 {
 {
 	if (button_pressed == 0) {
 	if (button_pressed == 0) {
 		button_pressed = 1;
 		button_pressed = 1;
 		wake_up(&powerd_wait);
 		wake_up(&powerd_wait);
 	}
 	}
+}
+
+static irqreturn_t power_handler(int irq, void *dev_id)
+{
+	wake_up_powerd();
 
 
 	/* FIXME: Check registers for status... */
 	/* FIXME: Check registers for status... */
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
 }
 }
-#endif /* CONFIG_PCI */
 
 
 extern void machine_halt(void);
 extern void machine_halt(void);
 extern void machine_alt_power_off(void);
 extern void machine_alt_power_off(void);
@@ -56,19 +58,18 @@ void machine_power_off(void)
 {
 {
 	sstate_poweroff();
 	sstate_poweroff();
 	if (!serial_console || scons_pwroff) {
 	if (!serial_console || scons_pwroff) {
-#ifdef CONFIG_PCI
 		if (power_reg) {
 		if (power_reg) {
 			/* Both register bits seem to have the
 			/* Both register bits seem to have the
 			 * same effect, so until I figure out
 			 * same effect, so until I figure out
 			 * what the difference is...
 			 * what the difference is...
 			 */
 			 */
 			writel(AUXIO_PCIO_CPWR_OFF | AUXIO_PCIO_SPWR_OFF, power_reg);
 			writel(AUXIO_PCIO_CPWR_OFF | AUXIO_PCIO_SPWR_OFF, power_reg);
-		} else
-#endif /* CONFIG_PCI */
+		} else {
 			if (poweroff_method != NULL) {
 			if (poweroff_method != NULL) {
 				poweroff_method();
 				poweroff_method();
 				/* not reached */
 				/* not reached */
 			}
 			}
+		}
 	}
 	}
 	machine_halt();
 	machine_halt();
 }
 }
@@ -76,7 +77,6 @@ void machine_power_off(void)
 void (*pm_power_off)(void) = machine_power_off;
 void (*pm_power_off)(void) = machine_power_off;
 EXPORT_SYMBOL(pm_power_off);
 EXPORT_SYMBOL(pm_power_off);
 
 
-#ifdef CONFIG_PCI
 static int powerd(void *__unused)
 static int powerd(void *__unused)
 {
 {
 	static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
 	static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
@@ -86,7 +86,7 @@ static int powerd(void *__unused)
 	daemonize("powerd");
 	daemonize("powerd");
 
 
 	add_wait_queue(&powerd_wait, &wait);
 	add_wait_queue(&powerd_wait, &wait);
-again:
+
 	for (;;) {
 	for (;;) {
 		set_task_state(current, TASK_INTERRUPTIBLE);
 		set_task_state(current, TASK_INTERRUPTIBLE);
 		if (button_pressed)
 		if (button_pressed)
@@ -100,16 +100,28 @@ static int powerd(void *__unused)
 	/* Ok, down we go... */
 	/* Ok, down we go... */
 	button_pressed = 0;
 	button_pressed = 0;
 	if (kernel_execve("/sbin/shutdown", argv, envp) < 0) {
 	if (kernel_execve("/sbin/shutdown", argv, envp) < 0) {
-		printk("powerd: shutdown execution failed\n");
-		add_wait_queue(&powerd_wait, &wait);
-		goto again;
+		printk(KERN_ERR "powerd: shutdown execution failed\n");
+		machine_power_off();
 	}
 	}
 	return 0;
 	return 0;
 }
 }
 
 
+int start_powerd(void)
+{
+	int err;
+
+	err = kernel_thread(powerd, NULL, CLONE_FS);
+	if (err < 0)
+		printk(KERN_ERR "power: Failed to start power daemon.\n");
+	else
+		printk(KERN_INFO "power: powerd running.\n");
+
+	return err;
+}
+
 static int __init has_button_interrupt(unsigned int irq, struct device_node *dp)
 static int __init has_button_interrupt(unsigned int irq, struct device_node *dp)
 {
 {
-	if (irq == PCI_IRQ_NONE)
+	if (irq == 0xffffffff)
 		return 0;
 		return 0;
 	if (!of_find_property(dp, "button", NULL))
 	if (!of_find_property(dp, "button", NULL))
 		return 0;
 		return 0;
@@ -130,17 +142,14 @@ static int __devinit power_probe(struct of_device *op, const struct of_device_id
 	poweroff_method = machine_halt;  /* able to use the standard halt */
 	poweroff_method = machine_halt;  /* able to use the standard halt */
 
 
 	if (has_button_interrupt(irq, op->node)) {
 	if (has_button_interrupt(irq, op->node)) {
-		if (kernel_thread(powerd, NULL, CLONE_FS) < 0) {
-			printk("Failed to start power daemon.\n");
+		if (start_powerd() < 0)
 			return 0;
 			return 0;
-		}
-		printk("powerd running.\n");
 
 
 		if (request_irq(irq,
 		if (request_irq(irq,
 				power_handler, 0, "power", NULL) < 0)
 				power_handler, 0, "power", NULL) < 0)
-			printk("power: Error, cannot register IRQ handler.\n");
+			printk(KERN_ERR "power: Cannot setup IRQ handler.\n");
 	} else {
 	} else {
-		printk("not using powerd.\n");
+		printk(KERN_INFO "power: Not using powerd.\n");
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -164,4 +173,3 @@ void __init power_init(void)
 	of_register_driver(&power_driver, &of_bus_type);
 	of_register_driver(&power_driver, &of_bus_type);
 	return;
 	return;
 }
 }
-#endif /* CONFIG_PCI */

+ 16 - 5
arch/sparc64/kernel/process.c

@@ -29,6 +29,7 @@
 #include <linux/compat.h>
 #include <linux/compat.h>
 #include <linux/tick.h>
 #include <linux/tick.h>
 #include <linux/init.h>
 #include <linux/init.h>
+#include <linux/cpu.h>
 
 
 #include <asm/oplib.h>
 #include <asm/oplib.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
@@ -49,7 +50,7 @@
 
 
 /* #define VERBOSE_SHOWREGS */
 /* #define VERBOSE_SHOWREGS */
 
 
-static void sparc64_yield(void)
+static void sparc64_yield(int cpu)
 {
 {
 	if (tlb_type != hypervisor)
 	if (tlb_type != hypervisor)
 		return;
 		return;
@@ -57,7 +58,7 @@ static void sparc64_yield(void)
 	clear_thread_flag(TIF_POLLING_NRFLAG);
 	clear_thread_flag(TIF_POLLING_NRFLAG);
 	smp_mb__after_clear_bit();
 	smp_mb__after_clear_bit();
 
 
-	while (!need_resched()) {
+	while (!need_resched() && !cpu_is_offline(cpu)) {
 		unsigned long pstate;
 		unsigned long pstate;
 
 
 		/* Disable interrupts. */
 		/* Disable interrupts. */
@@ -68,7 +69,7 @@ static void sparc64_yield(void)
 			: "=&r" (pstate)
 			: "=&r" (pstate)
 			: "i" (PSTATE_IE));
 			: "i" (PSTATE_IE));
 
 
-		if (!need_resched())
+		if (!need_resched() && !cpu_is_offline(cpu))
 			sun4v_cpu_yield();
 			sun4v_cpu_yield();
 
 
 		/* Re-enable interrupts. */
 		/* Re-enable interrupts. */
@@ -86,15 +87,25 @@ static void sparc64_yield(void)
 /* The idle loop on sparc64. */
 /* The idle loop on sparc64. */
 void cpu_idle(void)
 void cpu_idle(void)
 {
 {
+	int cpu = smp_processor_id();
+
 	set_thread_flag(TIF_POLLING_NRFLAG);
 	set_thread_flag(TIF_POLLING_NRFLAG);
 
 
 	while(1) {
 	while(1) {
 		tick_nohz_stop_sched_tick();
 		tick_nohz_stop_sched_tick();
-		while (!need_resched())
-			sparc64_yield();
+
+		while (!need_resched() && !cpu_is_offline(cpu))
+			sparc64_yield(cpu);
+
 		tick_nohz_restart_sched_tick();
 		tick_nohz_restart_sched_tick();
 
 
 		preempt_enable_no_resched();
 		preempt_enable_no_resched();
+
+#ifdef CONFIG_HOTPLUG_CPU
+		if (cpu_is_offline(cpu))
+			cpu_play_dead();
+#endif
+
 		schedule();
 		schedule();
 		preempt_disable();
 		preempt_disable();
 	}
 	}

+ 1 - 1
arch/sparc64/kernel/prom.c

@@ -1808,7 +1808,7 @@ static void __init of_fill_in_cpu_data(void)
 
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 		cpu_set(cpuid, cpu_present_map);
 		cpu_set(cpuid, cpu_present_map);
-		cpu_set(cpuid, phys_cpu_present_map);
+		cpu_set(cpuid, cpu_possible_map);
 #endif
 #endif
 	}
 	}
 
 

+ 1 - 4
arch/sparc64/kernel/setup.c

@@ -442,7 +442,6 @@ static int show_cpuinfo(struct seq_file *m, void *__unused)
 		   "D$ parity tl1\t: %u\n"
 		   "D$ parity tl1\t: %u\n"
 		   "I$ parity tl1\t: %u\n"
 		   "I$ parity tl1\t: %u\n"
 #ifndef CONFIG_SMP
 #ifndef CONFIG_SMP
-		   "Cpu0Bogo\t: %lu.%02lu\n"
 		   "Cpu0ClkTck\t: %016lx\n"
 		   "Cpu0ClkTck\t: %016lx\n"
 #endif
 #endif
 		   ,
 		   ,
@@ -455,10 +454,8 @@ static int show_cpuinfo(struct seq_file *m, void *__unused)
 		   ncpus_probed,
 		   ncpus_probed,
 		   num_online_cpus(),
 		   num_online_cpus(),
 		   dcache_parity_tl1_occurred,
 		   dcache_parity_tl1_occurred,
-		   icache_parity_tl1_occurred
+		   icache_parity_tl1_occurred,
 #ifndef CONFIG_SMP
 #ifndef CONFIG_SMP
-		   , cpu_data(0).udelay_val/(500000/HZ),
-		   (cpu_data(0).udelay_val/(5000/HZ)) % 100,
 		   cpu_data(0).clock_tick
 		   cpu_data(0).clock_tick
 #endif
 #endif
 		);
 		);

+ 201 - 50
arch/sparc64/kernel/smp.c

@@ -1,6 +1,6 @@
 /* smp.c: Sparc64 SMP support.
 /* smp.c: Sparc64 SMP support.
  *
  *
- * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1997, 2007 David S. Miller (davem@davemloft.net)
  */
  */
 
 
 #include <linux/module.h>
 #include <linux/module.h>
@@ -28,6 +28,8 @@
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
 #include <asm/cpudata.h>
 #include <asm/cpudata.h>
+#include <asm/hvtramp.h>
+#include <asm/io.h>
 
 
 #include <asm/irq.h>
 #include <asm/irq.h>
 #include <asm/irq_regs.h>
 #include <asm/irq_regs.h>
@@ -41,22 +43,26 @@
 #include <asm/sections.h>
 #include <asm/sections.h>
 #include <asm/prom.h>
 #include <asm/prom.h>
 #include <asm/mdesc.h>
 #include <asm/mdesc.h>
+#include <asm/ldc.h>
+#include <asm/hypervisor.h>
 
 
 extern void calibrate_delay(void);
 extern void calibrate_delay(void);
 
 
 int sparc64_multi_core __read_mostly;
 int sparc64_multi_core __read_mostly;
 
 
-/* Please don't make this stuff initdata!!!  --DaveM */
-unsigned char boot_cpu_id;
-
+cpumask_t cpu_possible_map __read_mostly = CPU_MASK_NONE;
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
 cpumask_t cpu_online_map __read_mostly = CPU_MASK_NONE;
-cpumask_t phys_cpu_present_map __read_mostly = CPU_MASK_NONE;
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly =
 cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly =
 	{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
 	{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
 cpumask_t cpu_core_map[NR_CPUS] __read_mostly =
 	{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
 	{ [0 ... NR_CPUS-1] = CPU_MASK_NONE };
+
+EXPORT_SYMBOL(cpu_possible_map);
+EXPORT_SYMBOL(cpu_online_map);
+EXPORT_SYMBOL(cpu_sibling_map);
+EXPORT_SYMBOL(cpu_core_map);
+
 static cpumask_t smp_commenced_mask;
 static cpumask_t smp_commenced_mask;
-static cpumask_t cpu_callout_map;
 
 
 void smp_info(struct seq_file *m)
 void smp_info(struct seq_file *m)
 {
 {
@@ -73,18 +79,17 @@ void smp_bogo(struct seq_file *m)
 	
 	
 	for_each_online_cpu(i)
 	for_each_online_cpu(i)
 		seq_printf(m,
 		seq_printf(m,
-			   "Cpu%dBogo\t: %lu.%02lu\n"
 			   "Cpu%dClkTck\t: %016lx\n",
 			   "Cpu%dClkTck\t: %016lx\n",
-			   i, cpu_data(i).udelay_val / (500000/HZ),
-			   (cpu_data(i).udelay_val / (5000/HZ)) % 100,
 			   i, cpu_data(i).clock_tick);
 			   i, cpu_data(i).clock_tick);
 }
 }
 
 
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock);
+
 extern void setup_sparc64_timer(void);
 extern void setup_sparc64_timer(void);
 
 
 static volatile unsigned long callin_flag = 0;
 static volatile unsigned long callin_flag = 0;
 
 
-void __init smp_callin(void)
+void __devinit smp_callin(void)
 {
 {
 	int cpuid = hard_smp_processor_id();
 	int cpuid = hard_smp_processor_id();
 
 
@@ -102,8 +107,6 @@ void __init smp_callin(void)
 
 
 	local_irq_enable();
 	local_irq_enable();
 
 
-	calibrate_delay();
-	cpu_data(cpuid).udelay_val = loops_per_jiffy;
 	callin_flag = 1;
 	callin_flag = 1;
 	__asm__ __volatile__("membar #Sync\n\t"
 	__asm__ __volatile__("membar #Sync\n\t"
 			     "flush  %%g6" : : : "memory");
 			     "flush  %%g6" : : : "memory");
@@ -120,7 +123,9 @@ void __init smp_callin(void)
 	while (!cpu_isset(cpuid, smp_commenced_mask))
 	while (!cpu_isset(cpuid, smp_commenced_mask))
 		rmb();
 		rmb();
 
 
+	spin_lock(&call_lock);
 	cpu_set(cpuid, cpu_online_map);
 	cpu_set(cpuid, cpu_online_map);
+	spin_unlock(&call_lock);
 
 
 	/* idle thread is expected to have preempt disabled */
 	/* idle thread is expected to have preempt disabled */
 	preempt_disable();
 	preempt_disable();
@@ -268,6 +273,67 @@ static void smp_synchronize_one_tick(int cpu)
 	spin_unlock_irqrestore(&itc_sync_lock, flags);
 	spin_unlock_irqrestore(&itc_sync_lock, flags);
 }
 }
 
 
+#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU)
+/* XXX Put this in some common place. XXX */
+static unsigned long kimage_addr_to_ra(void *p)
+{
+	unsigned long val = (unsigned long) p;
+
+	return kern_base + (val - KERNBASE);
+}
+
+static void ldom_startcpu_cpuid(unsigned int cpu, unsigned long thread_reg)
+{
+	extern unsigned long sparc64_ttable_tl0;
+	extern unsigned long kern_locked_tte_data;
+	extern int bigkernel;
+	struct hvtramp_descr *hdesc;
+	unsigned long trampoline_ra;
+	struct trap_per_cpu *tb;
+	u64 tte_vaddr, tte_data;
+	unsigned long hv_err;
+
+	hdesc = kzalloc(sizeof(*hdesc), GFP_KERNEL);
+	if (!hdesc) {
+		printk(KERN_ERR "ldom_startcpu_cpuid: Cannot allocate "
+		       "hvtramp_descr.\n");
+		return;
+	}
+
+	hdesc->cpu = cpu;
+	hdesc->num_mappings = (bigkernel ? 2 : 1);
+
+	tb = &trap_block[cpu];
+	tb->hdesc = hdesc;
+
+	hdesc->fault_info_va = (unsigned long) &tb->fault_info;
+	hdesc->fault_info_pa = kimage_addr_to_ra(&tb->fault_info);
+
+	hdesc->thread_reg = thread_reg;
+
+	tte_vaddr = (unsigned long) KERNBASE;
+	tte_data = kern_locked_tte_data;
+
+	hdesc->maps[0].vaddr = tte_vaddr;
+	hdesc->maps[0].tte   = tte_data;
+	if (bigkernel) {
+		tte_vaddr += 0x400000;
+		tte_data  += 0x400000;
+		hdesc->maps[1].vaddr = tte_vaddr;
+		hdesc->maps[1].tte   = tte_data;
+	}
+
+	trampoline_ra = kimage_addr_to_ra(hv_cpu_startup);
+
+	hv_err = sun4v_cpu_start(cpu, trampoline_ra,
+				 kimage_addr_to_ra(&sparc64_ttable_tl0),
+				 __pa(hdesc));
+	if (hv_err)
+		printk(KERN_ERR "ldom_startcpu_cpuid: sun4v_cpu_start() "
+		       "gives error %lu\n", hv_err);
+}
+#endif
+
 extern void sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load);
 extern void sun4v_init_mondo_queues(int use_bootmem, int cpu, int alloc, int load);
 
 
 extern unsigned long sparc64_cpu_startup;
 extern unsigned long sparc64_cpu_startup;
@@ -280,6 +346,7 @@ static struct thread_info *cpu_new_thread = NULL;
 
 
 static int __devinit smp_boot_one_cpu(unsigned int cpu)
 static int __devinit smp_boot_one_cpu(unsigned int cpu)
 {
 {
+	struct trap_per_cpu *tb = &trap_block[cpu];
 	unsigned long entry =
 	unsigned long entry =
 		(unsigned long)(&sparc64_cpu_startup);
 		(unsigned long)(&sparc64_cpu_startup);
 	unsigned long cookie =
 	unsigned long cookie =
@@ -290,20 +357,25 @@ static int __devinit smp_boot_one_cpu(unsigned int cpu)
 	p = fork_idle(cpu);
 	p = fork_idle(cpu);
 	callin_flag = 0;
 	callin_flag = 0;
 	cpu_new_thread = task_thread_info(p);
 	cpu_new_thread = task_thread_info(p);
-	cpu_set(cpu, cpu_callout_map);
 
 
 	if (tlb_type == hypervisor) {
 	if (tlb_type == hypervisor) {
 		/* Alloc the mondo queues, cpu will load them.  */
 		/* Alloc the mondo queues, cpu will load them.  */
 		sun4v_init_mondo_queues(0, cpu, 1, 0);
 		sun4v_init_mondo_queues(0, cpu, 1, 0);
 
 
-		prom_startcpu_cpuid(cpu, entry, cookie);
+#if defined(CONFIG_SUN_LDOMS) && defined(CONFIG_HOTPLUG_CPU)
+		if (ldom_domaining_enabled)
+			ldom_startcpu_cpuid(cpu,
+					    (unsigned long) cpu_new_thread);
+		else
+#endif
+			prom_startcpu_cpuid(cpu, entry, cookie);
 	} else {
 	} else {
 		struct device_node *dp = of_find_node_by_cpuid(cpu);
 		struct device_node *dp = of_find_node_by_cpuid(cpu);
 
 
 		prom_startcpu(dp->node, entry, cookie);
 		prom_startcpu(dp->node, entry, cookie);
 	}
 	}
 
 
-	for (timeout = 0; timeout < 5000000; timeout++) {
+	for (timeout = 0; timeout < 50000; timeout++) {
 		if (callin_flag)
 		if (callin_flag)
 			break;
 			break;
 		udelay(100);
 		udelay(100);
@@ -313,11 +385,15 @@ static int __devinit smp_boot_one_cpu(unsigned int cpu)
 		ret = 0;
 		ret = 0;
 	} else {
 	} else {
 		printk("Processor %d is stuck.\n", cpu);
 		printk("Processor %d is stuck.\n", cpu);
-		cpu_clear(cpu, cpu_callout_map);
 		ret = -ENODEV;
 		ret = -ENODEV;
 	}
 	}
 	cpu_new_thread = NULL;
 	cpu_new_thread = NULL;
 
 
+	if (tb->hdesc) {
+		kfree(tb->hdesc);
+		tb->hdesc = NULL;
+	}
+
 	return ret;
 	return ret;
 }
 }
 
 
@@ -720,7 +796,6 @@ struct call_data_struct {
 	int wait;
 	int wait;
 };
 };
 
 
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock);
 static struct call_data_struct *call_data;
 static struct call_data_struct *call_data;
 
 
 extern unsigned long xcall_call_function;
 extern unsigned long xcall_call_function;
@@ -1152,34 +1227,14 @@ void smp_penguin_jailcell(int irq, struct pt_regs *regs)
 	preempt_enable();
 	preempt_enable();
 }
 }
 
 
-void __init smp_tick_init(void)
-{
-	boot_cpu_id = hard_smp_processor_id();
-}
-
 /* /proc/profile writes can call this, don't __init it please. */
 /* /proc/profile writes can call this, don't __init it please. */
 int setup_profiling_timer(unsigned int multiplier)
 int setup_profiling_timer(unsigned int multiplier)
 {
 {
 	return -EINVAL;
 	return -EINVAL;
 }
 }
 
 
-/* Constrain the number of cpus to max_cpus.  */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 {
-	int i;
-
-	if (num_possible_cpus() > max_cpus) {
-		for_each_possible_cpu(i) {
-			if (i != boot_cpu_id) {
-				cpu_clear(i, phys_cpu_present_map);
-				cpu_clear(i, cpu_present_map);
-				if (num_possible_cpus() <= max_cpus)
-					break;
-			}
-		}
-	}
-
-	cpu_data(boot_cpu_id).udelay_val = loops_per_jiffy;
 }
 }
 
 
 void __devinit smp_prepare_boot_cpu(void)
 void __devinit smp_prepare_boot_cpu(void)
@@ -1190,30 +1245,32 @@ void __devinit smp_fill_in_sib_core_maps(void)
 {
 {
 	unsigned int i;
 	unsigned int i;
 
 
-	for_each_possible_cpu(i) {
+	for_each_present_cpu(i) {
 		unsigned int j;
 		unsigned int j;
 
 
+		cpus_clear(cpu_core_map[i]);
 		if (cpu_data(i).core_id == 0) {
 		if (cpu_data(i).core_id == 0) {
 			cpu_set(i, cpu_core_map[i]);
 			cpu_set(i, cpu_core_map[i]);
 			continue;
 			continue;
 		}
 		}
 
 
-		for_each_possible_cpu(j) {
+		for_each_present_cpu(j) {
 			if (cpu_data(i).core_id ==
 			if (cpu_data(i).core_id ==
 			    cpu_data(j).core_id)
 			    cpu_data(j).core_id)
 				cpu_set(j, cpu_core_map[i]);
 				cpu_set(j, cpu_core_map[i]);
 		}
 		}
 	}
 	}
 
 
-	for_each_possible_cpu(i) {
+	for_each_present_cpu(i) {
 		unsigned int j;
 		unsigned int j;
 
 
+		cpus_clear(cpu_sibling_map[i]);
 		if (cpu_data(i).proc_id == -1) {
 		if (cpu_data(i).proc_id == -1) {
 			cpu_set(i, cpu_sibling_map[i]);
 			cpu_set(i, cpu_sibling_map[i]);
 			continue;
 			continue;
 		}
 		}
 
 
-		for_each_possible_cpu(j) {
+		for_each_present_cpu(j) {
 			if (cpu_data(i).proc_id ==
 			if (cpu_data(i).proc_id ==
 			    cpu_data(j).proc_id)
 			    cpu_data(j).proc_id)
 				cpu_set(j, cpu_sibling_map[i]);
 				cpu_set(j, cpu_sibling_map[i]);
@@ -1242,18 +1299,112 @@ int __cpuinit __cpu_up(unsigned int cpu)
 	return ret;
 	return ret;
 }
 }
 
 
-void __init smp_cpus_done(unsigned int max_cpus)
+#ifdef CONFIG_HOTPLUG_CPU
+void cpu_play_dead(void)
+{
+	int cpu = smp_processor_id();
+	unsigned long pstate;
+
+	idle_task_exit();
+
+	if (tlb_type == hypervisor) {
+		struct trap_per_cpu *tb = &trap_block[cpu];
+
+		sun4v_cpu_qconf(HV_CPU_QUEUE_CPU_MONDO,
+				tb->cpu_mondo_pa, 0);
+		sun4v_cpu_qconf(HV_CPU_QUEUE_DEVICE_MONDO,
+				tb->dev_mondo_pa, 0);
+		sun4v_cpu_qconf(HV_CPU_QUEUE_RES_ERROR,
+				tb->resum_mondo_pa, 0);
+		sun4v_cpu_qconf(HV_CPU_QUEUE_NONRES_ERROR,
+				tb->nonresum_mondo_pa, 0);
+	}
+
+	cpu_clear(cpu, smp_commenced_mask);
+	membar_safe("#Sync");
+
+	local_irq_disable();
+
+	__asm__ __volatile__(
+		"rdpr	%%pstate, %0\n\t"
+		"wrpr	%0, %1, %%pstate"
+		: "=r" (pstate)
+		: "i" (PSTATE_IE));
+
+	while (1)
+		barrier();
+}
+
+int __cpu_disable(void)
 {
 {
-	unsigned long bogosum = 0;
+	int cpu = smp_processor_id();
+	cpuinfo_sparc *c;
 	int i;
 	int i;
 
 
-	for_each_online_cpu(i)
-		bogosum += cpu_data(i).udelay_val;
-	printk("Total of %ld processors activated "
-	       "(%lu.%02lu BogoMIPS).\n",
-	       (long) num_online_cpus(),
-	       bogosum/(500000/HZ),
-	       (bogosum/(5000/HZ))%100);
+	for_each_cpu_mask(i, cpu_core_map[cpu])
+		cpu_clear(cpu, cpu_core_map[i]);
+	cpus_clear(cpu_core_map[cpu]);
+
+	for_each_cpu_mask(i, cpu_sibling_map[cpu])
+		cpu_clear(cpu, cpu_sibling_map[i]);
+	cpus_clear(cpu_sibling_map[cpu]);
+
+	c = &cpu_data(cpu);
+
+	c->core_id = 0;
+	c->proc_id = -1;
+
+	spin_lock(&call_lock);
+	cpu_clear(cpu, cpu_online_map);
+	spin_unlock(&call_lock);
+
+	smp_wmb();
+
+	/* Make sure no interrupts point to this cpu.  */
+	fixup_irqs();
+
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+
+	return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	int i;
+
+	for (i = 0; i < 100; i++) {
+		smp_rmb();
+		if (!cpu_isset(cpu, smp_commenced_mask))
+			break;
+		msleep(100);
+	}
+	if (cpu_isset(cpu, smp_commenced_mask)) {
+		printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+	} else {
+#if defined(CONFIG_SUN_LDOMS)
+		unsigned long hv_err;
+		int limit = 100;
+
+		do {
+			hv_err = sun4v_cpu_stop(cpu);
+			if (hv_err == HV_EOK) {
+				cpu_clear(cpu, cpu_present_map);
+				break;
+			}
+		} while (--limit > 0);
+		if (limit <= 0) {
+			printk(KERN_ERR "sun4v_cpu_stop() fails err=%lu\n",
+			       hv_err);
+		}
+#endif
+	}
+}
+#endif
+
+void __init smp_cpus_done(unsigned int max_cpus)
+{
 }
 }
 
 
 void smp_send_reschedule(int cpu)
 void smp_send_reschedule(int cpu)

+ 2 - 14
arch/sparc64/kernel/sparc64_ksyms.c

@@ -1,7 +1,6 @@
-/* $Id: sparc64_ksyms.c,v 1.121 2002/02/09 19:49:31 davem Exp $
- * arch/sparc64/kernel/sparc64_ksyms.c: Sparc64 specific ksyms support.
+/* arch/sparc64/kernel/sparc64_ksyms.c: Sparc64 specific ksyms support.
  *
  *
- * Copyright (C) 1996 David S. Miller (davem@caip.rutgers.edu)
+ * Copyright (C) 1996, 2007 David S. Miller (davem@davemloft.net)
  * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
  * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
  * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
  * Copyright (C) 1999 Jakub Jelinek (jj@ultra.linux.cz)
  */
  */
@@ -28,7 +27,6 @@
 #include <net/compat.h>
 #include <net/compat.h>
 
 
 #include <asm/oplib.h>
 #include <asm/oplib.h>
-#include <asm/delay.h>
 #include <asm/system.h>
 #include <asm/system.h>
 #include <asm/auxio.h>
 #include <asm/auxio.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
@@ -124,10 +122,6 @@ EXPORT_SYMBOL(__write_lock);
 EXPORT_SYMBOL(__write_unlock);
 EXPORT_SYMBOL(__write_unlock);
 EXPORT_SYMBOL(__write_trylock);
 EXPORT_SYMBOL(__write_trylock);
 
 
-/* CPU online map and active count.  */
-EXPORT_SYMBOL(cpu_online_map);
-EXPORT_SYMBOL(phys_cpu_present_map);
-
 EXPORT_SYMBOL(smp_call_function);
 EXPORT_SYMBOL(smp_call_function);
 #endif /* CONFIG_SMP */
 #endif /* CONFIG_SMP */
 
 
@@ -330,12 +324,6 @@ EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(strncmp);
 EXPORT_SYMBOL(strncmp);
 
 
-/* Delay routines. */
-EXPORT_SYMBOL(__udelay);
-EXPORT_SYMBOL(__ndelay);
-EXPORT_SYMBOL(__const_udelay);
-EXPORT_SYMBOL(__delay);
-
 void VISenter(void);
 void VISenter(void);
 /* RAID code needs this */
 /* RAID code needs this */
 EXPORT_SYMBOL(VISenter);
 EXPORT_SYMBOL(VISenter);

+ 0 - 2
arch/sparc64/kernel/sysfs.c

@@ -193,7 +193,6 @@ static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
 }
 }
 
 
 SHOW_CPUDATA_ULONG_NAME(clock_tick, clock_tick);
 SHOW_CPUDATA_ULONG_NAME(clock_tick, clock_tick);
-SHOW_CPUDATA_ULONG_NAME(udelay_val, udelay_val);
 SHOW_CPUDATA_UINT_NAME(l1_dcache_size, dcache_size);
 SHOW_CPUDATA_UINT_NAME(l1_dcache_size, dcache_size);
 SHOW_CPUDATA_UINT_NAME(l1_dcache_line_size, dcache_line_size);
 SHOW_CPUDATA_UINT_NAME(l1_dcache_line_size, dcache_line_size);
 SHOW_CPUDATA_UINT_NAME(l1_icache_size, icache_size);
 SHOW_CPUDATA_UINT_NAME(l1_icache_size, icache_size);
@@ -203,7 +202,6 @@ SHOW_CPUDATA_UINT_NAME(l2_cache_line_size, ecache_line_size);
 
 
 static struct sysdev_attribute cpu_core_attrs[] = {
 static struct sysdev_attribute cpu_core_attrs[] = {
 	_SYSDEV_ATTR(clock_tick,          0444, show_clock_tick, NULL),
 	_SYSDEV_ATTR(clock_tick,          0444, show_clock_tick, NULL),
-	_SYSDEV_ATTR(udelay_val,          0444, show_udelay_val, NULL),
 	_SYSDEV_ATTR(l1_dcache_size,      0444, show_l1_dcache_size, NULL),
 	_SYSDEV_ATTR(l1_dcache_size,      0444, show_l1_dcache_size, NULL),
 	_SYSDEV_ATTR(l1_dcache_line_size, 0444, show_l1_dcache_line_size, NULL),
 	_SYSDEV_ATTR(l1_dcache_line_size, 0444, show_l1_dcache_line_size, NULL),
 	_SYSDEV_ATTR(l1_icache_size,      0444, show_l1_icache_size, NULL),
 	_SYSDEV_ATTR(l1_icache_size,      0444, show_l1_icache_size, NULL),

+ 21 - 7
arch/sparc64/kernel/time.c

@@ -849,9 +849,6 @@ static unsigned long sparc64_init_timers(void)
 {
 {
 	struct device_node *dp;
 	struct device_node *dp;
 	unsigned long clock;
 	unsigned long clock;
-#ifdef CONFIG_SMP
-	extern void smp_tick_init(void);
-#endif
 
 
 	dp = of_find_node_by_path("/");
 	dp = of_find_node_by_path("/");
 	if (tlb_type == spitfire) {
 	if (tlb_type == spitfire) {
@@ -874,10 +871,6 @@ static unsigned long sparc64_init_timers(void)
 		clock = of_getintprop_default(dp, "stick-frequency", 0);
 		clock = of_getintprop_default(dp, "stick-frequency", 0);
 	}
 	}
 
 
-#ifdef CONFIG_SMP
-	smp_tick_init();
-#endif
-
 	return clock;
 	return clock;
 }
 }
 
 
@@ -1038,10 +1031,31 @@ static void __init setup_clockevent_multiplier(unsigned long hz)
 	sparc64_clockevent.mult = mult;
 	sparc64_clockevent.mult = mult;
 }
 }
 
 
+static unsigned long tb_ticks_per_usec __read_mostly;
+
+void __delay(unsigned long loops)
+{
+	unsigned long bclock, now;
+
+	bclock = tick_ops->get_tick();
+	do {
+		now = tick_ops->get_tick();
+	} while ((now-bclock) < loops);
+}
+EXPORT_SYMBOL(__delay);
+
+void udelay(unsigned long usecs)
+{
+	__delay(tb_ticks_per_usec * usecs);
+}
+EXPORT_SYMBOL(udelay);
+
 void __init time_init(void)
 void __init time_init(void)
 {
 {
 	unsigned long clock = sparc64_init_timers();
 	unsigned long clock = sparc64_init_timers();
 
 
+	tb_ticks_per_usec = clock / USEC_PER_SEC;
+
 	timer_ticks_per_nsec_quotient =
 	timer_ticks_per_nsec_quotient =
 		clocksource_hz2mult(clock, SPARC64_NSEC_PER_CYC_SHIFT);
 		clocksource_hz2mult(clock, SPARC64_NSEC_PER_CYC_SHIFT);
 
 

+ 395 - 0
arch/sparc64/kernel/vio.c

@@ -0,0 +1,395 @@
+/* vio.c: Virtual I/O channel devices probing infrastructure.
+ *
+ *    Copyright (c) 2003-2005 IBM Corp.
+ *     Dave Engebretsen engebret@us.ibm.com
+ *     Santiago Leon santil@us.ibm.com
+ *     Hollis Blanchard <hollisb@us.ibm.com>
+ *     Stephen Rothwell
+ *
+ * Adapted to sparc64 by David S. Miller davem@davemloft.net
+ */
+
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+
+#include <asm/mdesc.h>
+#include <asm/vio.h>
+
+static inline int find_in_proplist(const char *list, const char *match,
+				   int len)
+{
+	while (len > 0) {
+		int l;
+
+		if (!strcmp(list, match))
+			return 1;
+		l = strlen(list) + 1;
+		list += l;
+		len -= l;
+	}
+	return 0;
+}
+
+static const struct vio_device_id *vio_match_device(
+	const struct vio_device_id *matches,
+	const struct vio_dev *dev)
+{
+	const char *type, *compat;
+	int len;
+
+	type = dev->type;
+	compat = dev->compat;
+	len = dev->compat_len;
+
+	while (matches->type[0] || matches->compat[0]) {
+		int match = 1;
+		if (matches->type[0])
+			match &= !strcmp(matches->type, type);
+
+		if (matches->compat[0]) {
+			match &= len &&
+				find_in_proplist(compat, matches->compat, len);
+		}
+		if (match)
+			return matches;
+		matches++;
+	}
+	return NULL;
+}
+
+static int vio_bus_match(struct device *dev, struct device_driver *drv)
+{
+	struct vio_dev *vio_dev = to_vio_dev(dev);
+	struct vio_driver *vio_drv = to_vio_driver(drv);
+	const struct vio_device_id *matches = vio_drv->id_table;
+
+	if (!matches)
+		return 0;
+
+	return vio_match_device(matches, vio_dev) != NULL;
+}
+
+static int vio_device_probe(struct device *dev)
+{
+	struct vio_dev *vdev = to_vio_dev(dev);
+	struct vio_driver *drv = to_vio_driver(dev->driver);
+	const struct vio_device_id *id;
+	int error = -ENODEV;
+
+	if (drv->probe) {
+		id = vio_match_device(drv->id_table, vdev);
+		if (id)
+			error = drv->probe(vdev, id);
+	}
+
+	return error;
+}
+
+static int vio_device_remove(struct device *dev)
+{
+	struct vio_dev *vdev = to_vio_dev(dev);
+	struct vio_driver *drv = to_vio_driver(dev->driver);
+
+	if (drv->remove)
+		return drv->remove(vdev);
+
+	return 1;
+}
+
+static ssize_t devspec_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *vdev = to_vio_dev(dev);
+	const char *str = "none";
+
+	if (!strcmp(vdev->type, "network"))
+		str = "vnet";
+	else if (!strcmp(vdev->type, "block"))
+		str = "vdisk";
+
+	return sprintf(buf, "%s\n", str);
+}
+
+static ssize_t type_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct vio_dev *vdev = to_vio_dev(dev);
+	return sprintf(buf, "%s\n", vdev->type);
+}
+
+static struct device_attribute vio_dev_attrs[] = {
+	__ATTR_RO(devspec),
+	__ATTR_RO(type),
+	__ATTR_NULL
+};
+
+static struct bus_type vio_bus_type = {
+	.name		= "vio",
+	.dev_attrs	= vio_dev_attrs,
+	.match		= vio_bus_match,
+	.probe		= vio_device_probe,
+	.remove		= vio_device_remove,
+};
+
+int vio_register_driver(struct vio_driver *viodrv)
+{
+	viodrv->driver.bus = &vio_bus_type;
+
+	return driver_register(&viodrv->driver);
+}
+EXPORT_SYMBOL(vio_register_driver);
+
+void vio_unregister_driver(struct vio_driver *viodrv)
+{
+	driver_unregister(&viodrv->driver);
+}
+EXPORT_SYMBOL(vio_unregister_driver);
+
+static void __devinit vio_dev_release(struct device *dev)
+{
+	kfree(to_vio_dev(dev));
+}
+
+static ssize_t
+show_pciobppath_attr(struct device *dev, struct device_attribute *attr,
+		     char *buf)
+{
+	struct vio_dev *vdev;
+	struct device_node *dp;
+
+	vdev = to_vio_dev(dev);
+	dp = vdev->dp;
+
+	return snprintf (buf, PAGE_SIZE, "%s\n", dp->full_name);
+}
+
+static DEVICE_ATTR(obppath, S_IRUSR | S_IRGRP | S_IROTH,
+		   show_pciobppath_attr, NULL);
+
+struct device_node *cdev_node;
+
+static struct vio_dev *root_vdev;
+static u64 cdev_cfg_handle;
+
+static void vio_fill_channel_info(struct mdesc_handle *hp, u64 mp,
+				  struct vio_dev *vdev)
+{
+	u64 a;
+
+	mdesc_for_each_arc(a, hp, mp, MDESC_ARC_TYPE_FWD) {
+		const u64 *chan_id;
+		const u64 *irq;
+		u64 target;
+
+		target = mdesc_arc_target(hp, a);
+
+		irq = mdesc_get_property(hp, target, "tx-ino", NULL);
+		if (irq)
+			vdev->tx_irq = sun4v_build_virq(cdev_cfg_handle, *irq);
+
+		irq = mdesc_get_property(hp, target, "rx-ino", NULL);
+		if (irq)
+			vdev->rx_irq = sun4v_build_virq(cdev_cfg_handle, *irq);
+
+		chan_id = mdesc_get_property(hp, target, "id", NULL);
+		if (chan_id)
+			vdev->channel_id = *chan_id;
+	}
+}
+
+static struct vio_dev *vio_create_one(struct mdesc_handle *hp, u64 mp,
+				      struct device *parent)
+{
+	const char *type, *compat;
+	struct device_node *dp;
+	struct vio_dev *vdev;
+	int err, tlen, clen;
+
+	type = mdesc_get_property(hp, mp, "device-type", &tlen);
+	if (!type) {
+		type = mdesc_get_property(hp, mp, "name", &tlen);
+		if (!type) {
+			type = mdesc_node_name(hp, mp);
+			tlen = strlen(type) + 1;
+		}
+	}
+	if (tlen > VIO_MAX_TYPE_LEN) {
+		printk(KERN_ERR "VIO: Type string [%s] is too long.\n",
+		       type);
+		return NULL;
+	}
+
+	compat = mdesc_get_property(hp, mp, "device-type", &clen);
+	if (!compat) {
+		clen = 0;
+	} else if (clen > VIO_MAX_COMPAT_LEN) {
+		printk(KERN_ERR "VIO: Compat len %d for [%s] is too long.\n",
+		       clen, type);
+		return NULL;
+	}
+
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev) {
+		printk(KERN_ERR "VIO: Could not allocate vio_dev\n");
+		return NULL;
+	}
+
+	vdev->mp = mp;
+	memcpy(vdev->type, type, tlen);
+	if (compat)
+		memcpy(vdev->compat, compat, clen);
+	else
+		memset(vdev->compat, 0, sizeof(vdev->compat));
+	vdev->compat_len = clen;
+
+	vdev->channel_id = ~0UL;
+	vdev->tx_irq = ~0;
+	vdev->rx_irq = ~0;
+
+	vio_fill_channel_info(hp, mp, vdev);
+
+	snprintf(vdev->dev.bus_id, BUS_ID_SIZE, "%lx", mp);
+	vdev->dev.parent = parent;
+	vdev->dev.bus = &vio_bus_type;
+	vdev->dev.release = vio_dev_release;
+
+	if (parent == NULL) {
+		dp = cdev_node;
+	} else if (to_vio_dev(parent) == root_vdev) {
+		dp = of_get_next_child(cdev_node, NULL);
+		while (dp) {
+			if (!strcmp(dp->type, type))
+				break;
+
+			dp = of_get_next_child(cdev_node, dp);
+		}
+	} else {
+		dp = to_vio_dev(parent)->dp;
+	}
+	vdev->dp = dp;
+
+	err = device_register(&vdev->dev);
+	if (err) {
+		printk(KERN_ERR "VIO: Could not register device %s, err=%d\n",
+		       vdev->dev.bus_id, err);
+		kfree(vdev);
+		return NULL;
+	}
+	if (vdev->dp)
+		err = sysfs_create_file(&vdev->dev.kobj,
+					&dev_attr_obppath.attr);
+
+	return vdev;
+}
+
+static void walk_tree(struct mdesc_handle *hp, u64 n, struct vio_dev *parent)
+{
+	u64 a;
+
+	mdesc_for_each_arc(a, hp, n, MDESC_ARC_TYPE_FWD) {
+		struct vio_dev *vdev;
+		u64 target;
+
+		target = mdesc_arc_target(hp, a);
+		vdev = vio_create_one(hp, target, &parent->dev);
+		if (vdev)
+			walk_tree(hp, target, vdev);
+	}
+}
+
+static void create_devices(struct mdesc_handle *hp, u64 root)
+{
+	u64 mp;
+
+	root_vdev = vio_create_one(hp, root, NULL);
+	if (!root_vdev) {
+		printk(KERN_ERR "VIO: Coult not create root device.\n");
+		return;
+	}
+
+	walk_tree(hp, root, root_vdev);
+
+	/* Domain services is odd as it doesn't sit underneath the
+	 * channel-devices node, so we plug it in manually.
+	 */
+	mp = mdesc_node_by_name(hp, MDESC_NODE_NULL, "domain-services");
+	if (mp != MDESC_NODE_NULL) {
+		struct vio_dev *parent = vio_create_one(hp, mp,
+							&root_vdev->dev);
+
+		if (parent)
+			walk_tree(hp, mp, parent);
+	}
+}
+
+const char *channel_devices_node = "channel-devices";
+const char *channel_devices_compat = "SUNW,sun4v-channel-devices";
+const char *cfg_handle_prop = "cfg-handle";
+
+static int __init vio_init(void)
+{
+	struct mdesc_handle *hp;
+	const char *compat;
+	const u64 *cfg_handle;
+	int err, len;
+	u64 root;
+
+	err = bus_register(&vio_bus_type);
+	if (err) {
+		printk(KERN_ERR "VIO: Could not register bus type err=%d\n",
+		       err);
+		return err;
+	}
+
+	hp = mdesc_grab();
+	if (!hp)
+		return 0;
+
+	root = mdesc_node_by_name(hp, MDESC_NODE_NULL, channel_devices_node);
+	if (root == MDESC_NODE_NULL) {
+		printk(KERN_INFO "VIO: No channel-devices MDESC node.\n");
+		mdesc_release(hp);
+		return 0;
+	}
+
+	cdev_node = of_find_node_by_name(NULL, "channel-devices");
+	err = -ENODEV;
+	if (!cdev_node) {
+		printk(KERN_INFO "VIO: No channel-devices OBP node.\n");
+		goto out_release;
+	}
+
+	compat = mdesc_get_property(hp, root, "compatible", &len);
+	if (!compat) {
+		printk(KERN_ERR "VIO: Channel devices lacks compatible "
+		       "property\n");
+		goto out_release;
+	}
+	if (!find_in_proplist(compat, channel_devices_compat, len)) {
+		printk(KERN_ERR "VIO: Channel devices node lacks (%s) "
+		       "compat entry.\n", channel_devices_compat);
+		goto out_release;
+	}
+
+	cfg_handle = mdesc_get_property(hp, root, cfg_handle_prop, NULL);
+	if (!cfg_handle) {
+		printk(KERN_ERR "VIO: Channel devices lacks %s property\n",
+		       cfg_handle_prop);
+		goto out_release;
+	}
+
+	cdev_cfg_handle = *cfg_handle;
+
+	create_devices(hp, root);
+
+	mdesc_release(hp);
+
+	return 0;
+
+out_release:
+	mdesc_release(hp);
+	return err;
+}
+
+postcore_initcall(vio_init);

+ 792 - 0
arch/sparc64/kernel/viohs.c

@@ -0,0 +1,792 @@
+/* viohs.c: LDOM Virtual I/O handshake helper layer.
+ *
+ * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/ldc.h>
+#include <asm/vio.h>
+
+int vio_ldc_send(struct vio_driver_state *vio, void *data, int len)
+{
+	int err, limit = 1000;
+
+	err = -EINVAL;
+	while (limit-- > 0) {
+		err = ldc_write(vio->lp, data, len);
+		if (!err || (err != -EAGAIN))
+			break;
+		udelay(1);
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(vio_ldc_send);
+
+static int send_ctrl(struct vio_driver_state *vio,
+		     struct vio_msg_tag *tag, int len)
+{
+	tag->sid = vio_send_sid(vio);
+	return vio_ldc_send(vio, tag, len);
+}
+
+static void init_tag(struct vio_msg_tag *tag, u8 type, u8 stype, u16 stype_env)
+{
+	tag->type = type;
+	tag->stype = stype;
+	tag->stype_env = stype_env;
+}
+
+static int send_version(struct vio_driver_state *vio, u16 major, u16 minor)
+{
+	struct vio_ver_info pkt;
+
+	vio->_local_sid = (u32) sched_clock();
+
+	memset(&pkt, 0, sizeof(pkt));
+	init_tag(&pkt.tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_VER_INFO);
+	pkt.major = major;
+	pkt.minor = minor;
+	pkt.dev_class = vio->dev_class;
+
+	viodbg(HS, "SEND VERSION INFO maj[%u] min[%u] devclass[%u]\n",
+	       major, minor, vio->dev_class);
+
+	return send_ctrl(vio, &pkt.tag, sizeof(pkt));
+}
+
+static int start_handshake(struct vio_driver_state *vio)
+{
+	int err;
+
+	viodbg(HS, "START HANDSHAKE\n");
+
+	vio->hs_state = VIO_HS_INVALID;
+
+	err = send_version(vio,
+			   vio->ver_table[0].major,
+			   vio->ver_table[0].minor);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+void vio_link_state_change(struct vio_driver_state *vio, int event)
+{
+	if (event == LDC_EVENT_UP) {
+		vio->hs_state = VIO_HS_INVALID;
+
+		switch (vio->dev_class) {
+		case VDEV_NETWORK:
+		case VDEV_NETWORK_SWITCH:
+			vio->dr_state = (VIO_DR_STATE_TXREQ |
+					 VIO_DR_STATE_RXREQ);
+			break;
+
+		case VDEV_DISK:
+			vio->dr_state = VIO_DR_STATE_TXREQ;
+			break;
+		case VDEV_DISK_SERVER:
+			vio->dr_state = VIO_DR_STATE_RXREQ;
+			break;
+		}
+		start_handshake(vio);
+	}
+}
+EXPORT_SYMBOL(vio_link_state_change);
+
+static int handshake_failure(struct vio_driver_state *vio)
+{
+	struct vio_dring_state *dr;
+
+	/* XXX Put policy here...  Perhaps start a timer to fire
+	 * XXX in 100 ms, which will bring the link up and retry
+	 * XXX the handshake.
+	 */
+
+	viodbg(HS, "HANDSHAKE FAILURE\n");
+
+	vio->dr_state &= ~(VIO_DR_STATE_TXREG |
+			   VIO_DR_STATE_RXREG);
+
+	dr = &vio->drings[VIO_DRIVER_RX_RING];
+	memset(dr, 0, sizeof(*dr));
+
+	kfree(vio->desc_buf);
+	vio->desc_buf = NULL;
+	vio->desc_buf_len = 0;
+
+	vio->hs_state = VIO_HS_INVALID;
+
+	return -ECONNRESET;
+}
+
+static int process_unknown(struct vio_driver_state *vio, void *arg)
+{
+	struct vio_msg_tag *pkt = arg;
+
+	viodbg(HS, "UNKNOWN CONTROL [%02x:%02x:%04x:%08x]\n",
+	       pkt->type, pkt->stype, pkt->stype_env, pkt->sid);
+
+	printk(KERN_ERR "vio: ID[%lu] Resetting connection.\n",
+	       vio->vdev->channel_id);
+
+	ldc_disconnect(vio->lp);
+
+	return -ECONNRESET;
+}
+
+static int send_dreg(struct vio_driver_state *vio)
+{
+	struct vio_dring_state *dr = &vio->drings[VIO_DRIVER_TX_RING];
+	union {
+		struct vio_dring_register pkt;
+		char all[sizeof(struct vio_dring_register) +
+			 (sizeof(struct ldc_trans_cookie) *
+			  dr->ncookies)];
+	} u;
+	int i;
+
+	memset(&u, 0, sizeof(u));
+	init_tag(&u.pkt.tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_DRING_REG);
+	u.pkt.dring_ident = 0;
+	u.pkt.num_descr = dr->num_entries;
+	u.pkt.descr_size = dr->entry_size;
+	u.pkt.options = VIO_TX_DRING;
+	u.pkt.num_cookies = dr->ncookies;
+
+	viodbg(HS, "SEND DRING_REG INFO ndesc[%u] dsz[%u] opt[0x%x] "
+	       "ncookies[%u]\n",
+	       u.pkt.num_descr, u.pkt.descr_size, u.pkt.options,
+	       u.pkt.num_cookies);
+
+	for (i = 0; i < dr->ncookies; i++) {
+		u.pkt.cookies[i] = dr->cookies[i];
+
+		viodbg(HS, "DRING COOKIE(%d) [%016llx:%016llx]\n",
+		       i,
+		       (unsigned long long) u.pkt.cookies[i].cookie_addr,
+		       (unsigned long long) u.pkt.cookies[i].cookie_size);
+	}
+
+	return send_ctrl(vio, &u.pkt.tag, sizeof(u));
+}
+
+static int send_rdx(struct vio_driver_state *vio)
+{
+	struct vio_rdx pkt;
+
+	memset(&pkt, 0, sizeof(pkt));
+
+	init_tag(&pkt.tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX);
+
+	viodbg(HS, "SEND RDX INFO\n");
+
+	return send_ctrl(vio, &pkt.tag, sizeof(pkt));
+}
+
+static int send_attr(struct vio_driver_state *vio)
+{
+	return vio->ops->send_attr(vio);
+}
+
+static struct vio_version *find_by_major(struct vio_driver_state *vio,
+					 u16 major)
+{
+	struct vio_version *ret = NULL;
+	int i;
+
+	for (i = 0; i < vio->ver_table_entries; i++) {
+		struct vio_version *v = &vio->ver_table[i];
+		if (v->major <= major) {
+			ret = v;
+			break;
+		}
+	}
+	return ret;
+}
+
+static int process_ver_info(struct vio_driver_state *vio,
+			    struct vio_ver_info *pkt)
+{
+	struct vio_version *vap;
+	int err;
+
+	viodbg(HS, "GOT VERSION INFO maj[%u] min[%u] devclass[%u]\n",
+	       pkt->major, pkt->minor, pkt->dev_class);
+
+	if (vio->hs_state != VIO_HS_INVALID) {
+		/* XXX Perhaps invoke start_handshake? XXX */
+		memset(&vio->ver, 0, sizeof(vio->ver));
+		vio->hs_state = VIO_HS_INVALID;
+	}
+
+	vap = find_by_major(vio, pkt->major);
+
+	vio->_peer_sid = pkt->tag.sid;
+
+	if (!vap) {
+		pkt->tag.stype = VIO_SUBTYPE_NACK;
+		pkt->major = 0;
+		pkt->minor = 0;
+		viodbg(HS, "SEND VERSION NACK maj[0] min[0]\n");
+		err = send_ctrl(vio, &pkt->tag, sizeof(*pkt));
+	} else if (vap->major != pkt->major) {
+		pkt->tag.stype = VIO_SUBTYPE_NACK;
+		pkt->major = vap->major;
+		pkt->minor = vap->minor;
+		viodbg(HS, "SEND VERSION NACK maj[%u] min[%u]\n",
+		       pkt->major, pkt->minor);
+		err = send_ctrl(vio, &pkt->tag, sizeof(*pkt));
+	} else {
+		struct vio_version ver = {
+			.major = pkt->major,
+			.minor = pkt->minor,
+		};
+		if (ver.minor > vap->minor)
+			ver.minor = vap->minor;
+		pkt->minor = ver.minor;
+		pkt->tag.stype = VIO_SUBTYPE_ACK;
+		viodbg(HS, "SEND VERSION ACK maj[%u] min[%u]\n",
+		       pkt->major, pkt->minor);
+		err = send_ctrl(vio, &pkt->tag, sizeof(*pkt));
+		if (err > 0) {
+			vio->ver = ver;
+			vio->hs_state = VIO_HS_GOTVERS;
+		}
+	}
+	if (err < 0)
+		return handshake_failure(vio);
+
+	return 0;
+}
+
+static int process_ver_ack(struct vio_driver_state *vio,
+			   struct vio_ver_info *pkt)
+{
+	viodbg(HS, "GOT VERSION ACK maj[%u] min[%u] devclass[%u]\n",
+	       pkt->major, pkt->minor, pkt->dev_class);
+
+	if (vio->hs_state & VIO_HS_GOTVERS) {
+		if (vio->ver.major != pkt->major ||
+		    vio->ver.minor != pkt->minor) {
+			pkt->tag.stype = VIO_SUBTYPE_NACK;
+			(void) send_ctrl(vio, &pkt->tag, sizeof(*pkt));
+			return handshake_failure(vio);
+		}
+	} else {
+		vio->ver.major = pkt->major;
+		vio->ver.minor = pkt->minor;
+		vio->hs_state = VIO_HS_GOTVERS;
+	}
+
+	switch (vio->dev_class) {
+	case VDEV_NETWORK:
+	case VDEV_DISK:
+		if (send_attr(vio) < 0)
+			return handshake_failure(vio);
+		break;
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int process_ver_nack(struct vio_driver_state *vio,
+			    struct vio_ver_info *pkt)
+{
+	struct vio_version *nver;
+
+	viodbg(HS, "GOT VERSION NACK maj[%u] min[%u] devclass[%u]\n",
+	       pkt->major, pkt->minor, pkt->dev_class);
+
+	if ((pkt->major == 0 && pkt->minor == 0) ||
+	    !(nver = find_by_major(vio, pkt->major)))
+		return handshake_failure(vio);
+
+	if (send_version(vio, nver->major, nver->minor) < 0)
+		return handshake_failure(vio);
+
+	return 0;
+}
+
+static int process_ver(struct vio_driver_state *vio, struct vio_ver_info *pkt)
+{
+	switch (pkt->tag.stype) {
+	case VIO_SUBTYPE_INFO:
+		return process_ver_info(vio, pkt);
+
+	case VIO_SUBTYPE_ACK:
+		return process_ver_ack(vio, pkt);
+
+	case VIO_SUBTYPE_NACK:
+		return process_ver_nack(vio, pkt);
+
+	default:
+		return handshake_failure(vio);
+	};
+}
+
+static int process_attr(struct vio_driver_state *vio, void *pkt)
+{
+	int err;
+
+	if (!(vio->hs_state & VIO_HS_GOTVERS))
+		return handshake_failure(vio);
+
+	err = vio->ops->handle_attr(vio, pkt);
+	if (err < 0) {
+		return handshake_failure(vio);
+	} else {
+		vio->hs_state |= VIO_HS_GOT_ATTR;
+
+		if ((vio->dr_state & VIO_DR_STATE_TXREQ) &&
+		    !(vio->hs_state & VIO_HS_SENT_DREG)) {
+			if (send_dreg(vio) < 0)
+				return handshake_failure(vio);
+
+			vio->hs_state |= VIO_HS_SENT_DREG;
+		}
+	}
+	return 0;
+}
+
+static int all_drings_registered(struct vio_driver_state *vio)
+{
+	int need_rx, need_tx;
+
+	need_rx = (vio->dr_state & VIO_DR_STATE_RXREQ);
+	need_tx = (vio->dr_state & VIO_DR_STATE_TXREQ);
+
+	if (need_rx &&
+	    !(vio->dr_state & VIO_DR_STATE_RXREG))
+		return 0;
+
+	if (need_tx &&
+	    !(vio->dr_state & VIO_DR_STATE_TXREG))
+		return 0;
+
+	return 1;
+}
+
+static int process_dreg_info(struct vio_driver_state *vio,
+			     struct vio_dring_register *pkt)
+{
+	struct vio_dring_state *dr;
+	int i, len;
+
+	viodbg(HS, "GOT DRING_REG INFO ident[%llx] "
+	       "ndesc[%u] dsz[%u] opt[0x%x] ncookies[%u]\n",
+	       (unsigned long long) pkt->dring_ident,
+	       pkt->num_descr, pkt->descr_size, pkt->options,
+	       pkt->num_cookies);
+
+	if (!(vio->dr_state & VIO_DR_STATE_RXREQ))
+		goto send_nack;
+
+	if (vio->dr_state & VIO_DR_STATE_RXREG)
+		goto send_nack;
+
+	vio->desc_buf = kzalloc(pkt->descr_size, GFP_ATOMIC);
+	if (!vio->desc_buf)
+		goto send_nack;
+
+	vio->desc_buf_len = pkt->descr_size;
+
+	dr = &vio->drings[VIO_DRIVER_RX_RING];
+
+	dr->num_entries = pkt->num_descr;
+	dr->entry_size = pkt->descr_size;
+	dr->ncookies = pkt->num_cookies;
+	for (i = 0; i < dr->ncookies; i++) {
+		dr->cookies[i] = pkt->cookies[i];
+
+		viodbg(HS, "DRING COOKIE(%d) [%016llx:%016llx]\n",
+		       i,
+		       (unsigned long long)
+		       pkt->cookies[i].cookie_addr,
+		       (unsigned long long)
+		       pkt->cookies[i].cookie_size);
+	}
+
+	pkt->tag.stype = VIO_SUBTYPE_ACK;
+	pkt->dring_ident = ++dr->ident;
+
+	viodbg(HS, "SEND DRING_REG ACK ident[%llx]\n",
+	       (unsigned long long) pkt->dring_ident);
+
+	len = (sizeof(*pkt) +
+	       (dr->ncookies * sizeof(struct ldc_trans_cookie)));
+	if (send_ctrl(vio, &pkt->tag, len) < 0)
+		goto send_nack;
+
+	vio->dr_state |= VIO_DR_STATE_RXREG;
+
+	return 0;
+
+send_nack:
+	pkt->tag.stype = VIO_SUBTYPE_NACK;
+	viodbg(HS, "SEND DRING_REG NACK\n");
+	(void) send_ctrl(vio, &pkt->tag, sizeof(*pkt));
+
+	return handshake_failure(vio);
+}
+
+static int process_dreg_ack(struct vio_driver_state *vio,
+			    struct vio_dring_register *pkt)
+{
+	struct vio_dring_state *dr;
+
+	viodbg(HS, "GOT DRING_REG ACK ident[%llx] "
+	       "ndesc[%u] dsz[%u] opt[0x%x] ncookies[%u]\n",
+	       (unsigned long long) pkt->dring_ident,
+	       pkt->num_descr, pkt->descr_size, pkt->options,
+	       pkt->num_cookies);
+
+	dr = &vio->drings[VIO_DRIVER_TX_RING];
+
+	if (!(vio->dr_state & VIO_DR_STATE_TXREQ))
+		return handshake_failure(vio);
+
+	dr->ident = pkt->dring_ident;
+	vio->dr_state |= VIO_DR_STATE_TXREG;
+
+	if (all_drings_registered(vio)) {
+		if (send_rdx(vio) < 0)
+			return handshake_failure(vio);
+		vio->hs_state = VIO_HS_SENT_RDX;
+	}
+	return 0;
+}
+
+static int process_dreg_nack(struct vio_driver_state *vio,
+			     struct vio_dring_register *pkt)
+{
+	viodbg(HS, "GOT DRING_REG NACK ident[%llx] "
+	       "ndesc[%u] dsz[%u] opt[0x%x] ncookies[%u]\n",
+	       (unsigned long long) pkt->dring_ident,
+	       pkt->num_descr, pkt->descr_size, pkt->options,
+	       pkt->num_cookies);
+
+	return handshake_failure(vio);
+}
+
+static int process_dreg(struct vio_driver_state *vio,
+			struct vio_dring_register *pkt)
+{
+	if (!(vio->hs_state & VIO_HS_GOTVERS))
+		return handshake_failure(vio);
+
+	switch (pkt->tag.stype) {
+	case VIO_SUBTYPE_INFO:
+		return process_dreg_info(vio, pkt);
+
+	case VIO_SUBTYPE_ACK:
+		return process_dreg_ack(vio, pkt);
+
+	case VIO_SUBTYPE_NACK:
+		return process_dreg_nack(vio, pkt);
+
+	default:
+		return handshake_failure(vio);
+	}
+}
+
+static int process_dunreg(struct vio_driver_state *vio,
+			  struct vio_dring_unregister *pkt)
+{
+	struct vio_dring_state *dr = &vio->drings[VIO_DRIVER_RX_RING];
+
+	viodbg(HS, "GOT DRING_UNREG\n");
+
+	if (pkt->dring_ident != dr->ident)
+		return 0;
+
+	vio->dr_state &= ~VIO_DR_STATE_RXREG;
+
+	memset(dr, 0, sizeof(*dr));
+
+	kfree(vio->desc_buf);
+	vio->desc_buf = NULL;
+	vio->desc_buf_len = 0;
+
+	return 0;
+}
+
+static int process_rdx_info(struct vio_driver_state *vio, struct vio_rdx *pkt)
+{
+	viodbg(HS, "GOT RDX INFO\n");
+
+	pkt->tag.stype = VIO_SUBTYPE_ACK;
+	viodbg(HS, "SEND RDX ACK\n");
+	if (send_ctrl(vio, &pkt->tag, sizeof(*pkt)) < 0)
+		return handshake_failure(vio);
+
+	vio->hs_state |= VIO_HS_SENT_RDX_ACK;
+	return 0;
+}
+
+static int process_rdx_ack(struct vio_driver_state *vio, struct vio_rdx *pkt)
+{
+	viodbg(HS, "GOT RDX ACK\n");
+
+	if (!(vio->hs_state & VIO_HS_SENT_RDX))
+		return handshake_failure(vio);
+
+	vio->hs_state |= VIO_HS_GOT_RDX_ACK;
+	return 0;
+}
+
+static int process_rdx_nack(struct vio_driver_state *vio, struct vio_rdx *pkt)
+{
+	viodbg(HS, "GOT RDX NACK\n");
+
+	return handshake_failure(vio);
+}
+
+static int process_rdx(struct vio_driver_state *vio, struct vio_rdx *pkt)
+{
+	if (!all_drings_registered(vio))
+		handshake_failure(vio);
+
+	switch (pkt->tag.stype) {
+	case VIO_SUBTYPE_INFO:
+		return process_rdx_info(vio, pkt);
+
+	case VIO_SUBTYPE_ACK:
+		return process_rdx_ack(vio, pkt);
+
+	case VIO_SUBTYPE_NACK:
+		return process_rdx_nack(vio, pkt);
+
+	default:
+		return handshake_failure(vio);
+	}
+}
+
+int vio_control_pkt_engine(struct vio_driver_state *vio, void *pkt)
+{
+	struct vio_msg_tag *tag = pkt;
+	u8 prev_state = vio->hs_state;
+	int err;
+
+	switch (tag->stype_env) {
+	case VIO_VER_INFO:
+		err = process_ver(vio, pkt);
+		break;
+
+	case VIO_ATTR_INFO:
+		err = process_attr(vio, pkt);
+		break;
+
+	case VIO_DRING_REG:
+		err = process_dreg(vio, pkt);
+		break;
+
+	case VIO_DRING_UNREG:
+		err = process_dunreg(vio, pkt);
+		break;
+
+	case VIO_RDX:
+		err = process_rdx(vio, pkt);
+		break;
+
+	default:
+		err = process_unknown(vio, pkt);
+		break;
+	}
+	if (!err &&
+	    vio->hs_state != prev_state &&
+	    (vio->hs_state & VIO_HS_COMPLETE))
+		vio->ops->handshake_complete(vio);
+
+	return err;
+}
+EXPORT_SYMBOL(vio_control_pkt_engine);
+
+void vio_conn_reset(struct vio_driver_state *vio)
+{
+}
+EXPORT_SYMBOL(vio_conn_reset);
+
+/* The issue is that the Solaris virtual disk server just mirrors the
+ * SID values it gets from the client peer.  So we work around that
+ * here in vio_{validate,send}_sid() so that the drivers don't need
+ * to be aware of this crap.
+ */
+int vio_validate_sid(struct vio_driver_state *vio, struct vio_msg_tag *tp)
+{
+	u32 sid;
+
+	/* Always let VERSION+INFO packets through unchecked, they
+	 * define the new SID.
+	 */
+	if (tp->type == VIO_TYPE_CTRL &&
+	    tp->stype == VIO_SUBTYPE_INFO &&
+	    tp->stype_env == VIO_VER_INFO)
+		return 0;
+
+	/* Ok, now figure out which SID to use.  */
+	switch (vio->dev_class) {
+	case VDEV_NETWORK:
+	case VDEV_NETWORK_SWITCH:
+	case VDEV_DISK_SERVER:
+	default:
+		sid = vio->_peer_sid;
+		break;
+
+	case VDEV_DISK:
+		sid = vio->_local_sid;
+		break;
+	}
+
+	if (sid == tp->sid)
+		return 0;
+	viodbg(DATA, "BAD SID tag->sid[%08x] peer_sid[%08x] local_sid[%08x]\n",
+	       tp->sid, vio->_peer_sid, vio->_local_sid);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(vio_validate_sid);
+
+u32 vio_send_sid(struct vio_driver_state *vio)
+{
+	switch (vio->dev_class) {
+	case VDEV_NETWORK:
+	case VDEV_NETWORK_SWITCH:
+	case VDEV_DISK:
+	default:
+		return vio->_local_sid;
+
+	case VDEV_DISK_SERVER:
+		return vio->_peer_sid;
+	}
+}
+EXPORT_SYMBOL(vio_send_sid);
+
+extern int vio_ldc_alloc(struct vio_driver_state *vio,
+			 struct ldc_channel_config *base_cfg,
+			 void *event_arg)
+{
+	struct ldc_channel_config cfg = *base_cfg;
+	struct ldc_channel *lp;
+
+	cfg.tx_irq = vio->vdev->tx_irq;
+	cfg.rx_irq = vio->vdev->rx_irq;
+
+	lp = ldc_alloc(vio->vdev->channel_id, &cfg, event_arg);
+	if (IS_ERR(lp))
+		return PTR_ERR(lp);
+
+	vio->lp = lp;
+
+	return 0;
+}
+EXPORT_SYMBOL(vio_ldc_alloc);
+
+void vio_ldc_free(struct vio_driver_state *vio)
+{
+	ldc_free(vio->lp);
+	vio->lp = NULL;
+
+	kfree(vio->desc_buf);
+	vio->desc_buf = NULL;
+	vio->desc_buf_len = 0;
+}
+EXPORT_SYMBOL(vio_ldc_free);
+
+void vio_port_up(struct vio_driver_state *vio)
+{
+	unsigned long flags;
+	int err, state;
+
+	spin_lock_irqsave(&vio->lock, flags);
+
+	state = ldc_state(vio->lp);
+
+	err = 0;
+	if (state == LDC_STATE_INIT) {
+		err = ldc_bind(vio->lp, vio->name);
+		if (err)
+			printk(KERN_WARNING "%s: Port %lu bind failed, "
+			       "err=%d\n",
+			       vio->name, vio->vdev->channel_id, err);
+	}
+
+	if (!err) {
+		err = ldc_connect(vio->lp);
+		if (err)
+			printk(KERN_WARNING "%s: Port %lu connect failed, "
+			       "err=%d\n",
+			       vio->name, vio->vdev->channel_id, err);
+	}
+	if (err) {
+		unsigned long expires = jiffies + HZ;
+
+		expires = round_jiffies(expires);
+		mod_timer(&vio->timer, expires);
+	}
+
+	spin_unlock_irqrestore(&vio->lock, flags);
+}
+EXPORT_SYMBOL(vio_port_up);
+
+static void vio_port_timer(unsigned long _arg)
+{
+	struct vio_driver_state *vio = (struct vio_driver_state *) _arg;
+
+	vio_port_up(vio);
+}
+
+int vio_driver_init(struct vio_driver_state *vio, struct vio_dev *vdev,
+		    u8 dev_class, struct vio_version *ver_table,
+		    int ver_table_size, struct vio_driver_ops *ops,
+		    char *name)
+{
+	switch (dev_class) {
+	case VDEV_NETWORK:
+	case VDEV_NETWORK_SWITCH:
+	case VDEV_DISK:
+	case VDEV_DISK_SERVER:
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!ops->send_attr ||
+	    !ops->handle_attr ||
+	    !ops->handshake_complete)
+		return -EINVAL;
+
+	if (!ver_table || ver_table_size < 0)
+		return -EINVAL;
+
+	if (!name)
+		return -EINVAL;
+
+	spin_lock_init(&vio->lock);
+
+	vio->name = name;
+
+	vio->dev_class = dev_class;
+	vio->vdev = vdev;
+
+	vio->ver_table = ver_table;
+	vio->ver_table_entries = ver_table_size;
+
+	vio->ops = ops;
+
+	setup_timer(&vio->timer, vio_port_timer, (unsigned long) vio);
+
+	return 0;
+}
+EXPORT_SYMBOL(vio_driver_init);

+ 1 - 1
arch/sparc64/lib/Makefile

@@ -14,6 +14,6 @@ lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \
 	 NGmemcpy.o NGcopy_from_user.o NGcopy_to_user.o NGpatch.o \
 	 NGmemcpy.o NGcopy_from_user.o NGcopy_to_user.o NGpatch.o \
 	 NGpage.o NGbzero.o \
 	 NGpage.o NGbzero.o \
 	 copy_in_user.o user_fixup.o memmove.o \
 	 copy_in_user.o user_fixup.o memmove.o \
-	 mcount.o ipcsum.o rwsem.o xor.o delay.o
+	 mcount.o ipcsum.o rwsem.o xor.o
 
 
 obj-y += iomap.o
 obj-y += iomap.o

+ 0 - 46
arch/sparc64/lib/delay.c

@@ -1,46 +0,0 @@
-/* delay.c: Delay loops for sparc64
- *
- * Copyright (C) 2004, 2006 David S. Miller <davem@davemloft.net>
- *
- * Based heavily upon x86 variant which is:
- *	Copyright (C) 1993 Linus Torvalds
- *	Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
- */
-
-#include <linux/delay.h>
-#include <asm/timer.h>
-
-void __delay(unsigned long loops)
-{
-	unsigned long bclock, now;
-	
-	bclock = tick_ops->get_tick();
-	do {
-		now = tick_ops->get_tick();
-	} while ((now-bclock) < loops);
-}
-
-/* We used to multiply by HZ after shifting down by 32 bits
- * but that runs into problems for higher values of HZ and
- * slow cpus.
- */
-void __const_udelay(unsigned long n)
-{
-	n *= 4;
-
-	n *= (cpu_data(raw_smp_processor_id()).udelay_val * (HZ/4));
-	n >>= 32;
-
-	__delay(n + 1);
-}
-
-void __udelay(unsigned long n)
-{
-	__const_udelay(n * 0x10c7UL);
-}
-
-
-void __ndelay(unsigned long n)
-{
-	__const_udelay(n * 0x5UL);
-}

+ 13 - 0
arch/sparc64/prom/misc.c

@@ -14,6 +14,7 @@
 #include <asm/openprom.h>
 #include <asm/openprom.h>
 #include <asm/oplib.h>
 #include <asm/oplib.h>
 #include <asm/system.h>
 #include <asm/system.h>
+#include <asm/ldc.h>
 
 
 int prom_service_exists(const char *service_name)
 int prom_service_exists(const char *service_name)
 {
 {
@@ -37,6 +38,10 @@ void prom_sun4v_guest_soft_state(void)
 /* Reset and reboot the machine with the command 'bcommand'. */
 /* Reset and reboot the machine with the command 'bcommand'. */
 void prom_reboot(const char *bcommand)
 void prom_reboot(const char *bcommand)
 {
 {
+#ifdef CONFIG_SUN_LDOMS
+	if (ldom_domaining_enabled)
+		ldom_reboot(bcommand);
+#endif
 	p1275_cmd("boot", P1275_ARG(0, P1275_ARG_IN_STRING) |
 	p1275_cmd("boot", P1275_ARG(0, P1275_ARG_IN_STRING) |
 		  P1275_INOUT(1, 0), bcommand);
 		  P1275_INOUT(1, 0), bcommand);
 }
 }
@@ -91,6 +96,10 @@ void prom_cmdline(void)
  */
  */
 void prom_halt(void)
 void prom_halt(void)
 {
 {
+#ifdef CONFIG_SUN_LDOMS
+	if (ldom_domaining_enabled)
+		ldom_power_off();
+#endif
 again:
 again:
 	p1275_cmd("exit", P1275_INOUT(0, 0));
 	p1275_cmd("exit", P1275_INOUT(0, 0));
 	goto again; /* PROM is out to get me -DaveM */
 	goto again; /* PROM is out to get me -DaveM */
@@ -98,6 +107,10 @@ void prom_halt(void)
 
 
 void prom_halt_power_off(void)
 void prom_halt_power_off(void)
 {
 {
+#ifdef CONFIG_SUN_LDOMS
+	if (ldom_domaining_enabled)
+		ldom_power_off();
+#endif
 	p1275_cmd("SUNW,power-off", P1275_INOUT(0, 0));
 	p1275_cmd("SUNW,power-off", P1275_INOUT(0, 0));
 
 
 	/* if nothing else helps, we just halt */
 	/* if nothing else helps, we just halt */

+ 1 - 0
arch/sparc64/prom/p1275.c

@@ -16,6 +16,7 @@
 #include <asm/system.h>
 #include <asm/system.h>
 #include <asm/spitfire.h>
 #include <asm/spitfire.h>
 #include <asm/pstate.h>
 #include <asm/pstate.h>
+#include <asm/ldc.h>
 
 
 struct {
 struct {
 	long prom_callback;			/* 0x00 */
 	long prom_callback;			/* 0x00 */

+ 11 - 2
arch/sparc64/prom/tree.c

@@ -13,6 +13,7 @@
 
 
 #include <asm/openprom.h>
 #include <asm/openprom.h>
 #include <asm/oplib.h>
 #include <asm/oplib.h>
+#include <asm/ldc.h>
 
 
 /* Return the child of node 'node' or zero if no this node has no
 /* Return the child of node 'node' or zero if no this node has no
  * direct descendent.
  * direct descendent.
@@ -261,9 +262,17 @@ int prom_node_has_property(int node, const char *prop)
 int
 int
 prom_setprop(int node, const char *pname, char *value, int size)
 prom_setprop(int node, const char *pname, char *value, int size)
 {
 {
-	if(size == 0) return 0;
-	if((pname == 0) || (value == 0)) return 0;
+	if (size == 0)
+		return 0;
+	if ((pname == 0) || (value == 0))
+		return 0;
 	
 	
+#ifdef CONFIG_SUN_LDOMS
+	if (ldom_domaining_enabled) {
+		ldom_set_var(pname, value);
+		return 0;
+	}
+#endif
 	return p1275_cmd ("setprop", P1275_ARG(1,P1275_ARG_IN_STRING)|
 	return p1275_cmd ("setprop", P1275_ARG(1,P1275_ARG_IN_STRING)|
 					  P1275_ARG(2,P1275_ARG_IN_BUF)|
 					  P1275_ARG(2,P1275_ARG_IN_BUF)|
 					  P1275_INOUT(4, 1), 
 					  P1275_INOUT(4, 1), 

+ 7 - 0
drivers/block/Kconfig

@@ -423,6 +423,13 @@ config ATA_OVER_ETH
 	This driver provides Support for ATA over Ethernet block
 	This driver provides Support for ATA over Ethernet block
 	devices like the Coraid EtherDrive (R) Storage Blade.
 	devices like the Coraid EtherDrive (R) Storage Blade.
 
 
+config SUNVDC
+	tristate "Sun Virtual Disk Client support"
+	depends on SUN_LDOMS
+	help
+	  Support for virtual disk devices as a client under Sun
+	  Logical Domains.
+
 source "drivers/s390/block/Kconfig"
 source "drivers/s390/block/Kconfig"
 
 
 endif # BLK_DEV
 endif # BLK_DEV

+ 1 - 0
drivers/block/Makefile

@@ -19,6 +19,7 @@ obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
 obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
 obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
+obj-$(CONFIG_SUNVDC)		+= sunvdc.o
 
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o

+ 972 - 0
drivers/block/sunvdc.c

@@ -0,0 +1,972 @@
+/* sunvdc.c: Sun LDOM Virtual Disk Client.
+ *
+ * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#include <asm/vio.h>
+#include <asm/ldc.h>
+
+#define DRV_MODULE_NAME		"sunvdc"
+#define PFX DRV_MODULE_NAME	": "
+#define DRV_MODULE_VERSION	"1.0"
+#define DRV_MODULE_RELDATE	"June 25, 2007"
+
+static char version[] __devinitdata =
+	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_DESCRIPTION("Sun LDOM virtual disk client driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+#define VDC_TX_RING_SIZE	256
+
+#define WAITING_FOR_LINK_UP	0x01
+#define WAITING_FOR_TX_SPACE	0x02
+#define WAITING_FOR_GEN_CMD	0x04
+#define WAITING_FOR_ANY		-1
+
+struct vdc_req_entry {
+	struct request		*req;
+};
+
+struct vdc_port {
+	struct vio_driver_state	vio;
+
+	struct vdc		*vp;
+
+	struct gendisk		*disk;
+
+	struct vdc_completion	*cmp;
+
+	u64			req_id;
+	u64			seq;
+	struct vdc_req_entry	rq_arr[VDC_TX_RING_SIZE];
+
+	unsigned long		ring_cookies;
+
+	u64			max_xfer_size;
+	u32			vdisk_block_size;
+
+	/* The server fills these in for us in the disk attribute
+	 * ACK packet.
+	 */
+	u64			operations;
+	u32			vdisk_size;
+	u8			vdisk_type;
+	u8			dev_no;
+
+	char			disk_name[32];
+
+	struct vio_disk_geom	geom;
+	struct vio_disk_vtoc	label;
+
+	struct list_head	list;
+};
+
+static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
+{
+	return container_of(vio, struct vdc_port, vio);
+}
+
+struct vdc {
+	/* Protects prot_list.  */
+	spinlock_t		lock;
+
+	struct vio_dev		*dev;
+
+	struct list_head	port_list;
+};
+
+/* Ordered from largest major to lowest */
+static struct vio_version vdc_versions[] = {
+	{ .major = 1, .minor = 0 },
+};
+
+#define VDCBLK_NAME	"vdisk"
+static int vdc_major;
+#define PARTITION_SHIFT	3
+
+static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr)
+{
+	return vio_dring_avail(dr, VDC_TX_RING_SIZE);
+}
+
+static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct vdc_port *port = disk->private_data;
+
+	geo->heads = (u8) port->geom.num_hd;
+	geo->sectors = (u8) port->geom.num_sec;
+	geo->cylinders = port->geom.num_cyl;
+
+	return 0;
+}
+
+static struct block_device_operations vdc_fops = {
+	.owner		= THIS_MODULE,
+	.getgeo		= vdc_getgeo,
+};
+
+static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for)
+{
+	if (vio->cmp &&
+	    (waiting_for == -1 ||
+	     vio->cmp->waiting_for == waiting_for)) {
+		vio->cmp->err = err;
+		complete(&vio->cmp->com);
+		vio->cmp = NULL;
+	}
+}
+
+static void vdc_handshake_complete(struct vio_driver_state *vio)
+{
+	vdc_finish(vio, 0, WAITING_FOR_LINK_UP);
+}
+
+static int vdc_handle_unknown(struct vdc_port *port, void *arg)
+{
+	struct vio_msg_tag *pkt = arg;
+
+	printk(KERN_ERR PFX "Received unknown msg [%02x:%02x:%04x:%08x]\n",
+	       pkt->type, pkt->stype, pkt->stype_env, pkt->sid);
+	printk(KERN_ERR PFX "Resetting connection.\n");
+
+	ldc_disconnect(port->vio.lp);
+
+	return -ECONNRESET;
+}
+
+static int vdc_send_attr(struct vio_driver_state *vio)
+{
+	struct vdc_port *port = to_vdc_port(vio);
+	struct vio_disk_attr_info pkt;
+
+	memset(&pkt, 0, sizeof(pkt));
+
+	pkt.tag.type = VIO_TYPE_CTRL;
+	pkt.tag.stype = VIO_SUBTYPE_INFO;
+	pkt.tag.stype_env = VIO_ATTR_INFO;
+	pkt.tag.sid = vio_send_sid(vio);
+
+	pkt.xfer_mode = VIO_DRING_MODE;
+	pkt.vdisk_block_size = port->vdisk_block_size;
+	pkt.max_xfer_size = port->max_xfer_size;
+
+	viodbg(HS, "SEND ATTR xfer_mode[0x%x] blksz[%u] max_xfer[%lu]\n",
+	       pkt.xfer_mode, pkt.vdisk_block_size, pkt.max_xfer_size);
+
+	return vio_ldc_send(&port->vio, &pkt, sizeof(pkt));
+}
+
+static int vdc_handle_attr(struct vio_driver_state *vio, void *arg)
+{
+	struct vdc_port *port = to_vdc_port(vio);
+	struct vio_disk_attr_info *pkt = arg;
+
+	viodbg(HS, "GOT ATTR stype[0x%x] ops[%lx] disk_size[%lu] disk_type[%x] "
+	       "xfer_mode[0x%x] blksz[%u] max_xfer[%lu]\n",
+	       pkt->tag.stype, pkt->operations,
+	       pkt->vdisk_size, pkt->vdisk_type,
+	       pkt->xfer_mode, pkt->vdisk_block_size,
+	       pkt->max_xfer_size);
+
+	if (pkt->tag.stype == VIO_SUBTYPE_ACK) {
+		switch (pkt->vdisk_type) {
+		case VD_DISK_TYPE_DISK:
+		case VD_DISK_TYPE_SLICE:
+			break;
+
+		default:
+			printk(KERN_ERR PFX "%s: Bogus vdisk_type 0x%x\n",
+			       vio->name, pkt->vdisk_type);
+			return -ECONNRESET;
+		}
+
+		if (pkt->vdisk_block_size > port->vdisk_block_size) {
+			printk(KERN_ERR PFX "%s: BLOCK size increased "
+			       "%u --> %u\n",
+			       vio->name,
+			       port->vdisk_block_size, pkt->vdisk_block_size);
+			return -ECONNRESET;
+		}
+
+		port->operations = pkt->operations;
+		port->vdisk_size = pkt->vdisk_size;
+		port->vdisk_type = pkt->vdisk_type;
+		if (pkt->max_xfer_size < port->max_xfer_size)
+			port->max_xfer_size = pkt->max_xfer_size;
+		port->vdisk_block_size = pkt->vdisk_block_size;
+		return 0;
+	} else {
+		printk(KERN_ERR PFX "%s: Attribute NACK\n", vio->name);
+
+		return -ECONNRESET;
+	}
+}
+
+static void vdc_end_special(struct vdc_port *port, struct vio_disk_desc *desc)
+{
+	int err = desc->status;
+
+	vdc_finish(&port->vio, -err, WAITING_FOR_GEN_CMD);
+}
+
+static void vdc_end_request(struct request *req, int uptodate, int num_sectors)
+{
+	if (end_that_request_first(req, uptodate, num_sectors))
+		return;
+	add_disk_randomness(req->rq_disk);
+	end_that_request_last(req, uptodate);
+}
+
+static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
+			unsigned int index)
+{
+	struct vio_disk_desc *desc = vio_dring_entry(dr, index);
+	struct vdc_req_entry *rqe = &port->rq_arr[index];
+	struct request *req;
+
+	if (unlikely(desc->hdr.state != VIO_DESC_DONE))
+		return;
+
+	ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies);
+	desc->hdr.state = VIO_DESC_FREE;
+	dr->cons = (index + 1) & (VDC_TX_RING_SIZE - 1);
+
+	req = rqe->req;
+	if (req == NULL) {
+		vdc_end_special(port, desc);
+		return;
+	}
+
+	rqe->req = NULL;
+
+	vdc_end_request(req, !desc->status, desc->size >> 9);
+
+	if (blk_queue_stopped(port->disk->queue))
+		blk_start_queue(port->disk->queue);
+}
+
+static int vdc_ack(struct vdc_port *port, void *msgbuf)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	struct vio_dring_data *pkt = msgbuf;
+
+	if (unlikely(pkt->dring_ident != dr->ident ||
+		     pkt->start_idx != pkt->end_idx ||
+		     pkt->start_idx >= VDC_TX_RING_SIZE))
+		return 0;
+
+	vdc_end_one(port, dr, pkt->start_idx);
+
+	return 0;
+}
+
+static int vdc_nack(struct vdc_port *port, void *msgbuf)
+{
+	/* XXX Implement me XXX */
+	return 0;
+}
+
+static void vdc_event(void *arg, int event)
+{
+	struct vdc_port *port = arg;
+	struct vio_driver_state *vio = &port->vio;
+	unsigned long flags;
+	int err;
+
+	spin_lock_irqsave(&vio->lock, flags);
+
+	if (unlikely(event == LDC_EVENT_RESET ||
+		     event == LDC_EVENT_UP)) {
+		vio_link_state_change(vio, event);
+		spin_unlock_irqrestore(&vio->lock, flags);
+		return;
+	}
+
+	if (unlikely(event != LDC_EVENT_DATA_READY)) {
+		printk(KERN_WARNING PFX "Unexpected LDC event %d\n", event);
+		spin_unlock_irqrestore(&vio->lock, flags);
+		return;
+	}
+
+	err = 0;
+	while (1) {
+		union {
+			struct vio_msg_tag tag;
+			u64 raw[8];
+		} msgbuf;
+
+		err = ldc_read(vio->lp, &msgbuf, sizeof(msgbuf));
+		if (unlikely(err < 0)) {
+			if (err == -ECONNRESET)
+				vio_conn_reset(vio);
+			break;
+		}
+		if (err == 0)
+			break;
+		viodbg(DATA, "TAG [%02x:%02x:%04x:%08x]\n",
+		       msgbuf.tag.type,
+		       msgbuf.tag.stype,
+		       msgbuf.tag.stype_env,
+		       msgbuf.tag.sid);
+		err = vio_validate_sid(vio, &msgbuf.tag);
+		if (err < 0)
+			break;
+
+		if (likely(msgbuf.tag.type == VIO_TYPE_DATA)) {
+			if (msgbuf.tag.stype == VIO_SUBTYPE_ACK)
+				err = vdc_ack(port, &msgbuf);
+			else if (msgbuf.tag.stype == VIO_SUBTYPE_NACK)
+				err = vdc_nack(port, &msgbuf);
+			else
+				err = vdc_handle_unknown(port, &msgbuf);
+		} else if (msgbuf.tag.type == VIO_TYPE_CTRL) {
+			err = vio_control_pkt_engine(vio, &msgbuf);
+		} else {
+			err = vdc_handle_unknown(port, &msgbuf);
+		}
+		if (err < 0)
+			break;
+	}
+	if (err < 0)
+		vdc_finish(&port->vio, err, WAITING_FOR_ANY);
+	spin_unlock_irqrestore(&vio->lock, flags);
+}
+
+static int __vdc_tx_trigger(struct vdc_port *port)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	struct vio_dring_data hdr = {
+		.tag = {
+			.type		= VIO_TYPE_DATA,
+			.stype		= VIO_SUBTYPE_INFO,
+			.stype_env	= VIO_DRING_DATA,
+			.sid		= vio_send_sid(&port->vio),
+		},
+		.dring_ident		= dr->ident,
+		.start_idx		= dr->prod,
+		.end_idx		= dr->prod,
+	};
+	int err, delay;
+
+	hdr.seq = dr->snd_nxt;
+	delay = 1;
+	do {
+		err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr));
+		if (err > 0) {
+			dr->snd_nxt++;
+			break;
+		}
+		udelay(delay);
+		if ((delay <<= 1) > 128)
+			delay = 128;
+	} while (err == -EAGAIN);
+
+	return err;
+}
+
+static int __send_request(struct request *req)
+{
+	struct vdc_port *port = req->rq_disk->private_data;
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	struct scatterlist sg[port->ring_cookies];
+	struct vdc_req_entry *rqe;
+	struct vio_disk_desc *desc;
+	unsigned int map_perm;
+	int nsg, err, i;
+	u64 len;
+	u8 op;
+
+	map_perm = LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO;
+
+	if (rq_data_dir(req) == READ) {
+		map_perm |= LDC_MAP_W;
+		op = VD_OP_BREAD;
+	} else {
+		map_perm |= LDC_MAP_R;
+		op = VD_OP_BWRITE;
+	}
+
+	nsg = blk_rq_map_sg(req->q, req, sg);
+
+	len = 0;
+	for (i = 0; i < nsg; i++)
+		len += sg[i].length;
+
+	if (unlikely(vdc_tx_dring_avail(dr) < 1)) {
+		blk_stop_queue(port->disk->queue);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	desc = vio_dring_cur(dr);
+
+	err = ldc_map_sg(port->vio.lp, sg, nsg,
+			 desc->cookies, port->ring_cookies,
+			 map_perm);
+	if (err < 0) {
+		printk(KERN_ERR PFX "ldc_map_sg() failure, err=%d.\n", err);
+		return err;
+	}
+
+	rqe = &port->rq_arr[dr->prod];
+	rqe->req = req;
+
+	desc->hdr.ack = VIO_ACK_ENABLE;
+	desc->req_id = port->req_id;
+	desc->operation = op;
+	if (port->vdisk_type == VD_DISK_TYPE_DISK) {
+		desc->slice = 2;
+	} else {
+		desc->slice = 0;
+	}
+	desc->status = ~0;
+	desc->offset = (req->sector << 9) / port->vdisk_block_size;
+	desc->size = len;
+	desc->ncookies = err;
+
+	/* This has to be a non-SMP write barrier because we are writing
+	 * to memory which is shared with the peer LDOM.
+	 */
+	wmb();
+	desc->hdr.state = VIO_DESC_READY;
+
+	err = __vdc_tx_trigger(port);
+	if (err < 0) {
+		printk(KERN_ERR PFX "vdc_tx_trigger() failure, err=%d\n", err);
+	} else {
+		port->req_id++;
+		dr->prod = (dr->prod + 1) & (VDC_TX_RING_SIZE - 1);
+	}
+out:
+
+	return err;
+}
+
+static void do_vdc_request(request_queue_t *q)
+{
+	while (1) {
+		struct request *req = elv_next_request(q);
+
+		if (!req)
+			break;
+
+		blkdev_dequeue_request(req);
+		if (__send_request(req) < 0)
+			vdc_end_request(req, 0, req->hard_nr_sectors);
+	}
+}
+
+static int generic_request(struct vdc_port *port, u8 op, void *buf, int len)
+{
+	struct vio_dring_state *dr;
+	struct vio_completion comp;
+	struct vio_disk_desc *desc;
+	unsigned int map_perm;
+	unsigned long flags;
+	int op_len, err;
+	void *req_buf;
+
+	if (!(((u64)1 << ((u64)op - 1)) & port->operations))
+		return -EOPNOTSUPP;
+
+	switch (op) {
+	case VD_OP_BREAD:
+	case VD_OP_BWRITE:
+	default:
+		return -EINVAL;
+
+	case VD_OP_FLUSH:
+		op_len = 0;
+		map_perm = 0;
+		break;
+
+	case VD_OP_GET_WCE:
+		op_len = sizeof(u32);
+		map_perm = LDC_MAP_W;
+		break;
+
+	case VD_OP_SET_WCE:
+		op_len = sizeof(u32);
+		map_perm = LDC_MAP_R;
+		break;
+
+	case VD_OP_GET_VTOC:
+		op_len = sizeof(struct vio_disk_vtoc);
+		map_perm = LDC_MAP_W;
+		break;
+
+	case VD_OP_SET_VTOC:
+		op_len = sizeof(struct vio_disk_vtoc);
+		map_perm = LDC_MAP_R;
+		break;
+
+	case VD_OP_GET_DISKGEOM:
+		op_len = sizeof(struct vio_disk_geom);
+		map_perm = LDC_MAP_W;
+		break;
+
+	case VD_OP_SET_DISKGEOM:
+		op_len = sizeof(struct vio_disk_geom);
+		map_perm = LDC_MAP_R;
+		break;
+
+	case VD_OP_SCSICMD:
+		op_len = 16;
+		map_perm = LDC_MAP_RW;
+		break;
+
+	case VD_OP_GET_DEVID:
+		op_len = sizeof(struct vio_disk_devid);
+		map_perm = LDC_MAP_W;
+		break;
+
+	case VD_OP_GET_EFI:
+	case VD_OP_SET_EFI:
+		return -EOPNOTSUPP;
+		break;
+	};
+
+	map_perm |= LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO;
+
+	op_len = (op_len + 7) & ~7;
+	req_buf = kzalloc(op_len, GFP_KERNEL);
+	if (!req_buf)
+		return -ENOMEM;
+
+	if (len > op_len)
+		len = op_len;
+
+	if (map_perm & LDC_MAP_R)
+		memcpy(req_buf, buf, len);
+
+	spin_lock_irqsave(&port->vio.lock, flags);
+
+	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+
+	/* XXX If we want to use this code generically we have to
+	 * XXX handle TX ring exhaustion etc.
+	 */
+	desc = vio_dring_cur(dr);
+
+	err = ldc_map_single(port->vio.lp, req_buf, op_len,
+			     desc->cookies, port->ring_cookies,
+			     map_perm);
+	if (err < 0) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		kfree(req_buf);
+		return err;
+	}
+
+	init_completion(&comp.com);
+	comp.waiting_for = WAITING_FOR_GEN_CMD;
+	port->vio.cmp = &comp;
+
+	desc->hdr.ack = VIO_ACK_ENABLE;
+	desc->req_id = port->req_id;
+	desc->operation = op;
+	desc->slice = 0;
+	desc->status = ~0;
+	desc->offset = 0;
+	desc->size = op_len;
+	desc->ncookies = err;
+
+	/* This has to be a non-SMP write barrier because we are writing
+	 * to memory which is shared with the peer LDOM.
+	 */
+	wmb();
+	desc->hdr.state = VIO_DESC_READY;
+
+	err = __vdc_tx_trigger(port);
+	if (err >= 0) {
+		port->req_id++;
+		dr->prod = (dr->prod + 1) & (VDC_TX_RING_SIZE - 1);
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+
+		wait_for_completion(&comp.com);
+		err = comp.err;
+	} else {
+		port->vio.cmp = NULL;
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+	}
+
+	if (map_perm & LDC_MAP_W)
+		memcpy(buf, req_buf, len);
+
+	kfree(req_buf);
+
+	return err;
+}
+
+static int __devinit vdc_alloc_tx_ring(struct vdc_port *port)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	unsigned long len, entry_size;
+	int ncookies;
+	void *dring;
+
+	entry_size = sizeof(struct vio_disk_desc) +
+		(sizeof(struct ldc_trans_cookie) * port->ring_cookies);
+	len = (VDC_TX_RING_SIZE * entry_size);
+
+	ncookies = VIO_MAX_RING_COOKIES;
+	dring = ldc_alloc_exp_dring(port->vio.lp, len,
+				    dr->cookies, &ncookies,
+				    (LDC_MAP_SHADOW |
+				     LDC_MAP_DIRECT |
+				     LDC_MAP_RW));
+	if (IS_ERR(dring))
+		return PTR_ERR(dring);
+
+	dr->base = dring;
+	dr->entry_size = entry_size;
+	dr->num_entries = VDC_TX_RING_SIZE;
+	dr->prod = dr->cons = 0;
+	dr->pending = VDC_TX_RING_SIZE;
+	dr->ncookies = ncookies;
+
+	return 0;
+}
+
+static void vdc_free_tx_ring(struct vdc_port *port)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+
+	if (dr->base) {
+		ldc_free_exp_dring(port->vio.lp, dr->base,
+				   (dr->entry_size * dr->num_entries),
+				   dr->cookies, dr->ncookies);
+		dr->base = NULL;
+		dr->entry_size = 0;
+		dr->num_entries = 0;
+		dr->pending = 0;
+		dr->ncookies = 0;
+	}
+}
+
+static int probe_disk(struct vdc_port *port)
+{
+	struct vio_completion comp;
+	struct request_queue *q;
+	struct gendisk *g;
+	int err;
+
+	init_completion(&comp.com);
+	comp.err = 0;
+	comp.waiting_for = WAITING_FOR_LINK_UP;
+	port->vio.cmp = &comp;
+
+	vio_port_up(&port->vio);
+
+	wait_for_completion(&comp.com);
+	if (comp.err)
+		return comp.err;
+
+	err = generic_request(port, VD_OP_GET_VTOC,
+			      &port->label, sizeof(port->label));
+	if (err < 0) {
+		printk(KERN_ERR PFX "VD_OP_GET_VTOC returns error %d\n", err);
+		return err;
+	}
+
+	err = generic_request(port, VD_OP_GET_DISKGEOM,
+			      &port->geom, sizeof(port->geom));
+	if (err < 0) {
+		printk(KERN_ERR PFX "VD_OP_GET_DISKGEOM returns "
+		       "error %d\n", err);
+		return err;
+	}
+
+	port->vdisk_size = ((u64)port->geom.num_cyl *
+			    (u64)port->geom.num_hd *
+			    (u64)port->geom.num_sec);
+
+	q = blk_init_queue(do_vdc_request, &port->vio.lock);
+	if (!q) {
+		printk(KERN_ERR PFX "%s: Could not allocate queue.\n",
+		       port->vio.name);
+		return -ENOMEM;
+	}
+	g = alloc_disk(1 << PARTITION_SHIFT);
+	if (!g) {
+		printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
+		       port->vio.name);
+		blk_cleanup_queue(q);
+		return -ENOMEM;
+	}
+
+	port->disk = g;
+
+	blk_queue_max_hw_segments(q, port->ring_cookies);
+	blk_queue_max_phys_segments(q, port->ring_cookies);
+	blk_queue_max_sectors(q, port->max_xfer_size);
+	g->major = vdc_major;
+	g->first_minor = port->dev_no << PARTITION_SHIFT;
+	strcpy(g->disk_name, port->disk_name);
+
+	g->fops = &vdc_fops;
+	g->queue = q;
+	g->private_data = port;
+	g->driverfs_dev = &port->vio.vdev->dev;
+
+	set_capacity(g, port->vdisk_size);
+
+	printk(KERN_INFO PFX "%s: %u sectors (%u MB)\n",
+	       g->disk_name,
+	       port->vdisk_size, (port->vdisk_size >> (20 - 9)));
+
+	add_disk(g);
+
+	return 0;
+}
+
+static struct ldc_channel_config vdc_ldc_cfg = {
+	.event		= vdc_event,
+	.mtu		= 64,
+	.mode		= LDC_MODE_UNRELIABLE,
+};
+
+static struct vio_driver_ops vdc_vio_ops = {
+	.send_attr		= vdc_send_attr,
+	.handle_attr		= vdc_handle_attr,
+	.handshake_complete	= vdc_handshake_complete,
+};
+
+static int __devinit vdc_port_probe(struct vio_dev *vdev,
+				    const struct vio_device_id *id)
+{
+	struct mdesc_handle *hp;
+	struct vdc_port *port;
+	unsigned long flags;
+	struct vdc *vp;
+	const u64 *port_id;
+	int err;
+
+	vp = dev_get_drvdata(vdev->dev.parent);
+	if (!vp) {
+		printk(KERN_ERR PFX "Cannot find port parent vdc.\n");
+		return -ENODEV;
+	}
+
+	hp = mdesc_grab();
+
+	port_id = mdesc_get_property(hp, vdev->mp, "id", NULL);
+	err = -ENODEV;
+	if (!port_id) {
+		printk(KERN_ERR PFX "Port lacks id property.\n");
+		goto err_out_release_mdesc;
+	}
+	if ((*port_id << PARTITION_SHIFT) & ~(u64)MINORMASK) {
+		printk(KERN_ERR PFX "Port id [%lu] too large.\n", *port_id);
+		goto err_out_release_mdesc;
+	}
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!port) {
+		printk(KERN_ERR PFX "Cannot allocate vdc_port.\n");
+		goto err_out_release_mdesc;
+	}
+
+	port->vp = vp;
+	port->dev_no = *port_id;
+
+	if (port->dev_no >= 26)
+		snprintf(port->disk_name, sizeof(port->disk_name),
+			 VDCBLK_NAME "%c%c",
+			 'a' + (port->dev_no / 26) - 1,
+			 'a' + (port->dev_no % 26));
+	else
+		snprintf(port->disk_name, sizeof(port->disk_name),
+			 VDCBLK_NAME "%c", 'a' + (port->dev_no % 26));
+
+	err = vio_driver_init(&port->vio, vdev, VDEV_DISK,
+			      vdc_versions, ARRAY_SIZE(vdc_versions),
+			      &vdc_vio_ops, port->disk_name);
+	if (err)
+		goto err_out_free_port;
+
+	port->vdisk_block_size = 512;
+	port->max_xfer_size = ((128 * 1024) / port->vdisk_block_size);
+	port->ring_cookies = ((port->max_xfer_size *
+			       port->vdisk_block_size) / PAGE_SIZE) + 2;
+
+	err = vio_ldc_alloc(&port->vio, &vdc_ldc_cfg, port);
+	if (err)
+		goto err_out_free_port;
+
+	err = vdc_alloc_tx_ring(port);
+	if (err)
+		goto err_out_free_ldc;
+
+	err = probe_disk(port);
+	if (err)
+		goto err_out_free_tx_ring;
+
+	INIT_LIST_HEAD(&port->list);
+
+	spin_lock_irqsave(&vp->lock, flags);
+	list_add(&port->list, &vp->port_list);
+	spin_unlock_irqrestore(&vp->lock, flags);
+
+	dev_set_drvdata(&vdev->dev, port);
+
+	mdesc_release(hp);
+
+	return 0;
+
+err_out_free_tx_ring:
+	vdc_free_tx_ring(port);
+
+err_out_free_ldc:
+	vio_ldc_free(&port->vio);
+
+err_out_free_port:
+	kfree(port);
+
+err_out_release_mdesc:
+	mdesc_release(hp);
+	return err;
+}
+
+static int vdc_port_remove(struct vio_dev *vdev)
+{
+	struct vdc_port *port = dev_get_drvdata(&vdev->dev);
+
+	if (port) {
+		del_timer_sync(&port->vio.timer);
+
+		vdc_free_tx_ring(port);
+		vio_ldc_free(&port->vio);
+
+		dev_set_drvdata(&vdev->dev, NULL);
+
+		kfree(port);
+	}
+	return 0;
+}
+
+static struct vio_device_id vdc_port_match[] = {
+	{
+		.type = "vdc-port",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(vio, vdc_match);
+
+static struct vio_driver vdc_port_driver = {
+	.id_table	= vdc_port_match,
+	.probe		= vdc_port_probe,
+	.remove		= vdc_port_remove,
+	.driver		= {
+		.name	= "vdc_port",
+		.owner	= THIS_MODULE,
+	}
+};
+
+static int __devinit vdc_probe(struct vio_dev *vdev,
+			       const struct vio_device_id *id)
+{
+	static int vdc_version_printed;
+	struct vdc *vp;
+
+	if (vdc_version_printed++ == 0)
+		printk(KERN_INFO "%s", version);
+
+	vp = kzalloc(sizeof(struct vdc), GFP_KERNEL);
+	if (!vp)
+		return -ENOMEM;
+
+	spin_lock_init(&vp->lock);
+	vp->dev = vdev;
+	INIT_LIST_HEAD(&vp->port_list);
+
+	dev_set_drvdata(&vdev->dev, vp);
+
+	return 0;
+}
+
+static int vdc_remove(struct vio_dev *vdev)
+{
+
+	struct vdc *vp = dev_get_drvdata(&vdev->dev);
+
+	if (vp) {
+		kfree(vp);
+		dev_set_drvdata(&vdev->dev, NULL);
+	}
+	return 0;
+}
+
+static struct vio_device_id vdc_match[] = {
+	{
+		.type = "block",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(vio, vdc_match);
+
+static struct vio_driver vdc_driver = {
+	.id_table	= vdc_match,
+	.probe		= vdc_probe,
+	.remove		= vdc_remove,
+	.driver		= {
+		.name	= "vdc",
+		.owner	= THIS_MODULE,
+	}
+};
+
+static int __init vdc_init(void)
+{
+	int err;
+
+	err = register_blkdev(0, VDCBLK_NAME);
+	if (err < 0)
+		goto out_err;
+
+	vdc_major = err;
+	err = vio_register_driver(&vdc_driver);
+	if (err)
+		goto out_unregister_blkdev;
+
+	err = vio_register_driver(&vdc_port_driver);
+	if (err)
+		goto out_unregister_vdc;
+
+	return 0;
+
+out_unregister_vdc:
+	vio_unregister_driver(&vdc_driver);
+
+out_unregister_blkdev:
+	unregister_blkdev(vdc_major, VDCBLK_NAME);
+	vdc_major = 0;
+
+out_err:
+	return err;
+}
+
+static void __exit vdc_exit(void)
+{
+	vio_unregister_driver(&vdc_port_driver);
+	vio_unregister_driver(&vdc_driver);
+	unregister_blkdev(vdc_major, VDCBLK_NAME);
+}
+
+module_init(vdc_init);
+module_exit(vdc_exit);

+ 6 - 0
drivers/net/Kconfig

@@ -604,6 +604,12 @@ config CASSINI
 	  Support for the Sun Cassini chip, aka Sun GigaSwift Ethernet. See also
 	  Support for the Sun Cassini chip, aka Sun GigaSwift Ethernet. See also
 	  <http://www.sun.com/products-n-solutions/hardware/docs/pdf/817-4341-10.pdf>
 	  <http://www.sun.com/products-n-solutions/hardware/docs/pdf/817-4341-10.pdf>
 
 
+config SUNVNET
+	tristate "Sun Virtual Network support"
+	depends on SUN_LDOMS
+	help
+	  Support for virtual network devices under Sun Logical Domains.
+
 config NET_VENDOR_3COM
 config NET_VENDOR_3COM
 	bool "3COM cards"
 	bool "3COM cards"
 	depends on ISA || EISA || MCA || PCI
 	depends on ISA || EISA || MCA || PCI

+ 1 - 0
drivers/net/Makefile

@@ -34,6 +34,7 @@ obj-$(CONFIG_SUNBMAC) += sunbmac.o
 obj-$(CONFIG_MYRI_SBUS) += myri_sbus.o
 obj-$(CONFIG_MYRI_SBUS) += myri_sbus.o
 obj-$(CONFIG_SUNGEM) += sungem.o sungem_phy.o
 obj-$(CONFIG_SUNGEM) += sungem.o sungem_phy.o
 obj-$(CONFIG_CASSINI) += cassini.o
 obj-$(CONFIG_CASSINI) += cassini.o
+obj-$(CONFIG_SUNVNET) += sunvnet.o
 
 
 obj-$(CONFIG_MACE) += mace.o
 obj-$(CONFIG_MACE) += mace.o
 obj-$(CONFIG_BMAC) += bmac.o
 obj-$(CONFIG_BMAC) += bmac.o

+ 1164 - 0
drivers/net/sunvnet.c

@@ -0,0 +1,1164 @@
+/* sunvnet.c: Sun LDOM Virtual Network Driver.
+ *
+ * Copyright (C) 2007 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+
+#include <asm/vio.h>
+#include <asm/ldc.h>
+
+#include "sunvnet.h"
+
+#define DRV_MODULE_NAME		"sunvnet"
+#define PFX DRV_MODULE_NAME	": "
+#define DRV_MODULE_VERSION	"1.0"
+#define DRV_MODULE_RELDATE	"June 25, 2007"
+
+static char version[] __devinitdata =
+	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_DESCRIPTION("Sun LDOM virtual network driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+/* Ordered from largest major to lowest */
+static struct vio_version vnet_versions[] = {
+	{ .major = 1, .minor = 0 },
+};
+
+static inline u32 vnet_tx_dring_avail(struct vio_dring_state *dr)
+{
+	return vio_dring_avail(dr, VNET_TX_RING_SIZE);
+}
+
+static int vnet_handle_unknown(struct vnet_port *port, void *arg)
+{
+	struct vio_msg_tag *pkt = arg;
+
+	printk(KERN_ERR PFX "Received unknown msg [%02x:%02x:%04x:%08x]\n",
+	       pkt->type, pkt->stype, pkt->stype_env, pkt->sid);
+	printk(KERN_ERR PFX "Resetting connection.\n");
+
+	ldc_disconnect(port->vio.lp);
+
+	return -ECONNRESET;
+}
+
+static int vnet_send_attr(struct vio_driver_state *vio)
+{
+	struct vnet_port *port = to_vnet_port(vio);
+	struct net_device *dev = port->vp->dev;
+	struct vio_net_attr_info pkt;
+	int i;
+
+	memset(&pkt, 0, sizeof(pkt));
+	pkt.tag.type = VIO_TYPE_CTRL;
+	pkt.tag.stype = VIO_SUBTYPE_INFO;
+	pkt.tag.stype_env = VIO_ATTR_INFO;
+	pkt.tag.sid = vio_send_sid(vio);
+	pkt.xfer_mode = VIO_DRING_MODE;
+	pkt.addr_type = VNET_ADDR_ETHERMAC;
+	pkt.ack_freq = 0;
+	for (i = 0; i < 6; i++)
+		pkt.addr |= (u64)dev->dev_addr[i] << ((5 - i) * 8);
+	pkt.mtu = ETH_FRAME_LEN;
+
+	viodbg(HS, "SEND NET ATTR xmode[0x%x] atype[0x%x] addr[%llx] "
+	       "ackfreq[%u] mtu[%llu]\n",
+	       pkt.xfer_mode, pkt.addr_type,
+	       (unsigned long long) pkt.addr,
+	       pkt.ack_freq,
+	       (unsigned long long) pkt.mtu);
+
+	return vio_ldc_send(vio, &pkt, sizeof(pkt));
+}
+
+static int handle_attr_info(struct vio_driver_state *vio,
+			    struct vio_net_attr_info *pkt)
+{
+	viodbg(HS, "GOT NET ATTR INFO xmode[0x%x] atype[0x%x] addr[%llx] "
+	       "ackfreq[%u] mtu[%llu]\n",
+	       pkt->xfer_mode, pkt->addr_type,
+	       (unsigned long long) pkt->addr,
+	       pkt->ack_freq,
+	       (unsigned long long) pkt->mtu);
+
+	pkt->tag.sid = vio_send_sid(vio);
+
+	if (pkt->xfer_mode != VIO_DRING_MODE ||
+	    pkt->addr_type != VNET_ADDR_ETHERMAC ||
+	    pkt->mtu != ETH_FRAME_LEN) {
+		viodbg(HS, "SEND NET ATTR NACK\n");
+
+		pkt->tag.stype = VIO_SUBTYPE_NACK;
+
+		(void) vio_ldc_send(vio, pkt, sizeof(*pkt));
+
+		return -ECONNRESET;
+	} else {
+		viodbg(HS, "SEND NET ATTR ACK\n");
+
+		pkt->tag.stype = VIO_SUBTYPE_ACK;
+
+		return vio_ldc_send(vio, pkt, sizeof(*pkt));
+	}
+
+}
+
+static int handle_attr_ack(struct vio_driver_state *vio,
+			   struct vio_net_attr_info *pkt)
+{
+	viodbg(HS, "GOT NET ATTR ACK\n");
+
+	return 0;
+}
+
+static int handle_attr_nack(struct vio_driver_state *vio,
+			    struct vio_net_attr_info *pkt)
+{
+	viodbg(HS, "GOT NET ATTR NACK\n");
+
+	return -ECONNRESET;
+}
+
+static int vnet_handle_attr(struct vio_driver_state *vio, void *arg)
+{
+	struct vio_net_attr_info *pkt = arg;
+
+	switch (pkt->tag.stype) {
+	case VIO_SUBTYPE_INFO:
+		return handle_attr_info(vio, pkt);
+
+	case VIO_SUBTYPE_ACK:
+		return handle_attr_ack(vio, pkt);
+
+	case VIO_SUBTYPE_NACK:
+		return handle_attr_nack(vio, pkt);
+
+	default:
+		return -ECONNRESET;
+	}
+}
+
+static void vnet_handshake_complete(struct vio_driver_state *vio)
+{
+	struct vio_dring_state *dr;
+
+	dr = &vio->drings[VIO_DRIVER_RX_RING];
+	dr->snd_nxt = dr->rcv_nxt = 1;
+
+	dr = &vio->drings[VIO_DRIVER_TX_RING];
+	dr->snd_nxt = dr->rcv_nxt = 1;
+}
+
+/* The hypervisor interface that implements copying to/from imported
+ * memory from another domain requires that copies are done to 8-byte
+ * aligned buffers, and that the lengths of such copies are also 8-byte
+ * multiples.
+ *
+ * So we align skb->data to an 8-byte multiple and pad-out the data
+ * area so we can round the copy length up to the next multiple of
+ * 8 for the copy.
+ *
+ * The transmitter puts the actual start of the packet 6 bytes into
+ * the buffer it sends over, so that the IP headers after the ethernet
+ * header are aligned properly.  These 6 bytes are not in the descriptor
+ * length, they are simply implied.  This offset is represented using
+ * the VNET_PACKET_SKIP macro.
+ */
+static struct sk_buff *alloc_and_align_skb(struct net_device *dev,
+					   unsigned int len)
+{
+	struct sk_buff *skb = netdev_alloc_skb(dev, len+VNET_PACKET_SKIP+8+8);
+	unsigned long addr, off;
+
+	if (unlikely(!skb))
+		return NULL;
+
+	addr = (unsigned long) skb->data;
+	off = ((addr + 7UL) & ~7UL) - addr;
+	if (off)
+		skb_reserve(skb, off);
+
+	return skb;
+}
+
+static int vnet_rx_one(struct vnet_port *port, unsigned int len,
+		       struct ldc_trans_cookie *cookies, int ncookies)
+{
+	struct net_device *dev = port->vp->dev;
+	unsigned int copy_len;
+	struct sk_buff *skb;
+	int err;
+
+	err = -EMSGSIZE;
+	if (unlikely(len < ETH_ZLEN || len > ETH_FRAME_LEN)) {
+		dev->stats.rx_length_errors++;
+		goto out_dropped;
+	}
+
+	skb = alloc_and_align_skb(dev, len);
+	err = -ENOMEM;
+	if (unlikely(!skb)) {
+		dev->stats.rx_missed_errors++;
+		goto out_dropped;
+	}
+
+	copy_len = (len + VNET_PACKET_SKIP + 7U) & ~7U;
+	skb_put(skb, copy_len);
+	err = ldc_copy(port->vio.lp, LDC_COPY_IN,
+		       skb->data, copy_len, 0,
+		       cookies, ncookies);
+	if (unlikely(err < 0)) {
+		dev->stats.rx_frame_errors++;
+		goto out_free_skb;
+	}
+
+	skb_pull(skb, VNET_PACKET_SKIP);
+	skb_trim(skb, len);
+	skb->protocol = eth_type_trans(skb, dev);
+
+	dev->stats.rx_packets++;
+	dev->stats.rx_bytes += len;
+
+	netif_rx(skb);
+
+	return 0;
+
+out_free_skb:
+	kfree_skb(skb);
+
+out_dropped:
+	dev->stats.rx_dropped++;
+	return err;
+}
+
+static int vnet_send_ack(struct vnet_port *port, struct vio_dring_state *dr,
+			 u32 start, u32 end, u8 vio_dring_state)
+{
+	struct vio_dring_data hdr = {
+		.tag = {
+			.type		= VIO_TYPE_DATA,
+			.stype		= VIO_SUBTYPE_ACK,
+			.stype_env	= VIO_DRING_DATA,
+			.sid		= vio_send_sid(&port->vio),
+		},
+		.dring_ident		= dr->ident,
+		.start_idx		= start,
+		.end_idx		= end,
+		.state			= vio_dring_state,
+	};
+	int err, delay;
+
+	hdr.seq = dr->snd_nxt;
+	delay = 1;
+	do {
+		err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr));
+		if (err > 0) {
+			dr->snd_nxt++;
+			break;
+		}
+		udelay(delay);
+		if ((delay <<= 1) > 128)
+			delay = 128;
+	} while (err == -EAGAIN);
+
+	return err;
+}
+
+static u32 next_idx(u32 idx, struct vio_dring_state *dr)
+{
+	if (++idx == dr->num_entries)
+		idx = 0;
+	return idx;
+}
+
+static u32 prev_idx(u32 idx, struct vio_dring_state *dr)
+{
+	if (idx == 0)
+		idx = dr->num_entries - 1;
+	else
+		idx--;
+
+	return idx;
+}
+
+static struct vio_net_desc *get_rx_desc(struct vnet_port *port,
+					struct vio_dring_state *dr,
+					u32 index)
+{
+	struct vio_net_desc *desc = port->vio.desc_buf;
+	int err;
+
+	err = ldc_get_dring_entry(port->vio.lp, desc, dr->entry_size,
+				  (index * dr->entry_size),
+				  dr->cookies, dr->ncookies);
+	if (err < 0)
+		return ERR_PTR(err);
+
+	return desc;
+}
+
+static int put_rx_desc(struct vnet_port *port,
+		       struct vio_dring_state *dr,
+		       struct vio_net_desc *desc,
+		       u32 index)
+{
+	int err;
+
+	err = ldc_put_dring_entry(port->vio.lp, desc, dr->entry_size,
+				  (index * dr->entry_size),
+				  dr->cookies, dr->ncookies);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int vnet_walk_rx_one(struct vnet_port *port,
+			    struct vio_dring_state *dr,
+			    u32 index, int *needs_ack)
+{
+	struct vio_net_desc *desc = get_rx_desc(port, dr, index);
+	struct vio_driver_state *vio = &port->vio;
+	int err;
+
+	if (IS_ERR(desc))
+		return PTR_ERR(desc);
+
+	viodbg(DATA, "vio_walk_rx_one desc[%02x:%02x:%08x:%08x:%lx:%lx]\n",
+	       desc->hdr.state, desc->hdr.ack,
+	       desc->size, desc->ncookies,
+	       desc->cookies[0].cookie_addr,
+	       desc->cookies[0].cookie_size);
+
+	if (desc->hdr.state != VIO_DESC_READY)
+		return 1;
+	err = vnet_rx_one(port, desc->size, desc->cookies, desc->ncookies);
+	if (err == -ECONNRESET)
+		return err;
+	desc->hdr.state = VIO_DESC_DONE;
+	err = put_rx_desc(port, dr, desc, index);
+	if (err < 0)
+		return err;
+	*needs_ack = desc->hdr.ack;
+	return 0;
+}
+
+static int vnet_walk_rx(struct vnet_port *port, struct vio_dring_state *dr,
+			u32 start, u32 end)
+{
+	struct vio_driver_state *vio = &port->vio;
+	int ack_start = -1, ack_end = -1;
+
+	end = (end == (u32) -1) ? prev_idx(start, dr) : next_idx(end, dr);
+
+	viodbg(DATA, "vnet_walk_rx start[%08x] end[%08x]\n", start, end);
+
+	while (start != end) {
+		int ack = 0, err = vnet_walk_rx_one(port, dr, start, &ack);
+		if (err == -ECONNRESET)
+			return err;
+		if (err != 0)
+			break;
+		if (ack_start == -1)
+			ack_start = start;
+		ack_end = start;
+		start = next_idx(start, dr);
+		if (ack && start != end) {
+			err = vnet_send_ack(port, dr, ack_start, ack_end,
+					    VIO_DRING_ACTIVE);
+			if (err == -ECONNRESET)
+				return err;
+			ack_start = -1;
+		}
+	}
+	if (unlikely(ack_start == -1))
+		ack_start = ack_end = prev_idx(start, dr);
+	return vnet_send_ack(port, dr, ack_start, ack_end, VIO_DRING_STOPPED);
+}
+
+static int vnet_rx(struct vnet_port *port, void *msgbuf)
+{
+	struct vio_dring_data *pkt = msgbuf;
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_RX_RING];
+	struct vio_driver_state *vio = &port->vio;
+
+	viodbg(DATA, "vnet_rx stype_env[%04x] seq[%016lx] rcv_nxt[%016lx]\n",
+	       pkt->tag.stype_env, pkt->seq, dr->rcv_nxt);
+
+	if (unlikely(pkt->tag.stype_env != VIO_DRING_DATA))
+		return 0;
+	if (unlikely(pkt->seq != dr->rcv_nxt)) {
+		printk(KERN_ERR PFX "RX out of sequence seq[0x%lx] "
+		       "rcv_nxt[0x%lx]\n", pkt->seq, dr->rcv_nxt);
+		return 0;
+	}
+
+	dr->rcv_nxt++;
+
+	/* XXX Validate pkt->start_idx and pkt->end_idx XXX */
+
+	return vnet_walk_rx(port, dr, pkt->start_idx, pkt->end_idx);
+}
+
+static int idx_is_pending(struct vio_dring_state *dr, u32 end)
+{
+	u32 idx = dr->cons;
+	int found = 0;
+
+	while (idx != dr->prod) {
+		if (idx == end) {
+			found = 1;
+			break;
+		}
+		idx = next_idx(idx, dr);
+	}
+	return found;
+}
+
+static int vnet_ack(struct vnet_port *port, void *msgbuf)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	struct vio_dring_data *pkt = msgbuf;
+	struct net_device *dev;
+	struct vnet *vp;
+	u32 end;
+
+	if (unlikely(pkt->tag.stype_env != VIO_DRING_DATA))
+		return 0;
+
+	end = pkt->end_idx;
+	if (unlikely(!idx_is_pending(dr, end)))
+		return 0;
+
+	dr->cons = next_idx(end, dr);
+
+	vp = port->vp;
+	dev = vp->dev;
+	if (unlikely(netif_queue_stopped(dev) &&
+		     vnet_tx_dring_avail(dr) >= VNET_TX_WAKEUP_THRESH(dr)))
+		return 1;
+
+	return 0;
+}
+
+static int vnet_nack(struct vnet_port *port, void *msgbuf)
+{
+	/* XXX just reset or similar XXX */
+	return 0;
+}
+
+static void maybe_tx_wakeup(struct vnet *vp)
+{
+	struct net_device *dev = vp->dev;
+
+	netif_tx_lock(dev);
+	if (likely(netif_queue_stopped(dev))) {
+		struct vnet_port *port;
+		int wake = 1;
+
+		list_for_each_entry(port, &vp->port_list, list) {
+			struct vio_dring_state *dr;
+
+			dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+			if (vnet_tx_dring_avail(dr) <
+			    VNET_TX_WAKEUP_THRESH(dr)) {
+				wake = 0;
+				break;
+			}
+		}
+		if (wake)
+			netif_wake_queue(dev);
+	}
+	netif_tx_unlock(dev);
+}
+
+static void vnet_event(void *arg, int event)
+{
+	struct vnet_port *port = arg;
+	struct vio_driver_state *vio = &port->vio;
+	unsigned long flags;
+	int tx_wakeup, err;
+
+	spin_lock_irqsave(&vio->lock, flags);
+
+	if (unlikely(event == LDC_EVENT_RESET ||
+		     event == LDC_EVENT_UP)) {
+		vio_link_state_change(vio, event);
+		spin_unlock_irqrestore(&vio->lock, flags);
+
+		return;
+	}
+
+	if (unlikely(event != LDC_EVENT_DATA_READY)) {
+		printk(KERN_WARNING PFX "Unexpected LDC event %d\n", event);
+		spin_unlock_irqrestore(&vio->lock, flags);
+		return;
+	}
+
+	tx_wakeup = err = 0;
+	while (1) {
+		union {
+			struct vio_msg_tag tag;
+			u64 raw[8];
+		} msgbuf;
+
+		err = ldc_read(vio->lp, &msgbuf, sizeof(msgbuf));
+		if (unlikely(err < 0)) {
+			if (err == -ECONNRESET)
+				vio_conn_reset(vio);
+			break;
+		}
+		if (err == 0)
+			break;
+		viodbg(DATA, "TAG [%02x:%02x:%04x:%08x]\n",
+		       msgbuf.tag.type,
+		       msgbuf.tag.stype,
+		       msgbuf.tag.stype_env,
+		       msgbuf.tag.sid);
+		err = vio_validate_sid(vio, &msgbuf.tag);
+		if (err < 0)
+			break;
+
+		if (likely(msgbuf.tag.type == VIO_TYPE_DATA)) {
+			if (msgbuf.tag.stype == VIO_SUBTYPE_INFO) {
+				err = vnet_rx(port, &msgbuf);
+			} else if (msgbuf.tag.stype == VIO_SUBTYPE_ACK) {
+				err = vnet_ack(port, &msgbuf);
+				if (err > 0)
+					tx_wakeup |= err;
+			} else if (msgbuf.tag.stype == VIO_SUBTYPE_NACK) {
+				err = vnet_nack(port, &msgbuf);
+			}
+		} else if (msgbuf.tag.type == VIO_TYPE_CTRL) {
+			err = vio_control_pkt_engine(vio, &msgbuf);
+			if (err)
+				break;
+		} else {
+			err = vnet_handle_unknown(port, &msgbuf);
+		}
+		if (err == -ECONNRESET)
+			break;
+	}
+	spin_unlock(&vio->lock);
+	if (unlikely(tx_wakeup && err != -ECONNRESET))
+		maybe_tx_wakeup(port->vp);
+	local_irq_restore(flags);
+}
+
+static int __vnet_tx_trigger(struct vnet_port *port)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	struct vio_dring_data hdr = {
+		.tag = {
+			.type		= VIO_TYPE_DATA,
+			.stype		= VIO_SUBTYPE_INFO,
+			.stype_env	= VIO_DRING_DATA,
+			.sid		= vio_send_sid(&port->vio),
+		},
+		.dring_ident		= dr->ident,
+		.start_idx		= dr->prod,
+		.end_idx		= (u32) -1,
+	};
+	int err, delay;
+
+	hdr.seq = dr->snd_nxt;
+	delay = 1;
+	do {
+		err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr));
+		if (err > 0) {
+			dr->snd_nxt++;
+			break;
+		}
+		udelay(delay);
+		if ((delay <<= 1) > 128)
+			delay = 128;
+	} while (err == -EAGAIN);
+
+	return err;
+}
+
+struct vnet_port *__tx_port_find(struct vnet *vp, struct sk_buff *skb)
+{
+	unsigned int hash = vnet_hashfn(skb->data);
+	struct hlist_head *hp = &vp->port_hash[hash];
+	struct hlist_node *n;
+	struct vnet_port *port;
+
+	hlist_for_each_entry(port, n, hp, hash) {
+		if (!compare_ether_addr(port->raddr, skb->data))
+			return port;
+	}
+	port = NULL;
+	if (!list_empty(&vp->port_list))
+		port = list_entry(vp->port_list.next, struct vnet_port, list);
+
+	return port;
+}
+
+struct vnet_port *tx_port_find(struct vnet *vp, struct sk_buff *skb)
+{
+	struct vnet_port *ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vp->lock, flags);
+	ret = __tx_port_find(vp, skb);
+	spin_unlock_irqrestore(&vp->lock, flags);
+
+	return ret;
+}
+
+static int vnet_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vnet *vp = netdev_priv(dev);
+	struct vnet_port *port = tx_port_find(vp, skb);
+	struct vio_dring_state *dr;
+	struct vio_net_desc *d;
+	unsigned long flags;
+	unsigned int len;
+	void *tx_buf;
+	int i, err;
+
+	if (unlikely(!port))
+		goto out_dropped;
+
+	spin_lock_irqsave(&port->vio.lock, flags);
+
+	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	if (unlikely(vnet_tx_dring_avail(dr) < 2)) {
+		if (!netif_queue_stopped(dev)) {
+			netif_stop_queue(dev);
+
+			/* This is a hard error, log it. */
+			printk(KERN_ERR PFX "%s: BUG! Tx Ring full when "
+			       "queue awake!\n", dev->name);
+			dev->stats.tx_errors++;
+		}
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		return NETDEV_TX_BUSY;
+	}
+
+	d = vio_dring_cur(dr);
+
+	tx_buf = port->tx_bufs[dr->prod].buf;
+	skb_copy_from_linear_data(skb, tx_buf + VNET_PACKET_SKIP, skb->len);
+
+	len = skb->len;
+	if (len < ETH_ZLEN) {
+		len = ETH_ZLEN;
+		memset(tx_buf+VNET_PACKET_SKIP+skb->len, 0, len - skb->len);
+	}
+
+	d->hdr.ack = VIO_ACK_ENABLE;
+	d->size = len;
+	d->ncookies = port->tx_bufs[dr->prod].ncookies;
+	for (i = 0; i < d->ncookies; i++)
+		d->cookies[i] = port->tx_bufs[dr->prod].cookies[i];
+
+	/* This has to be a non-SMP write barrier because we are writing
+	 * to memory which is shared with the peer LDOM.
+	 */
+	wmb();
+
+	d->hdr.state = VIO_DESC_READY;
+
+	err = __vnet_tx_trigger(port);
+	if (unlikely(err < 0)) {
+		printk(KERN_INFO PFX "%s: TX trigger error %d\n",
+		       dev->name, err);
+		d->hdr.state = VIO_DESC_FREE;
+		dev->stats.tx_carrier_errors++;
+		goto out_dropped_unlock;
+	}
+
+	dev->stats.tx_packets++;
+	dev->stats.tx_bytes += skb->len;
+
+	dr->prod = (dr->prod + 1) & (VNET_TX_RING_SIZE - 1);
+	if (unlikely(vnet_tx_dring_avail(dr) < 2)) {
+		netif_stop_queue(dev);
+		if (vnet_tx_dring_avail(dr) > VNET_TX_WAKEUP_THRESH(dr))
+			netif_wake_queue(dev);
+	}
+
+	spin_unlock_irqrestore(&port->vio.lock, flags);
+
+	dev_kfree_skb(skb);
+
+	dev->trans_start = jiffies;
+	return NETDEV_TX_OK;
+
+out_dropped_unlock:
+	spin_unlock_irqrestore(&port->vio.lock, flags);
+
+out_dropped:
+	dev_kfree_skb(skb);
+	dev->stats.tx_dropped++;
+	return NETDEV_TX_OK;
+}
+
+static void vnet_tx_timeout(struct net_device *dev)
+{
+	/* XXX Implement me XXX */
+}
+
+static int vnet_open(struct net_device *dev)
+{
+	netif_carrier_on(dev);
+	netif_start_queue(dev);
+
+	return 0;
+}
+
+static int vnet_close(struct net_device *dev)
+{
+	netif_stop_queue(dev);
+	netif_carrier_off(dev);
+
+	return 0;
+}
+
+static void vnet_set_rx_mode(struct net_device *dev)
+{
+	/* XXX Implement multicast support XXX */
+}
+
+static int vnet_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if (new_mtu != ETH_DATA_LEN)
+		return -EINVAL;
+
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static int vnet_set_mac_addr(struct net_device *dev, void *p)
+{
+	return -EINVAL;
+}
+
+static void vnet_get_drvinfo(struct net_device *dev,
+			     struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, DRV_MODULE_NAME);
+	strcpy(info->version, DRV_MODULE_VERSION);
+}
+
+static u32 vnet_get_msglevel(struct net_device *dev)
+{
+	struct vnet *vp = netdev_priv(dev);
+	return vp->msg_enable;
+}
+
+static void vnet_set_msglevel(struct net_device *dev, u32 value)
+{
+	struct vnet *vp = netdev_priv(dev);
+	vp->msg_enable = value;
+}
+
+static const struct ethtool_ops vnet_ethtool_ops = {
+	.get_drvinfo		= vnet_get_drvinfo,
+	.get_msglevel		= vnet_get_msglevel,
+	.set_msglevel		= vnet_set_msglevel,
+	.get_link		= ethtool_op_get_link,
+	.get_perm_addr		= ethtool_op_get_perm_addr,
+};
+
+static void vnet_port_free_tx_bufs(struct vnet_port *port)
+{
+	struct vio_dring_state *dr;
+	int i;
+
+	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	if (dr->base) {
+		ldc_free_exp_dring(port->vio.lp, dr->base,
+				   (dr->entry_size * dr->num_entries),
+				   dr->cookies, dr->ncookies);
+		dr->base = NULL;
+		dr->entry_size = 0;
+		dr->num_entries = 0;
+		dr->pending = 0;
+		dr->ncookies = 0;
+	}
+
+	for (i = 0; i < VNET_TX_RING_SIZE; i++) {
+		void *buf = port->tx_bufs[i].buf;
+
+		if (!buf)
+			continue;
+
+		ldc_unmap(port->vio.lp,
+			  port->tx_bufs[i].cookies,
+			  port->tx_bufs[i].ncookies);
+
+		kfree(buf);
+		port->tx_bufs[i].buf = NULL;
+	}
+}
+
+static int __devinit vnet_port_alloc_tx_bufs(struct vnet_port *port)
+{
+	struct vio_dring_state *dr;
+	unsigned long len;
+	int i, err, ncookies;
+	void *dring;
+
+	for (i = 0; i < VNET_TX_RING_SIZE; i++) {
+		void *buf = kzalloc(ETH_FRAME_LEN + 8, GFP_KERNEL);
+		int map_len = (ETH_FRAME_LEN + 7) & ~7;
+
+		err = -ENOMEM;
+		if (!buf) {
+			printk(KERN_ERR "TX buffer allocation failure\n");
+			goto err_out;
+		}
+		err = -EFAULT;
+		if ((unsigned long)buf & (8UL - 1)) {
+			printk(KERN_ERR "TX buffer misaligned\n");
+			kfree(buf);
+			goto err_out;
+		}
+
+		err = ldc_map_single(port->vio.lp, buf, map_len,
+				     port->tx_bufs[i].cookies, 2,
+				     (LDC_MAP_SHADOW |
+				      LDC_MAP_DIRECT |
+				      LDC_MAP_RW));
+		if (err < 0) {
+			kfree(buf);
+			goto err_out;
+		}
+		port->tx_bufs[i].buf = buf;
+		port->tx_bufs[i].ncookies = err;
+	}
+
+	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+
+	len = (VNET_TX_RING_SIZE *
+	       (sizeof(struct vio_net_desc) +
+		(sizeof(struct ldc_trans_cookie) * 2)));
+
+	ncookies = VIO_MAX_RING_COOKIES;
+	dring = ldc_alloc_exp_dring(port->vio.lp, len,
+				    dr->cookies, &ncookies,
+				    (LDC_MAP_SHADOW |
+				     LDC_MAP_DIRECT |
+				     LDC_MAP_RW));
+	if (IS_ERR(dring)) {
+		err = PTR_ERR(dring);
+		goto err_out;
+	}
+
+	dr->base = dring;
+	dr->entry_size = (sizeof(struct vio_net_desc) +
+			  (sizeof(struct ldc_trans_cookie) * 2));
+	dr->num_entries = VNET_TX_RING_SIZE;
+	dr->prod = dr->cons = 0;
+	dr->pending = VNET_TX_RING_SIZE;
+	dr->ncookies = ncookies;
+
+	return 0;
+
+err_out:
+	vnet_port_free_tx_bufs(port);
+
+	return err;
+}
+
+static struct ldc_channel_config vnet_ldc_cfg = {
+	.event		= vnet_event,
+	.mtu		= 64,
+	.mode		= LDC_MODE_UNRELIABLE,
+};
+
+static struct vio_driver_ops vnet_vio_ops = {
+	.send_attr		= vnet_send_attr,
+	.handle_attr		= vnet_handle_attr,
+	.handshake_complete	= vnet_handshake_complete,
+};
+
+const char *remote_macaddr_prop = "remote-mac-address";
+
+static int __devinit vnet_port_probe(struct vio_dev *vdev,
+				     const struct vio_device_id *id)
+{
+	struct mdesc_handle *hp;
+	struct vnet_port *port;
+	unsigned long flags;
+	struct vnet *vp;
+	const u64 *rmac;
+	int len, i, err, switch_port;
+
+	vp = dev_get_drvdata(vdev->dev.parent);
+	if (!vp) {
+		printk(KERN_ERR PFX "Cannot find port parent vnet.\n");
+		return -ENODEV;
+	}
+
+	hp = mdesc_grab();
+
+	rmac = mdesc_get_property(hp, vdev->mp, remote_macaddr_prop, &len);
+	err = -ENODEV;
+	if (!rmac) {
+		printk(KERN_ERR PFX "Port lacks %s property.\n",
+		       remote_macaddr_prop);
+		goto err_out_put_mdesc;
+	}
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	err = -ENOMEM;
+	if (!port) {
+		printk(KERN_ERR PFX "Cannot allocate vnet_port.\n");
+		goto err_out_put_mdesc;
+	}
+
+	for (i = 0; i < ETH_ALEN; i++)
+		port->raddr[i] = (*rmac >> (5 - i) * 8) & 0xff;
+
+	port->vp = vp;
+
+	err = vio_driver_init(&port->vio, vdev, VDEV_NETWORK,
+			      vnet_versions, ARRAY_SIZE(vnet_versions),
+			      &vnet_vio_ops, vp->dev->name);
+	if (err)
+		goto err_out_free_port;
+
+	err = vio_ldc_alloc(&port->vio, &vnet_ldc_cfg, port);
+	if (err)
+		goto err_out_free_port;
+
+	err = vnet_port_alloc_tx_bufs(port);
+	if (err)
+		goto err_out_free_ldc;
+
+	INIT_HLIST_NODE(&port->hash);
+	INIT_LIST_HEAD(&port->list);
+
+	switch_port = 0;
+	if (mdesc_get_property(hp, vdev->mp, "switch-port", NULL) != NULL)
+		switch_port = 1;
+
+	spin_lock_irqsave(&vp->lock, flags);
+	if (switch_port)
+		list_add(&port->list, &vp->port_list);
+	else
+		list_add_tail(&port->list, &vp->port_list);
+	hlist_add_head(&port->hash, &vp->port_hash[vnet_hashfn(port->raddr)]);
+	spin_unlock_irqrestore(&vp->lock, flags);
+
+	dev_set_drvdata(&vdev->dev, port);
+
+	printk(KERN_INFO "%s: PORT ( remote-mac ", vp->dev->name);
+	for (i = 0; i < 6; i++)
+		printk("%2.2x%c", port->raddr[i], i == 5 ? ' ' : ':');
+	if (switch_port)
+		printk("switch-port ");
+	printk(")\n");
+
+	vio_port_up(&port->vio);
+
+	mdesc_release(hp);
+
+	return 0;
+
+err_out_free_ldc:
+	vio_ldc_free(&port->vio);
+
+err_out_free_port:
+	kfree(port);
+
+err_out_put_mdesc:
+	mdesc_release(hp);
+	return err;
+}
+
+static int vnet_port_remove(struct vio_dev *vdev)
+{
+	struct vnet_port *port = dev_get_drvdata(&vdev->dev);
+
+	if (port) {
+		struct vnet *vp = port->vp;
+		unsigned long flags;
+
+		del_timer_sync(&port->vio.timer);
+
+		spin_lock_irqsave(&vp->lock, flags);
+		list_del(&port->list);
+		hlist_del(&port->hash);
+		spin_unlock_irqrestore(&vp->lock, flags);
+
+		vnet_port_free_tx_bufs(port);
+		vio_ldc_free(&port->vio);
+
+		dev_set_drvdata(&vdev->dev, NULL);
+
+		kfree(port);
+	}
+	return 0;
+}
+
+static struct vio_device_id vnet_port_match[] = {
+	{
+		.type = "vnet-port",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(vio, vnet_match);
+
+static struct vio_driver vnet_port_driver = {
+	.id_table	= vnet_port_match,
+	.probe		= vnet_port_probe,
+	.remove		= vnet_port_remove,
+	.driver		= {
+		.name	= "vnet_port",
+		.owner	= THIS_MODULE,
+	}
+};
+
+const char *local_mac_prop = "local-mac-address";
+
+static int __devinit vnet_probe(struct vio_dev *vdev,
+				const struct vio_device_id *id)
+{
+	static int vnet_version_printed;
+	struct mdesc_handle *hp;
+	struct net_device *dev;
+	struct vnet *vp;
+	const u64 *mac;
+	int err, i, len;
+
+	if (vnet_version_printed++ == 0)
+		printk(KERN_INFO "%s", version);
+
+	hp = mdesc_grab();
+
+	mac = mdesc_get_property(hp, vdev->mp, local_mac_prop, &len);
+	if (!mac) {
+		printk(KERN_ERR PFX "vnet lacks %s property.\n",
+		       local_mac_prop);
+		err = -ENODEV;
+		goto err_out;
+	}
+
+	dev = alloc_etherdev(sizeof(*vp));
+	if (!dev) {
+		printk(KERN_ERR PFX "Etherdev alloc failed, aborting.\n");
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	for (i = 0; i < ETH_ALEN; i++)
+		dev->dev_addr[i] = (*mac >> (5 - i) * 8) & 0xff;
+
+	memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
+
+	SET_NETDEV_DEV(dev, &vdev->dev);
+
+	vp = netdev_priv(dev);
+
+	spin_lock_init(&vp->lock);
+	vp->dev = dev;
+	vp->vdev = vdev;
+
+	INIT_LIST_HEAD(&vp->port_list);
+	for (i = 0; i < VNET_PORT_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&vp->port_hash[i]);
+
+	dev->open = vnet_open;
+	dev->stop = vnet_close;
+	dev->set_multicast_list = vnet_set_rx_mode;
+	dev->set_mac_address = vnet_set_mac_addr;
+	dev->tx_timeout = vnet_tx_timeout;
+	dev->ethtool_ops = &vnet_ethtool_ops;
+	dev->watchdog_timeo = VNET_TX_TIMEOUT;
+	dev->change_mtu = vnet_change_mtu;
+	dev->hard_start_xmit = vnet_start_xmit;
+
+	err = register_netdev(dev);
+	if (err) {
+		printk(KERN_ERR PFX "Cannot register net device, "
+		       "aborting.\n");
+		goto err_out_free_dev;
+	}
+
+	printk(KERN_INFO "%s: Sun LDOM vnet ", dev->name);
+
+	for (i = 0; i < 6; i++)
+		printk("%2.2x%c", dev->dev_addr[i], i == 5 ? '\n' : ':');
+
+	dev_set_drvdata(&vdev->dev, vp);
+
+	mdesc_release(hp);
+
+	return 0;
+
+err_out_free_dev:
+	free_netdev(dev);
+
+err_out:
+	mdesc_release(hp);
+	return err;
+}
+
+static int vnet_remove(struct vio_dev *vdev)
+{
+
+	struct vnet *vp = dev_get_drvdata(&vdev->dev);
+
+	if (vp) {
+		/* XXX unregister port, or at least check XXX */
+		unregister_netdevice(vp->dev);
+		dev_set_drvdata(&vdev->dev, NULL);
+	}
+	return 0;
+}
+
+static struct vio_device_id vnet_match[] = {
+	{
+		.type = "network",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(vio, vnet_match);
+
+static struct vio_driver vnet_driver = {
+	.id_table	= vnet_match,
+	.probe		= vnet_probe,
+	.remove		= vnet_remove,
+	.driver		= {
+		.name	= "vnet",
+		.owner	= THIS_MODULE,
+	}
+};
+
+static int __init vnet_init(void)
+{
+	int err = vio_register_driver(&vnet_driver);
+
+	if (!err) {
+		err = vio_register_driver(&vnet_port_driver);
+		if (err)
+			vio_unregister_driver(&vnet_driver);
+	}
+
+	return err;
+}
+
+static void __exit vnet_exit(void)
+{
+	vio_unregister_driver(&vnet_port_driver);
+	vio_unregister_driver(&vnet_driver);
+}
+
+module_init(vnet_init);
+module_exit(vnet_exit);

+ 70 - 0
drivers/net/sunvnet.h

@@ -0,0 +1,70 @@
+#ifndef _SUNVNET_H
+#define _SUNVNET_H
+
+#define DESC_NCOOKIES(entry_size)	\
+	((entry_size) - sizeof(struct vio_net_desc))
+
+/* length of time before we decide the hardware is borked,
+ * and dev->tx_timeout() should be called to fix the problem
+ */
+#define VNET_TX_TIMEOUT			(5 * HZ)
+
+#define VNET_TX_RING_SIZE		512
+#define VNET_TX_WAKEUP_THRESH(dr)	((dr)->pending / 4)
+
+/* VNET packets are sent in buffers with the first 6 bytes skipped
+ * so that after the ethernet header the IPv4/IPv6 headers are aligned
+ * properly.
+ */
+#define VNET_PACKET_SKIP		6
+
+struct vnet_tx_entry {
+	void			*buf;
+	unsigned int		ncookies;
+	struct ldc_trans_cookie	cookies[2];
+};
+
+struct vnet;
+struct vnet_port {
+	struct vio_driver_state	vio;
+
+	struct hlist_node	hash;
+	u8			raddr[ETH_ALEN];
+
+	struct vnet		*vp;
+
+	struct vnet_tx_entry	tx_bufs[VNET_TX_RING_SIZE];
+
+	struct list_head	list;
+};
+
+static inline struct vnet_port *to_vnet_port(struct vio_driver_state *vio)
+{
+	return container_of(vio, struct vnet_port, vio);
+}
+
+#define VNET_PORT_HASH_SIZE	16
+#define VNET_PORT_HASH_MASK	(VNET_PORT_HASH_SIZE - 1)
+
+static inline unsigned int vnet_hashfn(u8 *mac)
+{
+	unsigned int val = mac[4] ^ mac[5];
+
+	return val & (VNET_PORT_HASH_MASK);
+}
+
+struct vnet {
+	/* Protects port_list and port_hash.  */
+	spinlock_t		lock;
+
+	struct net_device	*dev;
+
+	u32			msg_enable;
+	struct vio_dev		*vdev;
+
+	struct list_head	port_list;
+
+	struct hlist_head	port_hash[VNET_PORT_HASH_SIZE];
+};
+
+#endif /* _SUNVNET_H */

+ 26 - 4
drivers/serial/sunhv.c

@@ -440,8 +440,16 @@ static void sunhv_console_write_paged(struct console *con, const char *s, unsign
 {
 {
 	struct uart_port *port = sunhv_port;
 	struct uart_port *port = sunhv_port;
 	unsigned long flags;
 	unsigned long flags;
+	int locked = 1;
+
+	local_irq_save(flags);
+	if (port->sysrq) {
+		locked = 0;
+	} else if (oops_in_progress) {
+		locked = spin_trylock(&port->lock);
+	} else
+		spin_lock(&port->lock);
 
 
-	spin_lock_irqsave(&port->lock, flags);
 	while (n > 0) {
 	while (n > 0) {
 		unsigned long ra = __pa(con_write_page);
 		unsigned long ra = __pa(con_write_page);
 		unsigned long page_bytes;
 		unsigned long page_bytes;
@@ -469,7 +477,10 @@ static void sunhv_console_write_paged(struct console *con, const char *s, unsign
 			ra += written;
 			ra += written;
 		}
 		}
 	}
 	}
-	spin_unlock_irqrestore(&port->lock, flags);
+
+	if (locked)
+		spin_unlock(&port->lock);
+	local_irq_restore(flags);
 }
 }
 
 
 static inline void sunhv_console_putchar(struct uart_port *port, char c)
 static inline void sunhv_console_putchar(struct uart_port *port, char c)
@@ -488,7 +499,15 @@ static void sunhv_console_write_bychar(struct console *con, const char *s, unsig
 {
 {
 	struct uart_port *port = sunhv_port;
 	struct uart_port *port = sunhv_port;
 	unsigned long flags;
 	unsigned long flags;
-	int i;
+	int i, locked = 1;
+
+	local_irq_save(flags);
+	if (port->sysrq) {
+		locked = 0;
+	} else if (oops_in_progress) {
+		locked = spin_trylock(&port->lock);
+	} else
+		spin_lock(&port->lock);
 
 
 	spin_lock_irqsave(&port->lock, flags);
 	spin_lock_irqsave(&port->lock, flags);
 	for (i = 0; i < n; i++) {
 	for (i = 0; i < n; i++) {
@@ -496,7 +515,10 @@ static void sunhv_console_write_bychar(struct console *con, const char *s, unsig
 			sunhv_console_putchar(port, '\r');
 			sunhv_console_putchar(port, '\r');
 		sunhv_console_putchar(port, *s++);
 		sunhv_console_putchar(port, *s++);
 	}
 	}
-	spin_unlock_irqrestore(&port->lock, flags);
+
+	if (locked)
+		spin_unlock(&port->lock);
+	local_irq_restore(flags);
 }
 }
 
 
 static struct console sunhv_console = {
 static struct console sunhv_console = {

+ 14 - 5
drivers/serial/sunsab.c

@@ -860,22 +860,31 @@ static int num_channels;
 static void sunsab_console_putchar(struct uart_port *port, int c)
 static void sunsab_console_putchar(struct uart_port *port, int c)
 {
 {
 	struct uart_sunsab_port *up = (struct uart_sunsab_port *)port;
 	struct uart_sunsab_port *up = (struct uart_sunsab_port *)port;
-	unsigned long flags;
-
-	spin_lock_irqsave(&up->port.lock, flags);
 
 
 	sunsab_tec_wait(up);
 	sunsab_tec_wait(up);
 	writeb(c, &up->regs->w.tic);
 	writeb(c, &up->regs->w.tic);
-
-	spin_unlock_irqrestore(&up->port.lock, flags);
 }
 }
 
 
 static void sunsab_console_write(struct console *con, const char *s, unsigned n)
 static void sunsab_console_write(struct console *con, const char *s, unsigned n)
 {
 {
 	struct uart_sunsab_port *up = &sunsab_ports[con->index];
 	struct uart_sunsab_port *up = &sunsab_ports[con->index];
+	unsigned long flags;
+	int locked = 1;
+
+	local_irq_save(flags);
+	if (up->port.sysrq) {
+		locked = 0;
+	} else if (oops_in_progress) {
+		locked = spin_trylock(&up->port.lock);
+	} else
+		spin_lock(&up->port.lock);
 
 
 	uart_console_write(&up->port, s, n, sunsab_console_putchar);
 	uart_console_write(&up->port, s, n, sunsab_console_putchar);
 	sunsab_tec_wait(up);
 	sunsab_tec_wait(up);
+
+	if (locked)
+		spin_unlock(&up->port.lock);
+	local_irq_restore(flags);
 }
 }
 
 
 static int sunsab_console_setup(struct console *con, char *options)
 static int sunsab_console_setup(struct console *con, char *options)

+ 14 - 0
drivers/serial/sunsu.c

@@ -1288,7 +1288,17 @@ static void sunsu_console_write(struct console *co, const char *s,
 				unsigned int count)
 				unsigned int count)
 {
 {
 	struct uart_sunsu_port *up = &sunsu_ports[co->index];
 	struct uart_sunsu_port *up = &sunsu_ports[co->index];
+	unsigned long flags;
 	unsigned int ier;
 	unsigned int ier;
+	int locked = 1;
+
+	local_irq_save(flags);
+	if (up->port.sysrq) {
+		locked = 0;
+	} else if (oops_in_progress) {
+		locked = spin_trylock(&up->port.lock);
+	} else
+		spin_lock(&up->port.lock);
 
 
 	/*
 	/*
 	 *	First save the UER then disable the interrupts
 	 *	First save the UER then disable the interrupts
@@ -1304,6 +1314,10 @@ static void sunsu_console_write(struct console *co, const char *s,
 	 */
 	 */
 	wait_for_xmitr(up);
 	wait_for_xmitr(up);
 	serial_out(up, UART_IER, ier);
 	serial_out(up, UART_IER, ier);
+
+	if (locked)
+		spin_unlock(&up->port.lock);
+	local_irq_restore(flags);
 }
 }
 
 
 /*
 /*

+ 14 - 3
drivers/serial/sunzilog.c

@@ -9,7 +9,7 @@
  * C. Dost, Pete Zaitcev, Ted Ts'o and Alex Buell for their
  * C. Dost, Pete Zaitcev, Ted Ts'o and Alex Buell for their
  * work there.
  * work there.
  *
  *
- *  Copyright (C) 2002, 2006 David S. Miller (davem@davemloft.net)
+ * Copyright (C) 2002, 2006, 2007 David S. Miller (davem@davemloft.net)
  */
  */
 
 
 #include <linux/module.h>
 #include <linux/module.h>
@@ -1151,11 +1151,22 @@ sunzilog_console_write(struct console *con, const char *s, unsigned int count)
 {
 {
 	struct uart_sunzilog_port *up = &sunzilog_port_table[con->index];
 	struct uart_sunzilog_port *up = &sunzilog_port_table[con->index];
 	unsigned long flags;
 	unsigned long flags;
+	int locked = 1;
+
+	local_irq_save(flags);
+	if (up->port.sysrq) {
+		locked = 0;
+	} else if (oops_in_progress) {
+		locked = spin_trylock(&up->port.lock);
+	} else
+		spin_lock(&up->port.lock);
 
 
-	spin_lock_irqsave(&up->port.lock, flags);
 	uart_console_write(&up->port, s, count, sunzilog_putchar);
 	uart_console_write(&up->port, s, count, sunzilog_putchar);
 	udelay(2);
 	udelay(2);
-	spin_unlock_irqrestore(&up->port.lock, flags);
+
+	if (locked)
+		spin_unlock(&up->port.lock);
+	local_irq_restore(flags);
 }
 }
 
 
 static int __init sunzilog_console_setup(struct console *con, char *options)
 static int __init sunzilog_console_setup(struct console *con, char *options)

+ 0 - 5
include/asm-sparc64/bugs.h

@@ -4,12 +4,7 @@
  */
  */
 #include <asm/sstate.h>
 #include <asm/sstate.h>
 
 
-extern unsigned long loops_per_jiffy;
-
 static void __init check_bugs(void)
 static void __init check_bugs(void)
 {
 {
-#ifndef CONFIG_SMP
-	cpu_data(0).udelay_val = loops_per_jiffy;
-#endif
 	sstate_running();
 	sstate_running();
 }
 }

+ 3 - 2
include/asm-sparc64/cpudata.h

@@ -19,7 +19,7 @@ typedef struct {
 	unsigned int	__softirq_pending; /* must be 1st, see rtrap.S */
 	unsigned int	__softirq_pending; /* must be 1st, see rtrap.S */
 	unsigned int	__pad0;
 	unsigned int	__pad0;
 	unsigned long	clock_tick;	/* %tick's per second */
 	unsigned long	clock_tick;	/* %tick's per second */
-	unsigned long	udelay_val;
+	unsigned long	__pad;
 	unsigned int	__pad1;
 	unsigned int	__pad1;
 	unsigned int	__pad2;
 	unsigned int	__pad2;
 
 
@@ -80,7 +80,8 @@ struct trap_per_cpu {
 	unsigned int		dev_mondo_qmask;
 	unsigned int		dev_mondo_qmask;
 	unsigned int		resum_qmask;
 	unsigned int		resum_qmask;
 	unsigned int		nonresum_qmask;
 	unsigned int		nonresum_qmask;
-	unsigned int		__pad2[3];
+	unsigned int		__pad2[1];
+	void			*hdesc;
 } __attribute__((aligned(64)));
 } __attribute__((aligned(64)));
 extern struct trap_per_cpu trap_block[NR_CPUS];
 extern struct trap_per_cpu trap_block[NR_CPUS];
 extern void init_cur_cpu_trap(struct thread_info *);
 extern void init_cur_cpu_trap(struct thread_info *);

+ 6 - 26
include/asm-sparc64/delay.h

@@ -1,37 +1,17 @@
 /* delay.h: Linux delay routines on sparc64.
 /* delay.h: Linux delay routines on sparc64.
  *
  *
- * Copyright (C) 1996, 2004 David S. Miller (davem@davemloft.net).
- *
- * Based heavily upon x86 variant which is:
- * Copyright (C) 1993 Linus Torvalds
- *
- * Delay routines calling functions in arch/sparc64/lib/delay.c
+ * Copyright (C) 1996, 2004, 2007 David S. Miller (davem@davemloft.net).
  */
  */
 
 
-#ifndef __SPARC64_DELAY_H
-#define __SPARC64_DELAY_H
-
-#include <linux/param.h>
-#include <asm/cpudata.h>
+#ifndef _SPARC64_DELAY_H
+#define _SPARC64_DELAY_H
 
 
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
 
 
-extern void __bad_udelay(void);
-extern void __bad_ndelay(void);
-
-extern void __udelay(unsigned long usecs);
-extern void __ndelay(unsigned long nsecs);
-extern void __const_udelay(unsigned long usecs);
 extern void __delay(unsigned long loops);
 extern void __delay(unsigned long loops);
-
-#define udelay(n) (__builtin_constant_p(n) ? \
-	((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
-	__udelay(n))
-	
-#define ndelay(n) (__builtin_constant_p(n) ? \
-	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
-	__ndelay(n))
+extern void udelay(unsigned long usecs);
+#define mdelay(n)	udelay((n) * 1000)
 
 
 #endif /* !__ASSEMBLY__ */
 #endif /* !__ASSEMBLY__ */
 
 
-#endif /* defined(__SPARC64_DELAY_H) */
+#endif /* _SPARC64_DELAY_H */

+ 37 - 0
include/asm-sparc64/hvtramp.h

@@ -0,0 +1,37 @@
+#ifndef _SPARC64_HVTRAP_H
+#define _SPARC64_HVTRAP_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+
+struct hvtramp_mapping {
+	__u64		vaddr;
+	__u64		tte;
+};
+
+struct hvtramp_descr {
+	__u32			cpu;
+	__u32			num_mappings;
+	__u64			fault_info_va;
+	__u64			fault_info_pa;
+	__u64			thread_reg;
+	struct hvtramp_mapping	maps[2];
+};
+
+extern void hv_cpu_startup(unsigned long hvdescr_pa);
+
+#endif
+
+#define HVTRAMP_DESCR_CPU		0x00
+#define HVTRAMP_DESCR_NUM_MAPPINGS	0x04
+#define HVTRAMP_DESCR_FAULT_INFO_VA	0x08
+#define HVTRAMP_DESCR_FAULT_INFO_PA	0x10
+#define HVTRAMP_DESCR_THREAD_REG	0x18
+#define HVTRAMP_DESCR_MAPS		0x20
+
+#define HVTRAMP_MAPPING_VADDR		0x00
+#define HVTRAMP_MAPPING_TTE		0x08
+#define HVTRAMP_MAPPING_SIZE		0x10
+
+#endif /* _SPARC64_HVTRAP_H */

+ 1 - 1
include/asm-sparc64/hypervisor.h

@@ -98,7 +98,7 @@
 #define HV_FAST_MACH_EXIT		0x00
 #define HV_FAST_MACH_EXIT		0x00
 
 
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
-extern void sun4v_mach_exit(unsigned long exit_core);
+extern void sun4v_mach_exit(unsigned long exit_code);
 #endif
 #endif
 
 
 /* Domain services.  */
 /* Domain services.  */

+ 2 - 0
include/asm-sparc64/irq.h

@@ -53,6 +53,8 @@ extern unsigned int sun4v_build_msi(u32 devhandle, unsigned int *virt_irq_p,
 extern void sun4v_destroy_msi(unsigned int virt_irq);
 extern void sun4v_destroy_msi(unsigned int virt_irq);
 extern unsigned int sbus_build_irq(void *sbus, unsigned int ino);
 extern unsigned int sbus_build_irq(void *sbus, unsigned int ino);
 
 
+extern void fixup_irqs(void);
+
 static __inline__ void set_softint(unsigned long bits)
 static __inline__ void set_softint(unsigned long bits)
 {
 {
 	__asm__ __volatile__("wr	%0, 0x0, %%set_softint"
 	__asm__ __volatile__("wr	%0, 0x0, %%set_softint"

+ 138 - 0
include/asm-sparc64/ldc.h

@@ -0,0 +1,138 @@
+#ifndef _SPARC64_LDC_H
+#define _SPARC64_LDC_H
+
+#include <asm/hypervisor.h>
+
+extern int ldom_domaining_enabled;
+extern void ldom_set_var(const char *var, const char *value);
+extern void ldom_reboot(const char *boot_command);
+extern void ldom_power_off(void);
+
+/* The event handler will be evoked when link state changes
+ * or data becomes available on the receive side.
+ *
+ * For non-RAW links, if the LDC_EVENT_RESET event arrives the
+ * driver should reset all of it's internal state and reinvoke
+ * ldc_connect() to try and bring the link up again.
+ *
+ * For RAW links, ldc_connect() is not used.  Instead the driver
+ * just waits for the LDC_EVENT_UP event.
+ */
+struct ldc_channel_config {
+	void (*event)(void *arg, int event);
+
+	u32			mtu;
+	unsigned int		rx_irq;
+	unsigned int		tx_irq;
+	u8			mode;
+#define LDC_MODE_RAW		0x00
+#define LDC_MODE_UNRELIABLE	0x01
+#define LDC_MODE_RESERVED	0x02
+#define LDC_MODE_STREAM		0x03
+
+	u8			debug;
+#define LDC_DEBUG_HS		0x01
+#define LDC_DEBUG_STATE		0x02
+#define LDC_DEBUG_RX		0x04
+#define LDC_DEBUG_TX		0x08
+#define LDC_DEBUG_DATA		0x10
+};
+
+#define LDC_EVENT_RESET		0x01
+#define LDC_EVENT_UP		0x02
+#define LDC_EVENT_DATA_READY	0x04
+
+#define LDC_STATE_INVALID	0x00
+#define LDC_STATE_INIT		0x01
+#define LDC_STATE_BOUND		0x02
+#define LDC_STATE_READY		0x03
+#define LDC_STATE_CONNECTED	0x04
+
+struct ldc_channel;
+
+/* Allocate state for a channel.  */
+extern struct ldc_channel *ldc_alloc(unsigned long id,
+				     const struct ldc_channel_config *cfgp,
+				     void *event_arg);
+
+/* Shut down and free state for a channel.  */
+extern void ldc_free(struct ldc_channel *lp);
+
+/* Register TX and RX queues of the link with the hypervisor.  */
+extern int ldc_bind(struct ldc_channel *lp, const char *name);
+
+/* For non-RAW protocols we need to complete a handshake before
+ * communication can proceed.  ldc_connect() does that, if the
+ * handshake completes successfully, an LDC_EVENT_UP event will
+ * be sent up to the driver.
+ */
+extern int ldc_connect(struct ldc_channel *lp);
+extern int ldc_disconnect(struct ldc_channel *lp);
+
+extern int ldc_state(struct ldc_channel *lp);
+
+/* Read and write operations.  Only valid when the link is up.  */
+extern int ldc_write(struct ldc_channel *lp, const void *buf,
+		     unsigned int size);
+extern int ldc_read(struct ldc_channel *lp, void *buf, unsigned int size);
+
+#define LDC_MAP_SHADOW	0x01
+#define LDC_MAP_DIRECT	0x02
+#define LDC_MAP_IO	0x04
+#define LDC_MAP_R	0x08
+#define LDC_MAP_W	0x10
+#define LDC_MAP_X	0x20
+#define LDC_MAP_RW	(LDC_MAP_R | LDC_MAP_W)
+#define LDC_MAP_RWX	(LDC_MAP_R | LDC_MAP_W | LDC_MAP_X)
+#define LDC_MAP_ALL	0x03f
+
+struct ldc_trans_cookie {
+	u64			cookie_addr;
+	u64			cookie_size;
+};
+
+struct scatterlist;
+extern int ldc_map_sg(struct ldc_channel *lp,
+		      struct scatterlist *sg, int num_sg,
+		      struct ldc_trans_cookie *cookies, int ncookies,
+		      unsigned int map_perm);
+
+extern int ldc_map_single(struct ldc_channel *lp,
+			  void *buf, unsigned int len,
+			  struct ldc_trans_cookie *cookies, int ncookies,
+			  unsigned int map_perm);
+
+extern void ldc_unmap(struct ldc_channel *lp, struct ldc_trans_cookie *cookies,
+		      int ncookies);
+
+extern int ldc_copy(struct ldc_channel *lp, int copy_dir,
+		    void *buf, unsigned int len, unsigned long offset,
+		    struct ldc_trans_cookie *cookies, int ncookies);
+
+static inline int ldc_get_dring_entry(struct ldc_channel *lp,
+				      void *buf, unsigned int len,
+				      unsigned long offset,
+				      struct ldc_trans_cookie *cookies,
+				      int ncookies)
+{
+	return ldc_copy(lp, LDC_COPY_IN, buf, len, offset, cookies, ncookies);
+}
+
+static inline int ldc_put_dring_entry(struct ldc_channel *lp,
+				      void *buf, unsigned int len,
+				      unsigned long offset,
+				      struct ldc_trans_cookie *cookies,
+				      int ncookies)
+{
+	return ldc_copy(lp, LDC_COPY_OUT, buf, len, offset, cookies, ncookies);
+}
+
+extern void *ldc_alloc_exp_dring(struct ldc_channel *lp, unsigned int len,
+				 struct ldc_trans_cookie *cookies,
+				 int *ncookies, unsigned int map_perm);
+
+extern void ldc_free_exp_dring(struct ldc_channel *lp, void *buf,
+			       unsigned int len,
+			       struct ldc_trans_cookie *cookies, int ncookies);
+
+#endif /* _SPARC64_LDC_H */

+ 58 - 30
include/asm-sparc64/mdesc.h

@@ -2,38 +2,66 @@
 #define _SPARC64_MDESC_H
 #define _SPARC64_MDESC_H
 
 
 #include <linux/types.h>
 #include <linux/types.h>
+#include <linux/cpumask.h>
 #include <asm/prom.h>
 #include <asm/prom.h>
 
 
-struct mdesc_node;
-struct mdesc_arc {
-	const char		*name;
-	struct mdesc_node	*arc;
-};
-
-struct mdesc_node {
-	const char		*name;
-	u64			node;
-	unsigned int		unique_id;
-	unsigned int		num_arcs;
-	unsigned int		irqs[2];
-	struct property		*properties;
-	struct mdesc_node	*hash_next;
-	struct mdesc_node	*allnodes_next;
-	struct mdesc_arc	arcs[0];
-};
-
-extern struct mdesc_node *md_find_node_by_name(struct mdesc_node *from,
-					       const char *name);
-#define md_for_each_node_by_name(__mn, __name) \
-	for (__mn = md_find_node_by_name(NULL, __name); __mn; \
-	     __mn = md_find_node_by_name(__mn, __name))
-
-extern struct property *md_find_property(const struct mdesc_node *mp,
-					 const char *name,
-					 int *lenp);
-extern const void *md_get_property(const struct mdesc_node *mp,
-				   const char *name,
-				   int *lenp);
+struct mdesc_handle;
+
+/* Machine description operations are to be surrounded by grab and
+ * release calls.  The mdesc_handle returned from the grab is
+ * the first argument to all of the operational calls that work
+ * on mdescs.
+ */
+extern struct mdesc_handle *mdesc_grab(void);
+extern void mdesc_release(struct mdesc_handle *);
+
+#define MDESC_NODE_NULL		(~(u64)0)
+
+extern u64 mdesc_node_by_name(struct mdesc_handle *handle,
+			      u64 from_node, const char *name);
+#define mdesc_for_each_node_by_name(__hdl, __node, __name) \
+	for (__node = mdesc_node_by_name(__hdl, MDESC_NODE_NULL, __name); \
+	     (__node) != MDESC_NODE_NULL; \
+	     __node = mdesc_node_by_name(__hdl, __node, __name))
+
+/* Access to property values returned from mdesc_get_property() are
+ * only valid inside of a mdesc_grab()/mdesc_release() sequence.
+ * Once mdesc_release() is called, the memory backed up by these
+ * pointers may reference freed up memory.
+ *
+ * Therefore callers must make copies of any property values
+ * they need.
+ *
+ * These same rules apply to mdesc_node_name().
+ */
+extern const void *mdesc_get_property(struct mdesc_handle *handle,
+				      u64 node, const char *name, int *lenp);
+extern const char *mdesc_node_name(struct mdesc_handle *hp, u64 node);
+
+/* MD arc iteration, the standard sequence is:
+ *
+ *	unsigned long arc;
+ *	mdesc_for_each_arc(arc, handle, node, MDESC_ARC_TYPE_{FWD,BACK}) {
+ *		unsigned long target = mdesc_arc_target(handle, arc);
+ *		...
+ *	}
+ */
+
+#define MDESC_ARC_TYPE_FWD	"fwd"
+#define MDESC_ARC_TYPE_BACK	"back"
+
+extern u64 mdesc_next_arc(struct mdesc_handle *handle, u64 from,
+			  const char *arc_type);
+#define mdesc_for_each_arc(__arc, __hdl, __node, __type) \
+	for (__arc = mdesc_next_arc(__hdl, __node, __type); \
+	     (__arc) != MDESC_NODE_NULL; \
+	     __arc = mdesc_next_arc(__hdl, __arc, __type))
+
+extern u64 mdesc_arc_target(struct mdesc_handle *hp, u64 arc);
+
+extern void mdesc_update(void);
+
+extern void mdesc_fill_in_cpu_data(cpumask_t mask);
 
 
 extern void sun4v_mdesc_init(void);
 extern void sun4v_mdesc_init(void);
 
 

+ 3 - 0
include/asm-sparc64/mmu_context.h

@@ -76,6 +76,9 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, str
 	unsigned long ctx_valid, flags;
 	unsigned long ctx_valid, flags;
 	int cpu;
 	int cpu;
 
 
+	if (unlikely(mm == &init_mm))
+		return;
+
 	spin_lock_irqsave(&mm->context.lock, flags);
 	spin_lock_irqsave(&mm->context.lock, flags);
 	ctx_valid = CTX_VALID(mm->context);
 	ctx_valid = CTX_VALID(mm->context);
 	if (!ctx_valid)
 	if (!ctx_valid)

+ 7 - 0
include/asm-sparc64/power.h

@@ -0,0 +1,7 @@
+#ifndef _SPARC64_POWER_H
+#define _SPARC64_POWER_H
+
+extern void wake_up_powerd(void);
+extern int start_powerd(void);
+
+#endif /* !(_SPARC64_POWER_H) */

+ 6 - 5
include/asm-sparc64/smp.h

@@ -29,9 +29,6 @@
 #include <asm/bitops.h>
 #include <asm/bitops.h>
 #include <asm/atomic.h>
 #include <asm/atomic.h>
 
 
-extern cpumask_t phys_cpu_present_map;
-#define cpu_possible_map phys_cpu_present_map
-
 extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_sibling_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
 extern cpumask_t cpu_core_map[NR_CPUS];
 extern int sparc64_multi_core;
 extern int sparc64_multi_core;
@@ -44,7 +41,12 @@ extern int hard_smp_processor_id(void);
 #define raw_smp_processor_id() (current_thread_info()->cpu)
 #define raw_smp_processor_id() (current_thread_info()->cpu)
 
 
 extern void smp_fill_in_sib_core_maps(void);
 extern void smp_fill_in_sib_core_maps(void);
-extern unsigned char boot_cpu_id;
+extern void cpu_play_dead(void);
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
+#endif
 
 
 #endif /* !(__ASSEMBLY__) */
 #endif /* !(__ASSEMBLY__) */
 
 
@@ -52,7 +54,6 @@ extern unsigned char boot_cpu_id;
 
 
 #define hard_smp_processor_id()		0
 #define hard_smp_processor_id()		0
 #define smp_fill_in_sib_core_maps() do { } while (0)
 #define smp_fill_in_sib_core_maps() do { } while (0)
-#define boot_cpu_id	(0)
 
 
 #endif /* !(CONFIG_SMP) */
 #endif /* !(CONFIG_SMP) */
 
 

+ 404 - 0
include/asm-sparc64/vio.h

@@ -0,0 +1,404 @@
+#ifndef _SPARC64_VIO_H
+#define _SPARC64_VIO_H
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/mod_devicetable.h>
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/list.h>
+
+#include <asm/ldc.h>
+#include <asm/mdesc.h>
+
+struct vio_msg_tag {
+	u8			type;
+#define VIO_TYPE_CTRL		0x01
+#define VIO_TYPE_DATA		0x02
+#define VIO_TYPE_ERR		0x04
+
+	u8			stype;
+#define VIO_SUBTYPE_INFO	0x01
+#define VIO_SUBTYPE_ACK		0x02
+#define VIO_SUBTYPE_NACK	0x04
+
+	u16			stype_env;
+#define VIO_VER_INFO		0x0001
+#define VIO_ATTR_INFO		0x0002
+#define VIO_DRING_REG		0x0003
+#define VIO_DRING_UNREG		0x0004
+#define VIO_RDX			0x0005
+#define VIO_PKT_DATA		0x0040
+#define VIO_DESC_DATA		0x0041
+#define VIO_DRING_DATA		0x0042
+#define VNET_MCAST_INFO		0x0101
+
+	u32		sid;
+};
+
+struct vio_rdx {
+	struct vio_msg_tag	tag;
+	u64			resv[6];
+};
+
+struct vio_ver_info {
+	struct vio_msg_tag	tag;
+	u16			major;
+	u16			minor;
+	u8			dev_class;
+#define VDEV_NETWORK		0x01
+#define VDEV_NETWORK_SWITCH	0x02
+#define VDEV_DISK		0x03
+#define VDEV_DISK_SERVER	0x04
+
+	u8			resv1[3];
+	u64			resv2[5];
+};
+
+struct vio_dring_register {
+	struct vio_msg_tag	tag;
+	u64			dring_ident;
+	u32			num_descr;
+	u32			descr_size;
+	u16			options;
+#define VIO_TX_DRING		0x0001
+#define VIO_RX_DRING		0x0002
+	u16			resv;
+	u32			num_cookies;
+	struct ldc_trans_cookie	cookies[0];
+};
+
+struct vio_dring_unregister {
+	struct vio_msg_tag	tag;
+	u64			dring_ident;
+	u64			resv[5];
+};
+
+/* Data transfer modes */
+#define VIO_PKT_MODE		0x01 /* Packet based transfer	*/
+#define VIO_DESC_MODE		0x02 /* In-band descriptors	*/
+#define VIO_DRING_MODE		0x03 /* Descriptor rings	*/
+
+struct vio_dring_data {
+	struct vio_msg_tag	tag;
+	u64			seq;
+	u64			dring_ident;
+	u32			start_idx;
+	u32			end_idx;
+	u8			state;
+#define VIO_DRING_ACTIVE	0x01
+#define VIO_DRING_STOPPED	0x02
+
+	u8			__pad1;
+	u16			__pad2;
+	u32			__pad3;
+	u64			__par4[2];
+};
+
+struct vio_dring_hdr {
+	u8			state;
+#define VIO_DESC_FREE		0x01
+#define VIO_DESC_READY		0x02
+#define VIO_DESC_ACCEPTED	0x03
+#define VIO_DESC_DONE		0x04
+	u8			ack;
+#define VIO_ACK_ENABLE		0x01
+#define VIO_ACK_DISABLE		0x00
+
+	u16			__pad1;
+	u32			__pad2;
+};
+
+/* VIO disk specific structures and defines */
+struct vio_disk_attr_info {
+	struct vio_msg_tag	tag;
+	u8			xfer_mode;
+	u8			vdisk_type;
+#define VD_DISK_TYPE_SLICE	0x01 /* Slice in block device	*/
+#define VD_DISK_TYPE_DISK	0x02 /* Entire block device	*/
+	u16			resv1;
+	u32			vdisk_block_size;
+	u64			operations;
+	u64			vdisk_size;
+	u64			max_xfer_size;
+	u64			resv2[2];
+};
+
+struct vio_disk_desc {
+	struct vio_dring_hdr	hdr;
+	u64			req_id;
+	u8			operation;
+#define VD_OP_BREAD		0x01 /* Block read			*/
+#define VD_OP_BWRITE		0x02 /* Block write			*/
+#define VD_OP_FLUSH		0x03 /* Flush disk contents		*/
+#define VD_OP_GET_WCE		0x04 /* Get write-cache status		*/
+#define VD_OP_SET_WCE		0x05 /* Enable/disable write-cache	*/
+#define VD_OP_GET_VTOC		0x06 /* Get VTOC			*/
+#define VD_OP_SET_VTOC		0x07 /* Set VTOC			*/
+#define VD_OP_GET_DISKGEOM	0x08 /* Get disk geometry		*/
+#define VD_OP_SET_DISKGEOM	0x09 /* Set disk geometry		*/
+#define VD_OP_SCSICMD		0x0a /* SCSI control command		*/
+#define VD_OP_GET_DEVID		0x0b /* Get device ID			*/
+#define VD_OP_GET_EFI		0x0c /* Get EFI				*/
+#define VD_OP_SET_EFI		0x0d /* Set EFI				*/
+	u8			slice;
+	u16			resv1;
+	u32			status;
+	u64			offset;
+	u64			size;
+	u32			ncookies;
+	u32			resv2;
+	struct ldc_trans_cookie	cookies[0];
+};
+
+#define VIO_DISK_VNAME_LEN	8
+#define VIO_DISK_ALABEL_LEN	128
+#define VIO_DISK_NUM_PART	8
+
+struct vio_disk_vtoc {
+	u8			volume_name[VIO_DISK_VNAME_LEN];
+	u16			sector_size;
+	u16			num_partitions;
+	u8			ascii_label[VIO_DISK_ALABEL_LEN];
+	struct {
+		u16		id;
+		u16		perm_flags;
+		u32		resv;
+		u64		start_block;
+		u64		num_blocks;
+	} partitions[VIO_DISK_NUM_PART];
+};
+
+struct vio_disk_geom {
+	u16			num_cyl; /* Num data cylinders		*/
+	u16			alt_cyl; /* Num alternate cylinders	*/
+	u16			beg_cyl; /* Cyl off of fixed head area	*/
+	u16			num_hd;  /* Num heads			*/
+	u16			num_sec; /* Num sectors			*/
+	u16			ifact;   /* Interleave factor		*/
+	u16			apc;     /* Alts per cylinder (SCSI)	*/
+	u16			rpm;	 /* Revolutions per minute	*/
+	u16			phy_cyl; /* Num physical cylinders	*/
+	u16			wr_skip; /* Num sects to skip, writes	*/
+	u16			rd_skip; /* Num sects to skip, writes	*/
+};
+
+struct vio_disk_devid {
+	u16			resv;
+	u16			type;
+	u32			len;
+	char			id[0];
+};
+
+struct vio_disk_efi {
+	u64			lba;
+	u64			len;
+	char			data[0];
+};
+
+/* VIO net specific structures and defines */
+struct vio_net_attr_info {
+	struct vio_msg_tag	tag;
+	u8			xfer_mode;
+	u8			addr_type;
+#define VNET_ADDR_ETHERMAC	0x01
+	u16			ack_freq;
+	u32			resv1;
+	u64			addr;
+	u64			mtu;
+	u64			resv2[3];
+};
+
+#define VNET_NUM_MCAST		7
+
+struct vio_net_mcast_info {
+	struct vio_msg_tag	tag;
+	u8			set;
+	u8			count;
+	u8			mcast_addr[VNET_NUM_MCAST * 6];
+	u32			resv;
+};
+
+struct vio_net_desc {
+	struct vio_dring_hdr	hdr;
+	u32			size;
+	u32			ncookies;
+	struct ldc_trans_cookie	cookies[0];
+};
+
+#define VIO_MAX_RING_COOKIES	24
+
+struct vio_dring_state {
+	u64			ident;
+	void			*base;
+	u64			snd_nxt;
+	u64			rcv_nxt;
+	u32			entry_size;
+	u32			num_entries;
+	u32			prod;
+	u32			cons;
+	u32			pending;
+	int			ncookies;
+	struct ldc_trans_cookie	cookies[VIO_MAX_RING_COOKIES];
+};
+
+static inline void *vio_dring_cur(struct vio_dring_state *dr)
+{
+	return dr->base + (dr->entry_size * dr->prod);
+}
+
+static inline void *vio_dring_entry(struct vio_dring_state *dr,
+				    unsigned int index)
+{
+	return dr->base + (dr->entry_size * index);
+}
+
+static inline u32 vio_dring_avail(struct vio_dring_state *dr,
+				  unsigned int ring_size)
+{
+	/* Ensure build-time power-of-2.  */
+	BUILD_BUG_ON(ring_size & (ring_size - 1));
+
+	return (dr->pending -
+		((dr->prod - dr->cons) & (ring_size - 1)));
+}
+
+#define VIO_MAX_TYPE_LEN	64
+#define VIO_MAX_COMPAT_LEN	64
+
+struct vio_dev {
+	u64			mp;
+	struct device_node	*dp;
+
+	char			type[VIO_MAX_TYPE_LEN];
+	char			compat[VIO_MAX_COMPAT_LEN];
+	int			compat_len;
+
+	unsigned long		channel_id;
+
+	unsigned int		tx_irq;
+	unsigned int		rx_irq;
+
+	struct device		dev;
+};
+
+struct vio_driver {
+	struct list_head		node;
+	const struct vio_device_id	*id_table;
+	int (*probe)(struct vio_dev *dev, const struct vio_device_id *id);
+	int (*remove)(struct vio_dev *dev);
+	void (*shutdown)(struct vio_dev *dev);
+	unsigned long			driver_data;
+	struct device_driver		driver;
+};
+
+struct vio_version {
+	u16		major;
+	u16		minor;
+};
+
+struct vio_driver_state;
+struct vio_driver_ops {
+	int	(*send_attr)(struct vio_driver_state *vio);
+	int	(*handle_attr)(struct vio_driver_state *vio, void *pkt);
+	void	(*handshake_complete)(struct vio_driver_state *vio);
+};
+
+struct vio_completion {
+	struct completion	com;
+	int			err;
+	int			waiting_for;
+};
+
+struct vio_driver_state {
+	/* Protects VIO handshake and, optionally, driver private state.  */
+	spinlock_t		lock;
+
+	struct ldc_channel	*lp;
+
+	u32			_peer_sid;
+	u32			_local_sid;
+	struct vio_dring_state	drings[2];
+#define VIO_DRIVER_TX_RING	0
+#define VIO_DRIVER_RX_RING	1
+
+	u8			hs_state;
+#define VIO_HS_INVALID		0x00
+#define VIO_HS_GOTVERS		0x01
+#define VIO_HS_GOT_ATTR		0x04
+#define VIO_HS_SENT_DREG	0x08
+#define VIO_HS_SENT_RDX		0x10
+#define VIO_HS_GOT_RDX_ACK	0x20
+#define VIO_HS_GOT_RDX		0x40
+#define VIO_HS_SENT_RDX_ACK	0x80
+#define VIO_HS_COMPLETE		(VIO_HS_GOT_RDX_ACK | VIO_HS_SENT_RDX_ACK)
+
+	u8			dev_class;
+
+	u8			dr_state;
+#define VIO_DR_STATE_TXREG	0x01
+#define VIO_DR_STATE_RXREG	0x02
+#define VIO_DR_STATE_TXREQ	0x10
+#define VIO_DR_STATE_RXREQ	0x20
+
+	u8			debug;
+#define VIO_DEBUG_HS		0x01
+#define VIO_DEBUG_DATA		0x02
+
+	void			*desc_buf;
+	unsigned int		desc_buf_len;
+
+	struct vio_completion	*cmp;
+
+	struct vio_dev		*vdev;
+
+	struct timer_list	timer;
+
+	struct vio_version	ver;
+
+	struct vio_version	*ver_table;
+	int			ver_table_entries;
+
+	char			*name;
+
+	struct vio_driver_ops	*ops;
+};
+
+#define viodbg(TYPE, f, a...) \
+do {	if (vio->debug & VIO_DEBUG_##TYPE) \
+		printk(KERN_INFO "vio: ID[%lu] " f, \
+		       vio->vdev->channel_id, ## a); \
+} while (0)
+
+extern int vio_register_driver(struct vio_driver *drv);
+extern void vio_unregister_driver(struct vio_driver *drv);
+
+static inline struct vio_driver *to_vio_driver(struct device_driver *drv)
+{
+	return container_of(drv, struct vio_driver, driver);
+}
+
+static inline struct vio_dev *to_vio_dev(struct device *dev)
+{
+	return container_of(dev, struct vio_dev, dev);
+}
+
+extern int vio_ldc_send(struct vio_driver_state *vio, void *data, int len);
+extern void vio_link_state_change(struct vio_driver_state *vio, int event);
+extern void vio_conn_reset(struct vio_driver_state *vio);
+extern int vio_control_pkt_engine(struct vio_driver_state *vio, void *pkt);
+extern int vio_validate_sid(struct vio_driver_state *vio,
+			    struct vio_msg_tag *tp);
+extern u32 vio_send_sid(struct vio_driver_state *vio);
+extern int vio_ldc_alloc(struct vio_driver_state *vio,
+			 struct ldc_channel_config *base_cfg, void *event_arg);
+extern void vio_ldc_free(struct vio_driver_state *vio);
+extern int vio_driver_init(struct vio_driver_state *vio, struct vio_dev *vdev,
+			   u8 dev_class, struct vio_version *ver_table,
+			   int ver_table_size, struct vio_driver_ops *ops,
+			   char *name);
+
+extern void vio_port_up(struct vio_driver_state *vio);
+
+#endif /* _SPARC64_VIO_H */