changeset 1991:f29baf5bf770

FWARC 2005/633 Project Q Logial Domaining Umbrella FWARC 2005/739 sun4v channels FWARC 2006/055 Domain Services FWARC 2006/072 sun4v virtual devices machine description data FWARC 2006/074 sun4v interrupt cookies FWARC 2006/075 Channel devices, Virtual Disk client and server bindings FWARC 2006/076 Virtual Network Client and switch Bindings FWARC 2006/081 Virtual Logical Domain Channel (vldc) Bindings FWARC 2006/086 LDOM variables FWARC 2006/105 LDOM support for NCP FWARC 2006/110 Domain Services MD node and other misc properties FWARC 2006/117 Virtual Console Concentrator Bindings FWARC 2006/135 sun4v channel console packets FWARC 2006/140 sun4v channels transport protocol FWARC 2006/141 FMA Domain Services FWARC 2006/174 NCS HV Update FWARC 2006/184 sun4v channels shared memory FWARC 2006/195 Virtual IO Communication Protocol PSARC 2006/152 Logical Domain Channels Transport API 6272074 Need interface to determine execution unit sharing. 6354556 Add machine description kernel framework support 6391870 LDoms v1.0 Solaris Changes
author heppo
date Tue, 16 May 2006 16:05:21 -0700
parents 2960cf15fee6
children 234183bdc3e0
files usr/src/Targetdirs usr/src/cmd/Makefile usr/src/cmd/Makefile.cmd usr/src/cmd/mdb/common/kmdb/kctl/kctl.h usr/src/cmd/mdb/common/kmdb/kmdb_auxv.h usr/src/cmd/mdb/common/kmdb/kmdb_fault.c usr/src/cmd/mdb/common/kmdb/kmdb_kctl.h usr/src/cmd/mdb/common/kmdb/kmdb_main.c usr/src/cmd/mdb/common/kmdb/kmdb_promif.c usr/src/cmd/mdb/common/kmdb/kmdb_promif.h usr/src/cmd/mdb/sparc/kmdb/kaif_activate.c usr/src/cmd/mdb/sparc/kmdb/kctl/kctl_isadep.c usr/src/cmd/mdb/sparc/v9/kmdb/kaif_handlers.s usr/src/cmd/mdb/sun4v/Makefile.kmdb usr/src/cmd/mdb/sun4v/modules/Makefile usr/src/cmd/mdb/sun4v/modules/vdsk/Makefile usr/src/cmd/mdb/sun4v/modules/vdsk/v9/Makefile usr/src/cmd/mdb/sun4v/modules/vdsk/vdsk.c usr/src/cmd/mdb/sun4v/v9/Makefile.kmdb usr/src/cmd/mdb/sun4v/v9/kmdb/Makefile usr/src/cmd/picl/plugins/sun4v/mdesc/Makefile usr/src/cmd/picl/plugins/sun4v/mdesc/cpu_prop_update.c usr/src/cmd/picl/plugins/sun4v/mdesc/dr.c usr/src/cmd/picl/plugins/sun4v/mdesc/init.c usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.c usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.h usr/src/cmd/vntsd/Makefile usr/src/cmd/vntsd/chars.h usr/src/cmd/vntsd/cmd.c usr/src/cmd/vntsd/common.c usr/src/cmd/vntsd/console.c usr/src/cmd/vntsd/listen.c usr/src/cmd/vntsd/queue.c usr/src/cmd/vntsd/read.c usr/src/cmd/vntsd/svc-vntsd usr/src/cmd/vntsd/vcc.h usr/src/cmd/vntsd/vntsd.c usr/src/cmd/vntsd/vntsd.h usr/src/cmd/vntsd/vntsd.xml usr/src/cmd/vntsd/vntsdvcc.c usr/src/cmd/vntsd/write.c usr/src/common/mdesc/mdesc_diff.c usr/src/common/mdesc/mdesc_fini.c usr/src/common/mdesc/mdesc_getbinsize.c usr/src/common/mdesc/mdesc_getgen.c usr/src/common/mdesc/mdesc_init_intern.c usr/src/common/mdesc/mdesc_rootnode.c usr/src/common/mdesc/mdesc_scandag.c usr/src/lib/libpcp/common/libpcp.c usr/src/pkgdefs/Makefile usr/src/pkgdefs/SUNWldomr.v/Makefile usr/src/pkgdefs/SUNWldomr.v/i.manifest usr/src/pkgdefs/SUNWldomr.v/pkginfo.tmpl usr/src/pkgdefs/SUNWldomr.v/postinstall usr/src/pkgdefs/SUNWldomr.v/preremove usr/src/pkgdefs/SUNWldomr.v/prototype_com usr/src/pkgdefs/SUNWldomr.v/prototype_sparc usr/src/pkgdefs/SUNWldomr.v/r.manifest usr/src/pkgdefs/SUNWldomu.v/Makefile usr/src/pkgdefs/SUNWldomu.v/depend usr/src/pkgdefs/SUNWldomu.v/pkginfo.tmpl usr/src/pkgdefs/SUNWldomu.v/prototype_com usr/src/pkgdefs/SUNWldomu.v/prototype_sparc usr/src/pkgdefs/SUNWmdb/prototype_sparc usr/src/pkgdefs/SUNWmdbr/prototype_sparc usr/src/pkgdefs/etc/exception_list_i386 usr/src/tools/scripts/bfu.sh usr/src/uts/common/sys/mdesc.h usr/src/uts/common/sys/mdesc_impl.h usr/src/uts/sfmmu/ml/sfmmu_kdi.s usr/src/uts/sun4/io/trapstat.c usr/src/uts/sun4/os/ddi_impl.c usr/src/uts/sun4/os/mlsetup.c usr/src/uts/sun4/os/mp_startup.c usr/src/uts/sun4/os/startup.c usr/src/uts/sun4u/os/mach_ddi_impl.c usr/src/uts/sun4u/os/mach_startup.c usr/src/uts/sun4v/Makefile.files usr/src/uts/sun4v/Makefile.rules usr/src/uts/sun4v/Makefile.sun4v.shared usr/src/uts/sun4v/cnex/Makefile usr/src/uts/sun4v/cpu/common_asm.s usr/src/uts/sun4v/cpu/generic.c usr/src/uts/sun4v/cpu/niagara.c usr/src/uts/sun4v/dr_cpu/Makefile usr/src/uts/sun4v/ds/Makefile usr/src/uts/sun4v/fault_iso/Makefile usr/src/uts/sun4v/io/cnex.c usr/src/uts/sun4v/io/dr_cpu.c usr/src/uts/sun4v/io/dr_util.c usr/src/uts/sun4v/io/ds.c usr/src/uts/sun4v/io/fault_iso.c usr/src/uts/sun4v/io/ldc.c usr/src/uts/sun4v/io/mdeg.c usr/src/uts/sun4v/io/mdesc.c usr/src/uts/sun4v/io/platsvc.c usr/src/uts/sun4v/io/qcn.c usr/src/uts/sun4v/io/vcc.c usr/src/uts/sun4v/io/vdc.c usr/src/uts/sun4v/io/vds.c usr/src/uts/sun4v/io/vldc.c usr/src/uts/sun4v/io/vnet.c usr/src/uts/sun4v/io/vnet_gen.c usr/src/uts/sun4v/io/vnex.c usr/src/uts/sun4v/io/vsw.c usr/src/uts/sun4v/ldc/Makefile usr/src/uts/sun4v/ml/hcall.s usr/src/uts/sun4v/ml/mach_offsets.in usr/src/uts/sun4v/ml/mach_proc_init.s usr/src/uts/sun4v/ml/mach_subr_asm.s usr/src/uts/sun4v/ml/trap_table.s usr/src/uts/sun4v/os/fillsysinfo.c usr/src/uts/sun4v/os/hsvc.c usr/src/uts/sun4v/os/intrq.c usr/src/uts/sun4v/os/lpad.c usr/src/uts/sun4v/os/mach_cpu_states.c usr/src/uts/sun4v/os/mach_descrip.c usr/src/uts/sun4v/os/mach_mp_startup.c usr/src/uts/sun4v/os/mach_mp_states.c usr/src/uts/sun4v/os/mach_startup.c usr/src/uts/sun4v/platsvc/Makefile usr/src/uts/sun4v/promif/promif_asr.c usr/src/uts/sun4v/promif/promif_cpu.c usr/src/uts/sun4v/promif/promif_emul.c usr/src/uts/sun4v/promif/promif_interp.c usr/src/uts/sun4v/promif/promif_io.c usr/src/uts/sun4v/promif/promif_key.c usr/src/uts/sun4v/promif/promif_mon.c usr/src/uts/sun4v/promif/promif_node.c usr/src/uts/sun4v/promif/promif_power_off.c usr/src/uts/sun4v/promif/promif_prop.c usr/src/uts/sun4v/promif/promif_reboot.c usr/src/uts/sun4v/promif/promif_stree.c usr/src/uts/sun4v/promif/promif_test.c usr/src/uts/sun4v/promif/promif_version.c usr/src/uts/sun4v/sys/cnex.h usr/src/uts/sun4v/sys/cpu_module.h usr/src/uts/sun4v/sys/dr_cpu.h usr/src/uts/sun4v/sys/dr_util.h usr/src/uts/sun4v/sys/ds.h usr/src/uts/sun4v/sys/ds_impl.h usr/src/uts/sun4v/sys/error.h usr/src/uts/sun4v/sys/fault_iso.h usr/src/uts/sun4v/sys/hsvc.h usr/src/uts/sun4v/sys/hypervisor_api.h usr/src/uts/sun4v/sys/ldc.h usr/src/uts/sun4v/sys/ldc_impl.h usr/src/uts/sun4v/sys/ldoms.h usr/src/uts/sun4v/sys/lpad.h usr/src/uts/sun4v/sys/mach_descrip.h usr/src/uts/sun4v/sys/machcpuvar.h usr/src/uts/sun4v/sys/machparam.h usr/src/uts/sun4v/sys/machsystm.h usr/src/uts/sun4v/sys/mdeg.h usr/src/uts/sun4v/sys/mmu.h usr/src/uts/sun4v/sys/ncp.h usr/src/uts/sun4v/sys/ncs.h usr/src/uts/sun4v/sys/platsvc.h usr/src/uts/sun4v/sys/promif_impl.h usr/src/uts/sun4v/sys/varconfig.h usr/src/uts/sun4v/sys/vcc.h usr/src/uts/sun4v/sys/vcc_impl.h usr/src/uts/sun4v/sys/vdc.h usr/src/uts/sun4v/sys/vdsk_common.h usr/src/uts/sun4v/sys/vdsk_mailbox.h usr/src/uts/sun4v/sys/vio_common.h usr/src/uts/sun4v/sys/vio_mailbox.h usr/src/uts/sun4v/sys/vldc.h usr/src/uts/sun4v/sys/vldc_impl.h usr/src/uts/sun4v/sys/vnet.h usr/src/uts/sun4v/sys/vnet_common.h usr/src/uts/sun4v/sys/vnet_gen.h usr/src/uts/sun4v/sys/vnet_mailbox.h usr/src/uts/sun4v/sys/vnet_proxy.h usr/src/uts/sun4v/sys/vnetmsg.h usr/src/uts/sun4v/sys/vsw.h usr/src/uts/sun4v/sys/vsw_fdb.h usr/src/uts/sun4v/vcc/Makefile usr/src/uts/sun4v/vdc/Makefile usr/src/uts/sun4v/vds/Makefile usr/src/uts/sun4v/vldc/Makefile usr/src/uts/sun4v/vnet/Makefile usr/src/uts/sun4v/vsw/Makefile
diffstat 183 files changed, 56893 insertions(+), 2066 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/Targetdirs	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/Targetdirs	Tue May 16 16:05:21 2006 -0700
@@ -159,6 +159,7 @@
 	/var/svc/manifest/platform \
 	/var/svc/manifest/platform/i86pc \
 	/var/svc/manifest/platform/sun4u \
+	/var/svc/manifest/platform/sun4v \
 	/var/svc/manifest/site \
 	/var/svc/profile
 
@@ -170,7 +171,13 @@
 	/usr/lib/inet/wanboot
 # EXPORT DELETE END
 
+i386_ROOT.BIN=
+
+sparc_ROOT.BIN= \
+	/usr/lib/ldoms
+
 ROOT.BIN= \
+	$($(MACH)_ROOT.BIN) \
 	/etc/saf \
 	/etc/sma \
 	/etc/sma/snmp \
--- a/usr/src/cmd/Makefile	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -456,6 +456,7 @@
 	sckmd		\
 	sf880drd	\
 	stmsboot	\
+	vntsd		\
 	wrsmconf	\
 	wrsmstat
 
@@ -692,6 +693,7 @@
 	prtdscp		\
 	prtfru		\
 	stmsboot	\
+	vntsd		\
 	wrsmconf	\
 	wrsmstat
 
@@ -765,6 +767,7 @@
 	syseventd			\
 	syslogd				\
 	utmpd				\
+	vntsd				\
 	ypcmd				\
 	zoneadmd
 
--- a/usr/src/cmd/Makefile.cmd	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/Makefile.cmd	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -209,6 +208,7 @@
 ROOTSVCPLATFORM=		$(ROOTVARSVCMANIFEST)/platform
 ROOTSVCPLATFORMI86PC=		$(ROOTSVCPLATFORM)/i86pc
 ROOTSVCPLATFORMSUN4U=		$(ROOTSVCPLATFORM)/sun4u
+ROOTSVCPLATFORMSUN4V=		$(ROOTSVCPLATFORM)/sun4v
 ROOTSVCAPPLICATION=		$(ROOTVARSVCMANIFEST)/application
 ROOTSVCAPPLICATIONMANAGEMENT=	$(ROOTSVCAPPLICATION)/management
 ROOTSVCAPPLICATIONSECURITY=	$(ROOTSVCAPPLICATION)/security
@@ -388,6 +388,9 @@
 $(ROOTSVCPLATFORMSUN4U)/%: %
 	$(INS.file)
 
+$(ROOTSVCPLATFORMSUN4V)/%: %
+	$(INS.file)
+
 $(ROOTMAN1)/%: %.sunman
 	$(INS.rename)
 
--- a/usr/src/cmd/mdb/common/kmdb/kctl/kctl.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/common/kmdb/kctl/kctl.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -106,6 +105,10 @@
 extern void kctl_auxv_init_isadep(kmdb_auxv_t *, void *);
 extern void kctl_auxv_fini(kmdb_auxv_t *);
 extern void kctl_auxv_fini_isadep(kmdb_auxv_t *);
+#ifdef sun4v
+extern void kctl_auxv_set_promif(kmdb_auxv_t *);
+extern void kctl_switch_promif(void);
+#endif
 
 extern void kctl_wrintr(void);
 extern void kctl_wrintr_fire(void);
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_auxv.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/common/kmdb/kmdb_auxv.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -53,6 +52,9 @@
 #include <gelf.h>
 #include <sys/machelf.h>
 #include <sys/kdi.h>
+#ifdef sun4v
+#include <sys/obpdefs.h>
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -104,6 +106,16 @@
 #ifdef __sparc
 	void (*kav_ktrap_install)(int, void (*)(void)); /* Add to krnl trptbl */
 	void (*kav_ktrap_restore)(void);	/* Restore krnl trap hdlrs */
+#ifdef sun4v
+	uint_t		kav_domaining;		/* Domaining status */
+	caddr_t		kav_promif_root;	/* PROM shadow tree root */
+	ihandle_t	kav_promif_in;		/* PROM input dev instance */
+	ihandle_t	kav_promif_out;		/* PROM output dev instance */
+	phandle_t	kav_promif_pin;		/* PROM input dev package */
+	phandle_t	kav_promif_pout;	/* PROM output dev package */
+	pnode_t		kav_promif_chosennode;	/* PROM "/chosen" node */
+	pnode_t		kav_promif_optionsnode;	/* PROM "/options" node */
+#endif
 #endif
 
 } kmdb_auxv_t;
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_fault.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/common/kmdb/kmdb_fault.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -78,7 +78,10 @@
 		for (;;) {
 			mdb_iob_printf(mdb.m_err, "\n%s: "
 #if defined(__sparc)
-			    "(o)bp, (p)anic"
+#ifndef sun4v
+			    "(o)bp, "
+#endif /* sun4v */
+			    "(p)anic"
 #else
 			    "reboo(t)"
 #endif
@@ -98,8 +101,10 @@
 				continue;
 #endif
 
+#ifndef sun4v
 			case 'o':
 			case 'O':
+#endif /* sun4v */
 			case 't':
 			case 'T':
 				kmdb_dpi_enter_mon();
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_kctl.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/common/kmdb/kmdb_kctl.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -53,6 +52,16 @@
 
 extern int kmdb_init(const char *, kmdb_auxv_t *);
 
+/*
+ * This function should only be defined for sun4v. However the mdb build
+ * uses a custom tool (hdr2map) to generate mapfile from header files but
+ * this tool does not take care of preprocessor directives and functions
+ * are included into the mapfile whatever the architecture is and even
+ * if there is an #ifdef sun4v. So we always declare this function but it
+ * has a fake definition for all architecture but sun4v.
+ */
+extern void kmdb_init_promif(char *, kmdb_auxv_t *);
+
 extern void kmdb_activate(kdi_debugvec_t **, uint_t);
 extern void kmdb_deactivate(void);
 
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_main.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/common/kmdb/kmdb_main.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -276,6 +275,28 @@
 	return (0);
 }
 
+#ifdef sun4v
+
+void
+kmdb_init_promif(char *pgmname, kmdb_auxv_t *kav)
+{
+	kmdb_prom_init_promif(pgmname, kav);
+}
+
+#else
+
+/*ARGSUSED*/
+void
+kmdb_init_promif(char *pgmname, kmdb_auxv_t *kav)
+{
+	/*
+	 * Fake function for non sun4v. See comments in kmdb_ctl.h
+	 */
+	ASSERT(0);
+}
+
+#endif
+
 /*
  * First-time kmdb startup.  Run when kmdb has control of the machine for the
  * first time.
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_promif.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/common/kmdb/kmdb_promif.c	Tue May 16 16:05:21 2006 -0700
@@ -18,7 +18,6 @@
  *
  * CDDL HEADER END
  */
-
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -29,6 +28,9 @@
 #include <sys/types.h>
 #include <sys/termios.h>
 #include <sys/promif.h>
+#ifdef sun4v
+#include <sys/promif_impl.h>
+#endif
 #include <unistd.h>
 #include <string.h>
 #include <stdlib.h>
@@ -754,7 +756,14 @@
 void
 kmdb_prom_init_begin(char *pgmname, kmdb_auxv_t *kav)
 {
+#ifdef sun4v
+	if (kav->kav_domaining)
+		kmdb_prom_init_promif(pgmname, kav);
+	else
+		prom_init(pgmname, kav->kav_romp);
+#else
 	prom_init(pgmname, kav->kav_romp);
+#endif
 
 	/* Initialize the interrupt ring buffer */
 	kmdb_prom_readbuf_head = kmdb_prom_readbuf_tail;
@@ -764,6 +773,18 @@
 #endif
 }
 
+#ifdef sun4v
+void
+kmdb_prom_init_promif(char *pgmname, kmdb_auxv_t *kav)
+{
+	ASSERT(kav->kav_domaining);
+	cif_init(pgmname, kav->kav_promif_root,
+	    kav->kav_promif_in, kav->kav_promif_out,
+	    kav->kav_promif_pin, kav->kav_promif_pout,
+	    kav->kav_promif_chosennode, kav->kav_promif_optionsnode);
+}
+#endif
+
 /*
  * Conclude the initialization of the debugger/PROM interface.  Memory
  * allocation and the global `mdb' object are now available.
--- a/usr/src/cmd/mdb/common/kmdb/kmdb_promif.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/common/kmdb/kmdb_promif.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -42,6 +41,9 @@
 
 extern void kmdb_prom_init_begin(char *, kmdb_auxv_t *);
 extern void kmdb_prom_init_finish(kmdb_auxv_t *);
+#ifdef sun4v
+extern void kmdb_prom_init_promif(char *, kmdb_auxv_t *);
+#endif
 extern ssize_t kmdb_prom_read(void *, size_t, struct termios *);
 extern ssize_t kmdb_prom_write(const void *, size_t, struct termios *);
 extern ihandle_t kmdb_prom_get_handle(char *);
--- a/usr/src/cmd/mdb/sparc/kmdb/kaif_activate.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/sparc/kmdb/kaif_activate.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -131,7 +131,34 @@
 	bcopy((caddr_t)kaif_hdlr_generic, tgt, 32);
 }
 
-#ifndef sun4v
+#ifdef	sun4v
+
+/*ARGSUSED*/
+static void
+kaif_install_goto_tt64(caddr_t tgt, caddr_t arg)
+{
+	/* LINTED - pointer alignment */
+	uint32_t *hdlr = (uint32_t *)tgt;
+	uint32_t disp = (T_FAST_INSTR_MMU_MISS - T_INSTR_MMU_MISS) * 0x20;
+
+	*hdlr++ = 0x10480000 | (disp >> 2);	/* ba,pt (to tt64) */
+	*hdlr++ = 0x01000000;			/* nop */
+}
+
+/*ARGSUSED*/
+static void
+kaif_install_goto_tt68(caddr_t tgt, caddr_t arg)
+{
+	/* LINTED - pointer alignment */
+	uint32_t *hdlr = (uint32_t *)tgt;
+	uint32_t disp = (T_FAST_DATA_MMU_MISS - T_DATA_MMU_MISS) * 0x20;
+
+	*hdlr++ = 0x10480000 | (disp >> 2);	/* ba,pt (to tt68) */
+	*hdlr++ = 0x01000000;			/* nop */
+}
+
+#endif	/* sun4v */
+
 static void
 kaif_install_dmmumiss(caddr_t tgt, caddr_t vatotte)
 {
@@ -159,25 +186,31 @@
 	*patch++ |= (uintptr_t)vatotte >> 10;
 	*patch |= ((uintptr_t)vatotte) & 0x3ff;
 }
-#endif /* sun4v */
 
 static struct kaif_trap_handlers {
 	uint_t th_tt;
 	void (*th_install)(caddr_t, caddr_t);
 } kaif_trap_handlers[] = {
 	{ T_INSTR_EXCEPTION,			kaif_install_generic },
+#ifdef sun4v
+	{ T_INSTR_MMU_MISS,			kaif_install_goto_tt64 },
+#endif
 	{ T_IDIV0,				kaif_install_generic },
 	{ T_DATA_EXCEPTION,			kaif_install_generic },
+#ifdef sun4v
+	{ T_DATA_MMU_MISS,			kaif_install_goto_tt68 },
+#endif
 	{ T_DATA_ERROR,				kaif_install_generic },
 	{ T_ALIGNMENT,				kaif_install_generic },
-#ifdef sun4v
-#else /* sun4v */
 	{ T_FAST_INSTR_MMU_MISS,		kaif_install_immumiss },
 	{ T_FAST_DATA_MMU_MISS,			kaif_install_dmmumiss },
 	{ T_FAST_DATA_MMU_PROT,			kaif_install_generic },
+#ifdef sun4v
+	{ T_INSTR_MMU_MISS + T_TL1,		kaif_install_goto_tt64 },
+	{ T_DATA_MMU_MISS + T_TL1,		kaif_install_goto_tt68 },
+#endif
 	{ T_FAST_INSTR_MMU_MISS + T_TL1,	kaif_install_immumiss },
 	{ T_FAST_DATA_MMU_MISS + T_TL1,		kaif_install_dmmumiss },
-#endif /* sun4v */
 	{ 0 }
 };
 
@@ -189,34 +222,27 @@
 	int i;
 
 	/*
+	 * sun4u:
 	 * We rely upon OBP for the handling of a great many traps.  As such,
 	 * we begin by populating our table with pointers to OBP's handlers.
 	 * We then copy in our own handlers where appropriate.  At some point,
 	 * when we provide the bulk of the handlers, this process will be
 	 * reversed.
+	 *
+	 * sun4v:
+	 * The sun4v kernel dismisses OBP at boot. Both fast and slow TLB
+	 * misses are handled by KMDB. Breakpoint traps go directly KMDB.
+	 * All other trap entries are redirected to their respective
+	 * trap implemenation within the Solaris trap table.
 	 */
 	for (i = 0; i < kaif_tba_native_sz; i += 0x20) {
 		/* LINTED - pointer alignment */
 		uint32_t *hdlr = (uint32_t *)(kaif_tba_native + i);
 #ifdef	sun4v
-		uint32_t tt = i/0x20;
-
-		/*
-		 * We use obp's tl0 handlers. Sine kmdb installs itsdebug
-		 * hook in obp, if obp cannot handle any traps, such as
-		 * user enter an invalid address in kmdb, obp will call
-		 * kmdb's callback and the control goes back to kmdb.
-		 * For tl>0 traps, kernel's trap handlers are good at
-		 * handling these on sun4v.
-		 */
-		if (tt >= T_TL1)
-			brtgt = (uintptr_t)(kaif_tba_kernel + i);
-		else
-			brtgt = (uintptr_t)(kaif_tba_obp + i);
-#else /* !sun4v */
+		brtgt = (uintptr_t)(kaif_tba_kernel + i);
+#else
 		brtgt = (uintptr_t)(kaif_tba_obp + i);
-#endif /* sun4v */
-
+#endif
 		*hdlr++ = 0x03000000 | (brtgt >> 10);	/* sethi brtgt, %g1 */
 		*hdlr++ = 0x81c06000 | (brtgt & 0x3ff);	/* jmp %g1 + brtgt */
 		*hdlr++ = 0x01000000;			/* nop */
--- a/usr/src/cmd/mdb/sparc/kmdb/kctl/kctl_isadep.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/sparc/kmdb/kctl/kctl_isadep.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,6 +35,11 @@
 #include <sys/cpuvar.h>
 #include <sys/kobj.h>
 #include <sys/kobj_impl.h>
+#ifdef sun4v
+#include <sys/ldoms.h>
+#include <sys/promif_impl.h>
+#include <kmdb/kmdb_kctl.h>
+#endif
 
 #include <kmdb/kctl/kctl.h>
 
@@ -229,8 +233,48 @@
 
 	kav->kav_ktrap_install = kctl_ktrap_install;
 	kav->kav_ktrap_restore = kctl_ktrap_restore;
+#ifdef sun4v
+	if (kctl.kctl_boot_loaded) {
+		/*
+		 * When booting kmdb, kmdb starts before domaining is
+		 * enabled and before the cif handler is changed to the
+		 * kernel cif handler. So we start kmdb with using the
+		 * OBP and we will change this when the cif handler is
+		 * installed.
+		 */
+		kav->kav_domaining = 0;
+	} else {
+		kctl_auxv_set_promif(kav);
+	}
+#endif
 }
 
+#ifdef sun4v
+
+void
+kctl_auxv_set_promif(kmdb_auxv_t *kav)
+{
+	kav->kav_domaining = domaining_enabled;
+	kav->kav_promif_root = promif_stree_getroot();
+	kav->kav_promif_in = prom_stdin_ihandle();
+	kav->kav_promif_out = prom_stdout_ihandle();
+	kav->kav_promif_pin = prom_stdin_node();
+	kav->kav_promif_pout = prom_stdout_node();
+	kav->kav_promif_chosennode = prom_chosennode();
+	kav->kav_promif_optionsnode = prom_finddevice("/options");
+}
+
+void
+kctl_switch_promif(void)
+{
+	kmdb_auxv_t kav;
+
+	kctl_auxv_set_promif(&kav);
+	kmdb_init_promif(NULL, &kav);
+}
+
+#endif
+
 /*ARGSUSED*/
 void
 kctl_auxv_fini_isadep(kmdb_auxv_t *auxv)
--- a/usr/src/cmd/mdb/sparc/v9/kmdb/kaif_handlers.s	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/sparc/v9/kmdb/kaif_handlers.s	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,6 +31,20 @@
 #include <sys/machtrap.h>
 #include <sys/privregs.h>
 #include <sys/mmu.h>
+#include <vm/mach_sfmmu.h>
+
+#if defined(sun4v) && !defined(lint)
+#include <sys/machparam.h>
+#endif
+
+#if defined(sun4v) && defined(KMDB_TRAPCOUNT)
+/*
+ * The sun4v implemenations of the fast miss handlers are larger than those
+ * of their sun4u kin. This is unfortunate because there is not enough space
+ * remaining in the respective trap table entries for this debug feature.
+ */
+#error "KMDB_TRAPCOUNT not supported on sun4v"
+#endif
 
 /*
  * This file contains the trap handlers that will be copied to kmdb's trap
@@ -50,12 +64,7 @@
 
 #if defined(lint)
 #include <kmdb/kaif.h>
-#endif /* lint */
 
-#if defined(lint)
-
-#ifdef sun4v
-#else /* sun4v */
 void
 kaif_hdlr_dmiss(void)
 {
@@ -65,51 +74,149 @@
 kaif_itlb_handler(void)
 {
 }
-#endif /* sun4v */
-#else	/* lint */
+
+#else /* lint */
 
 #ifdef sun4v
+
+#define	GET_MMU_D_ADDR_CTX(daddr, ctx)			\
+	MMU_FAULT_STATUS_AREA(ctx);			\
+	ldx	[ctx + MMFSA_D_ADDR], daddr;		\
+	ldx	[ctx + MMFSA_D_CTX], ctx
+
+#define	GET_MMU_I_ADDR_CTX(iaddr, ctx)			\
+	MMU_FAULT_STATUS_AREA(ctx);			\
+	ldx	[ctx + MMFSA_I_ADDR], iaddr;		\
+	ldx	[ctx + MMFSA_I_CTX], ctx
+
+/*
+ * KAIF_ITLB_STUFF
+ * derived from ITLB_STUFF in uts/sun4v/vm/mach_sfmmu.h
+ *
+ * Load ITLB entry
+ *
+ * In:
+ *   tte = reg containing tte
+ *   ouch = branch target label used if hcall fails (sun4v only)
+ *   scr1, scr2, scr3, scr4 = scratch registers (must not be %o0-%o3)
+ */
+#define	KAIF_ITLB_STUFF(tte, ouch, scr1, scr2, scr3, scr4)	\
+	mov	%o0, scr1;				\
+	mov	%o1, scr2;				\
+	mov	%o2, scr3;				\
+	mov	%o3, scr4;				\
+	MMU_FAULT_STATUS_AREA(%o2);			\
+	ldx	[%o2 + MMFSA_I_ADDR], %o0;		\
+	ldx	[%o2 + MMFSA_I_CTX], %o1;		\
+	srlx	%o0, PAGESHIFT, %o0;			\
+	sllx	%o0, PAGESHIFT, %o0;			\
+	mov	tte, %o2;				\
+	mov	MAP_ITLB, %o3;				\
+	ta	MMU_MAP_ADDR;				\
+	/* BEGIN CSTYLED */				\
+	brnz,a,pn %o0, ouch;				\
+	  nop;						\
+	/* END CSTYLED */				\
+	mov	scr1, %o0;				\
+	mov	scr2, %o1;				\
+	mov	scr3, %o2;				\
+	mov	scr4, %o3
+
+/*
+ * KAIF_DTLB_STUFF
+ * derived from DTLB_STUFF in uts/sun4v/vm/mach_sfmmu.h
+ *
+ * Load DTLB entry
+ *
+ * In:
+ *   tte = reg containing tte
+ *   ouch = branch target label used if hcall fails (sun4v only)
+ *   scr1, scr2, scr3, scr4 = scratch registers (must not be %o0-%o3)
+ */
+#define	KAIF_DTLB_STUFF(tte, ouch, scr1, scr2, scr3, scr4)	\
+	mov	%o0, scr1;				\
+	mov	%o1, scr2;				\
+	mov	%o2, scr3;				\
+	mov	%o3, scr4;				\
+	MMU_FAULT_STATUS_AREA(%o2);			\
+	ldx	[%o2 + MMFSA_D_ADDR], %o0;		\
+	ldx	[%o2 + MMFSA_D_CTX], %o1;		\
+	srlx	%o0, PAGESHIFT, %o0;			\
+	sllx	%o0, PAGESHIFT, %o0;			\
+	mov	tte, %o2;				\
+	mov	MAP_DTLB, %o3;				\
+	ta	MMU_MAP_ADDR;				\
+	/* BEGIN CSTYLED */				\
+	brnz,a,pn %o0, ouch;				\
+	  nop;						\
+	/* END CSTYLED */				\
+	mov	scr1, %o0;				\
+	mov	scr2, %o1;				\
+	mov	scr3, %o2;				\
+	mov	scr4, %o3
+
 #else /* sun4v */
 
-	.global	kaif_hdlr_dmiss_patch
-	.global	kaif_hdlr_imiss_patch
+#define	GET_MMU_D_ADDR_CTX(daddr, ctx)			\
+	mov	MMU_TAG_ACCESS, ctx;			\
+	ldxa	[ctx]ASI_DMMU, daddr;			\
+	sllx	daddr, TAGACC_CTX_LSHIFT, ctx;		\
+	srlx	ctx, TAGACC_CTX_LSHIFT, ctx
 
-	/*
-	 * This routine must be exactly 32 instructions long.
-	 */
-	ENTRY_NP(kaif_hdlr_dmiss)
-	mov	MMU_TAG_ACCESS, %g1
-	ldxa	[%g1]ASI_DMMU, %g1		/* %g1 = addr|ctx */
-	sllx	%g1, TAGACC_CTX_LSHIFT, %g2	/* strip addr */
-	srlx	%g2, TAGACC_CTX_LSHIFT, %g2	/* %g2 = ctx */
+#define	GET_MMU_I_ADDR_CTX(iaddr, ctx)			\
+	rdpr	%tpc, iaddr;				\
+	ldxa	[%g0]ASI_IMMU, ctx;			\
+	srlx	ctx, TTARGET_CTX_SHIFT, ctx
+
+#define	KAIF_DTLB_STUFF(tte, ouch, scr1, scr2, scr3, scr4)	\
+	DTLB_STUFF(tte, scr1, scr2, scr3, scr4)
+
+#define	KAIF_ITLB_STUFF(tte, ouch, scr1, scr2, scr3, scr4)	\
+	ITLB_STUFF(tte, scr1, scr2, scr3, scr4)
 
-	/*
-	 * Use kdi_vatotte to look up the tte.  We don't bother stripping the
-	 * context, as it won't change the tte we get.
-	 */
-kaif_hdlr_dmiss_patch:
-	sethi	%hi(0), %g3	/* set by kaif to kdi_vatotte */
-	or	%g3, %lo(0), %g3
-	jmpl	%g3, %g7	/* uses all regs, ret to %g7, tte or 0 in %g1 */
-	add	%g7, 8, %g7	/* adjust return */
+#endif /* sun4v */
+	
+/*
+ * KAIF_CALL_KDI_VATOTTE
+ *
+ * Use kdi_vatotte to look up the tte.  We don't bother stripping the
+ * context, as it won't change the tte we get.
+ *
+ * The two instruction at patch_lbl are modified during runtime
+ * by kaif to point to kdi_vatotte
+ *
+ * Clobbers all globals.
+ * Returns tte in %g1 if successful, otherwise 0 in %g1
+ * Leaves address of next instruction following this macro in scr1
+ */
+#define	KAIF_CALL_KDI_VATOTTE(addr, ctx, patch_lbl, scr0, scr1)	\
+	.global	patch_lbl;					\
+patch_lbl:							\
+	sethi	%hi(0), scr0;					\
+	or	scr0, %lo(0), scr0;				\
+	jmpl	scr0, scr1;					\
+	add	scr1, 8, scr1
 
-	brz	%g1, 1f
+
+	ENTRY_NP(kaif_hdlr_dmiss)
+	GET_MMU_D_ADDR_CTX(%g1, %g2)
+
+	KAIF_CALL_KDI_VATOTTE(%g1, %g2, kaif_hdlr_dmiss_patch, %g3, %g7)
+0:	brz	%g1, 1f
 	nop
 
 	/* 
 	 * kdi_vatotte gave us a TTE to use.  Load it up and head back 
 	 * into the world, but first bump a counter.
 	 */
-#ifdef	KMDB_TRAPCOUNT
-	ldx	[%g7 + 0x40], %g2	/* Trap counter.  See top comment */
+
+#ifdef	KMDB_TRAPCOUNT			/* Trap counter.  See top comment */
+	ldx	[%g7 + .count-0b], %g2
 	add	%g2, 1, %g2
-	stx	%g2, [%g7 + 0x40]
-#else
-	nop
-	nop
-	nop
+	stx	%g2, [%g7 + .count-0b]
 #endif
-	stxa	%g1, [%g0]ASI_DTLB_IN
+
+	KAIF_DTLB_STUFF(%g1, 1f, %g2, %g3, %g4, %g5)
 	retry
 
 1:	/* 
@@ -126,63 +233,47 @@
 	 * find the TTE for the debugger without missing.
 	 */
 
-#ifdef	KMDB_TRAPCOUNT
-	mov	MMU_TAG_ACCESS, %g1	/* Trap address "counter". */
-	ldxa	[%g1]ASI_DMMU, %g1
-	stx	%g1, [%g7 + 0x48]
-#else
-	nop
-	nop
-	nop
+#ifdef	KMDB_TRAPCOUNT			/* Trap address "counter". */
+	GET_MMU_D_ADDR(%g2, %g3)
+	stx	%g2, [%g7 + .daddr-0b]
+	stx	%g1, [%g7 + .ecode-0b]
 #endif
 
-	mov	PTSTATE_KERN_COMMON | PSTATE_AG, %g3
-	wrpr	%g3, %pstate
-	sethi	%hi(kaif_dtrap), %g4
-	jmp	%g4 + %lo(kaif_dtrap)
+	sethi	%hi(kaif_dtrap), %g1
+	jmp	%g1 + %lo(kaif_dtrap)
 	nop
-	unimp	0
-	unimp	0	/* counter goes here (base + 0x60) */
-	unimp	0
-	unimp	0	/* miss address goes here (base + 0x68) */
-	unimp	0
-	unimp	0
-	unimp	0
-	unimp	0
-	unimp	0
+	/* NOTREACHED */
+
+#ifdef KMDB_TRAPCOUNT
+	.align 8
+.count:	.xword 0			/* counter goes here */
+.daddr:	.xword 0			/* miss address goes here */
+.ecode:	.xword 0			/* sun4v: g1 contains err code */
+#endif
+
+	.align 32*4			/* force length to 32 instr. */
 	SET_SIZE(kaif_hdlr_dmiss)
 
-	/*
-	 * This routine must be exactly 32 instructions long.
-	 */
-	ENTRY_NP(kaif_hdlr_imiss)
-	rdpr	%tpc, %g1
-	ldxa	[%g0]ASI_IMMU, %g2
-	srlx	%g2, TTARGET_CTX_SHIFT, %g2
+
 
-kaif_hdlr_imiss_patch:
-	sethi	%hi(0), %g3	/* set by kaif to kdi_vatotte */
-	or	%g3, %lo(0), %g3
-	jmpl	%g3, %g7	/* uses all regs, ret to %g7, tte or 0 in %g1 */
-	add	%g7, 8, %g7	/* adjust return */
+	ENTRY_NP(kaif_hdlr_imiss)
+	GET_MMU_I_ADDR_CTX(%g1, %g2)
 
-	brz	%g1, 1f
+	KAIF_CALL_KDI_VATOTTE(%g1, %g2, kaif_hdlr_imiss_patch, %g3, %g7)
+0:	brz	%g1, 1f
 	nop
 
 	/* 
 	 * kdi_vatotte gave us a TTE to use.  Load it up and head back 
 	 * into the world, but first bump a counter.
 	 */
-#ifdef	KMDB_TRAPCOUNT
-	ldx	[%g7 + 0x3c], %g2	/* Trap counter.  See top comment */
+#ifdef	KMDB_TRAPCOUNT			/* Trap counter.  See top comment */
+	ldx	[%g7 + .count-0b], %g2
 	add	%g2, 1, %g2
-	stx	%g2, [%g7 + 0x3c]
-#else
-	nop
-	nop
-	nop
+	stx	%g2, [%g7 + .count-0b]
 #endif
-	stxa	%g1, [%g0]ASI_ITLB_IN
+
+	KAIF_ITLB_STUFF(%g1, 1f, %g2, %g3, %g4, %g5)
 	retry
 
 1:	/* 
@@ -197,42 +288,41 @@
 	 * We will only reach this point at TL=1, as kdi_vatotte will always
 	 * find the TTE for the debugger without missing.
 	 */
-	rdpr	%pstate, %g1
-	or	%g0, PTSTATE_KERN_COMMON | PSTATE_AG, %g2
-	set	kaif_dtrap, %g3
-	jmp	%g3
-	wrpr	%g2, %pstate
-	unimp	0
-	unimp	0
-	unimp	0	/* counter goes here */
-	unimp	0
-	unimp	0
-	unimp	0
-	unimp	0
-	unimp	0
-	unimp	0
-	unimp	0
-	unimp	0
-	unimp	0
+
+	sethi	%hi(kaif_dtrap), %g1
+	jmp	%g1 + %lo(kaif_dtrap)
+	nop
+	/* NOTREACHED */
+
+#ifdef KMDB_TRAPCOUNT
+	.align	8
+.count:	.xword	0
+#endif
+
+	.align	32*4			/* force length to 32 instr. */
 	SET_SIZE(kaif_hdlr_imiss)
-#endif /* sun4v */
+
+
 
 	ENTRY_NP(kaif_hdlr_generic)
-#ifdef	KMDB_TRAPCOUNT
-	rd	%pc, %g3		/* Trap counter.  See top comment */
-	ld	[%g3 + 0x1c], %g4
+#ifdef	KMDB_TRAPCOUNT			/* Trap counter.  See top comment */
+0:	rd	%pc, %g3
+	ldx	[%g3 + .count-0b], %g4
 	add	%g4, 1, %g4
-	st	%g4, [%g3 + 0x1c]
-#else
-	nop
+	stx	%g4, [%g3 + .count-0b]
+#endif
+
+	sethi	%hi(kaif_dtrap), %g1
+	jmp	%g1 + %lo(kaif_dtrap)
 	nop
-	nop
-	nop
+	/* NOTREACHED */
+
+#ifdef	KMDB_TRAPCOUNT
+	.align	8
+.count:	.xword	0			/* counter goes here */
 #endif
-	sethi	%hi(kaif_dtrap), %g3
-	jmp	%g3 + %lo(kaif_dtrap)
-	rdpr	%pstate, %g1
-	unimp	0	/* counter goes here */
+
+	.align	32*4			/* force length to 32 instr. */
 	SET_SIZE(kaif_hdlr_generic)
 
-#endif
+#endif /* lint */
--- a/usr/src/cmd/mdb/sun4v/Makefile.kmdb	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/sun4v/Makefile.kmdb	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
@@ -46,10 +45,19 @@
 	prom_printf.c \
 	prom_prop.c \
 	prom_putchar.c \
+	prom_reboot.c \
 	prom_sparc.c \
 	prom_stdin.c \
 	prom_stdout.c \
-	prom_string.c
+	prom_string.c \
+	promif_emul.c \
+	promif_interp.c \
+	promif_io.c \
+	promif_mon.c \
+	promif_node.c \
+	promif_prop.c \
+	promif_reboot.c \
+	promif_stree.c
 
 KMDBSRCS += \
 	kaif.c \
@@ -113,6 +121,10 @@
 	$(COMPILE.c) $<
 	$(CTFCONVERT_O)
 
+%.o: ../../../../../uts/sun4v/promif/%.c
+	$(COMPILE.c) $<
+	$(CTFCONVERT_O)
+
 %.ln: ../../../../../psm/promif/ieee1275/common/%.c
 	$(LINT.c) -c $<
 
@@ -121,3 +133,6 @@
 
 %.ln: ../../../../../psm/promif/ieee1275/sun4u/%.c
 	$(LINT.c) -c $<
+
+%.ln: ../../../../../uts/sun4v/promif/%.c
+	$(LINT.c) -c $<
--- a/usr/src/cmd/mdb/sun4v/modules/Makefile	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/sun4v/modules/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -19,11 +18,13 @@
 #
 # CDDL HEADER END
 #
+
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
+#
 
-SUBDIRS = unix
+SUBDIRS = unix vdsk
 include ../../Makefile.subdirs
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/mdb/sun4v/modules/vdsk/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,30 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+SUBDIRS = v9
+include ../../../Makefile.subdirs
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/mdb/sun4v/modules/vdsk/v9/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,44 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+MODULE = vdsk.so
+MDBTGT = kvm
+
+MODSRCS = vdsk.c
+
+include ../../../../../Makefile.cmd
+include ../../../../../Makefile.cmd.64
+include ../../../../sparc/Makefile.sparcv9
+include ../../../Makefile.sun4v
+include ../../../../Makefile.module
+
+MODSRCS_DIR = ../
+
+CPPFLAGS += -DMP -D_MACHDEP
+CPPFLAGS += -Dsun4v
+CPPFLAGS += -I$(SRC)/uts/sun4v
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/mdb/sun4v/modules/vdsk/vdsk.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * This module provides debugging tools for the LDoms vDisk drivers
+ * (vds and vdc).
+ */
+
+#include <sys/mdb_modapi.h>
+
+#include <sys/vdsk_common.h>
+
+/*
+ */
+int
+vd_dring_entry_walk_init(mdb_walk_state_t *wsp)
+{
+	/* Must have a start addr.  */
+	if (wsp->walk_addr == NULL) {
+		mdb_warn("Descriptor Ring base address required\n");
+
+		return (WALK_ERR);
+	}
+
+	return (WALK_NEXT);
+}
+
+
+/*
+ * Generic entry walker step routine.
+ */
+int
+vd_dring_entry_walk_step(mdb_walk_state_t *wsp)
+{
+	static int		entry_count = 0;
+	int			status;
+	vd_dring_entry_t	dring_entry;
+
+	if (mdb_vread(&dring_entry, VD_DRING_ENTRY_SZ,
+	    (uintptr_t)wsp->walk_addr) == -1) {
+		mdb_warn("failed to read vd_dring_entry_t at %p",
+		    wsp->walk_addr);
+
+		return (WALK_ERR);
+	}
+
+	status = wsp->walk_callback(wsp->walk_addr, &dring_entry,
+	    wsp->walk_cbdata);
+	wsp->walk_addr = (uintptr_t)(wsp->walk_addr + VD_DRING_ENTRY_SZ);
+
+	/* Check if we're at the last element */
+	if (++entry_count >= VD_DRING_LEN) {
+		/* reset counter for next call to this walker */
+		entry_count = 0;
+
+		return (WALK_DONE);
+	}
+
+	return (status);
+}
+
+/*
+ * MDB module linkage information:
+ */
+
+static const mdb_walker_t walkers[] = {
+	{ "vd_dring_entry", "walk vDisk public Descriptor Ring entries",
+	    vd_dring_entry_walk_init, vd_dring_entry_walk_step, NULL, NULL },
+	{ NULL }
+};
+
+static const mdb_modinfo_t modinfo = {
+	MDB_API_VERSION, NULL, walkers
+};
+
+const mdb_modinfo_t *
+_mdb_init(void)
+{
+	return (&modinfo);
+}
--- a/usr/src/cmd/mdb/sun4v/v9/Makefile.kmdb	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/sun4v/v9/Makefile.kmdb	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #ident	"%Z%%M%	%I%	%E% SMI"
@@ -35,6 +34,7 @@
 
 KMDBML += \
 	client_handler.s \
+	hcall.s \
 	kaif_handlers.s \
 	kaif_invoke.s \
 	kaif_resume.s \
--- a/usr/src/cmd/mdb/sun4v/v9/kmdb/Makefile	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/mdb/sun4v/v9/kmdb/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -18,6 +18,7 @@
 #
 # CDDL HEADER END
 #
+
 #
 # Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
@@ -58,6 +59,9 @@
 # will be added for the trap table, and each handler installed by KMDB will use
 # its padding to keep a trap count.  See kaif_handlers.s.
 #
+# NOTE: not currently supported by the sun4v fast miss handlers. See
+# ../../../sparc/v9/kmdb/kaif_handlers.s to verify before use.
+#
 #TRAPCOUNT = -DKMDB_TRAPCOUNT
 
 CPPFLAGS += -I../../../sparc/mdb -I.. -I$(SRC)/uts/sun4 -I$(SRC)/uts/sun4v
@@ -146,6 +150,9 @@
 	$(COMPILE.c) $<
 	$(CTFCONVERT_O)
 
+%.o: ../../../../../uts/sun4v/ml/%.s
+	$(COMPILE.s) -o $@ $<
+
 #
 # Lint
 #
@@ -189,6 +196,9 @@
 %.ln: $(SRC)/common/net/util/%.c
 	$(LINT.c) -c $<
 
+%.ln: ../../../../../uts/sun4v/ml/%.s
+	$(LINT.s) -c $<
+
 #
 # Installation targets
 #
--- a/usr/src/cmd/picl/plugins/sun4v/mdesc/Makefile	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License").  You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or http://www.opensolaris.org/os/licensing.
@@ -19,12 +18,13 @@
 #
 # CDDL HEADER END
 #
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
 #
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
 # cmd/picl/plugins/sun4v/mdesc/Makefile
 #
 LIBRARY=	libmdescplugin.a
@@ -32,7 +32,7 @@
 
 OBJS_DIR=	pics
 
-OBJECTS=	mdescplugin.o init.o \
+OBJECTS=	mdescplugin.o init.o dr.o \
 		cpu_prop_update.o disk_discovery.o \
 		mdesc_findname.o mdesc_findnodeprop.o \
 		mdesc_fini.o mdesc_getpropstr.o \
@@ -48,7 +48,7 @@
 
 SRCS=		$(OBJECTS:%.o=%.c)
 
-LINT_SRC=	./mdescplugin.c ./init.c \
+LINT_SRC=	./mdescplugin.c ./init.c ./dr.c \
 		./cpu_prop_update.c ./disk_discovery.c \
 		$(SRC)/common/mdesc/mdesc_findname.c \
 		$(SRC)/common/mdesc/mdesc_findnodeprop.c \
@@ -85,7 +85,7 @@
 LDLIBS +=	-L$(ROOT)/usr/lib/picl/plugins -L$(ROOT)/usr/lib/sparcv9
 DYNFLAGS +=	-R$(DYNFLAGS_COM)
 
-LDLIBS +=	-lc -lpicltree -lrt -lpicldevtree -lcfgadm -lnvpair
+LDLIBS +=	-lc -lpicltree -ldevinfo -lrt -lpicldevtree -lcfgadm -lnvpair
 
 LINTFLAGS +=	-erroff=E_BAD_PTR_CAST_ALIGN -v
 
--- a/usr/src/cmd/picl/plugins/sun4v/mdesc/cpu_prop_update.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/cpu_prop_update.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -213,16 +212,16 @@
 	int x, num_nodes;
 	int ncpus, ncaches, ntlbs;
 	int status;
-	int reg_prop[4], reg;
+	int reg_prop[SUN4V_CPU_REGSIZE], cpuid;
 	uint64_t int_value;
 
-	status = ptree_get_propval_by_name(node, "reg", reg_prop,
+	status = ptree_get_propval_by_name(node, OBP_REG, reg_prop,
 	    sizeof (reg_prop));
 	if (status != PICL_SUCCESS) {
 		return (PICL_WALK_TERMINATE);
 	}
 
-	reg = reg_prop[0] & 0x3f;
+	cpuid = CFGHDL_TO_CPUID(reg_prop[0]);
 
 	/*
 	 * Allocate space for our searches.
@@ -266,7 +265,7 @@
 			continue;
 		}
 
-		if (int_value != reg)
+		if (int_value != cpuid)
 			continue;
 
 		add_md_prop(node, sizeof (int_value), "cpuid", &int_value,
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/dr.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,517 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include "mdescplugin.h"
+
+static	di_prom_handle_t	ph = DI_PROM_HANDLE_NIL;
+
+typedef struct cpu_lookup {
+	di_node_t di_node;
+	picl_nodehdl_t nodeh;
+	int result;
+} cpu_lookup_t;
+
+extern int add_cpu_prop(picl_nodehdl_t node, void *args);
+extern md_t *mdesc_devinit(void);
+
+/*
+ * This function is identical to the one in the picldevtree plugin.
+ * Unfortunately we can't just reuse that code.
+ */
+static int
+add_string_list_prop(picl_nodehdl_t nodeh, char *name, char *strlist,
+    unsigned int nrows)
+{
+	ptree_propinfo_t	propinfo;
+	picl_prophdl_t		proph;
+	picl_prophdl_t		tblh;
+	int			err;
+	unsigned int		i;
+	unsigned int		j;
+	picl_prophdl_t		*proprow;
+	int			len;
+
+#define	NCOLS_IN_STRING_TABLE	1
+
+	err = ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION,
+	    PICL_PTYPE_TABLE, PICL_READ, sizeof (picl_prophdl_t), name,
+	    NULL, NULL);
+	if (err != PICL_SUCCESS)
+		return (err);
+
+	err = ptree_create_table(&tblh);
+	if (err != PICL_SUCCESS)
+		return (err);
+
+	err = ptree_create_and_add_prop(nodeh, &propinfo, &tblh, &proph);
+	if (err != PICL_SUCCESS)
+		return (err);
+
+	proprow = alloca(sizeof (picl_prophdl_t) * nrows);
+	if (proprow == NULL) {
+		(void) ptree_destroy_prop(proph);
+		return (PICL_FAILURE);
+	}
+
+	for (j = 0; j < nrows; ++j) {
+		len = strlen(strlist) + 1;
+		err = ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION,
+		    PICL_PTYPE_CHARSTRING, PICL_READ, len, name,
+		    NULL, NULL);
+		if (err != PICL_SUCCESS)
+			break;
+		err = ptree_create_prop(&propinfo, strlist, &proprow[j]);
+		if (err != PICL_SUCCESS)
+			break;
+		strlist += len;
+		err = ptree_add_row_to_table(tblh, NCOLS_IN_STRING_TABLE,
+		    &proprow[j]);
+		if (err != PICL_SUCCESS)
+			break;
+	}
+
+	if (err != PICL_SUCCESS) {
+		for (i = 0; i < j; ++i)
+			(void) ptree_destroy_prop(proprow[i]);
+		(void) ptree_delete_prop(proph);
+		(void) ptree_destroy_prop(proph);
+		return (err);
+	}
+
+	return (PICL_SUCCESS);
+}
+
+/*
+ * This function is identical to the one in the picldevtree plugin.
+ * Unfortunately we can't just reuse that code.
+ */
+static void
+add_devinfo_props(picl_nodehdl_t nodeh, di_node_t di_node)
+{
+	int			instance;
+	char			*di_val;
+	di_prop_t		di_prop;
+	int			di_ptype;
+	ptree_propinfo_t	propinfo;
+
+	instance = di_instance(di_node);
+	(void) ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION,
+	    PICL_PTYPE_INT, PICL_READ, sizeof (instance), PICL_PROP_INSTANCE,
+	    NULL, NULL);
+	(void) ptree_create_and_add_prop(nodeh, &propinfo, &instance, NULL);
+
+	di_val = di_bus_addr(di_node);
+	if (di_val) {
+		(void) ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION,
+		    PICL_PTYPE_CHARSTRING, PICL_READ, strlen(di_val) + 1,
+		    PICL_PROP_BUS_ADDR, NULL, NULL);
+		(void) ptree_create_and_add_prop(nodeh, &propinfo, di_val,
+		    NULL);
+	}
+
+	di_val = di_binding_name(di_node);
+	if (di_val) {
+		(void) ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION,
+		    PICL_PTYPE_CHARSTRING, PICL_READ, strlen(di_val) + 1,
+		    PICL_PROP_BINDING_NAME, NULL, NULL);
+		(void) ptree_create_and_add_prop(nodeh, &propinfo, di_val,
+		    NULL);
+	}
+
+	di_val = di_driver_name(di_node);
+	if (di_val) {
+		(void) ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION,
+		    PICL_PTYPE_CHARSTRING, PICL_READ, strlen(di_val) + 1,
+		    PICL_PROP_DRIVER_NAME, NULL, NULL);
+		(void) ptree_create_and_add_prop(nodeh, &propinfo, di_val,
+		    NULL);
+	}
+
+	di_val = di_devfs_path(di_node);
+	if (di_val) {
+		(void) ptree_init_propinfo(&propinfo, PTREE_PROPINFO_VERSION,
+		    PICL_PTYPE_CHARSTRING, PICL_READ, strlen(di_val) + 1,
+		    PICL_PROP_DEVFS_PATH, NULL, NULL);
+		(void) ptree_create_and_add_prop(nodeh, &propinfo, di_val,
+		    NULL);
+		di_devfs_path_free(di_val);
+	}
+
+	for (di_prop = di_prop_next(di_node, DI_PROP_NIL);
+	    di_prop != DI_PROP_NIL;
+		di_prop = di_prop_next(di_node, di_prop)) {
+
+		di_val = di_prop_name(di_prop);
+		di_ptype = di_prop_type(di_prop);
+		switch (di_ptype) {
+		case DI_PROP_TYPE_BOOLEAN:
+			(void) ptree_init_propinfo(&propinfo,
+			    PTREE_PROPINFO_VERSION, PICL_PTYPE_VOID,
+			    PICL_READ, (size_t)0, di_val, NULL, NULL);
+			(void) ptree_create_and_add_prop(nodeh, &propinfo,
+			    NULL, NULL);
+			break;
+		case DI_PROP_TYPE_INT: {
+			int	*idata;
+			int	len;
+
+			len = di_prop_ints(di_prop, &idata);
+			if (len < 0)
+				/* Recieved error, so ignore prop */
+				break;
+
+			if (len == 1)
+				(void) ptree_init_propinfo(&propinfo,
+				    PTREE_PROPINFO_VERSION, PICL_PTYPE_INT,
+				    PICL_READ, len * sizeof (int), di_val,
+				    NULL, NULL);
+			else
+				(void) ptree_init_propinfo(&propinfo,
+				    PTREE_PROPINFO_VERSION,
+				    PICL_PTYPE_BYTEARRAY, PICL_READ,
+				    len * sizeof (int), di_val,
+				    NULL, NULL);
+
+			(void) ptree_create_and_add_prop(nodeh, &propinfo,
+			    idata, NULL);
+		}
+		break;
+		case DI_PROP_TYPE_STRING: {
+			char	*sdata;
+			int	len;
+
+			len = di_prop_strings(di_prop, &sdata);
+			if (len < 0)
+				break;
+
+			if (len == 1) {
+				(void) ptree_init_propinfo(&propinfo,
+				    PTREE_PROPINFO_VERSION,
+				    PICL_PTYPE_CHARSTRING, PICL_READ,
+				    strlen(sdata) + 1, di_val,
+				    NULL, NULL);
+				(void) ptree_create_and_add_prop(nodeh,
+				    &propinfo, sdata, NULL);
+			} else {
+				(void) add_string_list_prop(nodeh, di_val,
+				    sdata, len);
+			}
+		}
+		break;
+		case DI_PROP_TYPE_BYTE: {
+			int		len;
+			unsigned char *bdata;
+
+			len = di_prop_bytes(di_prop, &bdata);
+			if (len < 0)
+				break;
+			(void) ptree_init_propinfo(&propinfo,
+			    PTREE_PROPINFO_VERSION, PICL_PTYPE_BYTEARRAY,
+			    PICL_READ, len, di_val, NULL, NULL);
+			(void) ptree_create_and_add_prop(nodeh, &propinfo,
+			    bdata, NULL);
+		}
+		break;
+		case DI_PROP_TYPE_UNKNOWN:
+			break;
+		case DI_PROP_TYPE_UNDEF_IT:
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+/*
+ * Create a  picl node of type cpu and fill it.
+ * properties are filled from both the device tree and the
+ * Machine description.
+ */
+static int
+construct_cpu_node(picl_nodehdl_t plath, di_node_t dn)
+{
+	int		err;
+	char		*nodename;
+	picl_nodehdl_t	anodeh;
+
+	nodename = di_node_name(dn);	/* PICL_PROP_NAME */
+
+	err = ptree_create_and_add_node(plath, nodename, PICL_CLASS_CPU,
+	    &anodeh);
+	if (err != PICL_SUCCESS)
+		return (err);
+
+	add_devinfo_props(anodeh, dn);
+	(void) add_cpu_prop(anodeh, NULL);
+
+	return (err);
+}
+
+/*
+ * Given a devinfo node find its reg property.
+ */
+static int
+get_reg_prop(di_node_t dn, int **pdata)
+{
+	int dret = 0;
+
+	dret = di_prop_lookup_ints(DDI_DEV_T_ANY, dn, OBP_REG, pdata);
+	if (dret > 0)
+		return (dret);
+
+	if (!ph)
+		return (0);
+	dret = di_prom_prop_lookup_ints(ph, dn, OBP_REG, pdata);
+	return (dret < 0? 0 : dret);
+}
+
+/*
+ * Given a devinfo cpu node find its cpuid property.
+ */
+int
+get_cpuid(di_node_t di_node)
+{
+	int	len;
+	int	*idata;
+	int	dcpuid = -1;
+
+	len = get_reg_prop(di_node, &idata);
+
+	if (len != SUN4V_CPU_REGSIZE)
+		return (dcpuid);
+	if (len == SUN4V_CPU_REGSIZE)
+		dcpuid = CFGHDL_TO_CPUID(idata[0]);
+
+	return (dcpuid);
+}
+
+int
+find_cpu(di_node_t node, int cpuid)
+{
+	int	dcpuid;
+	di_node_t cnode;
+	char	*nodename;
+
+	for (cnode = di_child_node(node); cnode != DI_NODE_NIL;
+	    cnode = di_sibling_node(cnode)) {
+		nodename = di_node_name(cnode);
+		if (nodename == NULL)
+			continue;
+		if (strcmp(nodename, OBP_CPU) == 0) {
+			dcpuid = get_cpuid(cnode);
+			if (dcpuid == cpuid) {
+				return (1);
+			}
+		}
+	}
+	return (0);
+}
+
+/*
+ * Callback to the ptree walk function during remove_cpus.
+ * As a part of the args receives a picl nodeh, searches
+ * the device tree for a cpu whose cpuid matches the picl cpu node.
+ * Sets arg struct's result to 1 if it failed to match and terminates
+ * the walk.
+ */
+static int
+remove_cpu_candidate(picl_nodehdl_t nodeh, void *c_args)
+{
+	di_node_t	di_node;
+	cpu_lookup_t	*cpu_arg;
+	int	err;
+	int	pcpuid;
+	int reg_prop[SUN4V_CPU_REGSIZE];
+
+	if (c_args == NULL)
+		return (PICL_INVALIDARG);
+
+	cpu_arg = c_args;
+	di_node = cpu_arg->di_node;
+
+	err = ptree_get_propval_by_name(nodeh, OBP_REG, reg_prop,
+	    sizeof (reg_prop));
+
+	if (err != PICL_SUCCESS) {
+		return (PICL_WALK_CONTINUE);
+	}
+
+	pcpuid = CFGHDL_TO_CPUID(reg_prop[0]);
+
+	if (!find_cpu(di_node, pcpuid)) {
+		cpu_arg->result = 1;
+		cpu_arg->nodeh = nodeh;
+		return (PICL_WALK_TERMINATE);
+	}
+
+	cpu_arg->result = 0;
+	return (PICL_WALK_CONTINUE);
+}
+
+/*
+ * Given the start node of the device tree.
+ * find all cpus in the picl tree that don't have
+ * device tree counterparts and remove them.
+ */
+static void
+remove_cpus(di_node_t di_start)
+{
+	int		err;
+	picl_nodehdl_t	plath;
+	cpu_lookup_t	cpu_arg;
+
+	err = ptree_get_node_by_path(PLATFORM_PATH, &plath);
+	if (err != PICL_SUCCESS)
+		return;
+
+	do {
+		cpu_arg.di_node = di_start;
+		cpu_arg.nodeh = 0;
+		cpu_arg.result = 0;
+
+		if (ptree_walk_tree_by_class(plath,
+		    PICL_CLASS_CPU, &cpu_arg, remove_cpu_candidate)
+		    != PICL_SUCCESS)
+			return;
+
+		if (cpu_arg.result == 1) {
+			err = ptree_delete_node(cpu_arg.nodeh);
+			if (err == PICL_SUCCESS)
+				ptree_destroy_node(cpu_arg.nodeh);
+		}
+	} while (cpu_arg.result);
+}
+
+/*
+ * Callback to the ptree walk function during add_cpus.
+ * As a part of the args receives a cpu di_node, compares
+ * each picl cpu node's cpuid to the device tree node's cpuid.
+ * Sets arg struct's result to 1 on a match.
+ */
+static int
+cpu_exists(picl_nodehdl_t nodeh, void *c_args)
+{
+	di_node_t	di_node;
+	cpu_lookup_t	*cpu_arg;
+	int	err;
+	int	dcpuid, pcpuid;
+	int reg_prop[4];
+
+	if (c_args == NULL)
+		return (PICL_INVALIDARG);
+
+	cpu_arg = c_args;
+	di_node = cpu_arg->di_node;
+	dcpuid = get_cpuid(di_node);
+
+	err = ptree_get_propval_by_name(nodeh, OBP_REG, reg_prop,
+	    sizeof (reg_prop));
+
+	if (err != PICL_SUCCESS)
+		return (PICL_WALK_CONTINUE);
+
+	pcpuid = CFGHDL_TO_CPUID(reg_prop[0]);
+
+	if (dcpuid == pcpuid) {
+		cpu_arg->result = 1;
+		return (PICL_WALK_TERMINATE);
+	}
+
+	cpu_arg->result = 0;
+	return (PICL_WALK_CONTINUE);
+}
+
+/*
+ * Given the root node of the device tree.
+ * compare it to the picl tree and add to it cpus
+ * that are new.
+ */
+static void
+add_cpus(di_node_t di_node)
+{
+	int		err;
+	di_node_t	cnode;
+	picl_nodehdl_t	plath;
+	cpu_lookup_t	cpu_arg;
+	char		*nodename;
+
+	err = ptree_get_node_by_path(PLATFORM_PATH, &plath);
+	if (err != PICL_SUCCESS)
+		return;
+
+	for (cnode = di_child_node(di_node); cnode != DI_NODE_NIL;
+	    cnode = di_sibling_node(cnode)) {
+		nodename = di_node_name(cnode);
+		if (nodename == NULL)
+			continue;
+		if (strcmp(nodename, OBP_CPU) == 0) {
+			cpu_arg.di_node = cnode;
+
+			if (ptree_walk_tree_by_class(plath,
+			    PICL_CLASS_CPU, &cpu_arg, cpu_exists)
+			    != PICL_SUCCESS)
+				return;
+
+			if (cpu_arg.result == 0)
+				/*
+				 * Didn't find a matching cpu, add it.
+				 */
+				(void) construct_cpu_node(plath,
+				    cnode);
+		}
+	}
+}
+
+/*
+ * Handle DR events. Only supports cpu add and remove.
+ */
+int
+update_devices(char *dev, int op)
+{
+	di_node_t	di_root;
+
+	if ((di_root = di_init("/", DINFOCPYALL)) == DI_NODE_NIL)
+		return (PICL_FAILURE);
+
+	if ((ph = di_prom_init()) == NULL)
+		return (PICL_FAILURE);
+
+	if (op == DEV_ADD) {
+		if (strcmp(dev, OBP_CPU) == 0)
+			add_cpus(di_root);
+	}
+
+	if (op == DEV_REMOVE) {
+		if (strcmp(dev, OBP_CPU) == 0)
+			remove_cpus(di_root);
+	}
+
+	di_fini(di_root);
+	di_prom_fini(ph);
+	return (PICL_SUCCESS);
+}
--- a/usr/src/cmd/picl/plugins/sun4v/mdesc/init.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/init.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -43,17 +43,20 @@
 #define	SIZE	8192
 
 static void mdesc_free(void *bufp, size_t size);
+uint8_t *md_bufp;
 
 md_t *
 mdesc_devinit(void)
 {
 	int fh;
-	uint8_t *bufp = NULL;
 	int res;
 	int size;
 	int offset;
 	md_t *mdp;
 
+	if (md_bufp != NULL)
+		return (NULL);
+
 	fh = open(MDESC_PATH, O_RDONLY, 0);
 	if (fh < 0) {
 		return (NULL);
@@ -62,8 +65,8 @@
 	size = SIZE;	/* initial size */
 	offset = 0;
 
-	bufp = malloc(size);
-	if (NULL == bufp) {
+	md_bufp = malloc(size);
+	if (NULL == md_bufp) {
 		return (NULL);
 	}
 
@@ -76,18 +79,18 @@
 
 		while (len < SIZE) {
 			size += SIZE;
-			bufp = realloc(bufp, size);
-			if (NULL == bufp)
+			md_bufp = realloc(md_bufp, size);
+			if (NULL == md_bufp)
 				return (NULL);
 			len = size - offset;
 		}
 
 		do {
-			res = read(fh, bufp+offset, len);
+			res = read(fh, md_bufp + offset, len);
 		} while ((res < 0) && (errno == EAGAIN));
 
 		if (res < 0) {
-			free(bufp);
+			free(md_bufp);
 			return (NULL);
 		}
 
@@ -96,13 +99,13 @@
 
 	(void) close(fh);
 
-	bufp = realloc(bufp, offset);
-	if (NULL == bufp)
+	md_bufp = realloc(md_bufp, offset);
+	if (NULL == md_bufp)
 		return (NULL);
 
-	mdp = md_init_intern((uint64_t *)bufp, malloc, mdesc_free);
+	mdp = md_init_intern((uint64_t *)md_bufp, malloc, mdesc_free);
 	if (NULL == mdp) {
-		free(bufp);
+		free(md_bufp);
 		return (NULL);
 	}
 
@@ -113,5 +116,17 @@
 void
 mdesc_free(void *bufp, size_t size)
 {
-	free(bufp);
+	if (bufp)
+		free(bufp);
 }
+
+void
+mdesc_devfini(md_t *mdp)
+{
+	if (mdp)
+		(void) md_fini(mdp);
+
+	if (md_bufp)
+		free(md_bufp);
+	md_bufp = NULL;
+}
--- a/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -50,6 +50,8 @@
 extern int add_cpu_prop(picl_nodehdl_t node, void *args);
 extern int disk_discovery(void);
 extern md_t *mdesc_devinit(void);
+extern void mdesc_devfini(md_t *mdp);
+extern int update_devices(char *dev, int op);
 
 picld_plugin_reg_t mdescplugin_reg = {
 	PICLD_PLUGIN_VERSION_1,
@@ -91,6 +93,65 @@
 }
 
 /*
+ * DR event handler
+ * respond to the picl events:
+ *      PICLEVENT_DR_AP_STATE_CHANGE
+ */
+static void
+dr_handler(const char *ename, const void *earg, size_t size, void *cookie)
+{
+	nvlist_t	*nvlp = NULL;
+	char		*dtype;
+	char		*ap_id;
+	char		*hint;
+
+
+	if (strcmp(ename, PICLEVENT_DR_AP_STATE_CHANGE) != 0) {
+		return;
+	}
+
+	if (nvlist_unpack((char *)earg, size, &nvlp, NULL)) {
+		return;
+	}
+
+	if (nvlist_lookup_string(nvlp, PICLEVENTARG_DATA_TYPE, &dtype)) {
+		nvlist_free(nvlp);
+		return;
+	}
+
+	if (strcmp(dtype, PICLEVENTARG_PICLEVENT_DATA) != 0) {
+		nvlist_free(nvlp);
+		return;
+	}
+
+	if (nvlist_lookup_string(nvlp, PICLEVENTARG_AP_ID, &ap_id)) {
+		nvlist_free(nvlp);
+		return;
+	}
+
+	if (nvlist_lookup_string(nvlp, PICLEVENTARG_HINT, &hint)) {
+		nvlist_free(nvlp);
+		return;
+	}
+
+	mdp = mdesc_devinit();
+	if (mdp == NULL) {
+		nvlist_free(nvlp);
+		return;
+	}
+
+	rootnode = md_root_node(mdp);
+
+	if (strcmp(hint, DR_HINT_INSERT) == 0)
+		(void) update_devices(ap_id, DEV_ADD);
+	else if (strcmp(hint, DR_HINT_REMOVE) == 0)
+		(void) update_devices(ap_id, DEV_REMOVE);
+
+	mdesc_devfini(mdp);
+	nvlist_free(nvlp);
+}
+
+/*
  * Discovery event handler
  * respond to the picl events:
  *      PICLEVENT_SYSEVENT_DEVICE_ADDED
@@ -170,8 +231,10 @@
 	    dsc_handler, NULL);
 	(void) ptree_register_handler(PICLEVENT_SYSEVENT_DEVICE_REMOVED,
 	    dsc_handler, NULL);
+	(void) ptree_register_handler(PICLEVENT_DR_AP_STATE_CHANGE,
+	    dr_handler, NULL);
 
-	(void) md_fini(mdp);
+	mdesc_devfini(mdp);
 }
 
 void
@@ -182,6 +245,8 @@
 	    dsc_handler, NULL);
 	(void) ptree_unregister_handler(PICLEVENT_SYSEVENT_DEVICE_REMOVED,
 	    dsc_handler, NULL);
+	(void) ptree_unregister_handler(PICLEVENT_DR_AP_STATE_CHANGE,
+	    dr_handler, NULL);
 }
 
 void
--- a/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/cmd/picl/plugins/sun4v/mdesc/mdescplugin.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,6 +29,10 @@
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include <picl.h>
 #include <picltree.h>
 #include <picldefs.h>
@@ -50,6 +54,9 @@
 #include <dirent.h>
 #include <config_admin.h>
 #include <sys/param.h>
+#include <libdevinfo.h>
+#include <sys/systeminfo.h>
+#include <sys/sysevent/dr.h>
 
 #define	MAXSTRLEN 256
 #define	ICACHE_FLAG 0x01
@@ -58,5 +65,13 @@
 #define	DISK_DISCOVERY_NAME "disk_discovery"
 #define	CONFIGURED "configured"
 #define	UNCONFIGURED "unconfigured"
+#define	DEV_ADD		0
+#define	DEV_REMOVE	1
+#define	SUN4V_CPU_REGSIZE	4
+#define	CFGHDL_TO_CPUID(x)	(x  & ~(0xful << 28))
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif	/* _MDESCPLUGIN_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,74 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+PROG =	vntsd
+SRCS =	cmd.c common.c console.c listen.c queue.c read.c vntsd.c vntsdvcc.c \
+        write.c	
+OBJS = $(SRCS:.c=.o)
+
+include ../Makefile.cmd
+
+POFILES = $(SRCS:.c=.po)
+POFILE = $(PROG)_msg.po
+
+MANIFEST = vntsd.xml
+SVCMETHOD = svc-vntsd
+
+CFLAGS += $(CCVERBOSE)
+
+LDLIBS += -lsocket -lnsl
+
+ROOTCMDDIR =	$(ROOTLIB)/ldoms
+ROOTMANIFESTDIR = $(ROOTSVCPLATFORMSUN4V)
+$(ROOTMANIFEST) := FILEMODE = 0444
+
+
+.KEEP_STATE:
+
+all: $(PROG) 
+
+install: all		\
+	$(ROOTCMD)	\
+	$(ROOTMANIFEST) \
+	$(ROOTSVCMETHOD)
+
+$(PROG): $(OBJS)
+	$(LINK.c) $(OBJS) -o $@ $(LDLIBS)
+	$(POST_PROCESS)
+
+$(POFILE): $(POFILES)
+	$(RM) $@
+	$(CAT) $(POFILES) > $@
+
+check: $(CHKMANIFEST)
+
+lint: lint_SRCS
+
+clean:
+	$(RM) $(OBJS)
+
+include ../Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/chars.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,87 @@
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _CHARS_H
+#define	_CHARS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	    CR	    13
+#define	    LF	    10
+
+
+/* telnet protocol command support */
+#define	    BEL	    7	    /* not support */
+#define	    BS	    8	    /* supported */
+#define	    HT	    9	    /* eoln */
+#define	    VT	    11	    /* not support */
+#define	    FF	    12	    /* not support */
+#define	    STOP    18
+#define	    START   19
+
+#define	    SE	    240	    /* end of subnegotiation params */
+#define	    NOP	    241
+#define	    DM	    242	    /* Data Mark not support */
+#define	    BRK	    243	    /* termial support  */
+#define	    IP	    244	    /* control-C */
+#define	    AO	    245	    /* abort output  not support */
+#define	    AYT	    246	    /* Are you there */
+#define	    EC	    247	    /* Erase character - not support */
+#define	    EL	    248	    /* Erase line   - not support */
+#define	    GA	    249	    /* Go ahead. */
+#define	    SB	    250	    /* Subnegotiation of the indicated option */
+#define	    WILL    251	    /* will do */
+#define	    WONT    252	    /* refuse */
+#define	    DO	    253	    /* request do */
+#define	    DONT    254	    /* request do not do */
+#define	    IAC	    255	    /* command */
+
+
+
+/* telnet options */
+
+#define	    TEL_ECHO	1
+#define	    SUPRESS	3
+#define	    STATUS	5
+#define	    TM		6	/* timing mark - not supported */
+#define	    TERM_TYPE	24	/* Terminal type -not supported */
+#define	    WIN_SIZE	31	/*  window size - not supported */
+#define	    TERM_SP	32	/* terminal speed - not supported */
+#define	    FC		33	/* remote flow control - not supported */
+#define	    LINEMODE	34	/* line mode */
+#define	    ENV		36	/* environment variables */
+
+#define	    VNTSD_DAEMON_CMD	'~'
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CHARS_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/cmd.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,486 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Vntsd handles two types of special commands, one is telnet
+ * commands and another is vntsd special commands.
+ * telnet commands supported are:
+ * WILL
+ * WONT
+ * DO
+ * DONT
+ *  TEL_ECHO
+ *  SUPRESS
+ *  LINEMODE
+ * BRK
+ * AYT
+ * HT
+ *
+ * Vntsd special commands are:
+ *  send break		(~#)
+ *  exit		(~.)
+ *  force write access	(~w)
+ *  cycle console down	(~n)
+ *  cycle console up	(~p)
+ *  help		(~?)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <thread.h>
+#include <ctype.h>
+#include <sys/termio.h>
+#include <libintl.h>
+#include <syslog.h>
+#include "vntsd.h"
+#include "chars.h"
+
+char vntsd_eol[] = { CR, LF, 0};
+
+typedef	int	    (*e_func_t)(vntsd_client_t *clientp);
+/* structure for daemon special cmd */
+typedef struct {
+	char e_char;				/* char to match on */
+	char *e_help;				/* help string */
+	e_func_t e_func;			/* command */
+} esctable_t;
+
+/* genbrk() -  send a break to vcc driver */
+static int
+genbrk(vntsd_client_t *clientp)
+{
+
+	vntsd_cons_t *consp;
+
+	assert(clientp);
+	assert(clientp->cons);
+
+	consp = clientp->cons;
+	D1(stderr, "t@%d genbrk fd=%d sockfd %d\n", thr_self(),
+	    consp->vcc_fd, clientp->sockfd);
+
+	assert(consp->clientpq != NULL);
+	if (consp->clientpq->handle != clientp) {
+		/* reader */
+		return (vntsd_write_line(clientp,
+			    gettext(VNTSD_NO_WRITE_ACCESS_MSG)));
+	}
+
+	/* writer */
+	if (ioctl(consp->vcc_fd, TCSBRK, NULL)) {
+		return (VNTSD_ERR_VCC_IOCTL);
+	}
+
+	return (VNTSD_STATUS_CONTINUE);
+}
+
+/*
+ * console_forward()  - cycle client to the next console
+ * in the group queue.
+ */
+static int
+console_forward(void)
+{
+	return (VNTSD_STATUS_MOV_CONS_FORWARD);
+}
+
+/*
+ * console_backward()  - cycle client to the previous
+ * console in the group queue.
+ */
+static int
+console_backward(void)
+{
+	return (VNTSD_STATUS_MOV_CONS_BACKWARD);
+}
+
+/* acquire_write() - acquire write access to a console. */
+static int
+acquire_write(vntsd_client_t *clientp)
+{
+	int	rv;
+	int	yes_no = 1;
+	vntsd_cons_t *consp;
+
+	assert(clientp);
+	consp = clientp->cons;
+	assert(consp);
+
+	if (consp->clientpq->handle == clientp) {
+		/* client is a  writer */
+		if ((rv = vntsd_write_line(clientp,
+			    gettext("You have write permission"))) !=
+		    VNTSD_SUCCESS) {
+			return (rv);
+
+		}
+		return (VNTSD_STATUS_CONTINUE);
+	}
+
+	/* message to client */
+	if ((rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN))
+	    != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	/*
+	 * TRANSLATION_NOTE
+	 * The following string should be formatted to fit on multiple lines
+	 * assuming a line width of at most 78 characters. There must be no
+	 * trailing newline.
+	 */
+	if ((rv = vntsd_write_lines(clientp,
+			    gettext("Warning: another user currently "
+	    "has write permission\nto this console and forcibly removing "
+	    "him/her will terminate\nany current write action and all work "
+	    "will be lost."))) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	/* get client yes no */
+	if ((rv = vntsd_write_client(clientp, vntsd_eol,
+			    VNTSD_EOL_LEN)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	if ((rv = vntsd_get_yes_no(clientp,
+			    gettext("Would you like to continue?"),
+			    &yes_no)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	if (yes_no == B_FALSE) {
+		/* client change mind no need to acquire  write access */
+		return (VNTSD_STATUS_CONTINUE);
+	}
+
+	return (VNTSD_STATUS_ACQUIRE_WRITER);
+}
+
+/* client_exit()  - disconnect client from the console. */
+static int
+client_exit(void)
+{
+	return (VNTSD_STATUS_RESELECT_CONS);
+}
+
+static int daemon_cmd_help(vntsd_client_t *clientp);
+
+/* table for daemon commands */
+
+static esctable_t  etable[] = {
+
+	/* send a break to vcc */
+	{'#', "send break",  genbrk},
+
+	/* exit */
+	{'.', "exit from this console",  (e_func_t)client_exit},
+
+	/* acquire write access */
+	{'w', "force write access", acquire_write},
+
+	/* connect to next console in queue */
+	{'n', "console down", (e_func_t)console_forward},
+
+	/* connect to previous console in queue */
+	{'p', "console up", (e_func_t)console_backward},
+
+	/* help must be next to last */
+	{'?', "_", daemon_cmd_help},
+
+	/* table terminator */
+	{0, 0, 0}
+};
+
+void
+vntsd_init_esctable_msgs(void)
+{
+	esctable_t  *p;
+
+	for (p = etable; p->e_char != '\0'; p++) {
+		p->e_help = gettext(p->e_help);
+	}
+}
+
+/* daemon_cmd_help() - print help. */
+static int
+daemon_cmd_help(vntsd_client_t *clientp)
+{
+	esctable_t  *p;
+	int	    rv;
+	char	    buf[VNTSD_LINE_LEN];
+
+	if ((rv = vntsd_write_client(clientp, vntsd_eol,
+			    VNTSD_EOL_LEN)) != VNTSD_SUCCESS) {
+	    return (rv);
+	}
+
+	/*
+	 * TRANSLATION_NOTE
+	 * VNTSD is the name of the VNTS daemon and should not be translated.
+	 */
+	if ((rv = vntsd_write_line(clientp, gettext("VNTSD commands"))) !=
+	    VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	for (p = etable; p->e_char; p++) {
+		(void) snprintf(buf, sizeof (buf),
+				"~%c --%s", p->e_char, p->e_help);
+
+		if ((rv = vntsd_write_line(clientp, buf)) != VNTSD_SUCCESS) {
+			return (rv);
+		}
+	}
+
+	return (VNTSD_STATUS_CONTINUE);
+}
+
+/* exit from daemon command */
+static int
+exit_daemon_cmd(vntsd_client_t *clientp, int rv)
+{
+	(void) mutex_lock(&clientp->lock);
+	clientp->status &= ~VNTSD_CLIENT_DISABLE_DAEMON_CMD;
+	(void) mutex_unlock(&clientp->lock);
+	return (rv);
+}
+
+/* vntsd_process_daemon_cmd() - special commands */
+int
+vntsd_process_daemon_cmd(vntsd_client_t *clientp, char c)
+{
+	esctable_t *p;
+	int	    rv;
+
+	if (c != VNTSD_DAEMON_CMD) {
+		/* not a daemon command */
+		return (VNTSD_SUCCESS);
+	}
+
+	if (clientp->status & VNTSD_CLIENT_DISABLE_DAEMON_CMD) {
+		return (VNTSD_STATUS_CONTINUE);
+	}
+
+	/* no reentry to process_daemon_cmd */
+	(void) mutex_lock(&clientp->lock);
+	clientp->status |= VNTSD_CLIENT_DISABLE_DAEMON_CMD;
+	(void) mutex_unlock(&clientp->lock);
+
+	D3(stderr, "t@%d process_daemon_cmd %d %d \n", thr_self(),
+	    clientp->cons->vcc_fd, clientp->sockfd);
+
+	/* read in command */
+	if ((rv = vntsd_read_char(clientp, &c)) != VNTSD_SUCCESS) {
+		return (exit_daemon_cmd(clientp, rv));
+	}
+
+	for (p = etable; p->e_char; p++) {
+		if (p->e_char == c) {
+			/* found match */
+			assert(p->e_func);
+			rv = (*p->e_func)(clientp);
+			return (exit_daemon_cmd(clientp, rv));
+		}
+	}
+
+	/* no match, print out the help */
+	p--;
+	assert(p->e_char == '?');
+	rv = (*p->e_func)(clientp);
+
+	return (exit_daemon_cmd(clientp, rv));
+
+}
+
+/* vntsd_set_telnet_options() - change  telnet client to  character mode. */
+int
+vntsd_set_telnet_options(int fd)
+{
+	/* set client telnet options */
+	uint8_t buf[] = {IAC, DONT, LINEMODE, IAC, WILL, SUPRESS, IAC, WILL,
+		TEL_ECHO, IAC, DONT, TERM_TYPE, IAC, DONT, TERM_SP,
+		IAC, DONT, STATUS, IAC, DONT, FC, IAC, DONT, TM, IAC, DONT, ENV,
+		IAC, DONT, WIN_SIZE};
+
+	return (vntsd_write_fd(fd, (char *)buf, 30));
+}
+
+/*  vntsd_telnet_cmd() process telnet commands */
+int
+vntsd_telnet_cmd(vntsd_client_t *clientp, char c)
+{
+	uint8_t	buf[4];
+	char	cmd;
+	int	rv = VNTSD_STATUS_CONTINUE;
+
+	bzero(buf, 4);
+
+	if ((uint8_t)c != IAC) {
+		/* not telnet cmd */
+		return (VNTSD_SUCCESS);
+	}
+
+	if ((rv = vntsd_read_char(clientp, &cmd)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	if ((rv = vntsd_read_char(clientp, &c)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+
+	switch ((uint8_t)cmd) {
+
+	case WILL:
+
+		switch ((uint8_t)c) {
+		case TEL_ECHO:
+		case SUPRESS:
+		case LINEMODE:
+			break;
+		default:
+			syslog(LOG_ERR, "not support telnet WILL %x\n", c);
+			break;
+		}
+		break;
+
+	case  WONT:
+
+		switch ((uint8_t)c) {
+		case TEL_ECHO:
+		case SUPRESS:
+		case LINEMODE:
+		default:
+			syslog(LOG_ERR, "not support telnet WONT %x\n", c);
+			break;
+		}
+		break;
+
+	case DO:
+	case DONT:
+
+		buf[0] = IAC;
+		buf[1] = WILL;
+		buf[2] = c;
+		rv = vntsd_write_client(clientp, (char *)buf, 3);
+
+		break;
+
+	case BRK:
+
+		/* send break to vcc */
+		rv = genbrk(clientp);
+		break;
+
+	case IP:
+
+		break;
+
+	case AYT:
+
+		rv = vntsd_write_client(clientp, &c, 1);
+		break;
+
+	case HT:
+		return (VNTSD_STATUS_CONTINUE);
+
+	default:
+		syslog(LOG_ERR, "not support telnet ctrl %x\n", c);
+		break;
+	}
+
+	if (rv == VNTSD_SUCCESS) {
+		return (VNTSD_STATUS_CONTINUE);
+	} else {
+		return (rv);
+	}
+}
+
+
+/*
+ * vntsd_ctrl_cmd()   - control keys
+ * read and write suspend are supported.
+ */
+int
+vntsd_ctrl_cmd(vntsd_client_t *clientp, char c)
+{
+	int	cmd;
+
+	D3(stderr, "t@%d vntsd_ctrl_cmd%d %d\n", thr_self(),
+	    clientp->cons->vcc_fd, clientp->sockfd);
+
+	if ((c != START) && (c != STOP)) {
+		/* not a supported control command */
+		return (VNTSD_SUCCESS);
+	}
+
+	if (c == START) {
+
+		D3(stderr, "t@%d client restart\n", thr_self());
+
+		/* send resume read */
+		cmd = 1;
+
+		if (ioctl(clientp->cons->vcc_fd, TCXONC, &cmd)) {
+			return (VNTSD_STATUS_VCC_IO_ERR);
+		}
+
+		/* send resume write */
+		cmd = 3;
+
+		if (ioctl(clientp->cons->vcc_fd, TCXONC, &cmd)) {
+			return (VNTSD_STATUS_VCC_IO_ERR);
+		}
+	}
+
+	if (c == STOP) {
+		D3(stderr, "t@%d client suspend\n", thr_self());
+
+		/* send suspend read */
+		cmd = 0;
+
+		if (ioctl(clientp->cons->vcc_fd, TCXONC, &cmd)) {
+			return (VNTSD_STATUS_VCC_IO_ERR);
+		}
+
+		/* send suspend write */
+		cmd = 2;
+
+		if (ioctl(clientp->cons->vcc_fd, TCXONC, &cmd)) {
+			perror("ioctl TCXONC");
+			return (VNTSD_STATUS_VCC_IO_ERR);
+		}
+	}
+
+	return (VNTSD_STATUS_CONTINUE);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/common.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,654 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * supporting modules.
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+#include <sys/poll.h>
+#include <wait.h>
+#include <time.h>
+#include <netinet/in.h>
+#include <thread.h>
+#include <signal.h>
+#include <ctype.h>
+#include <langinfo.h>
+#include <libintl.h>
+#include <syslog.h>
+#include "vntsd.h"
+#include "chars.h"
+
+/*  vntsd_write_line() - write a line to TCP client */
+int
+vntsd_write_line(vntsd_client_t *clientp, char *line)
+{
+	int rv;
+
+	rv = vntsd_write_client(clientp, line, strlen(line));
+	if (rv == VNTSD_SUCCESS) {
+		rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN);
+	}
+
+	return (rv);
+}
+
+/*  vntsd_write_lines() write one or more lines to client.  */
+int
+vntsd_write_lines(vntsd_client_t *clientp, char *lines)
+{
+	char	*buf;
+	char	*line;
+	char 	*endofline;
+
+	buf = strdup(lines);
+	if (buf == NULL) {
+		return (VNTSD_ERR_NO_MEM);
+	}
+
+	line = buf;
+
+	while ((line != NULL) && (*line != '\0')) {
+
+		endofline = strchr(line, '\n');
+		if (endofline != NULL) {
+			*endofline = '\0';
+		}
+
+		(void) vntsd_write_line(clientp, line);
+
+		if (endofline != NULL)
+			line = endofline + 1;
+		else
+			line = NULL;
+	}
+
+	free(buf);
+	return (VNTSD_SUCCESS);
+}
+
+/* vntsd_get_yes_no() -  read in a "y" or "n" */
+int
+vntsd_get_yes_no(vntsd_client_t *clientp, char *msg, int *yes_no)
+{
+	char	c;
+	char	yesno[8];
+	int	rv;
+
+	/* create [y/n] prompt */
+	(void) snprintf(yesno, sizeof (yesno), "[%c/%c] ",
+	    *nl_langinfo(YESSTR), *nl_langinfo(NOSTR));
+
+	for (; ; ) {
+		if ((rv = vntsd_write_client(clientp, msg, strlen(msg)))
+		    != VNTSD_SUCCESS) {
+			return (rv);
+		}
+
+		if ((rv = vntsd_write_client(clientp, yesno, strlen(yesno))) !=
+		    VNTSD_SUCCESS) {
+			return (rv);
+		}
+
+		if ((rv = vntsd_read_data(clientp, &c))
+		    != VNTSD_SUCCESS) {
+			return (rv);
+		}
+
+		/* echo */
+		if ((rv = vntsd_write_client(clientp, &c, 1)) !=
+		    VNTSD_SUCCESS) {
+			return (rv);
+		}
+
+		if ((rv = vntsd_write_client(clientp, vntsd_eol,
+			VNTSD_EOL_LEN)) !=
+		    VNTSD_SUCCESS) {
+			return (rv);
+		}
+
+		c = tolower(c);
+
+		if (c == *nl_langinfo(YESSTR)) {
+			*yes_no = B_TRUE;
+			return (VNTSD_SUCCESS);
+		}
+
+		if (c == *nl_langinfo(NOSTR)) {
+			*yes_no = B_FALSE;
+			return (VNTSD_SUCCESS);
+		}
+
+		if ((rv = vntsd_write_line(clientp,
+		    gettext("Invalid response. Try again.")))
+		    != VNTSD_SUCCESS) {
+			return (rv);
+		}
+	}
+
+	/*NOTREACHED*/
+	return (0);
+}
+
+/* vntsd_open_vcc()  -  open a vcc port */
+int
+vntsd_open_vcc(char *dev_name, uint_t cons_no)
+{
+	int	drvfd;
+	int	sz;
+	char	*path;
+	sz = strlen(VCC_DEVICE_PATH) + strlen(dev_name)+1;
+
+	path = calloc(sz, 1);
+
+	if (path == NULL) {
+		return (-1);
+	}
+
+	(void) snprintf(path, sz-1, VCC_DEVICE_PATH, dev_name);
+
+	for (; ; ) {
+		drvfd = open(path, O_RDWR);
+
+		if ((drvfd < 0) && (errno == EAGAIN)) {
+			if (vntsd_vcc_ioctl(VCC_FORCE_CLOSE, cons_no, &cons_no)
+				!= VNTSD_SUCCESS) {
+				break;
+			    }
+		} else {
+			break;
+		}
+	}
+
+
+	if (drvfd < 0) {
+		D1(stderr, "t@%d open_vcc@%s exit\n", thr_self(), dev_name);
+		free(path);
+		return (-1);
+	}
+
+	free(path);
+	return (drvfd);
+}
+
+/* vntsd_cons_by_consno() - match a console structure to cons no */
+boolean_t
+vntsd_cons_by_consno(vntsd_cons_t *consp, int *cons_id)
+{
+	if (consp->status & VNTSD_CONS_DELETED) {
+		return (B_FALSE);
+	}
+	return (consp->cons_no == *cons_id);
+}
+
+/* vntsd_write_client() write to telnet client */
+int
+vntsd_write_client(vntsd_client_t *client, char *buffer, size_t sz)
+{
+	int rv;
+
+
+	/* write to client */
+	rv = vntsd_write_fd(client->sockfd, buffer, sz);
+
+	/* client has output, reset timer */
+	vntsd_reset_timer(client->cons_tid);
+
+	return (rv);
+}
+
+/* vntsd_write_fd() write to tcp socket file descriptor  */
+int
+vntsd_write_fd(int fd, void *buf, size_t sz)
+{
+	int n;
+
+	while (sz > 0) {
+		n = write(fd, buf, sz);
+		if (n < 0) {
+			if (errno == EINTR) {
+				return (VNTSD_STATUS_INTR);
+			}
+
+			return (VNTSD_STATUS_CLIENT_QUIT);
+		}
+
+		if (n == 0) {
+			return (VNTSD_STATUS_CLIENT_QUIT);
+		}
+
+		buf =  (caddr_t)buf + n;
+		sz -= n;
+	}
+	return (VNTSD_SUCCESS);
+
+}
+
+/*
+ * vntsd_read_char() - read a char from TCP Clienti. Returns:
+ * VNTSD_SUCCESS, VNTSD_STATUS_CLIENT_QUIT or VNTSD_STATUS_INTR
+ */
+int
+vntsd_read_char(vntsd_client_t *clientp, char *c)
+{
+	int		n;
+	vntsd_timeout_t tmo;
+	int		rv;
+
+	tmo.tid = thr_self();
+	tmo.minutes = 0;
+	tmo.clientp = clientp;
+
+	/* attach to timer */
+	if ((rv = vntsd_attach_timer(&tmo)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	n = read(clientp->sockfd, c, 1);
+
+	/* detach from timer */
+	if ((rv = vntsd_detach_timer(&tmo)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	if (n == 1) {
+		return (VNTSD_SUCCESS);
+	}
+
+	if (n == 0) {
+		return (VNTSD_STATUS_CLIENT_QUIT);
+	}
+
+	/*
+	 * read error or wake up by signal, either console is being removed or
+	 * timeout occurs.
+	 */
+	if (errno == EINTR) {
+		return (VNTSD_STATUS_INTR);
+	}
+
+	/* any other error, we close client */
+	return (VNTSD_STATUS_CLIENT_QUIT);
+}
+
+/*
+ * vntsd_read_data() -  handle special commands
+ * such as telnet, daemon and ctrl cmds. Returns:
+ * from vntsd_read_char:
+ *	    VNTSD_STATUS_CLIENT_QUIT
+ *	    VNTSD_STATUS_INTR
+ * from vnts_process_daemon_cmd:
+ *	    VNTSD_STATUS_RESELECT_CONS
+ *	    VNTSD_STATUS_MOV_CONS_FORWARD
+ *	    VNTSD_STATUS_MOV_CONS_BACKWARD
+ *	    VNTSD_STATUS_ACQURE_WRITER
+ *	    VNTSD_STATUS_CONTINUE
+ * from vntsd_telnet_cmd
+ *	    VNTSD_STATUS_CONTINUE
+ */
+int
+vntsd_read_data(vntsd_client_t *clientp, char *c)
+{
+	int rv;
+
+	for (; ; ) {
+		if ((rv = vntsd_read_char(clientp, c)) != VNTSD_SUCCESS) {
+		    return (rv);
+		}
+
+		/* daemon cmd? */
+		rv = vntsd_process_daemon_cmd(clientp, *c);
+
+		if (rv == VNTSD_SUCCESS) {
+			/* telnet cmd? */
+			rv = vntsd_telnet_cmd(clientp, *c);
+		}
+
+		if (rv == VNTSD_STATUS_CONTINUE) {
+			continue;
+		}
+
+		return (rv);
+	}
+
+	/*NOTREACHED*/
+	return (0);
+}
+/* vntsd_read_line() -  read a line from TCP client */
+int
+vntsd_read_line(vntsd_client_t *clientp, char *buf, int *in_sz)
+{
+	char	c;
+	int	rv;
+	int	out_sz = 0;
+
+
+	for (; ; ) {
+
+		if ((rv =  vntsd_read_data(clientp, &c)) !=  VNTSD_SUCCESS) {
+		    return (rv);
+		}
+
+		if (c == BS) {
+			/* back */
+			if ((rv = vntsd_write_client(clientp, &c, 1)) !=
+			    VNTSD_SUCCESS) {
+				return (rv);
+			}
+
+			c = ' ';
+			if ((rv = vntsd_write_client(clientp, &c, 1)) !=
+			    VNTSD_SUCCESS) {
+				return (rv);
+			}
+
+			buf--;
+			out_sz--;
+			continue;
+		}
+		/* echo */
+		if ((rv = vntsd_write_client(clientp, &c, 1)) !=
+		    VNTSD_SUCCESS) {
+			return (rv);
+		}
+
+		*buf++ = c;
+		out_sz++;
+
+		if (c == CR) {
+			/* end of line */
+			*in_sz = out_sz;
+			return (VNTSD_SUCCESS);
+		}
+
+		if (out_sz == *in_sz) {
+			return (VNTSD_SUCCESS);
+		}
+	}
+
+	/*NOTREACHED*/
+	return (0);
+}
+
+/* free a client */
+void
+vntsd_free_client(vntsd_client_t *clientp)
+{
+
+	if (clientp->sockfd != -1) {
+		(void) close(clientp->sockfd);
+	}
+
+	(void) mutex_destroy(&clientp->lock);
+
+	free(clientp);
+}
+
+
+/* check if a vcc console port still ok */
+boolean_t
+vntsd_vcc_cons_alive(vntsd_cons_t *consp)
+{
+	vcc_console_t	vcc_cons;
+	int		rv;
+
+	assert(consp);
+	assert(consp->group);
+
+	/* construct current configuration */
+	(void) strncpy(vcc_cons.domain_name, consp->domain_name, MAXPATHLEN);
+	(void) strncpy(vcc_cons.group_name, consp->group->group_name,
+	    MAXPATHLEN);
+	vcc_cons.tcp_port = consp->group->tcp_port;
+	vcc_cons.cons_no   = consp->cons_no;
+
+	/* call vcc to verify */
+	rv = vntsd_vcc_ioctl(VCC_CONS_STATUS, consp->cons_no, &vcc_cons);
+	if (rv != VNTSD_SUCCESS) {
+		return (B_FALSE);
+	}
+
+	if (vcc_cons.cons_no == -1) {
+		/* port is gone */
+		return (B_FALSE);
+	}
+
+	/* port is ok */
+	return (B_TRUE);
+
+}
+
+/* add to total if a console is alive  */
+static boolean_t
+total_cons(vntsd_cons_t *consp, int *num_cons)
+{
+	int rv;
+
+	assert(consp->group);
+	rv = vntsd_vcc_err(consp);
+	if (rv == VNTSD_STATUS_CONTINUE) {
+		(*num_cons)++;
+	}
+	return (B_FALSE);
+}
+
+
+/* total alive consoles in a group  */
+int
+vntsd_chk_group_total_cons(vntsd_group_t *groupp)
+{
+	uint_t num_cons = 0;
+
+	(void) vntsd_que_find(groupp->conspq, (compare_func_t)total_cons,
+	    &num_cons);
+	return (num_cons);
+}
+
+/* vntsd_log() log function for errors */
+void
+vntsd_log(vntsd_status_t status, char *msg)
+{
+	char	*status_msg = NULL;
+	int	critical = 0;
+
+	switch (status) {
+
+	case VNTSD_SUCCESS:
+		status_msg = "STATUS_OK";
+		break;
+
+	case VNTSD_STATUS_CONTINUE:
+		status_msg = "CONTINUE";
+		break;
+
+	case VNTSD_STATUS_EXIT_SIG:
+		critical = 1;
+		status_msg = "KILL SIGNAL RECV";
+		break;
+
+	case VNTSD_STATUS_SIG:
+		status_msg = "SIG RECV";
+		break;
+
+	case VNTSD_STATUS_NO_HOST_NAME:
+		status_msg = "Warining NO HOST NAME";
+		break;
+
+	case VNTSD_STATUS_CLIENT_QUIT:
+		status_msg = "CLIENT CLOSED  GROUP CONNECTION";
+		break;
+
+	case VNTSD_STATUS_RESELECT_CONS:
+		status_msg = "CLIENT RESELECTS CONSOLE";
+		break;
+
+	case VNTSD_STATUS_VCC_IO_ERR:
+		status_msg = "CONSOLE WAS DELETED";
+		break;
+
+	case VNTSD_STATUS_MOV_CONS_FORWARD:
+		status_msg = "MOVE CONSOLE FORWARD";
+		break;
+
+	case VNTSD_STATUS_MOV_CONS_BACKWARD:
+		status_msg = "MOVE CONSOLE BACKWARD";
+		break;
+
+	case VNTSD_STATUS_ACQUIRE_WRITER:
+		status_msg = "FORCE CONSOLE WRITE";
+		break;
+
+	case VNTSD_STATUS_INTR:
+		status_msg = "RECV SIGNAL";
+		break;
+
+	case VNTSD_STATUS_DISCONN_CONS:
+		status_msg = "DELETING CONSOLE";
+		break;
+
+	case VNTSD_STATUS_NO_CONS:
+		status_msg = "GROUP HAS NO CONSOLE";
+		break;
+
+	case VNTSD_ERR_NO_MEM:
+		critical = 1;
+		status_msg = "NO MEMORY";
+		break;
+
+	case VNTSD_ERR_NO_DRV:
+		critical = 1;
+		status_msg = "NO VCC DRIVER";
+		break;
+
+	case VNTSD_ERR_WRITE_CLIENT:
+		status_msg  =  "WRITE CLIENT ERR";
+		break;
+
+	case VNTSD_ERR_EL_NOT_FOUND:
+		critical = 1;
+		status_msg = "ELEMENT_NOT_FOUND";
+		break;
+
+	case VNTSD_ERR_VCC_CTRL_DATA:
+		critical = 1;
+		status_msg = "VCC CTRL DATA  ERROR";
+		break;
+
+	case VNTSD_ERR_VCC_POLL:
+		critical = 1;
+		status_msg = "VCC POLL ERROR";
+		break;
+
+	case VNTSD_ERR_VCC_IOCTL:
+		critical = 1;
+		status_msg = "VCC IOCTL ERROR";
+		break;
+
+	case VNTSD_ERR_VCC_GRP_NAME:
+		critical = 1;
+		status_msg = "VCC GROUP NAME ERROR";
+		break;
+
+	case VNTSD_ERR_CREATE_LISTEN_THR:
+		critical = 1;
+		status_msg = "FAIL TO CREATE LISTEN THREAD";
+		break;
+
+	case VNTSD_ERR_CREATE_WR_THR:
+		critical = 1;
+		status_msg = "FAIL TO CREATE WRITE THREAD";
+		break;
+
+	case VNTSD_ERR_ADD_CONS_FAILED:
+		critical = 1;
+		status_msg = "FAIL TO ADD A CONSOLE";
+		break;
+
+	case VNTSD_ERR_LISTEN_SOCKET:
+		critical = 1;
+		status_msg = "LISTEN SOCKET ERROR";
+		break;
+
+	case VNTSD_ERR_LISTEN_OPTS:
+		critical = 1;
+		status_msg = "SET SOCKET OPTIONS ERROR";
+		break;
+
+	case VNTSD_ERR_LISTEN_BIND:
+		critical = 1;
+		status_msg = "BIND SOCKET ERROR";
+		break;
+
+	case VNTSD_STATUS_ACCEPT_ERR:
+		critical = 1;
+		status_msg = "LISTEN ACCEPT ERROR";
+		break;
+
+	case VNTSD_ERR_CREATE_CONS_THR:
+		critical = 1;
+		status_msg = "CREATE CONSOLE THREAD ERROR ";
+		break;
+
+	case VNTSD_ERR_SIG:
+		critical = 1;
+		status_msg = "RECV UNKNOWN SIG";
+		break;
+
+	case VNTSD_ERR_UNKNOWN_CMD:
+		critical = 1;
+		status_msg = "RECV UNKNOWN COMMAND";
+		break;
+
+	case VNTSD_ERR_CLIENT_TIMEOUT:
+		status_msg  =  "CLOSE CLIENT BECAUSE TIMEOUT";
+		break;
+	default:
+		status_msg = "Unknown status recv";
+		break;
+	}
+
+
+	if (critical) {
+		syslog(LOG_ERR, "%s: thread[%d] %s\n", status_msg,
+		    thr_self(), msg);
+	}
+#ifdef DEBUG
+	DERR(stderr, "%s: thread[%d] %s\n", status_msg,
+		    thr_self(), msg);
+	syslog(LOG_ERR, "%s: thread[%d] %s\n", status_msg, thr_self(), msg);
+#endif
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/console.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,721 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Listen thread creates a console thread whenever there is a tcp client
+ * made a conection to its port. In the console thread, if there are
+ * multiple consoles in the group, client will be asked for a console selection.
+ * a write thread for a console is created when first client connects to a
+ * selected console and console thread becomes read thread for the client.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <thread.h>
+#include <synch.h>
+#include <signal.h>
+#include <assert.h>
+#include <ctype.h>
+#include <syslog.h>
+#include <libintl.h>
+#include <netdb.h>
+#include "vntsd.h"
+#include "chars.h"
+
+/*  display domain names in the group */
+static boolean_t
+display_domain_name(vntsd_cons_t *consp,  int  *fd)
+{
+	char	buf[VNTSD_LINE_LEN];
+	char	*status;
+
+
+	if (consp->clientpq != NULL) {
+		status = gettext("connected");
+	} else if (consp->status & VNTSD_CONS_DELETED) {
+		status = gettext("removing...");
+	} else {
+		status = gettext("online");
+	}
+
+	(void) snprintf(buf, sizeof (buf), "%-20d%-30s%-25s%s",
+	    consp->cons_no, consp->domain_name, status, vntsd_eol);
+
+	return (vntsd_write_fd(*fd, buf, strlen(buf)) != VNTSD_SUCCESS);
+}
+
+/* output connected message to tcp client */
+static int
+write_connect_msg(vntsd_client_t *clientp, char *group_name,
+    char *domain_name)
+{
+
+	int	rv = VNTSD_SUCCESS;
+	char	buf[VNTSD_LINE_LEN];
+
+	if ((rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN)) !=
+	    VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	(void) snprintf(buf, sizeof (buf),
+	    gettext("Connecting to console \"%s\" in group \"%s\" ...."),
+	    domain_name, group_name);
+
+	if ((rv = vntsd_write_line(clientp, buf)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	if ((rv = vntsd_write_line(clientp,
+			    gettext("Press ~? for control options .."))) !=
+	    VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	return (VNTSD_SUCCESS);
+}
+
+static int
+create_write_thread(vntsd_cons_t *consp)
+{
+
+	assert(consp);
+
+	/* create write thread for the console */
+	(void) mutex_lock(&consp->lock);
+	if (thr_create(NULL, 0, (thr_func_t)vntsd_write_thread,
+		    (void *)consp, NULL, &consp->wr_tid)) {
+
+		DERR(stderr, "t@%d create_rd_wr_thread@%d: "
+		    "create write thread failed\n",
+		    thr_self(), consp->cons_no);
+		(void) close(consp->vcc_fd);
+		consp->vcc_fd = -1;
+		(void) mutex_unlock(&consp->lock);
+
+		return (VNTSD_ERR_CREATE_WR_THR);
+	}
+	(void) mutex_unlock(&consp->lock);
+	return (VNTSD_SUCCESS);
+}
+
+/* Display all domain consoles in a group. */
+static int
+list_all_domains(vntsd_group_t *groupp, vntsd_client_t *clientp)
+{
+	char	    vntsd_line[VNTSD_LINE_LEN];
+	int	    rv = VNTSD_SUCCESS;
+
+	if ((rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN))
+	    != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	/*
+	 * TRANSLATION_NOTE
+	 * The following three strings of the form "DOMAIN .." are table
+	 * headers and should be all uppercase.
+	 */
+	(void) snprintf(vntsd_line, sizeof (vntsd_line),
+	    "%-20s%-30s%-25s",
+	    gettext("DOMAIN ID"), gettext("DOMAIN NAME"),
+	    gettext("DOMAIN STATE"));
+
+	if ((rv = vntsd_write_line(clientp, vntsd_line)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	(void) mutex_lock(&groupp->lock);
+
+	if (vntsd_que_find(groupp->conspq, (compare_func_t)display_domain_name,
+		    &(clientp->sockfd)) != NULL) {
+		rv = VNTSD_ERR_WRITE_CLIENT;
+	}
+
+	(void) mutex_unlock(&groupp->lock);
+
+	return (rv);
+}
+
+/* display help */
+static int
+display_help(vntsd_client_t *clientp)
+{
+	int	rv = VNTSD_SUCCESS;
+	char	*bufp;
+
+	if ((rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN))
+		!= VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	/*
+	 * TRANSLATION_NOTE
+	 * The following three strings of the form ". -- ..." are help
+	 * messages for single character commands. Do not translate the
+	 * character before the --.
+	 */
+	bufp = gettext("h -- this help)");
+
+	if ((rv = vntsd_write_line(clientp, bufp)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	bufp = gettext("l -- list of consoles");
+
+	if ((rv = vntsd_write_line(clientp, bufp)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	bufp = gettext("q -- quit");
+
+	if ((rv = vntsd_write_line(clientp, bufp)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	/*
+	 * TRANSLATION_NOTE
+	 * In the following string, "id" is a short mnemonic for
+	 * "identifier" and both occurrences should be translated.
+	 */
+
+	bufp = gettext("[c[c ]]{id} -- connect to console of domain {id}");
+
+	if ((rv = vntsd_write_line(clientp, bufp)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	return (VNTSD_SUCCESS);
+}
+
+/* select a console to connect */
+static int
+select_cons(vntsd_group_t *groupp, int num_cons, vntsd_cons_t **consp,
+    vntsd_client_t *clientp, char c)
+{
+	int	    cons_no = -2;
+	int	    n;
+	int	    i;
+	char	    buf[VNTSD_LINE_LEN];
+	int	    rv;
+
+
+
+	(void) mutex_lock(&groupp->lock);
+	if (groupp->num_cons == 0) {
+		(void) mutex_unlock(&groupp->lock);
+		/* no console in this group */
+		return (VNTSD_STATUS_NO_CONS);
+	}
+	(void) mutex_unlock(&groupp->lock);
+
+	if (num_cons == 1) {
+		/* by pass selecting console */
+		*consp = (vntsd_cons_t *)(groupp->conspq->handle);
+		return (VNTSD_SUCCESS);
+	}
+
+
+	if (isdigit(c)) {
+		/* {id} input */
+		cons_no = c - '0';
+	} else if (c == 'c') {
+		/* c{id} or c {id} input */
+		cons_no = -1;
+	} else if (!isspace(c)) {
+		return (VNTSD_ERR_INVALID_INPUT);
+	}
+
+	/* get client selections */
+	n = VNTSD_LINE_LEN;
+
+	if ((rv = vntsd_read_line(clientp, buf, &n)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	/* parse command */
+	for (i = 0; i < n; i++) {
+		if (cons_no == -1) {
+			/* c{id} */
+			cons_no = atoi(buf + i);
+			break;
+		}
+
+		if (isspace(buf[i]) && cons_no == -2) {
+			/* skip space */
+			continue;
+		}
+
+		if (buf[i] == 'c') {
+			/* c{id} or c {id} */
+			cons_no = -1;
+		} else if (buf[i] == CR) {
+			break;
+		} else {
+			return (VNTSD_ERR_INVALID_INPUT);
+		}
+	}
+
+	if (cons_no < 0) {
+		return (VNTSD_ERR_INVALID_INPUT);
+	}
+
+	/* get selected console */
+	(void) mutex_lock(&groupp->lock);
+
+	*consp = (vntsd_cons_t *)vntsd_que_find(groupp->conspq,
+		    (compare_func_t)vntsd_cons_by_consno, &cons_no);
+
+	if (*consp == NULL) {
+		/* during console selection, the console has been  deleted */
+		(void) mutex_unlock(&groupp->lock);
+
+		return (VNTSD_ERR_INVALID_INPUT);
+	}
+	if ((*consp)->status & VNTSD_CONS_DELETED) {
+		return (VNTSD_ERR_INVALID_INPUT);
+	}
+
+	(void) mutex_unlock(&groupp->lock);
+
+	return (VNTSD_SUCCESS);
+}
+
+/* compare if there is a match console in the gorup */
+static boolean_t
+find_cons_in_group(vntsd_cons_t *consp_in_group, vntsd_cons_t *consp)
+{
+	if (consp_in_group == consp) {
+		return (B_TRUE);
+	} else {
+		return (B_FALSE);
+	}
+}
+
+/* connect a client to a console */
+static int
+connect_cons(vntsd_cons_t *consp, vntsd_client_t *clientp)
+{
+	int	rv, rv1;
+	vntsd_group_t *groupp;
+
+	assert(consp);
+	groupp = consp->group;
+	assert(groupp);
+	assert(clientp);
+
+	(void) mutex_lock(&groupp->lock);
+
+	/* check if console is valid */
+	consp = vntsd_que_find(groupp->conspq,
+	    (compare_func_t)find_cons_in_group, consp);
+
+	if (consp == NULL) {
+		(void) mutex_unlock(&groupp->lock);
+		return (VNTSD_STATUS_NO_CONS);
+	}
+	if (consp->status & VNTSD_CONS_DELETED) {
+		(void) mutex_unlock(&groupp->lock);
+		return (VNTSD_STATUS_NO_CONS);
+	}
+
+	(void) mutex_lock(&consp->lock);
+	(void) mutex_lock(&clientp->lock);
+
+
+	clientp->cons = consp;
+
+	/* enable daemon cmd */
+	clientp->status &= ~VNTSD_CLIENT_DISABLE_DAEMON_CMD;
+
+	if (consp->clientpq == NULL) {
+		/* first connect to console - a writer */
+		assert(consp->vcc_fd == -1);
+		/* open vcc */
+		consp->vcc_fd = vntsd_open_vcc(consp->dev_name, consp->cons_no);
+		if (consp->vcc_fd < 0) {
+			(void) mutex_unlock(&clientp->lock);
+			(void) mutex_unlock(&consp->lock);
+			(void) mutex_unlock(&groupp->lock);
+			assert(consp->group);
+			return (vntsd_vcc_err(consp));
+		}
+	}
+
+	(void) mutex_unlock(&clientp->lock);
+
+	/*
+	 * move the client from group's no console selected queue
+	 * to cons queue
+	 */
+
+	rv = vntsd_que_rm(&groupp->no_cons_clientpq, clientp);
+	assert(rv == VNTSD_SUCCESS);
+
+	rv = vntsd_que_append(&consp->clientpq, clientp);
+	(void) mutex_unlock(&groupp->lock);
+
+	if (rv != VNTSD_SUCCESS) {
+		if (consp->clientpq->handle == clientp) {
+			/* writer */
+			(void) close(consp->vcc_fd);
+			consp->vcc_fd = -1;
+		}
+
+		(void) mutex_unlock(&consp->lock);
+		return (rv);
+	}
+
+	(void) mutex_unlock(&consp->lock);
+
+	if (consp->clientpq->handle == clientp) {
+		/* create a write thread */
+		rv = create_write_thread(consp);
+		if (rv != VNTSD_SUCCESS) {
+			return (rv);
+		}
+	}
+
+	/* write connecting message */
+	if ((rv = write_connect_msg(clientp, consp->group->group_name,
+	    consp->domain_name)) != VNTSD_SUCCESS) {
+			return (rv);
+	}
+
+	/* process input from client */
+	rv = vntsd_read(clientp);
+
+	/* client disconnected from the console */
+	(void) mutex_lock(&groupp->lock);
+
+	/* remove client from console queue */
+	(void) mutex_lock(&consp->lock);
+	rv1 = vntsd_que_rm(&consp->clientpq, clientp);
+	assert(rv1 == VNTSD_SUCCESS);
+
+	/* append client to group's no console selected  queue */
+	rv1 = vntsd_que_append(&groupp->no_cons_clientpq, clientp);
+	(void) mutex_unlock(&groupp->lock);
+
+	if (consp->clientpq == NULL) {
+		/* clean up console since there is no client connected to it */
+		assert(consp->vcc_fd != -1);
+
+		/* close vcc port */
+		(void) close(consp->vcc_fd);
+		consp->vcc_fd = -1;
+
+		/* force write thread to exit */
+		assert(consp->wr_tid != (thread_t)-1);
+		(void) thr_kill(consp->wr_tid, SIGUSR1);
+		(void) mutex_unlock(&consp->lock);
+		(void) thr_join(consp->wr_tid, NULL, NULL);
+		(void) mutex_lock(&consp->lock);
+	}
+
+	if (consp->status & VNTSD_CONS_SIG_WAIT) {
+		/* console is waiting for client to disconnect */
+		(void) cond_signal(&consp->cvp);
+	}
+
+	(void) mutex_unlock(&consp->lock);
+
+	return (rv1 == VNTSD_SUCCESS ? rv : rv1);
+
+}
+
+/* read command line input */
+static int
+read_cmd(vntsd_client_t *clientp, char *prompt, char *cmd)
+{
+	int		rv;
+
+	/* disable daemon special command */
+	(void) mutex_lock(&clientp->lock);
+	clientp->status |= VNTSD_CLIENT_DISABLE_DAEMON_CMD;
+	(void) mutex_unlock(&clientp->lock);
+
+	if ((rv = vntsd_write_client(clientp, vntsd_eol, VNTSD_EOL_LEN))
+	    != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	if ((rv = vntsd_write_client(clientp, prompt, strlen(prompt)))
+		!= VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	if ((rv = vntsd_read_data(clientp, cmd)) != VNTSD_SUCCESS) {
+		return (rv);
+	}
+	if (*cmd == BS) {
+		return (VNTSD_SUCCESS);
+	}
+
+	rv = vntsd_write_client(clientp, cmd, 1);
+
+	*cmd = tolower(*cmd);
+
+	return (rv);
+}
+
+/* reset client for selecting a console in the group */
+static void
+client_init(vntsd_client_t *clientp)
+{
+	(void) mutex_lock(&clientp->lock);
+	clientp->cons = NULL;
+	clientp->status = 0;
+	(void) mutex_unlock(&clientp->lock);
+}
+
+/* clean up client and exit the thread */
+static void
+client_fini(vntsd_group_t *groupp, vntsd_client_t *clientp)
+{
+
+	assert(groupp);
+	assert(clientp);
+
+	/* disconnct client from tcp port */
+	assert(clientp->sockfd != -1);
+	(void) close(clientp->sockfd);
+
+	(void) mutex_lock(&groupp->lock);
+	(void) vntsd_que_rm(&groupp->no_cons_clientpq, clientp);
+
+	if ((groupp->no_cons_clientpq == NULL) &&
+	    (groupp->status & VNTSD_GROUP_SIG_WAIT)) {
+		/* group is waiting to be deleted */
+		groupp->status &= ~VNTSD_GROUP_SIG_WAIT;
+		(void) cond_signal(&groupp->cvp);
+	}
+	(void) mutex_unlock(&groupp->lock);
+
+	(void) mutex_destroy(&clientp->lock);
+	free(clientp);
+
+	thr_exit(0);
+}
+
+/*  check client's status. exit if client quits or fatal errors */
+static void
+console_chk_status(vntsd_group_t *groupp, vntsd_client_t *clientp, int status)
+{
+	char    err_msg[VNTSD_LINE_LEN];
+
+	D1(stderr, "t@%d console_chk_status() status=%d "
+	    "client status=%x num consoles=%d \n",
+	    thr_self(), status, clientp->status, groupp->num_cons);
+
+	(void) snprintf(err_msg, VNTSD_LINE_LEN, "console_chk_status client%d"
+	    " num_cos=%d", clientp->sockfd, groupp->num_cons);
+
+	if (groupp->num_cons == 0) {
+		/* no more console in the group */
+		client_fini(groupp, clientp);
+	}
+
+	if (status == VNTSD_STATUS_INTR) {
+		/* reason for signal? */
+		status = vntsd_cons_chk_intr(clientp);
+	}
+
+	switch (status) {
+
+	case VNTSD_STATUS_CLIENT_QUIT:
+		client_fini(groupp, clientp);
+		return;
+
+	case VNTSD_STATUS_RESELECT_CONS:
+		assert(clientp->cons);
+		if ((groupp->num_cons == 1) &&
+		    (groupp->conspq->handle == clientp->cons)) {
+			/* no other selection available */
+			client_fini(groupp, clientp);
+		} else {
+			client_init(clientp);
+		}
+		return;
+
+	case VNTSD_STATUS_VCC_IO_ERR:
+		if ((clientp->status & VNTSD_CLIENT_CONS_DELETED) == 0) {
+			/* check if console was deleted  */
+			status = vntsd_vcc_err(clientp->cons);
+		}
+
+		if (status != VNTSD_STATUS_CONTINUE) {
+			/* console was deleted */
+			if (groupp->num_cons == 1) {
+				client_fini(groupp, clientp);
+			}
+		}
+
+		/* console is ok */
+		client_init(clientp);
+		return;
+
+	case VNTSD_STATUS_MOV_CONS_FORWARD:
+	case VNTSD_STATUS_MOV_CONS_BACKWARD:
+		if (groupp->num_cons == 1) {
+			/* same console */
+			return;
+		}
+
+		/* get selected console */
+		(void) mutex_lock(&(clientp->cons->group->lock));
+		clientp->cons = vntsd_que_pos(clientp->cons->group->conspq,
+		    clientp->cons,
+		    (status == VNTSD_STATUS_MOV_CONS_FORWARD)?(1):(-1));
+		(void) mutex_unlock(&(clientp->cons->group->lock));
+		return;
+
+	case VNTSD_SUCCESS:
+	case VNTSD_STATUS_CONTINUE:
+	case VNTSD_STATUS_NO_CONS:
+		client_init(clientp);
+		return;
+
+	case VNTSD_ERR_INVALID_INPUT:
+		return;
+
+	default:
+		/* fatal error */
+		vntsd_log(status, err_msg);
+		client_fini(groupp, clientp);
+		return;
+	}
+}
+
+/* console thread */
+void *
+vntsd_console_thread(vntsd_thr_arg_t *argp)
+{
+	vntsd_group_t	    *groupp;
+	vntsd_cons_t	    *consp;
+	vntsd_client_t	    *clientp;
+
+	char		    buf[MAXHOSTNAMELEN];
+	char		    prompt[72];
+	char		    cmd;
+	int		    rv = VNTSD_SUCCESS;
+	int		    num_cons;
+
+
+	groupp = (vntsd_group_t *)argp->handle;
+	clientp = (vntsd_client_t *)argp->arg;
+
+	assert(groupp);
+	assert(clientp);
+
+	/* check if group is removed */
+
+	D1(stderr, "t@%d get_client_sel@%lld:client@%d\n", thr_self(),
+	    groupp->tcp_port, clientp->sockfd);
+
+	bzero(buf, MAXHOSTNAMELEN);
+
+	/* host name */
+	if (gethostname(buf, MAXHOSTNAMELEN)) {
+		vntsd_log(VNTSD_STATUS_NO_HOST_NAME, "vntsd_console_thread()");
+		(void) snprintf(buf, sizeof (buf), "unkown host");
+	}
+
+	if (snprintf(prompt, sizeof (prompt),
+		    "%s-vnts-%s: h,l,{id},c{id},c {id},q:",
+	    buf, groupp->group_name) >= sizeof (prompt)) {
+		/* long prompt doesn't fit, use short one */
+		(void) snprintf(prompt, sizeof (prompt),
+				"vnts: h,l,{id},c{id},c {id}, q:");
+	}
+
+
+	for (;;) {
+		cmd = ' ';
+		D1(stderr, "t@%d console_thread()@%lld:client@%d\n", thr_self(),
+		    groupp->tcp_port, clientp->sockfd);
+
+		num_cons = vntsd_chk_group_total_cons(groupp);
+
+		if ((num_cons > 1) && (clientp->cons == NULL)) {
+			/*  console to connect to */
+			rv = read_cmd(clientp, prompt, &cmd);
+			/* check error and may exit */
+			console_chk_status(groupp, clientp, rv);
+		}
+
+		switch (cmd) {
+
+		case 'l':
+
+			/* list domain names */
+			rv = list_all_domains(groupp, clientp);
+			break;
+
+
+		case 'q':
+
+			rv = VNTSD_STATUS_CLIENT_QUIT;
+			break;
+
+		case 'h':
+			rv = display_help(clientp);
+			break;
+
+		default:
+			/* select console */
+			if (clientp->cons == NULL) {
+				rv = select_cons(groupp, num_cons,
+				    &consp, clientp, cmd);
+				if (rv == VNTSD_ERR_INVALID_INPUT) {
+					rv = display_help(clientp);
+					break;
+				}
+			} else {
+				consp = clientp->cons;
+			}
+			assert(consp);
+
+			/* connect to console */
+			rv = connect_cons(consp, clientp);
+			D1(stderr, "t@%d console_thread()"
+			    "connect_cons returns %d\n",
+			    thr_self(), rv);
+			break;
+
+		}
+		/* check error and may  exit */
+		console_chk_status(groupp, clientp, rv);
+	}
+
+	/*NOTREACHED*/
+	return (NULL);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/listen.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,285 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Each group has a listen thread. It is created at the time
+ * of a group creation and destroyed when a group does not have
+ * any console associated with it.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <thread.h>
+#include <assert.h>
+#include <signal.h>
+#include <ctype.h>
+#include <syslog.h>
+#include "vntsd.h"
+
+/*
+ * check the state of listen thread. exit if there is an fatal error
+ * or the group is removed.
+ */
+static void
+listen_chk_status(vntsd_group_t *groupp, int status)
+{
+	char	    err_msg[VNTSD_LINE_LEN];
+
+
+	D1(stderr, "t@%d listen_chk_status() status=%d group=%s "
+	    "tcp=%lld group status = %x\n", thr_self(), status,
+	    groupp->group_name, groupp->tcp_port, groupp->status);
+
+	(void) snprintf(err_msg, sizeof (err_msg),
+	    "Group:%s TCP port %lld status %x",
+	    groupp->group_name, groupp->tcp_port, groupp->status);
+
+
+	switch (status) {
+
+	case VNTSD_SUCCESS:
+		return;
+
+	case VNTSD_STATUS_INTR:
+		assert(groupp->status & VNTSD_GROUP_SIG_WAIT);
+		/* close listen socket */
+		(void) mutex_lock(&groupp->lock);
+		(void) close(groupp->sockfd);
+		groupp->sockfd = -1;
+
+		/* let group know */
+		groupp->status &= ~VNTSD_GROUP_SIG_WAIT;
+		(void) cond_signal(&groupp->cvp);
+
+		(void) mutex_unlock(&groupp->lock);
+		/* exit thread */
+		thr_exit(0);
+		break;
+
+	case VNTSD_STATUS_ACCEPT_ERR:
+		return;
+
+	case VNTSD_STATUS_NO_CONS:
+	default:
+		/* fatal, exit thread */
+
+		(void) mutex_lock(&groupp->lock);
+		(void) close(groupp->sockfd);
+		groupp->sockfd = -1;
+		(void) mutex_unlock(&groupp->lock);
+		vntsd_log(status, err_msg);
+		vntsd_clean_group(groupp);
+
+		thr_exit(0);
+		break;
+	}
+}
+
+/* allocate and initialize listening socket. */
+static int
+open_socket(int port_no, int *sockfd)
+{
+
+	struct	    sockaddr_in addr;
+	int	    on;
+
+
+	/* allocate a socket */
+	*sockfd = socket(AF_INET, SOCK_STREAM, 0);
+	if (*sockfd < 0) {
+		if (errno == EINTR) {
+			return (VNTSD_STATUS_INTR);
+		}
+		return (VNTSD_ERR_LISTEN_SOCKET);
+	}
+
+#ifdef DEBUG
+	/* set reuse local socket address */
+	on = 1;
+	if (setsockopt(*sockfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof (on))) {
+		return (VNTSD_ERR_LISTEN_OPTS);
+	}
+#endif
+
+	addr.sin_family = AF_INET;
+	addr.sin_addr.s_addr = (vntsd_ip_addr()).s_addr;
+	addr.sin_port = htons(port_no);
+
+	/* bind socket */
+	if (bind(*sockfd, (struct sockaddr *)&addr, sizeof (addr)) < 0) {
+		if (errno == EINTR) {
+			return (VNTSD_STATUS_INTR);
+		}
+		return (VNTSD_ERR_LISTEN_BIND);
+
+	}
+
+	if (listen(*sockfd, VNTSD_MAX_SOCKETS) == -1) {
+		if (errno == EINTR) {
+			return (VNTSD_STATUS_INTR);
+		}
+		return (VNTSD_ERR_LISTEN_BIND);
+	}
+
+	D1(stderr, "t@%d open_socket() sockfd=%d\n", thr_self(), *sockfd);
+	return (VNTSD_SUCCESS);
+}
+
+/* ceate console selection thread */
+static int
+create_console_thread(vntsd_group_t *groupp, int sockfd)
+{
+	vntsd_client_t	    *clientp;
+	vntsd_thr_arg_t	    arg;
+	int		    rv;
+
+
+	assert(groupp);
+	D1(stderr, "t@%d create_console_thread@%lld:client@%d\n", thr_self(),
+	    groupp->tcp_port, sockfd);
+
+	/* allocate a new client */
+	clientp = (vntsd_client_t *)malloc(sizeof (vntsd_client_t));
+	if (clientp  == NULL) {
+		return (VNTSD_ERR_NO_MEM);
+	}
+
+	/* initialize the client */
+	bzero(clientp, sizeof (vntsd_client_t));
+
+	clientp->sockfd = sockfd;
+	clientp->cons_tid = (thread_t)-1;
+
+	(void) mutex_init(&clientp->lock, USYNC_THREAD|LOCK_ERRORCHECK, NULL);
+
+	/* append client to group */
+	(void) mutex_lock(&groupp->lock);
+
+	if ((rv = vntsd_que_append(&groupp->no_cons_clientpq, clientp))
+	    != VNTSD_SUCCESS) {
+		(void) mutex_unlock(&groupp->lock);
+		vntsd_free_client(clientp);
+		return (rv);
+	}
+
+	(void) mutex_unlock(&groupp->lock);
+
+	(void) mutex_lock(&clientp->lock);
+
+	/* parameters for console thread */
+	bzero(&arg, sizeof (arg));
+
+	arg.handle = groupp;
+	arg.arg = clientp;
+
+	/* create console selection thread */
+	if (thr_create(NULL, 0, (thr_func_t)vntsd_console_thread,
+		    &arg, THR_DETACHED, &clientp->cons_tid)) {
+
+		(void) mutex_unlock(&clientp->lock);
+		(void) mutex_lock(&groupp->lock);
+		(void) vntsd_que_rm(&groupp->no_cons_clientpq, clientp);
+		(void) mutex_unlock(&groupp->lock);
+		vntsd_free_client(clientp);
+
+		return (VNTSD_ERR_CREATE_CONS_THR);
+	}
+
+	(void) mutex_unlock(&clientp->lock);
+
+	return (VNTSD_SUCCESS);
+}
+
+/* listen thread */
+void *
+vntsd_listen_thread(vntsd_group_t *groupp)
+{
+
+	int		newsockfd;
+	size_t		clilen;
+	struct		sockaddr_in cli_addr;
+	int		rv;
+	int		num_cons;
+
+	assert(groupp);
+
+	D1(stderr, "t@%d listen@%lld\n", thr_self(), groupp->tcp_port);
+
+
+	/* initialize listen socket */
+	(void) mutex_lock(&groupp->lock);
+	rv = open_socket(groupp->tcp_port, &groupp->sockfd);
+	(void) mutex_unlock(&groupp->lock);
+	listen_chk_status(groupp, rv);
+
+	for (; ; ) {
+
+		clilen = sizeof (cli_addr);
+
+		/* listen to the socket */
+		newsockfd = accept(groupp->sockfd, (struct sockaddr *)&cli_addr,
+			    &clilen);
+
+		D1(stderr, "t@%d listen_thread() connected sockfd=%d\n",
+		    thr_self(), newsockfd);
+
+		if (newsockfd <=  0) {
+
+			if (errno == EINTR) {
+				listen_chk_status(groupp, VNTSD_STATUS_INTR);
+			} else {
+				listen_chk_status(groupp,
+				    VNTSD_STATUS_ACCEPT_ERR);
+			}
+			continue;
+		}
+		num_cons = vntsd_chk_group_total_cons(groupp);
+		if (num_cons == 0) {
+			(void) close(newsockfd);
+			listen_chk_status(groupp, VNTSD_STATUS_NO_CONS);
+		}
+
+		/* a connection is established */
+		rv = vntsd_set_telnet_options(newsockfd);
+		if (rv != VNTSD_SUCCESS) {
+			(void) close(newsockfd);
+			listen_chk_status(groupp, rv);
+		}
+		rv = create_console_thread(groupp, newsockfd);
+		if (rv != VNTSD_SUCCESS) {
+			(void) close(newsockfd);
+			listen_chk_status(groupp, rv);
+		}
+	}
+
+	/*NOTREACHED*/
+	return (NULL);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/queue.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,288 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * utility for vntsd queue handling
+ */
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+#include <wait.h>
+#include <time.h>
+#include <netinet/in.h>
+#include <thread.h>
+#include <signal.h>
+#include "vntsd.h"
+
+/* alloc_que_el() allocates a queue element */
+static vntsd_que_t *
+alloc_que_el(void *handle)
+{
+	vntsd_que_t *el;
+
+	/* allocate a queue element */
+	el = (vntsd_que_t *)malloc(sizeof (vntsd_que_t));
+	if (el == NULL) {
+		return (NULL);
+	}
+
+
+	el->nextp = NULL;
+	el->prevp = NULL;
+	el->handle = handle;
+
+	return (el);
+}
+
+/* vntsd_que_append() appends a element to a queue */
+int
+vntsd_que_append(vntsd_que_t **que_hd, void *handle)
+{
+	vntsd_que_t *p;
+	vntsd_que_t *el;
+
+	assert(que_hd);
+	assert(handle);
+
+	/* allocate a queue element */
+	el = alloc_que_el(handle);
+
+	if (el == NULL) {
+		return (VNTSD_ERR_NO_MEM);
+	}
+
+	p = *que_hd;
+
+	if (p == NULL) {
+		/* first one */
+		*que_hd  = el;
+	} else {
+		/* walk to the last one */
+		while (p->nextp != NULL)
+			p = p->nextp;
+		p->nextp = el;
+	}
+
+	el->prevp = p;
+
+	return (VNTSD_SUCCESS);
+}
+
+/* vntsd_que_insert_after() inserts element arter the handle */
+int
+vntsd_que_insert_after(vntsd_que_t *que, void *handle, void *next)
+{
+	vntsd_que_t *q, *el;
+
+	assert(que);
+
+	q = que;
+
+	while (q != NULL) {
+		if (q->handle == handle) {
+			break;
+		}
+
+		q = q->nextp;
+	}
+
+	if (q == NULL) {
+		/* not in queue */
+		return (VNTSD_ERR_EL_NOT_FOUND);
+	}
+
+	el = alloc_que_el(next);
+
+	if (el == NULL) {
+		return (VNTSD_ERR_NO_MEM);
+	}
+
+	el->nextp = q->nextp;
+	q->nextp = el;
+	el->prevp = q;
+
+	return (VNTSD_SUCCESS);
+}
+
+
+
+/* vntsd_que_rm() removes an element from a queue */
+int
+vntsd_que_rm(vntsd_que_t **que_hd, void *handle)
+{
+	vntsd_que_t	*p = *que_hd;
+	vntsd_que_t	*prevp = NULL;
+
+
+	while (p != NULL) {
+		/* match handle */
+		if (p->handle == handle) {
+			break;
+		}
+		prevp = p;
+		p = p->nextp;
+	}
+
+	if (p == NULL) {
+		/* not found */
+		return (VNTSD_ERR_EL_NOT_FOUND);
+	}
+
+	/* found */
+	if (p == *que_hd) {
+		/* first one */
+		*que_hd = p->nextp;
+	} else {
+		prevp->nextp = p->nextp;
+	}
+
+	if (p->nextp != NULL) {
+		p->nextp->prevp = prevp;
+	}
+
+	handle = p->handle;
+
+	free(p);
+
+	return (VNTSD_SUCCESS);
+
+}
+
+/* vntsd_que_walk() - walk queue and apply function to each element */
+void *
+vntsd_que_walk(vntsd_que_t *que_hd, el_func_t el_func)
+{
+	vntsd_que_t *p = que_hd;
+
+	while (p != NULL) {
+		if ((*el_func)(p->handle)) {
+		    return (p->handle);
+		}
+
+		p = p->nextp;
+	}
+	return (VNTSD_SUCCESS);
+}
+
+
+/* vntsd_que_find() finds first match */
+void *
+vntsd_que_find(vntsd_que_t *que_hd, compare_func_t compare_func, void *data)
+{
+	vntsd_que_t *p = que_hd;
+
+	assert(compare_func != NULL);
+	while (p != NULL) {
+		if ((*compare_func)(p->handle, data)) {
+			/* found match */
+			return (p->handle);
+		}
+
+		p = p->nextp;
+	}
+
+	/* not found */
+	return (NULL);
+}
+
+/* vntsd_free_que() frees entire queue */
+void
+vntsd_free_que(vntsd_que_t **q, clean_func_t clean_func)
+{
+	vntsd_que_t *p;
+
+	while (*q != NULL) {
+		p = *q;
+
+		*q  = p->nextp;
+
+		if (clean_func) {
+			/* clean func will free the handle */
+			(*clean_func)(p->handle);
+		} else {
+			free(p->handle);
+		}
+
+		free(p);
+	}
+}
+
+/*
+ * vntsd_que_pos() matches a handle and returns a handle located at "pos"
+ * relative to the matched handle. pos supported are 1 or -1.
+ */
+void *
+vntsd_que_pos(vntsd_que_t *que_hd, void *handle, int pos)
+{
+	vntsd_que_t *p = que_hd;
+
+	assert((pos == 1) || (pos == -1));
+
+
+	while (p != NULL) {
+		if (p->handle == handle) {
+			/* find match */
+			if (pos == 1) {
+				/* forward 1 */
+				if (p->nextp != NULL) {
+					return (p->nextp->handle);
+				}
+
+				/* last one go to first */
+				return (que_hd->handle);
+
+			} else {
+				/* backward 1 */
+				if (p->prevp != NULL) {
+					return (p->prevp->handle);
+				}
+
+				/* first one, return last one */
+				while (p->nextp != NULL) {
+					p = p->nextp;
+				}
+
+				assert(p != NULL);
+				assert(p->handle != NULL);
+				return (p->handle);
+
+			}
+		}
+		p = p->nextp;
+	}
+
+	DERR(stderr, "t@%d vntsd_que_pos can not find handle \n",
+	    thr_self());
+
+	return (NULL);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/read.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,265 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * read thread - Read from tcp client and write to vcc driver. There  are one
+ * writer and multiple readers per console. The first client who connects to
+ * a console get write access. An error message is returned to readers if they
+ * attemp to input commands. Read thread accepts special daemon commands from
+ * all clients.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <thread.h>
+#include <synch.h>
+#include <signal.h>
+#include <assert.h>
+#include <ctype.h>
+#include <syslog.h>
+#include <libintl.h>
+#include "vntsd.h"
+#include "chars.h"
+
+/* write_vcc()  - write to vcc virtual console */
+static int
+write_vcc(vntsd_client_t *clientp, char c)
+{
+	int	n;
+
+
+	assert(clientp);
+	assert(clientp->cons);
+
+	if (c == 0) {
+		return (VNTSD_SUCCESS);
+	}
+	n = write(clientp->cons->vcc_fd, &c, 1);
+
+	if (n < 0) {
+		/* write error */
+		if (errno == EINTR) {
+			return (vntsd_cons_chk_intr(clientp));
+		}
+
+		return (VNTSD_STATUS_VCC_IO_ERR);
+	}
+
+	assert(n != 0);
+	return (VNTSD_SUCCESS);
+
+}
+
+/*
+ * acquire_writer() the client is going to be writer.
+ * insert the client to the head of the console client queue.
+ */
+static int
+acquire_writer(vntsd_client_t *clientp)
+{
+	vntsd_cons_t	    *consp;
+	vntsd_client_t	    *writerp;
+	int		    rv;
+
+	D1(stderr, "t@%d:acuire_writer :client@%d\n", thr_self(),
+	    clientp->sockfd);
+
+	assert(clientp != NULL);
+	consp = clientp->cons;
+
+	assert(consp);
+
+	(void) mutex_lock(&consp->lock);
+
+	assert(consp->clientpq != NULL);
+	if (consp->clientpq->handle == clientp) {
+		/* clientp is a writer already */
+		(void) mutex_unlock(&consp->lock);
+		return (VNTSD_SUCCESS);
+	}
+
+	/* current writer */
+	writerp = (vntsd_client_t *)(consp->clientpq->handle);
+
+	(void) mutex_lock(&writerp->lock);
+
+	rv = vntsd_que_rm(&(consp->clientpq), clientp);
+	assert(rv == VNTSD_SUCCESS);
+
+	(void) mutex_lock(&clientp->lock);
+
+	/* move client to be first in the console queue */
+	consp->clientpq->handle = clientp;
+
+	/* move previous writer to be the second in the queue */
+	rv =  vntsd_que_insert_after(consp->clientpq, clientp, writerp);
+
+	(void) mutex_unlock(&consp->lock);
+	(void) mutex_unlock(&writerp->lock);
+	(void) mutex_unlock(&clientp->lock);
+
+	if (rv != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	/* write warning message to the writer */
+
+	if ((rv = vntsd_write_line(writerp,
+	    gettext("Warning: Console connection forced into read-only mode")))
+	    != VNTSD_SUCCESS) {
+		return (rv);
+	}
+
+	return (VNTSD_SUCCESS);
+}
+
+/* interrupt handler */
+int
+vntsd_cons_chk_intr(vntsd_client_t *clientp)
+{
+
+	if (clientp->status & VNTSD_CLIENT_TIMEOUT) {
+		return (VNTSD_STATUS_CLIENT_QUIT);
+	}
+	if (clientp->status & VNTSD_CLIENT_CONS_DELETED) {
+		return (VNTSD_STATUS_RESELECT_CONS);
+	}
+
+	if (clientp->status & VNTSD_CLIENT_IO_ERR) {
+		return (VNTSD_STATUS_CLIENT_QUIT);
+	}
+	return (VNTSD_STATUS_CONTINUE);
+}
+
+/* read from client */
+static int
+read_char(vntsd_client_t *clientp, char *c)
+{
+	int	    rv;
+
+	for (; ; ) {
+
+		rv = vntsd_read_data(clientp, c);
+
+		switch (rv) {
+		case VNTSD_STATUS_CONTINUE:
+			break;
+
+		case VNTSD_STATUS_ACQUIRE_WRITER:
+			rv = acquire_writer(clientp);
+			if (rv != VNTSD_SUCCESS) {
+				return (rv);
+			}
+			break;
+		default:
+			return (rv);
+		}
+	}
+}
+
+/* vntsd_read worker */
+int
+vntsd_read(vntsd_client_t *clientp)
+{
+	char		c;
+	int		rv;
+
+
+	assert(clientp);
+	D3(stderr, "t@%d vntsd_read@%d\n", thr_self(), clientp->sockfd);
+
+	for (; ; ) {
+
+		/* client input */
+		rv = read_char(clientp, &c);
+
+		if (rv == VNTSD_STATUS_INTR) {
+			rv = vntsd_cons_chk_intr(clientp);
+		}
+
+		if (rv != VNTSD_SUCCESS) {
+			return (rv);
+		}
+
+		assert(clientp->cons);
+		if (clientp->cons->clientpq->handle != clientp) {
+			/* reader - print  error message */
+			if ((c != CR) && (c != LF)) {
+				rv = vntsd_write_line(clientp,
+				    gettext(VNTSD_NO_WRITE_ACCESS_MSG));
+
+				/* check errors and may exit */
+				if (rv == VNTSD_STATUS_INTR) {
+					rv = vntsd_cons_chk_intr(clientp);
+				}
+
+				if (rv != VNTSD_SUCCESS) {
+					return (rv);
+				}
+
+			}
+
+			continue;
+		}
+
+		rv = vntsd_ctrl_cmd(clientp, c);
+
+		switch (rv) {
+		case VNTSD_STATUS_CONTINUE:
+			continue;
+			break;
+		case VNTSD_STATUS_INTR:
+			rv = vntsd_cons_chk_intr(clientp);
+			if (rv != VNTSD_SUCCESS) {
+				return (rv);
+			}
+			break;
+		case VNTSD_SUCCESS:
+			break;
+		default:
+			return (rv);
+		}
+
+		/* write to vcc */
+		rv = write_vcc(clientp, c);
+		if (rv == VNTSD_STATUS_INTR) {
+			rv = vntsd_cons_chk_intr(clientp);
+		}
+		if (rv != VNTSD_SUCCESS) {
+			return (rv);
+		}
+
+	}
+
+	/*NOTREACHED*/
+	return (NULL);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/svc-vntsd	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,64 @@
+#!/sbin/sh
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# Start script for vntsd
+#
+# For modifying parameters passed to vntsd, do not edit
+# this script. Instead use svccfg(1m) to modify the SMF
+# repository. For example:
+#
+# svccfg
+# svc:> select ldoms/vntsd
+# svc:/ldoms/vntsd> setprop vntsd/vcc_device = "virtual-console-concentrator@1"
+# svc:/ldoms/vntsd> setprop vntsd/listen_addr = "192.168.1.1"
+# svc:/ldoms/vntsd> exit
+
+. /lib/svc/share/smf_include.sh
+
+vcc_device=`svcprop -p vntsd/vcc_device $SMF_FMRI 2>/dev/null`
+if [ -z "$vcc_device" ]; then
+	vcc_device="virtual-console-concentrator@0"
+fi
+args="-i $vcc_device"
+
+listen_addr=`svcprop -p vntsd/listen_addr $SMF_FMRI 2>/dev/null`
+if [ -n "$listen_addr" ]; then
+	args="$args -p $listen_addr"
+fi
+
+timeout=`svcprop -p vntsd/timeout_minutes $SMF_FMRI 2>/dev/null`
+if [ -n "$timeout" ]; then
+	args="$args -t $timeout"
+fi
+
+if [ -x /usr/lib/ldoms/vntsd ]; then
+    /usr/lib/ldoms/vntsd $args || exit $SMF_EXIT_ERR_CONFIG
+else
+    echo "WARNING: /usr/lib/ldoms/vntsd is missing or not executable" >& 2
+    exit $SMF_EXIT_ERR_CONFIG
+fi
+
+exit $SMF_EXIT_OK
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/vcc.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#ifndef _VCC_H
+#define	_VCC_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define	VCC_MAX_NAME		25
+
+#define	VCC_NUM_CONSOLE		0x1	    /* total number of groups */
+#define	VCC_PORT_TBL		0x2	    /* download all port in a group */
+
+#define	VCC_INQUIRY		0x4	    /* inquiry evnts */
+#define	VCC_PORT_CONFIG		0x8	    /* download one port */
+#define	VCC_CLEAN_POLL		0x10	    /* vntsd exits */
+#define	VCC_DEL_PORT_OK		0x20	    /* vntsd delete port ok */
+#define	VCC_PORT_HELLO		0x1
+
+typedef enum {
+	VNTSD_MSG_ADD_PORT,
+	VNTSD_MSG_DEL_PORT
+} vntsd_msg_t;
+
+
+#define	VCC_PORT_ON		0x40
+
+
+typedef struct vntsd_console {
+	int cons_no;
+	uint64_t status;
+	char domain_name[VCC_MAX_NAME];
+} vntsd_console_t;
+
+/* console configuration that is downloaded to vntsd */
+typedef struct vntsd_vcc_console {
+	vntsd_console_t	console;
+	char 		group_name[VCC_MAX_NAME];
+	uint64_t	tcp_port;
+} vntsd_vcc_console_t;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VCC_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/vntsd.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,582 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * VNTSD main
+ *
+ * VNTSD takes the following options:
+ * -i	<device instance>
+ *	VCC device instance to use, e.g. virtual-console-concentrator@0.
+ *	Required option.
+ * -p	<ip address>
+ *	IP address VNTSD listens to.
+ * -d
+ *	Do not daemonize. This is only available in a DEBUG build.
+ * -t	timeout for inactivity 0 = indefinite
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <time.h>
+#include <netinet/in.h>
+#include <thread.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <libintl.h>
+#include <locale.h>
+#include <syslog.h>
+#include "vntsd.h"
+#include "chars.h"
+
+#if !defined(TEXT_DOMAIN)	/* Should be defined by cc -D */
+#define	TEXT_DOMAIN "SYS_TEST"	/* Use this only if it weren't. */
+#endif
+
+/* global variables */
+
+#ifdef DEBUG
+int vntsddbg = 0x8;
+#endif
+
+#define	MINUTE		60
+
+static vntsd_t *vntsdp;
+
+
+static void vntsd_exit(void);
+/* Signal handler for SIGINT, SIGKILL and SIGHUP */
+static void
+exit_sig_handler(int sig)
+{
+
+	char err_msg[VNTSD_LINE_LEN];
+
+	D1(stderr, "t@%d exit_sig_handler%d \n", thr_self(), sig);
+
+	(void) snprintf(err_msg, sizeof (err_msg), "exit_sig_handler() sig=%d",
+	    sig);
+
+	vntsd_log(VNTSD_STATUS_EXIT_SIG, err_msg);
+
+	exit(0);
+}
+
+/*
+ * Before a thread reads in client's input, it attaches to vntsd timer so that
+ * it can be waken up if a client does not access the connection for
+ * VNTSD_INPUT_TIMEOUT(10) minutes.
+ */
+
+/* attach a thread to timer */
+int
+vntsd_attach_timer(vntsd_timeout_t *tmop)
+{
+	int	rv;
+
+	if (vntsdp->timeout == 0) {
+		return (VNTSD_SUCCESS);
+	}
+
+	(void) mutex_lock(&vntsdp->tmo_lock);
+	rv = vntsd_que_append(&vntsdp->tmoq, (void *)tmop);
+	(void) mutex_unlock(&vntsdp->tmo_lock);
+	return (rv);
+}
+
+/* detach a thread from timer */
+int
+vntsd_detach_timer(vntsd_timeout_t *tmop)
+{
+	int	rv;
+
+	if (vntsdp->timeout == 0) {
+		return (VNTSD_SUCCESS);
+	}
+
+	(void) mutex_lock(&vntsdp->tmo_lock);
+	rv = vntsd_que_rm(&vntsdp->tmoq, (void *)tmop);
+	(void) mutex_unlock(&vntsdp->tmo_lock);
+
+	return (rv);
+}
+
+/* check threadd's timeout */
+static boolean_t
+chk_timeout(vntsd_timeout_t *tmop)
+{
+	tmop->minutes++;
+
+	if (tmop->minutes == vntsdp->timeout) {
+		/* wake up the thread */
+		tmop->clientp->status |= VNTSD_CLIENT_TIMEOUT;
+		(void) thr_kill(tmop->tid, SIGALRM);
+	}
+
+	/* return false to walk the queue */
+	return (B_FALSE);
+}
+
+/* reset timer */
+static boolean_t
+reset_timeout(vntsd_timeout_t *tmop, thread_t tid)
+{
+	if (tmop->tid == tid) {
+		tmop->minutes = 0;
+	}
+	/* return false to walk the queue */
+	return (B_FALSE);
+}
+
+void
+vntsd_reset_timer(thread_t tid)
+{
+	if (vntsdp->timeout == 0) {
+		return;
+	}
+
+	(void) mutex_lock(&vntsdp->tmo_lock);
+	(void) vntsd_que_find(vntsdp->tmoq, (compare_func_t)reset_timeout,
+	    (void*)tid);
+	(void) mutex_unlock(&vntsdp->tmo_lock);
+}
+
+/*
+ * When alarm goes off, wake up timeout threads. Alarm is set off every
+ * minutes.
+ */
+static void
+vntsd_alarm_sig_handler(int sig)
+{
+	static thread_t main_thread = 0;
+
+	D1(stderr, "t@%d alarm signal %d\n", thr_self(), sig);
+	if (vntsdp->timeout == 0) {
+		DERR(stderr, "t@%d alarm signal should not recv %d\n",
+		    thr_self(), sig);
+		return;
+	}
+
+
+	if (main_thread == 0) {
+		/* initialize thread id  */
+		main_thread = thr_self();
+	} else if (main_thread != thr_self()) {
+		/* get signal because thread is timeout */
+		return;
+	}
+
+	/* in main thread */
+	(void) mutex_lock(&vntsdp->tmo_lock);
+
+	/* wake up timeout threads */
+	(void) vntsd_que_walk(vntsdp->tmoq, (el_func_t)chk_timeout);
+	(void) mutex_unlock(&vntsdp->tmo_lock);
+
+	/* reset alarm */
+	(void) alarm(MINUTE);
+}
+
+/* got a  SIGUSER1 siginal */
+static void
+vntsd_sig_handler(int sig)
+{
+	char err_msg[VNTSD_LINE_LEN];
+
+	(void) snprintf(err_msg, sizeof (err_msg), "sig_handler() sig=%d",
+	    sig);
+
+	if (sig != SIGUSR1) {
+		vntsd_log(VNTSD_STATUS_SIG, err_msg);
+	}
+}
+
+/* vntsd exits */
+static void
+vntsd_exit(void)
+{
+	D1(stderr, "t@%d vntsd_exit\n", thr_self());
+
+	(void) mutex_lock(&vntsdp->lock);
+
+	if (vntsdp->timeout > 0) {
+		/* cancel the timer */
+		(void) alarm(0);
+	}
+	/* delete all  groups */
+	vntsd_free_que(&vntsdp->grouppq, (clean_func_t)vntsd_clean_group);
+
+	/* close control port */
+	(void) close(vntsdp->ctrl_fd);
+
+	assert(vntsdp->tmoq == NULL);
+	(void) mutex_unlock(&vntsdp->lock);
+
+	/* clean up vntsdp */
+	(void) mutex_destroy(&vntsdp->tmo_lock);
+	(void) mutex_destroy(&vntsdp->lock);
+	free(vntsdp);
+	closelog();
+}
+
+/*
+ * vntsd_help()
+ * print out valid command line options
+ */
+static void
+vntsd_help(void)
+{
+
+	(void) fprintf(stderr, gettext("Usage: vntsd -i <VCC device instance> "
+		    "[-p <listen address>] [-t <timeout in minutes>]\n"));
+}
+
+
+#ifdef DEBUG
+#define	DEBUG_OPTIONS	"d"
+#else
+#define	DEBUG_OPTIONS	""
+#endif
+
+int
+main(int argc, char ** argv)
+{
+	char	    *path;
+	struct	    pollfd poll_drv[1];
+	struct	    sigaction act;
+	char	    *listen_addr = NULL;
+	pid_t	    pid;
+	int	    i;
+	int	    option;
+	int	    sz;
+	int	    fd;
+	int	    n;
+
+	/* internationalization */
+	(void) setlocale(LC_MESSAGES, "");
+	(void) textdomain(TEXT_DOMAIN);
+	vntsd_init_esctable_msgs();
+
+	/* initialization */
+	bzero(&act, sizeof (act));
+
+	vntsdp = calloc(sizeof (vntsd_t), 1);
+	if (vntsdp == NULL) {
+		vntsd_log(VNTSD_ERR_NO_MEM, "main:vntsdp");
+		exit(1);
+	}
+
+	vntsdp->ctrl_fd = -1;
+	vntsdp->devinst = NULL;
+
+	(void) mutex_init(&vntsdp->lock, USYNC_THREAD|LOCK_ERRORCHECK, NULL);
+	(void) mutex_init(&vntsdp->tmo_lock, USYNC_THREAD|LOCK_ERRORCHECK,
+	    NULL);
+
+	/* get CLI options */
+	while ((option = getopt(argc, argv, "i:t:p:"DEBUG_OPTIONS)) != EOF) {
+		switch (option) {
+#ifdef DEBUG
+		case 'd':
+			vntsdp->options |= VNTSD_OPT_DAEMON_OFF;
+			break;
+#endif
+		case 'i':
+			vntsdp->devinst = optarg;
+			break;
+		case 'p':
+			listen_addr = optarg;
+			break;
+
+		case 't':
+			n = sscanf(optarg, "%d", &(vntsdp->timeout));
+			if (n  != 1) {
+				vntsdp->timeout = -1;
+			}
+			break;
+
+		default:
+			vntsd_help();
+			exit(1);
+		}
+	}
+
+	if ((vntsdp->devinst == NULL) || (vntsdp->timeout == -1)) {
+		vntsd_help();
+		exit(1);
+	}
+
+	if (listen_addr == NULL || strcmp(listen_addr, "localhost") == 0) {
+		/* by default listen on loopback interface */
+		vntsdp->ip_addr.s_addr = htonl(INADDR_LOOPBACK);
+	} else if (strcmp(listen_addr, "any") == 0) {
+		vntsdp->ip_addr.s_addr = htonl(INADDR_ANY);
+	} else {
+		vntsdp->ip_addr.s_addr = inet_addr(listen_addr);
+		if (vntsdp->ip_addr.s_addr == (in_addr_t)(-1)) {
+			(void) fprintf(stderr,
+			    gettext("Invalid listen address '%s'\n"),
+			    listen_addr);
+			exit(1);
+		}
+	}
+
+	D3(stderr, "options = %llx, instance = %s, listen = %s\n",
+	    vntsdp->options, vntsdp->devinst,
+	    listen_addr ? listen_addr : "<null>");
+
+	/* open VCC driver control port */
+	sz = strlen(VCC_DEVICE_CTL_PATH) + strlen(vntsdp->devinst) + 1;
+	path = calloc(sz, 1);
+	if (path == NULL) {
+		vntsd_log(VNTSD_ERR_NO_MEM, "main(): alloc dev path");
+		exit(1);
+	}
+	(void) snprintf(path, sz-1, VCC_DEVICE_CTL_PATH, vntsdp->devinst,
+	    sizeof (vntsdp->devinst));
+	vntsdp->ctrl_fd = open(path, O_RDWR);
+	free(path);
+
+	if (vntsdp->ctrl_fd == -1) {
+		/*
+		 * do not print error if device is not present
+		 * the daemon is probably being started incorrectly
+		 */
+		if (errno != ENOENT) {
+			syslog(LOG_ERR,
+			    "Error opening VCC device control port: %s",
+			    strerror(errno));
+		}
+		exit(1);
+	}
+	if ((vntsdp->options & VNTSD_OPT_DAEMON_OFF) == 0) {
+		/* daemonize it */
+		pid = fork();
+		if (pid < 0) {
+			perror("fork");
+			exit(1);
+		}
+		if (pid > 0) {
+			/* parent */
+			exit(0);
+		}
+
+		/*
+		 * child process (daemon)
+		 *
+		 * Close all file descriptors other than 2 and the ctrl fd.
+		 */
+		(void) close(0);
+		(void) close(1);
+		for (i = 3; i < vntsdp->ctrl_fd; i++) {
+			(void) close(i);
+		}
+		closefrom(vntsdp->ctrl_fd + 1);
+
+		/* obtain a new process group */
+		(void) setsid();
+		fd =  open("/dev/null", O_RDWR);
+		if (fd < 0) {
+			syslog(LOG_ERR, "Can not open /dev/null");
+			exit(1);
+		}
+		/* handle standard I/O */
+		if (dup2(fd, 0) < 0) {
+			syslog(LOG_ERR, "Failed dup2()");
+			exit(1);
+		}
+
+		if (dup2(fd, 1) < 0) {
+			syslog(LOG_ERR, "Failed dup2()");
+			exit(1);
+		}
+
+		/* ignore terminal signals */
+		(void) signal(SIGTSTP, SIG_IGN);
+		(void) signal(SIGTTOU, SIG_IGN);
+		(void) signal(SIGTTIN, SIG_IGN);
+	}
+
+
+	/* set up signal handlers */
+
+	/* exit signals */
+	act.sa_handler = exit_sig_handler;
+
+	(void) sigemptyset(&act.sa_mask);
+	(void) sigaction(SIGINT, &act, NULL);
+	(void) sigaction(SIGTERM, &act, NULL);
+	(void) sigaction(SIGHUP, &act, NULL);
+
+	/* vntsd internal signals */
+	act.sa_handler = vntsd_sig_handler;
+	(void) sigemptyset(&act.sa_mask);
+	(void) sigaction(SIGUSR1, &act, NULL);
+
+
+	act.sa_handler = vntsd_alarm_sig_handler;
+	(void) sigemptyset(&act.sa_mask);
+	(void) sigaction(SIGALRM, &act, NULL);
+
+
+	/* setup exit */
+	(void) atexit(vntsd_exit);
+
+
+
+	/* initialization */
+	openlog("vntsd", LOG_CONS, LOG_DAEMON);
+
+
+	/* set alarm */
+	if (vntsdp->timeout > 0) {
+		(void) alarm(MINUTE);
+	}
+
+	vntsdp->tid = thr_self();
+
+	/* get exiting consoles from vcc */
+	vntsd_get_config(vntsdp);
+
+	for (; ; ) {
+		/* poll vcc for configuration change */
+		bzero(poll_drv, sizeof (poll_drv));
+
+		poll_drv[0].fd = vntsdp->ctrl_fd;
+		poll_drv[0].events = POLLIN;
+
+		if (poll(poll_drv, 1, -1) == -1) {
+			if (errno == EINTR) {
+				/* wake up because a consle was deleted */
+				vntsd_delete_cons(vntsdp);
+				continue;
+			}
+			vntsd_log(VNTSD_ERR_VCC_POLL,
+			    "vcc control poll err! aborting..");
+			exit(1);
+		}
+
+		D1(stderr, "t@%d driver event %x\n", thr_self(),
+		    poll_drv[0].revents);
+
+		vntsd_daemon_wakeup(vntsdp);
+
+	}
+
+	/*NOTREACHED*/
+	return (0);
+}
+
+/* export ip_addr */
+struct in_addr
+vntsd_ip_addr(void)
+{
+	return (vntsdp->ip_addr);
+}
+
+/*
+ * ioctl to vcc control port
+ * Supported ioctls interface are:
+ *		ioctl code	    parameters	   return data
+ *		VCC_NUM_CONSOLE	    none	   uint_t  no consoles
+ *		VCC_CONS_TBL	    none	   array of vcc_cons_t
+ *		VCC_INQUIRY	    none	   vcc_response_t response
+ *		VCC_CONS_INFO	    uint_t portno   vcc_cons_t
+ *		VCC_CONS_STATUS	    uint_t portno
+ *		VCC_FORCE_CLOSE	    uint_t portno
+ */
+int
+vntsd_vcc_ioctl(int ioctl_code, uint_t portno, void *buf)
+{
+	D1(stderr, "t@%d vcc_ioctl@%d code=%x\n", thr_self(), portno,
+	    ioctl_code);
+
+	if ((ioctl_code == (VCC_CONS_INFO)) ||
+	    (ioctl_code == (VCC_FORCE_CLOSE))) {
+		/* construct vcc in buf */
+		*((uint_t *)buf) = portno;
+	}
+
+	if (ioctl(vntsdp->ctrl_fd, ioctl_code, (caddr_t)buf)) {
+		/*  control port get error */
+		syslog(LOG_ERR, "vcc control port error! abort vntsd");
+		(void) thr_kill(vntsdp->tid, SIGINT);
+		return (VNTSD_STATUS_VCC_IO_ERR);
+	}
+
+	return (VNTSD_SUCCESS);
+}
+
+/*
+ * check if a vcc i/o error is caused by removal of a console. If so notify
+ * all clients connected to the console and wake up main thread to cleanup
+ * the console.
+ */
+int
+vntsd_vcc_err(vntsd_cons_t *consp)
+{
+	vntsd_group_t *groupp;
+
+	assert(consp);
+	groupp = consp->group;
+	assert(groupp);
+
+	if (consp->status & VNTSD_CONS_DELETED) {
+		/* console was deleted  */
+		return (VNTSD_STATUS_VCC_IO_ERR);
+	}
+
+	if (vntsd_vcc_cons_alive(consp)) {
+		/* console is ok */
+		return (VNTSD_STATUS_CONTINUE);
+	}
+
+	/* console needs to be deleted */
+	(void) mutex_lock(&consp->lock);
+	consp->status |= VNTSD_CONS_DELETED;
+
+	/* signal all clients to disconnect from console */
+	(void) vntsd_que_walk(consp->clientpq,
+	    (el_func_t)vntsd_notify_client_cons_del);
+	(void) mutex_unlock(&consp->lock);
+
+	/* mark the group */
+	(void) mutex_lock(&groupp->lock);
+	groupp->status |= VNTSD_GROUP_CLEAN_CONS;
+	(void) mutex_unlock(&groupp->lock);
+
+	/* signal main thread to deleted console */
+	(void) thr_kill(vntsdp->tid, SIGUSR1);
+
+	return (VNTSD_STATUS_VCC_IO_ERR);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/vntsd.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,476 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * vntsd uses configuration information provided by vcc to export access
+ * to Ldom console access over regular TCP sockets. When it starts, it opens
+ * the vcc driver control port and obtains the list of ports that have been
+ * created by the vcc driver as well as TCP port number and group associated
+ * with each port.
+ * vntsd consists of multiple components as the follows:
+ *
+ * vntsd.c
+ * This module initializes vnts daemon, process user options such as instance
+ * number, ip address and etc., and provides main thread to poll any console
+ * port change.
+ *
+ * vntsdvcc.c
+ * This module provides vcc driver interface. It opens vcc driver control
+ * ports, read initial configuration, and provides interface to read, write and
+ * ioctl virtual console ports. This module creates a listen thread for each
+ * console group. It further dynamically adds and removes virtual consoles
+ * and groups following instructions of the vcc driver. This module
+ * is executed in the same thread as vntsd.c which is blocked on vcc control
+ * poll interface.
+ *
+ * listen.c
+ * This is a group listen thread. Each group's tcp-port has a listen thread
+ * associated with it. The thread is created when a console is associated with
+ * a new group and is removed when all consoles in the group are removed.
+ *
+ * console.c
+ * This is a console selection thread. The thread is created when a client
+ * connects to a group TCP port and exited when client disconnects. If there is
+ * only one console in the group, the client is connected to that console. If
+ * there are multiple consoles in the group, the client is asked to select a
+ * console. After determining which console to connect to, this thread
+ * a write thread if the cient is a writer and it self read in client input.
+ *
+ * read.c
+ * it reads input from a TCP client, processes
+ * special daemon and telent commands and write to vcc driver if the client
+ * is a writer. The client is a writer if the client is the first one connects
+ * to the console. Read thread print out an error message if a reader attempt
+ * to input to vcc. Read thread exits if console is deleted, client
+ * disconnects, or there is a fatal error.
+ *
+ * Write.c
+ * Write thread is creaed when first client connects to a console. It reads
+ * from vcc and writes to all clients that connect to the same console.
+ * Write thread exits when all clients disconnect from the console.
+ *
+ * cmd.c
+ * This is a supporting module for handling special daemon and telnet commands.
+ *
+ * common.c
+ * supporting modules shared by threads modules.
+ *
+ * queue.c
+ * This is a moudle supporting queue operations. Vntsd organizes its data
+ * in multiple queues <see data structure below>.
+ *
+ * vntsd.xml
+ * This is a manifest to support SMF interfaces.
+ *
+ * Data structures
+ * each group has a vntsd_group_t structure, which contains a queue of
+ * all console in that group.
+ * each console has a vntsd_cons_t structure, which contains a queue of
+ * all clients that connected to the console.
+ *
+ *     +----------+   +----------+   +----------+
+ *     |  group	  |-->|  group   |-->|   group  |-->....
+ *     +----------+   +----------+   +----------+
+ *          |
+ *          |<-----------------------------------------+
+ *          |<------------------------+                |
+ *          |<--------+               |                |
+ *          |         |               |                |
+ *          |      +----------+     +----------+     +----------+
+ *          +----->| console  |---->| console  |---->| lconsole |---> ....
+ *                 +----------+     +----------+     +----------+
+ *                     |  |
+ *		       |  |     +----------+      +----------+
+ *		       |  +---->|  client  |----->|   client |----->......
+ *		       |	+----------+      +----------+
+ *		       |	     |                 |
+ *		       |<------------+                 |
+ *		       |<------------------------------+
+ *
+ * Locks
+ *  Each vntsd has one lock to protect the group queue
+ *  Each group has one lock to protect the console queue,  the queue for
+ *  clients without a console connection and status.
+ *  Each console has one lock to protect client queue and status.
+ *  Each client has one lock to protect the state of the client. The client
+ *  states are:
+ *
+ *  VCC_CLIENT_READER
+ *	A client is connected to a console as either a writer or a reader.
+ *	if this client is the first one connects the console, the client is
+ *	a writer, otherwise the client is a reader. A writer' write thread
+ *	reads from vcc and send output to all readers connected to the
+ *	same console. a reader's write thread is blocked until a reader becomes
+ *	a writer.
+ *
+ *	When a client selected a console, the client becomes a reader if
+ *	there is another client connected to the console before the client.
+ *	A client will be a writer if
+ *	1. client is the first one connected to the console or
+ *	2. client has entered a ~w daemon command or
+ *	3. all clients connected to the console before the client have
+ *	   disconnected from the console.
+ *
+ *  VCC_CLIENT_MOVE_CONS_FORWARD
+ *  VCC_CLIENT_MOVE_CONS_BACKWOARD
+ *	A client is disconnecting from one console and move to the next or
+ *	previous console in the group queue.
+ *	A client is in one of these state if
+ *	1. the client has entered the daemon command and
+ *	2. the vntsd is in process of switching the client from one
+ *	   console to another.
+ *
+ *  VCC_CLIENT_DISABLE_DAEMON_CMD
+ *	vntsd is in processing of a client's daemon command or the client is
+ *	in selecting console.
+ *	A client is in this state if
+ *	1. the client has not selected a console or
+ *	2. the vntsd is processing a client's daemon command.
+ *
+ *  VCC_CLIENT_ACQUIRE_WRITER
+ *	A reader forces to become a writer via vntsd special command.
+ *	A client is in this state if
+ *	1. the client is a reader and
+ *	2. client has entered a daemon command to become a writer.
+ *
+ *  VCC_CLIENT_CONS_DELETED
+ *	The console that the client is connected to is being deleted and
+ *	waiting for the client to disconnect.
+ *	A client is in this state if
+ *	1. the console a client is connected to is being removed and
+ *	2. the vntsd is in process of disconnecting the client from the console.
+ *
+ */
+
+#ifndef _VNTSD_H
+#define	_VNTSD_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include	<sys/shm.h>
+#include	<strings.h>
+#include	<assert.h>
+#include	<sys/wait.h>
+#include	<sys/stat.h>
+#include	<fcntl.h>
+#include	<stropts.h>
+#include	<errno.h>
+#include	<sys/param.h>
+#include	"../../uts/sun4v/sys/vcc.h"
+
+#define	DEBUG
+
+/* vntsd limits */
+#define	    VNTSD_MAX_BUF_SIZE		128
+#define	    VNTSD_LINE_LEN		100
+#define	    VNTSD_MAX_SOCKETS		5
+#define	    VNTSD_EOL_LEN		2
+
+/* secons before re-send signal for cv_wait */
+#define	    VNTSD_CV_WAIT_DELTIME	10
+
+#define	    VCC_PATH_PREFIX     \
+		"/devices/virtual-devices@100/channel-devices@200/"
+#define	    VCC_DEVICE_PATH			"/devices%s"
+#define	    VCC_DEVICE_CTL_PATH VCC_PATH_PREFIX "%s:ctl"
+
+/* common messages */
+#define	    VNTSD_NO_WRITE_ACCESS_MSG	"You do not have write access"
+
+/* vntsd options */
+#define	    VNTSD_OPT_DAEMON_OFF	0x1
+
+/* group states */
+
+#define	    VNTSD_GROUP_SIG_WAIT	0x1	/*  waiting for signal */
+#define	    VNTSD_GROUP_CLEAN_CONS	0x2	/*  cons needs to be clean */
+#define	    VNTSD_GROUP_CLEANUP		0x4	/*  waiting for signal */
+
+
+
+
+
+/* console status */
+
+#define	    VNTSD_CONS_DELETED		0x1	/* deleted */
+#define	    VNTSD_CONS_SIG_WAIT		0x2	/* waiting fro signal */
+
+
+#define	    VNTSD_CLIENT_IO_ERR		    0x1	    /* reader */
+#define	    VNTSD_CLIENT_DISABLE_DAEMON_CMD 0x2	    /* disable daemon cmd */
+#define	    VNTSD_CLIENT_TIMEOUT	    0x4	    /* timeout */
+#define	    VNTSD_CLIENT_CONS_DELETED	    0x8	    /* console deleted */
+
+/* generic que structure */
+typedef	struct vntsd_que {
+	void			*handle;	/* element in queue */
+	struct vntsd_que	*nextp;		/* next queue element */
+	struct vntsd_que	*prevp;		/* previous queue element */
+} vntsd_que_t;
+
+struct vntsd_cons;
+struct vntsd_group;
+struct vntsd;
+
+/* client structure  */
+typedef struct vntsd_client {
+	mutex_t	    lock;	    /* protect the client */
+	uint_t	    status;	    /* client's state */
+
+	int	    sockfd;	    /* connection socket */
+	thread_t    cons_tid;	    /* console thread */
+
+	struct vntsd_cons    *cons; /* back link to console configuration */
+
+} vntsd_client_t;
+
+/* console structure */
+typedef struct vntsd_cons {
+	mutex_t		lock;			    /* protect console port */
+	cond_t		cvp;			    /* sync between threads */
+
+	vntsd_que_t	*clientpq;		    /* client que */
+	uint_t		status;			    /* client's state */
+	int		vcc_fd;			    /* vcc console port */
+	thread_t	wr_tid;			    /* write thread */
+
+	uint_t		cons_no;		    /* console port number  */
+	char		domain_name[MAXPATHLEN];    /* domain name */
+	char		dev_name[MAXPATHLEN];
+
+	struct vntsd_group   *group;		    /* back link to group */
+} vntsd_cons_t;
+
+/* group structure  */
+typedef struct vntsd_group {
+	mutex_t	    lock;		    /* protect group */
+	cond_t	    cvp;		    /* sync remove group */
+
+	uint_t	    status;		    /* group status */
+	char	    group_name[MAXPATHLEN];
+	uint64_t    tcp_port;		    /* telnet port */
+
+	thread_t    listen_tid;		    /* listen thread */
+	int	    sockfd;		    /* listen socket */
+
+	vntsd_que_t *conspq;		    /* console queue */
+	uint_t	    num_cons;		    /* num console */
+
+	/* clients have no console connection */
+	vntsd_que_t *no_cons_clientpq;
+	struct vntsd   *vntsd;
+
+} vntsd_group_t;
+
+/* daemon structure */
+typedef struct vntsd {
+
+	mutex_t		lock;			/* protect vntsd */
+	mutex_t		tmo_lock;		/* protect tmo queue */
+
+	int		instance;		/* vcc instance */
+	struct in_addr  ip_addr;		/* ip address to listen */
+	uint64_t	options;		/* daemon options */
+	int		timeout;		/* connection timeout */
+
+	char		*devinst;		/* device name */
+	int		ctrl_fd;		/* vcc ctrl port */
+
+	vntsd_que_t	*grouppq;		/* group queue */
+	uint_t		num_grps;		/* num groups */
+
+	vntsd_que_t	*tmoq;			/* timeout queue */
+	thread_t	tid;			/* main thread id */
+
+} vntsd_t;
+
+/* handle for creating thread */
+typedef	struct vntsd_thr_arg {
+	void	*handle;
+	void	*arg;
+} vntsd_thr_arg_t;
+
+/* timeout structure */
+typedef struct vntsd_timeout {
+	thread_t	tid;		    /* thread tid */
+	uint_t		minutes;	    /* idle minutes */
+	vntsd_client_t	*clientp;	    /* client */
+} vntsd_timeout_t;
+
+/* vntsd status and error  definitions */
+typedef enum {
+
+	/* status */
+	VNTSD_SUCCESS = 0,		/* success */
+	VNTSD_STATUS_CONTINUE,		/* continue to execute */
+	VNTSD_STATUS_EXIT_SIG,		/* exit siginal */
+	VNTSD_STATUS_SIG,		/* known signal */
+	VNTSD_STATUS_NO_HOST_NAME,	/* no host name set */
+	VNTSD_STATUS_CLIENT_QUIT,	/* client disconnected from group */
+	VNTSD_STATUS_RESELECT_CONS,	/* client re-selecting console */
+	VNTSD_STATUS_VCC_IO_ERR,	/* a vcc io error occurs */
+	VNTSD_STATUS_MOV_CONS_FORWARD,	/* down arrow  */
+	VNTSD_STATUS_MOV_CONS_BACKWARD,	/* up  arrow  */
+	VNTSD_STATUS_ACQUIRE_WRITER,	/* force become the writer */
+	VNTSD_STATUS_INTR,		/* thread receive a signal */
+	VNTSD_STATUS_DISCONN_CONS,	/* disconnect a client from cons */
+	VNTSD_STATUS_NO_CONS,		/* disconnect a client from cons */
+
+	/* resource errors */
+	VNTSD_ERR_NO_MEM,		/* memory allocation error */
+	VNTSD_ERR_NO_DRV,		/* cannot open vcc port */
+
+	/* vcc errors */
+	VNTSD_ERR_VCC_CTRL_DATA,	/* vcc ctrl data error */
+	VNTSD_ERR_VCC_POLL,		/* error poll vcc driver */
+	VNTSD_ERR_VCC_IOCTL,		/* vcc ioctl call error */
+	VNTSD_ERR_VCC_GRP_NAME,		/* group name differs from database */
+	VNTSD_ERR_ADD_CONS_FAILED,	/* addition of a console failed */
+
+	/* create thread errors */
+	VNTSD_ERR_CREATE_LISTEN_THR,	/* listen thread creation failed */
+	VNTSD_ERR_CREATE_CONS_THR,	/* create console thread err  */
+	VNTSD_ERR_CREATE_WR_THR,	/* listen thread creation failed */
+
+	/* listen thread errors */
+	VNTSD_ERR_LISTEN_SOCKET,	/* can not create tcp socket */
+	VNTSD_ERR_LISTEN_OPTS,		/* can not set socket opt */
+	VNTSD_ERR_LISTEN_BIND,		/* can not bind socket */
+	VNTSD_STATUS_ACCEPT_ERR,	/* accept error  */
+
+	/* tcp client read and write errors */
+	VNTSD_ERR_WRITE_CLIENT,		/* writing tcp client err */
+
+	/* tcp client timeout */
+	VNTSD_ERR_CLIENT_TIMEOUT,	/* client has no activity for timeout */
+
+	/* signal errors */
+	VNTSD_ERR_SIG,			/* unknown signal */
+
+	/* user input error */
+	VNTSD_ERR_INVALID_INPUT,	/* client typed in */
+
+	/* internal errors */
+	VNTSD_ERR_EL_NOT_FOUND,		/* element not found */
+	VNTSD_ERR_UNKNOWN_CMD		/* unknown error/cmd */
+
+} vntsd_status_t;
+
+/* function prototype defines */
+typedef	int	    (*compare_func_t)(void *el, void *data);
+typedef	int	    (*el_func_t)(void *el);
+typedef	void	    (*clean_func_t)(void *el);
+typedef	void	    (*sig_handler_t)(int sig);
+typedef	void	    *(*thr_func_t)(void *);
+
+
+
+/* function prototype */
+void		vntsd_log(vntsd_status_t err, char *msg);
+struct in_addr	vntsd_ip_addr(void);
+
+void		vntsd_get_config(vntsd_t *vntsdp);
+void		vntsd_daemon_wakeup(vntsd_t *vntsdp);
+int		vntsd_open_vcc(char *domain_name, uint_t cons_no);
+void		vntsd_delete_cons(vntsd_t *vntsdp);
+void		vntsd_clean_group(vntsd_group_t *groupp);
+
+
+void		*vntsd_listen_thread(vntsd_group_t *groupp);
+void		*vntsd_console_thread(vntsd_thr_arg_t *argp);
+int		vntsd_read(vntsd_client_t *clientp);
+void		*vntsd_write_thread(vntsd_cons_t *consp);
+
+boolean_t	vntsd_cons_by_consno(vntsd_cons_t *consp, int *cons_id);
+
+int		vntsd_que_append(vntsd_que_t **que_hd, void *handle);
+int		vntsd_que_rm(vntsd_que_t **que_hd, void *handle);
+void		*vntsd_que_find(vntsd_que_t *que_hd, compare_func_t
+			compare_func, void *data);
+void		*vntsd_que_walk(vntsd_que_t *que_hd, el_func_t el_func);
+
+int		vntsd_que_insert_after(vntsd_que_t *que, void *handle,
+			void *next);
+void		*vntsd_que_pos(vntsd_que_t *que_hd, void *handle, int pos);
+void		vntsd_free_que(vntsd_que_t **q, clean_func_t clean_func);
+
+int		vntsd_read_char(vntsd_client_t *clientp, char *c);
+int		vntsd_read_line(vntsd_client_t *clientp, char *buf, int *size);
+int		vntsd_read_data(vntsd_client_t *clientp, char *c);
+int		vntsd_get_yes_no(vntsd_client_t *clientp, char *msg,
+			int *yes_no);
+int		vntsd_ctrl_cmd(vntsd_client_t *clientp, char c);
+int		vntsd_process_daemon_cmd(vntsd_client_t *clientp, char c);
+int		vntsd_telnet_cmd(vntsd_client_t *clientp, char c);
+
+int		vntsd_set_telnet_options(int fd);
+int		vntsd_write_client(vntsd_client_t *client, char *buffer,
+	size_t sz);
+int		vntsd_write_fd(int fd, void *buffer, size_t sz);
+int		vntsd_write_line(vntsd_client_t *clientp, char *line);
+int		vntsd_write_lines(vntsd_client_t *clientp, char *lines);
+extern char	vntsd_eol[];
+
+void		vntsd_clean_group(vntsd_group_t *portp);
+void		vntsd_free_client(vntsd_client_t *clientp);
+int		vntsd_attach_timer(vntsd_timeout_t *tmop);
+int		vntsd_detach_timer(vntsd_timeout_t *tmop);
+void		vntsd_reset_timer(thread_t tid);
+void		vntsd_init_esctable_msgs(void);
+int		vntsd_vcc_ioctl(int ioctl_code, uint_t portno, void *buf);
+int		vntsd_vcc_err(vntsd_cons_t *consp);
+int		vntsd_cons_chk_intr(vntsd_client_t *clientp);
+boolean_t	vntsd_vcc_cons_alive(vntsd_cons_t *consp);
+boolean_t	vntsd_notify_client_cons_del(vntsd_client_t *clientp);
+int		vntsd_chk_group_total_cons(vntsd_group_t *groupp);
+
+
+#ifdef	DEBUG
+
+extern int vntsddbg;
+
+#define	D1 	if (vntsddbg & 0x01) (void) fprintf
+#define	D2	if (vntsddbg & 0x02) (void) fprintf
+#define	D3 	if (vntsddbg & 0x04) (void) fprintf
+#define	DERR 	if (vntsddbg & 0x08) (void) fprintf
+
+#else  /* not DEBUG */
+
+#define	D1
+#define	D2
+#define	D3
+#define	DERR
+
+#endif /* not DEBUG */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VNTSD_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/vntsd.xml	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,94 @@
+<?xml version="1.0"?>
+<!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1">
+<!--
+ Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ Use is subject to license terms.
+
+ CDDL HEADER START
+
+ The contents of this file are subject to the terms of the
+ Common Development and Distribution License (the "License").
+ You may not use this file except in compliance with the License.
+
+ You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ or http://www.opensolaris.org/os/licensing.
+ See the License for the specific language governing permissions
+ and limitations under the License.
+
+ When distributing Covered Code, include this CDDL HEADER in each
+ file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ If applicable, add the following below this CDDL HEADER, with the
+ fields enclosed by brackets "[]" replaced with your own identifying
+ information: Portions Copyright [yyyy] [name of copyright owner]
+
+ CDDL HEADER END
+
+	ident	"%Z%%M%	%I%	%E% SMI"
+
+	NOTE:  This service manifest is not editable; its contents will
+	be overwritten by package or patch operations, including
+	operating system upgrade.  Make customizations in a different
+	file.
+-->
+
+<service_bundle type='manifest' name='SUNWldomu:vntsd'>
+
+<service
+    name='ldoms/vntsd'
+    type='service'
+    version='1'>
+
+	<create_default_instance enabled='false' />
+
+	<dependency
+		name='network'
+		grouping='optional_all'
+		restart_on='error'
+		type='service'>
+		<service_fmri value='svc:/milestone/network' />
+	</dependency>
+
+	<dependency
+		name='syslog'
+		grouping='optional_all'
+		restart_on='none'
+		type='service'>
+		<service_fmri value='svc:/system/system-log' />
+	</dependency>
+
+	<exec_method
+	    type='method'
+	    name='start'
+	    exec='/lib/svc/method/svc-vntsd'
+	    timeout_seconds='60' />
+
+	<exec_method
+	    type='method'
+	    name='stop'
+	    exec=':kill'
+	    timeout_seconds='30' />
+
+	<!-- these are passed to vntsd in the method script -->
+	<property_group name='vntsd' type='application'>
+		<propval name='vcc_device' type='astring' 
+			value='virtual-console-concentrator@0' />
+		<propval name='listen_addr' type='astring' value='localhost' />
+		<propval name='timeout_minutes' type='integer' value='0' />
+	</property_group>
+
+	<stability value='Unstable' />
+
+	<template>
+		<common_name>
+			<loctext xml:lang='C'>
+			virtual network terminal server
+			</loctext>
+		</common_name>
+		<documentation>
+			<manpage title='vntsd' section='1M'
+				manpath='/usr/share/man' />
+		</documentation>
+	</template>
+</service>
+
+</service_bundle>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/vntsdvcc.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,633 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Configuration and setup interface to vcc driver.
+ * At intialization time, vntsd opens vcc ctrl port and read initial
+ * configuratioa. It manages console groups, creates the listen thread,
+ * dynamically adds and removes virtual console within a group.
+ */
+
+
+#include <syslog.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+#include <wait.h>
+#include <time.h>
+#include <synch.h>
+#include <netinet/in.h>
+#include <thread.h>
+#include <signal.h>
+#include "vntsd.h"
+
+/* signal all clients that console has been deleted */
+boolean_t
+vntsd_notify_client_cons_del(vntsd_client_t *clientp)
+{
+	(void) mutex_lock(&clientp->lock);
+	clientp->status |= VNTSD_CLIENT_CONS_DELETED;
+	(void) thr_kill(clientp->cons_tid, SIGUSR1);
+	(void) mutex_unlock(&clientp->lock);
+	return (B_FALSE);
+}
+
+/* free console  structure */
+static void
+free_cons(vntsd_cons_t *consp)
+{
+	assert(consp);
+	(void) mutex_destroy(&consp->lock);
+	(void) cond_destroy(&consp->cvp);
+	free(consp);
+}
+
+/*
+ *  all clients connected to a console must disconnect before
+ *  removing a console.
+ */
+static void
+cleanup_cons(vntsd_cons_t *consp)
+{
+	vntsd_group_t	*groupp;
+	timestruc_t	to;
+
+	assert(consp);
+	D1(stderr, "t@%d vntsd_disconn_clients@%d\n", thr_self(),
+	    consp->cons_no);
+
+	groupp = consp->group;
+	assert(groupp);
+
+
+	(void) mutex_lock(&consp->lock);
+
+	/* wait for all clients disconnect from the console */
+	while (consp->clientpq != NULL) {
+		consp->status |= VNTSD_CONS_SIG_WAIT;
+
+		/* signal client to disconnect the console */
+		(void) vntsd_que_walk(consp->clientpq,
+		    (el_func_t)vntsd_notify_client_cons_del);
+
+		(void) thr_kill(consp->wr_tid, SIGUSR1);
+		to.tv_sec = VNTSD_CV_WAIT_DELTIME;
+		to.tv_nsec = 0;
+
+		/* wait for clients to disconnect  */
+		(void) cond_reltimedwait(&consp->cvp, &consp->lock, &to);
+	}
+
+	(void) mutex_unlock(&consp->lock);
+
+	free_cons(consp);
+}
+
+/* search for a group whose console is being deleted */
+static boolean_t
+find_clean_cons_group(vntsd_group_t *groupp)
+{
+	if (groupp->status & VNTSD_GROUP_CLEAN_CONS) {
+		return (B_TRUE);
+	} else {
+		return (B_FALSE);
+	}
+}
+
+/* search for a console that is being deleted */
+static boolean_t
+find_clean_cons(vntsd_cons_t *consp)
+{
+	if (consp->status & VNTSD_CONS_DELETED) {
+		return (B_TRUE);
+	} else {
+		return (B_FALSE);
+	}
+}
+
+/* delete a console */
+void
+vntsd_delete_cons(vntsd_t *vntsdp)
+{
+	vntsd_group_t *groupp;
+	vntsd_cons_t *consp;
+
+	for (; ; ) {
+		/* get the group contains deleted console */
+		(void) mutex_lock(&vntsdp->lock);
+		groupp = vntsd_que_walk(vntsdp->grouppq,
+		    (el_func_t)find_clean_cons_group);
+		if (groupp == NULL) {
+			/* no more group has console deleted */
+			(void) mutex_unlock(&vntsdp->lock);
+			return;
+		}
+		groupp->status &= ~VNTSD_GROUP_CLEAN_CONS;
+		(void) mutex_unlock(&vntsdp->lock);
+
+		for (; ; ) {
+			/* get the console to be deleted */
+			(void) mutex_lock(&groupp->lock);
+			assert(groupp->conspq);
+			consp = vntsd_que_walk(groupp->conspq,
+			    (el_func_t)find_clean_cons);
+			if (consp == NULL) {
+				/* no more cons to delete */
+				(void) mutex_unlock(&groupp->lock);
+				break;
+			}
+
+			/* remove console from the group */
+			(void) vntsd_que_rm(&groupp->conspq, consp);
+			groupp->num_cons--;
+			(void) mutex_unlock(&groupp->lock);
+
+			/* clean up the console */
+			cleanup_cons(consp);
+
+			/* delete group? */
+			if (groupp->num_cons == 0) {
+				/* no more console delete it */
+				assert(groupp->vntsd);
+
+				(void) mutex_lock(&groupp->vntsd->lock);
+				(void) vntsd_que_rm(&groupp->vntsd->grouppq,
+						    groupp);
+				(void) mutex_unlock(&groupp->vntsd->lock);
+
+				/* clean up the group */
+				vntsd_clean_group(groupp);
+				break;
+			}
+		}
+	}
+}
+
+/* clean up a group */
+void
+vntsd_clean_group(vntsd_group_t *groupp)
+{
+
+	timestruc_t	to;
+
+	D1(stderr, "t@%d clean_group() group=%s tcp=%lld\n", thr_self(),
+	    groupp->group_name, groupp->tcp_port);
+
+	(void) mutex_lock(&groupp->lock);
+
+	/* prevent from reentry */
+	if (groupp->status & VNTSD_GROUP_CLEANUP) {
+		(void) mutex_unlock(&groupp->lock);
+		return;
+	}
+	groupp->status |= VNTSD_GROUP_CLEANUP;
+	vntsd_free_que(&groupp->conspq, (clean_func_t)cleanup_cons);
+	(void) mutex_unlock(&groupp->lock);
+
+	/* walk through no cons client queue */
+	while (groupp->no_cons_clientpq != NULL) {
+		groupp->status |= VNTSD_GROUP_SIG_WAIT;
+		(void) vntsd_que_walk(groupp->no_cons_clientpq,
+		    (el_func_t)vntsd_notify_client_cons_del);
+		to.tv_sec = VNTSD_CV_WAIT_DELTIME;
+		to.tv_nsec = 0;
+		(void) cond_reltimedwait(&groupp->cvp, &groupp->lock, &to);
+	}
+
+	if (groupp->listen_tid == thr_self()) {
+		/* listen thread is exiting */
+		(void) mutex_lock(&(groupp->vntsd->lock));
+		(void) vntsd_que_rm(&groupp->vntsd->grouppq, groupp);
+		(void) mutex_unlock(&groupp->vntsd->lock);
+
+		(void) cond_destroy(&groupp->cvp);
+		(void) mutex_unlock(&groupp->lock);
+		(void) mutex_destroy(&groupp->lock);
+		free(groupp);
+		return;
+	}
+
+	/* signal listen thread to exit  */
+	groupp->status |= VNTSD_GROUP_SIG_WAIT;
+
+	while (groupp->status & VNTSD_GROUP_SIG_WAIT) {
+		(void) thr_kill(groupp->listen_tid, SIGUSR1);
+		to.tv_sec = VNTSD_CV_WAIT_DELTIME;
+		to.tv_nsec = 0;
+		/* wait listen thread to exit  */
+		(void) cond_reltimedwait(&groupp->cvp, &groupp->lock, &to);
+	}
+
+	(void) mutex_unlock(&groupp->lock);
+	(void) thr_join(groupp->listen_tid, NULL, NULL);
+	/* free group */
+	(void) cond_destroy(&groupp->cvp);
+	(void) mutex_destroy(&groupp->lock);
+	free(groupp);
+}
+
+/* allocate and initialize console structure */
+static vntsd_cons_t *
+alloc_cons(vntsd_group_t *groupp, vcc_console_t *consolep)
+{
+	vntsd_cons_t *consp;
+	int	rv;
+
+	/* allocate console */
+	consp = (vntsd_cons_t *)malloc(sizeof (vntsd_cons_t));
+	if (consp == NULL) {
+		vntsd_log(VNTSD_ERR_NO_MEM, "alloc_cons");
+		return (NULL);
+	}
+
+	/* intialize console */
+	bzero(consp, sizeof (vntsd_cons_t));
+
+	(void) mutex_init(&consp->lock, USYNC_THREAD|LOCK_ERRORCHECK, NULL);
+	(void) cond_init(&consp->cvp, USYNC_THREAD, NULL);
+
+	consp->cons_no = consolep->cons_no;
+	(void) strlcpy(consp->domain_name, consolep->domain_name, MAXPATHLEN);
+	(void) strlcpy(consp->dev_name, consolep->dev_name, MAXPATHLEN);
+	consp->wr_tid = (thread_t)-1;
+	consp->vcc_fd = (thread_t)-1;
+
+	/* join the group */
+	(void) mutex_lock(&groupp->lock);
+
+	if ((rv = vntsd_que_append(&groupp->conspq, consp)) !=
+	    VNTSD_SUCCESS) {
+		(void) mutex_unlock(&groupp->lock);
+		vntsd_log(rv, "alloc_cons");
+		free_cons(consp);
+		return (NULL);
+	}
+	groupp->num_cons++;
+	consp->group = groupp;
+
+	(void) mutex_unlock(&groupp->lock);
+
+	D1(stderr, "t@%d alloc_cons@%d %s %s\n", thr_self(),
+	    consp->cons_no, consp->domain_name, consp->dev_name);
+
+	return (consp);
+}
+
+/* compare tcp with group->tcp */
+static boolean_t
+grp_by_tcp(vntsd_group_t *groupp, uint64_t *tcp_port)
+{
+	assert(groupp);
+	assert(tcp_port);
+	return (groupp->tcp_port == *tcp_port);
+}
+
+/* allocate and initialize group */
+static vntsd_group_t *
+alloc_group(vntsd_t *vntsdp, char *group_name, uint64_t tcp_port)
+{
+	vntsd_group_t *groupp;
+
+	/* allocate group */
+	groupp = (vntsd_group_t *)malloc(sizeof (vntsd_group_t));
+	if (groupp == NULL) {
+		vntsd_log(VNTSD_ERR_NO_MEM, "alloc_group");
+		return (NULL);
+	}
+
+	/* initialize group */
+	bzero(groupp, sizeof (vntsd_group_t));
+
+	(void) mutex_init(&groupp->lock, USYNC_THREAD|LOCK_ERRORCHECK, NULL);
+	(void) cond_init(&groupp->cvp, USYNC_THREAD, NULL);
+
+	if (group_name != NULL) {
+		(void) memcpy(groupp->group_name, group_name, MAXPATHLEN);
+	}
+
+	groupp->tcp_port = tcp_port;
+	groupp->listen_tid = (thread_t)-1;
+	groupp->sockfd = (thread_t)-1;
+	groupp->vntsd = vntsdp;
+
+	D1(stderr, "t@%d alloc_group@%lld:%s\n", thr_self(), groupp->tcp_port,
+	    groupp->group_name);
+
+	return (groupp);
+}
+
+/*
+ * Initialize a console, if console is associated with with a
+ * new group, intialize the group.
+ */
+static int
+alloc_cons_with_group(vntsd_t *vntsdp, vcc_console_t *consp,
+    vntsd_group_t **new_groupp)
+{
+	vntsd_group_t	*groupp = NULL;
+	int		rv;
+
+	*new_groupp = NULL;
+
+	/* match group by tcp port */
+
+
+	(void) mutex_lock(&vntsdp->lock);
+	groupp = vntsd_que_find(vntsdp->grouppq,
+	    (compare_func_t)grp_by_tcp, (void *)&(consp->tcp_port));
+	(void) mutex_unlock(&vntsdp->lock);
+
+	if (groupp != NULL) {
+		/* group with same tcp port found */
+
+		if (strcmp(groupp->group_name, consp->group_name)) {
+			/* conflict group name */
+			vntsd_log(VNTSD_ERR_VCC_GRP_NAME,
+			    "group name is different from existing group");
+			return (VNTSD_ERR_VCC_CTRL_DATA);
+		}
+
+	} else {
+		/* new group */
+		groupp = alloc_group(vntsdp, consp->group_name,
+		    consp->tcp_port);
+		if (groupp == NULL) {
+			return (VNTSD_ERR_NO_MEM);
+		}
+
+		assert(groupp->conspq == NULL);
+		/* queue group to vntsdp */
+		(void) mutex_lock(&vntsdp->lock);
+		rv = vntsd_que_append(&vntsdp->grouppq, groupp);
+		(void) mutex_unlock(&vntsdp->lock);
+
+		if (rv != VNTSD_SUCCESS) {
+			return (rv);
+		}
+
+		*new_groupp = groupp;
+	}
+
+	/* intialize console */
+	if (alloc_cons(groupp, consp) == NULL) {
+		/* no memory */
+		if (new_groupp != NULL) {
+			/* clean up new group */
+			(void) cond_destroy(&groupp->cvp);
+			(void) mutex_destroy(&groupp->lock);
+			free(groupp);
+		}
+
+		return (VNTSD_ERR_NO_MEM);
+	}
+
+	return (VNTSD_SUCCESS);
+
+}
+
+
+/* create listen thread */
+static boolean_t
+create_listen_thread(vntsd_group_t *groupp)
+{
+
+	char err_msg[VNTSD_LINE_LEN];
+	int rv;
+
+	assert(groupp);
+
+	(void) mutex_lock(&groupp->lock);
+	assert(groupp->num_cons);
+
+	D1(stderr, "t@%d create_listen:%lld\n", thr_self(), groupp->tcp_port);
+
+	if ((rv = thr_create(NULL, 0, (thr_func_t)vntsd_listen_thread,
+			    (void *)groupp, THR_DETACHED, &groupp->listen_tid))
+	    != 0) {
+		(void) (void) snprintf(err_msg, sizeof (err_msg),
+		    "Can not create listen thread for"
+		    "group %s tcp %llx\n", groupp->group_name,
+		    groupp->tcp_port);
+		vntsd_log(VNTSD_ERR_CREATE_LISTEN_THR, err_msg);
+
+		/* clean up group queue */
+		vntsd_free_que(&groupp->conspq, (clean_func_t)free_cons);
+		groupp->listen_tid = (thread_t)-1;
+	}
+
+	(void) mutex_unlock(&groupp->lock);
+
+	return (rv != 0);
+}
+
+/* delete a console if the console exists in the vntsd */
+static void
+delete_cons_before_add(vntsd_t *vntsdp, uint64_t tcp_port, uint_t cons_no)
+{
+	vntsd_group_t	    *groupp;
+	vntsd_cons_t	    *consp;
+
+	/* group exists? */
+	(void) mutex_lock(&vntsdp->lock);
+	groupp = vntsd_que_find(vntsdp->grouppq, (compare_func_t)grp_by_tcp,
+	    (void *)&(tcp_port));
+	(void) mutex_unlock(&vntsdp->lock);
+
+	if (groupp == NULL) {
+		/* no such group */
+		return;
+	}
+
+	/* group exists, if console exists? */
+	(void) mutex_lock(&groupp->lock);
+	consp = vntsd_que_find(groupp->conspq,
+	    (compare_func_t)vntsd_cons_by_consno, &cons_no);
+
+	if (consp == NULL) {
+		/* no such console */
+		(void) mutex_unlock(&groupp->lock);
+		return;
+	}
+	/* console exists - delete console */
+
+	(void) mutex_lock(&consp->lock);
+
+	consp->status |= VNTSD_CONS_DELETED;
+	groupp->status |= VNTSD_GROUP_CLEAN_CONS;
+
+	(void) mutex_unlock(&consp->lock);
+
+	(void) mutex_unlock(&groupp->lock);
+
+	vntsd_delete_cons(vntsdp);
+}
+
+/* add a console */
+static void
+do_add_cons(vntsd_t *vntsdp, int cons_no)
+{
+	vcc_console_t	console;
+	vntsd_group_t	*groupp;
+	int		rv;
+	char		err_msg[VNTSD_LINE_LEN];
+
+
+	(void) snprintf(err_msg, sizeof (err_msg),
+	    "do_add_cons():Can not add console=%d", cons_no);
+
+	/* get console configuration from vcc */
+
+	if ((rv = vntsd_vcc_ioctl(VCC_CONS_INFO, cons_no, (void *)&console))
+	    != VNTSD_SUCCESS) {
+		vntsd_log(rv, err_msg);
+		return;
+	}
+
+	/* clean up the console if console was deleted and added again */
+	delete_cons_before_add(vntsdp, console.tcp_port, console.cons_no);
+
+	/* initialize console */
+
+	if ((rv = alloc_cons_with_group(vntsdp, &console, &groupp)) !=
+	    VNTSD_SUCCESS) {
+		/* no memory to add this new console */
+		vntsd_log(rv, err_msg);
+		return;
+	}
+
+	if (groupp != NULL) {
+		/* new group */
+		/* create listen thread for this console */
+		if (create_listen_thread(groupp)) {
+			vntsd_log(VNTSD_ERR_CREATE_LISTEN_THR, err_msg);
+			(void) cond_destroy(&groupp->cvp);
+			(void) mutex_destroy(&groupp->lock);
+			free(groupp);
+		}
+
+	}
+}
+
+/* daemon wake up */
+void
+vntsd_daemon_wakeup(vntsd_t *vntsdp)
+{
+
+	vcc_response_t	inq_data;
+
+	/* reason to wake up  */
+	if (vntsd_vcc_ioctl(VCC_INQUIRY, 0, (void *)&inq_data) !=
+	    VNTSD_SUCCESS) {
+		vntsd_log(VNTSD_ERR_VCC_IOCTL, "vntsd_daemon_wakeup()");
+		return;
+	}
+
+	D1(stderr, "t@%d vntsd_daemon_wakup:msg %d port %x\n", thr_self(),
+	    inq_data.reason, inq_data.cons_no);
+
+	switch (inq_data.reason) {
+
+	case VCC_CONS_ADDED:
+		do_add_cons(vntsdp, inq_data.cons_no);
+		break;
+
+	default:
+		DERR(stderr, "t@%d daemon_wakeup:ioctl_unknown %d\n",
+		    thr_self(), inq_data.reason);
+		vntsd_log(VNTSD_ERR_UNKNOWN_CMD, "from vcc\n");
+		break;
+	}
+}
+
+/* initial console configuration */
+void
+vntsd_get_config(vntsd_t *vntsdp)
+{
+
+	int		i;
+	int		num_cons;
+	vcc_console_t	*consp;
+	vntsd_group_t	*groupp;
+
+	/* num of consoles */
+	num_cons = 0;
+
+	if (vntsd_vcc_ioctl(VCC_NUM_CONSOLE, 0, (void *)&num_cons) !=
+	    VNTSD_SUCCESS) {
+		vntsd_log(VNTSD_ERR_VCC_IOCTL, "VCC_NUM_CONSOLE failed\n");
+		return;
+	}
+
+	D3(stderr, "get_config:num_cons=%d", num_cons);
+
+	if (num_cons == 0) {
+		return;
+	}
+
+	/* allocate memory for all consoles */
+	consp = malloc(num_cons*sizeof (vcc_console_t));
+
+	if (consp == NULL) {
+		vntsd_log(VNTSD_ERR_NO_MEM, "for console table.");
+		return;
+	}
+
+	/* get console table */
+	if (vntsd_vcc_ioctl(VCC_CONS_TBL, 0, (void *)consp) != VNTSD_SUCCESS) {
+		vntsd_log(VNTSD_ERR_VCC_IOCTL, " VCC_CONS_TBL "
+		    "for console table\n");
+		return;
+	}
+
+	/* intialize groups and consoles  */
+	for (i = 0; i < num_cons; i++) {
+		if (alloc_cons_with_group(vntsdp, &consp[i], &groupp)
+		    != VNTSD_SUCCESS) {
+			vntsd_log(VNTSD_ERR_ADD_CONS_FAILED, "get_config");
+		}
+	}
+
+	/* create listen thread for each group */
+	(void) mutex_lock(&vntsdp->lock);
+
+	for (; ; ) {
+		groupp = vntsd_que_walk(vntsdp->grouppq,
+		    (el_func_t)create_listen_thread);
+		if (groupp == NULL) {
+			break;
+		}
+		vntsd_log(VNTSD_ERR_CREATE_LISTEN_THR, "get config()");
+	}
+
+	(void) mutex_unlock(&vntsdp->lock);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/cmd/vntsd/write.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,251 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * write thread - read from vcc console and  write to tcp client. There are one
+ * writer and multiple readers per console. The first client who connects to
+ * a console get write access.
+ * Writer thread writes vcc data to all tcp clients that connected to
+ * the console.
+ */
+
+#include <stdio.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <thread.h>
+#include <synch.h>
+#include <signal.h>
+#include <assert.h>
+#include <poll.h>
+#include <syslog.h>
+#include <libintl.h>
+#include "vntsd.h"
+#include "chars.h"
+
+/*
+ * check the state of write thread. exit if no more client connects to the
+ * console.
+ */
+static void
+write_chk_status(vntsd_cons_t *consp, int status)
+{
+
+	if ((consp->status & VNTSD_CONS_DELETED) || (consp->clientpq == NULL)) {
+		thr_exit(0);
+	}
+
+	switch (status) {
+	case VNTSD_STATUS_VCC_IO_ERR:
+		assert(consp->group != NULL);
+		if (vntsd_vcc_err(consp) != VNTSD_STATUS_CONTINUE) {
+			thr_exit(0);
+		}
+		break;
+	case VNTSD_STATUS_INTR:
+		thr_exit(0);
+	default:
+		break;
+
+	}
+}
+
+/*
+ * skip_terminal_null()
+ * scan terminal null character sequence (0x5e 0x40)
+ * return number of characters in the buf after skipping terminal null
+ * sequence.
+ */
+static int
+skip_terminal_null(char *buf, int buf_sz, int sz)
+{
+	int	    i, j;
+	static int  term_null_seq = 0;
+
+	assert(sz >= 0);
+
+	if (buf_sz < sz+1) {
+		return (-1);
+	}
+
+	if (term_null_seq) {
+		/* skip 0x5e previously */
+		term_null_seq = 0;
+
+		if (buf[0] != 0x40) {
+			/* not terminal null sequence put 0x5e back */
+			for (i = sz; i > 0; i--) {
+				buf[i] = buf[i-1];
+			}
+
+			buf[0] = 0x5e;
+
+			sz++;
+		} else {
+			/* skip terminal null sequence */
+			sz--;
+
+			if (sz == 0) {
+				return (sz);
+			}
+
+			for (i = 0; i < sz; i++) {
+				buf[i] = buf[i+1];
+			}
+		}
+	}
+
+	for (; ; ) {
+		for (i = 0; i < sz; i++) {
+			if (buf[i]  == '\0') {
+				return (i);
+			}
+
+			if (buf[i] == 0x5e) {
+				/* possible terminal null sequence */
+				if (i == sz -1) {
+					/* last character in buffer */
+					term_null_seq = 1;
+					sz--;
+					buf[i] = 0;
+					return (sz);
+				}
+
+				if (buf[i+1] == 0x40) {
+					/* found terminal null sequence */
+					sz -= 2;
+					for (j = i; j < sz -i; j++) {
+						buf[j] = buf[j+2];
+					}
+					break;
+				}
+
+				if (buf[i+1] == '\0') {
+					buf[i] = 0;
+					term_null_seq = 1;
+					return (i);
+				}
+
+			}
+		}
+
+		if (i == sz) {
+			/* end of scan */
+			return (sz);
+		}
+	}
+}
+
+/* read data from vcc */
+static int
+read_vcc(vntsd_cons_t *consp, char *buf, ssize_t *sz)
+{
+	/* read from vcc */
+	*sz = read(consp->vcc_fd, buf, VNTSD_MAX_BUF_SIZE);
+
+	if (errno == EINTR) {
+		return (VNTSD_STATUS_INTR);
+	}
+
+	if ((*sz > 0)) {
+		return (VNTSD_SUCCESS);
+	}
+	return (VNTSD_STATUS_VCC_IO_ERR);
+}
+
+static int s_sz;
+/* write to a client */
+static boolean_t
+write_all_clients(vntsd_client_t *clientp, char *buf)
+{
+	int rv;
+
+	rv = vntsd_write_client(clientp, buf, s_sz);
+	if (rv != VNTSD_SUCCESS) {
+		(void) mutex_lock(&clientp->lock);
+		clientp->status |= VNTSD_CLIENT_IO_ERR;
+		assert(clientp->cons);
+		(void) thr_kill(clientp->cons_tid, NULL);
+		(void) mutex_unlock(&clientp->lock);
+	}
+	return (B_FALSE);
+
+}
+
+/* vntsd_write_thread() */
+void*
+vntsd_write_thread(vntsd_cons_t *consp)
+{
+	char		buf[VNTSD_MAX_BUF_SIZE+1];
+	int		sz;
+	int		rv;
+
+	D1(stderr, "t@%d vntsd_write@%d\n", thr_self(), consp->vcc_fd);
+
+	assert(consp);
+	write_chk_status(consp, VNTSD_SUCCESS);
+
+	for (; ; ) {
+		bzero(buf,  VNTSD_MAX_BUF_SIZE +1);
+
+		/* read data */
+		rv = read_vcc(consp, buf, &sz);
+
+		write_chk_status(consp, rv);
+
+		if (sz <= 0) {
+			continue;
+		}
+
+		/* has data */
+		if ((s_sz = skip_terminal_null(buf, sz+1, sz)) == 0) {
+			/* terminal null sequence */
+			continue;
+		}
+
+		assert(s_sz > 0);
+
+		/*
+		 * output data to all clients connected
+		 * to this console
+		 */
+
+		(void) mutex_lock(&consp->lock);
+		(void) vntsd_que_find(consp->clientpq,
+		    (compare_func_t)write_all_clients, buf);
+		(void) mutex_unlock(&consp->lock);
+
+		write_chk_status(consp, VNTSD_SUCCESS);
+
+	}
+
+	/*NOTREACHED*/
+	return (NULL);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/common/mdesc/mdesc_diff.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,602 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else /* _KERNEL */
+#include <string.h>
+#include <strings.h>
+#endif /* _KERNEL */
+#include <sys/note.h>
+#include <sys/mdesc.h>
+#include <sys/mdesc_impl.h>
+
+#define	MDD_FREE_CHECK(mdp, ptr, sz)		\
+	do {					\
+		if (ptr) mdp->freep(ptr, sz);	\
+	_NOTE(CONSTCOND) } while (0)
+
+#define	MD_DIFF_MAGIC			0x4D445F4449464621ull	/* 'MD_DIFF!' */
+#define	MD_DIFF_NOMATCH			(-1)
+#define	MD_DIFF_MATCH			(1)
+
+typedef struct {
+	mde_cookie_t	*mdep;
+	uint_t		nelem;
+} md_diff_t;
+
+typedef struct {
+	uint64_t	mdd_magic;
+	md_diff_t	added;
+	md_diff_t	removed;
+	md_diff_t	match1;
+	md_diff_t	match2;
+	void 		*(*allocp)(size_t);
+	void		(*freep)(void *, size_t);
+} md_diff_impl_t;
+
+/*
+ * Internal utility functions
+ */
+static int mdd_scan_for_nodes(md_t *mdp, mde_cookie_t start,
+    char *compnodep, int *countp, mde_cookie_t **nodespp);
+
+static boolean_t mdd_any_dup_nodes(md_impl_t *mdp, md_prop_match_t *pmp,
+    int count, mde_cookie_t *nodesp);
+
+static int mdd_node_list_match(md_impl_t *md1, md_impl_t *md2,
+    md_element_t *match_nodep, mde_cookie_t *match_listp,
+    uint8_t *match_seenp, int start, int end, md_prop_match_t *match_elemsp);
+
+static int mdd_node_compare(md_impl_t *mdap, md_impl_t *mdbp,
+    md_element_t *nodeap, md_element_t *nodebp, md_prop_match_t *match_elemsp);
+
+/*
+ * Given two DAGs and information about how to uniquely identify
+ * the nodes of interest, determine which nodes have been added
+ * to the second MD, removed from the first MD, or exist in both
+ * MDs. This information is recorded and can be accessed using the
+ * opaque cookie returned to the caller.
+ */
+md_diff_cookie_t
+md_diff_init(md_t *md1p, mde_cookie_t start1, md_t *md2p, mde_cookie_t start2,
+    char *compnodep, md_prop_match_t *match_fieldsp)
+{
+	int		idx;
+	md_impl_t	*md1 = (md_impl_t *)md1p;
+	md_impl_t	*md2 = (md_impl_t *)md2p;
+	mde_cookie_t	*md1nodesp = NULL;
+	mde_cookie_t	*md2nodesp = NULL;
+	int		md1count = 0;
+	int		md2count = 0;
+	uint8_t		*seenp = NULL;
+
+	/* variables used to gather results */
+	md_diff_impl_t	*diff_res;
+	mde_cookie_t	*mde_add_scr;
+	mde_cookie_t	*mde_rem_scr;
+	mde_cookie_t	*mde_match1_scr;
+	mde_cookie_t	*mde_match2_scr;
+	int		nadd = 0;
+	int		nrem = 0;
+	int		nmatch = 0;
+
+	/* sanity check params */
+	if ((md1p == NULL) || (md2p == NULL))
+		return (MD_INVAL_DIFF_COOKIE);
+
+	if ((start1 == MDE_INVAL_ELEM_COOKIE) ||
+	    (start2 == MDE_INVAL_ELEM_COOKIE))
+		return (MD_INVAL_DIFF_COOKIE);
+
+	if ((compnodep == NULL) || (match_fieldsp == NULL))
+		return (MD_INVAL_DIFF_COOKIE);
+
+	/*
+	 * Prepare an array of the matching nodes from the first MD.
+	 */
+	if (mdd_scan_for_nodes(md1p,
+	    start1, compnodep, &md1count, &md1nodesp) == -1)
+		return (MD_INVAL_DIFF_COOKIE);
+
+	/* sanity check that all nodes are unique */
+	if (md1nodesp &&
+	    mdd_any_dup_nodes(md1, match_fieldsp, md1count, md1nodesp)) {
+		MDD_FREE_CHECK(md1, md1nodesp, sizeof (mde_cookie_t) *
+		    md1count);
+		return (MD_INVAL_DIFF_COOKIE);
+	}
+
+
+	/*
+	 * Prepare an array of the matching nodes from the second MD.
+	 */
+	if (mdd_scan_for_nodes(md2p,
+	    start2, compnodep, &md2count, &md2nodesp) == -1)
+		return (MD_INVAL_DIFF_COOKIE);
+
+	/* sanity check that all nodes are unique */
+	if (md2nodesp &&
+	    mdd_any_dup_nodes(md2, match_fieldsp, md2count, md2nodesp)) {
+		MDD_FREE_CHECK(md1, md1nodesp, sizeof (mde_cookie_t) *
+		    md1count);
+		MDD_FREE_CHECK(md2, md2nodesp, sizeof (mde_cookie_t) *
+		    md2count);
+		return (MD_INVAL_DIFF_COOKIE);
+	}
+
+	/* setup our result structure */
+	diff_res = md1->allocp(sizeof (md_diff_impl_t));
+	bzero(diff_res, sizeof (md_diff_impl_t));
+	diff_res->allocp = md1->allocp;
+	diff_res->freep = md1->freep;
+	diff_res->mdd_magic = MD_DIFF_MAGIC;
+
+	/*
+	 * Special cases for empty lists
+	 */
+	if ((md1count == 0) && (md2count != 0)) {
+		/* all the nodes found were added */
+		diff_res->added.mdep = md2nodesp;
+		diff_res->added.nelem = md2count;
+		return ((mde_cookie_t)diff_res);
+	}
+
+	if ((md1count != 0) && (md2count == 0)) {
+		/* all the nodes found were removed */
+		diff_res->removed.mdep = md1nodesp;
+		diff_res->removed.nelem = md1count;
+		return ((mde_cookie_t)diff_res);
+	}
+
+	if ((md1count == 0) && (md2count == 0))
+		/* no nodes found */
+		return ((mde_cookie_t)diff_res);
+
+	/*
+	 * Both lists have some elements. Allocate some scratch
+	 * buffers to sort them into our three categories, added,
+	 * removed, and matched pairs.
+	 */
+	mde_add_scr = diff_res->allocp(sizeof (mde_cookie_t) * md2count);
+	mde_rem_scr = diff_res->allocp(sizeof (mde_cookie_t) * md1count);
+	mde_match1_scr = diff_res->allocp(sizeof (mde_cookie_t) * md1count);
+	mde_match2_scr = diff_res->allocp(sizeof (mde_cookie_t) * md2count);
+
+	/* array of seen flags only needed for md2 */
+	seenp = (uint8_t *)diff_res->allocp(sizeof (uint8_t) * md2count);
+	bzero(seenp, sizeof (uint8_t) * md2count);
+
+	/*
+	 * Make a pass through the md1 node array. Make note of
+	 * any nodes not in the md2 array, indicating that they
+	 * have been removed. Also keep track of the nodes that
+	 * are present in both arrays for the matched pair results.
+	 */
+	for (idx = 0; idx < md1count; idx++) {
+
+		md_element_t *elem = &(md1->mdep[md1nodesp[idx]]);
+
+		int match = mdd_node_list_match(md1, md2, elem, md2nodesp,
+		    seenp, 0, md2count - 1, match_fieldsp);
+
+		if (match == MD_DIFF_NOMATCH)
+			/* record deleted node */
+			mde_rem_scr[nrem++] = md1nodesp[idx];
+		else {
+			/* record matched node pair */
+			mde_match1_scr[nmatch] = md1nodesp[idx];
+			mde_match2_scr[nmatch] = md2nodesp[match];
+			nmatch++;
+
+			/* mark that this match has been recorded */
+			seenp[match] = 1;
+		}
+	}
+
+	/*
+	 * Make a pass through the md2 array. Any nodes that have
+	 * not been marked as seen have been added.
+	 */
+	for (idx = 0; idx < md2count; idx++) {
+		if (!seenp[idx])
+			/* record added node */
+			mde_add_scr[nadd++] = md2nodesp[idx];
+	}
+
+	/* fill in the added node list */
+	if (nadd) {
+		int addsz = sizeof (mde_cookie_t) * nadd;
+		diff_res->added.mdep = (mde_cookie_t *)diff_res->allocp(addsz);
+
+		bcopy(mde_add_scr, diff_res->added.mdep, addsz);
+
+		diff_res->added.nelem = nadd;
+	}
+
+	/* fill in the removed node list */
+	if (nrem) {
+		int remsz = sizeof (mde_cookie_t) * nrem;
+		diff_res->removed.mdep =
+		    (mde_cookie_t *)diff_res->allocp(remsz);
+
+		bcopy(mde_rem_scr, diff_res->removed.mdep, remsz);
+		diff_res->removed.nelem = nrem;
+	}
+
+	/* fill in the matching node lists */
+	if (nmatch) {
+		int matchsz = sizeof (mde_cookie_t) * nmatch;
+		diff_res->match1.mdep =
+		    (mde_cookie_t *)diff_res->allocp(matchsz);
+		diff_res->match2.mdep =
+		    (mde_cookie_t *)diff_res->allocp(matchsz);
+
+		bcopy(mde_match1_scr, diff_res->match1.mdep, matchsz);
+		bcopy(mde_match2_scr, diff_res->match2.mdep, matchsz);
+		diff_res->match1.nelem = nmatch;
+		diff_res->match2.nelem = nmatch;
+	}
+
+	/* clean up */
+	md1->freep(md1nodesp, sizeof (mde_cookie_t) * md1count);
+	md2->freep(md2nodesp, sizeof (mde_cookie_t) * md2count);
+
+	diff_res->freep(mde_add_scr, sizeof (mde_cookie_t) * md2count);
+	diff_res->freep(mde_rem_scr, sizeof (mde_cookie_t) * md1count);
+	diff_res->freep(mde_match1_scr, sizeof (mde_cookie_t) * md1count);
+	diff_res->freep(mde_match2_scr, sizeof (mde_cookie_t) * md2count);
+
+	diff_res->freep(seenp, sizeof (uint8_t) * md2count);
+
+	return ((md_diff_cookie_t)diff_res);
+}
+
+/*
+ * Returns an array of the nodes added to the second MD in a
+ * previous md_diff_init() call. Returns the number of elements
+ * in the returned array. If the value is zero, the pointer
+ * passed back will be NULL.
+ */
+int
+md_diff_added(md_diff_cookie_t mdd, mde_cookie_t **mde_addedp)
+{
+	md_diff_impl_t	*mddp = (md_diff_impl_t *)mdd;
+
+	if ((mddp == NULL) || (mddp->mdd_magic != MD_DIFF_MAGIC))
+		return (-1);
+
+	*mde_addedp = mddp->added.mdep;
+
+	return (mddp->added.nelem);
+}
+
+/*
+ * Returns an array of the nodes removed from the first MD in a
+ * previous md_diff_init() call. Returns the number of elements
+ * in the returned array. If the value is zero, the pointer
+ * passed back will be NULL.
+ */
+int
+md_diff_removed(md_diff_cookie_t mdd, mde_cookie_t **mde_removedp)
+{
+	md_diff_impl_t	*mddp = (md_diff_impl_t *)mdd;
+
+	if ((mddp == NULL) || (mddp->mdd_magic != MD_DIFF_MAGIC))
+		return (-1);
+
+	*mde_removedp = mddp->removed.mdep;
+
+	return (mddp->removed.nelem);
+}
+
+/*
+ * Returns a pair of parallel arrays that contain nodes that were
+ * considered matching based on the match criteria passed in to
+ * a previous md_diff_init() call. Returns the number of elements
+ * in the arrays. If the value is zero, both pointers passed back
+ * will be NULL.
+ */
+int
+md_diff_matched(md_diff_cookie_t mdd, mde_cookie_t **mde_match1p,
+    mde_cookie_t **mde_match2p)
+{
+	md_diff_impl_t	*mddp = (md_diff_impl_t *)mdd;
+
+	if ((mddp == NULL) || (mddp->mdd_magic != MD_DIFF_MAGIC))
+		return (-1);
+
+	*mde_match1p = mddp->match1.mdep;
+	*mde_match2p = mddp->match2.mdep;
+
+	return (mddp->match1.nelem);
+}
+
+/*
+ * Deallocate any storage used to store results of a previous
+ * md_diff_init() call. Returns 0 on success and -1 on failure.
+ */
+int
+md_diff_fini(md_diff_cookie_t mdd)
+{
+	md_diff_impl_t	*mddp = (md_diff_impl_t *)mdd;
+
+	if ((mddp == NULL) || (mddp->mdd_magic != MD_DIFF_MAGIC))
+		return (-1);
+
+	mddp->mdd_magic = 0;
+
+	MDD_FREE_CHECK(mddp, mddp->added.mdep, mddp->added.nelem *
+	    sizeof (mde_cookie_t));
+
+	MDD_FREE_CHECK(mddp, mddp->removed.mdep, mddp->removed.nelem *
+	    sizeof (mde_cookie_t));
+
+	MDD_FREE_CHECK(mddp, mddp->match1.mdep, mddp->match1.nelem *
+	    sizeof (mde_cookie_t));
+
+	MDD_FREE_CHECK(mddp, mddp->match2.mdep, mddp->match2.nelem *
+	    sizeof (mde_cookie_t));
+
+	mddp->freep(mddp, sizeof (md_diff_impl_t));
+
+	return (0);
+}
+
+/*
+ * Walk the "fwd" DAG in an MD and return an array of nodes that are
+ * of the specified type. The start param is used to start the walk
+ * from an arbitrary location in the DAG. Returns an array of nodes
+ * as well as a count of the number of nodes in the array.  If the
+ * count is zero, the node pointer will be passed back as NULL.
+ *
+ * Returns: 0 success; -1 failure
+ */
+static int
+mdd_scan_for_nodes(md_t *mdp,
+    mde_cookie_t start, char *compnodep, int *countp, mde_cookie_t **nodespp)
+{
+	mde_str_cookie_t	cname;
+	mde_str_cookie_t	aname;
+	md_impl_t		*mdip = (md_impl_t *)mdp;
+
+	if (mdip == NULL)
+		return (-1);
+
+	cname = md_find_name(mdp, compnodep);
+	aname = md_find_name(mdp, "fwd");
+
+	/* get the number of nodes of interest in the DAG */
+	*countp = md_scan_dag(mdp, start, cname, aname, NULL);
+	if (*countp == 0) {
+		*nodespp = NULL;
+		return (0);
+	}
+
+	/* allocate the storage */
+	*nodespp = mdip->allocp(sizeof (mde_cookie_t) * (*countp));
+
+	/* populate our array with the matching nodes */
+	(void) md_scan_dag(mdp, start, cname, aname, *nodespp);
+
+	return (0);
+}
+
+/*
+ * Walk an array of nodes and check if there are any duplicate
+ * nodes. A duplicate is determined based on the specified match
+ * criteria. Returns B_TRUE if there are any duplicates and B_FALSE
+ * otherwise.
+ */
+static boolean_t
+mdd_any_dup_nodes(md_impl_t *mdp, md_prop_match_t *pmp, int count,
+    mde_cookie_t *nodesp)
+{
+	int		idx;
+	int		match;
+	md_element_t	*elem;
+
+	ASSERT(count > 0 || nodesp == NULL);
+
+	for (idx = 0; idx < count; idx++) {
+		elem = &(mdp->mdep[nodesp[idx]]);
+
+		match = mdd_node_list_match(mdp, mdp, elem, nodesp, NULL,
+		    idx + 1, count - 1, pmp);
+
+		if (match != MD_DIFF_NOMATCH)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Given a node and a array of nodes, compare the node to all elements
+ * in the specified start-end range of the array. If the node matches
+ * one of the nodes in the array, return the index of that node. Otherwise
+ * return MD_DIFF_NOMATCH.
+ *
+ * The optional seen array parameter can be used to optimize repeated
+ * calls to this function. If the seen array indicates that an element
+ * has already been matched, the full comparison is not necessary.
+ */
+static int
+mdd_node_list_match(md_impl_t *md1, md_impl_t *md2, md_element_t *match_nodep,
+    mde_cookie_t *match_listp, uint8_t *match_seenp, int start, int end,
+    md_prop_match_t *match_elemsp)
+{
+	int		match;
+	int		idx;
+	md_element_t	*elem;
+
+	for (idx = start; idx <= end; idx++) {
+
+		if ((match_seenp != NULL) && (match_seenp[idx]))
+			continue;
+
+		elem = &(md2->mdep[match_listp[idx]]);
+
+		match = mdd_node_compare(md1, md2, match_nodep, elem,
+		    match_elemsp);
+		if (match == MD_DIFF_MATCH)
+			return (idx);
+	}
+
+	return (MD_DIFF_NOMATCH);
+}
+
+/*
+ * Given two nodes and a list of properties, compare the nodes.
+ * A match is concluded if both nodes have all of the specified
+ * properties and all the values of those properties are the
+ * same. Returns MD_DIFF_NOMATCH if the nodes do not match and
+ * MD_DIFF_MATCH otherwise.
+ */
+static int
+mdd_node_compare(md_impl_t *mdap, md_impl_t *mdbp, md_element_t *nodeap,
+    md_element_t *nodebp, md_prop_match_t *match_elemsp)
+{
+	md_element_t	*ap;
+	md_element_t	*bp;
+	boolean_t	nodea_interest;
+	boolean_t	nodeb_interest;
+	int		idx;
+
+	/* make sure we are starting at the beginning of the nodes */
+	if ((MDE_TAG(nodeap) != MDET_NODE) || (MDE_TAG(nodebp) != MDET_NODE))
+		return (MD_DIFF_NOMATCH);
+
+	for (idx = 0; match_elemsp[idx].type != MDET_LIST_END; idx++) {
+
+		int type;
+
+		nodea_interest = B_FALSE;
+		nodeb_interest = B_FALSE;
+
+		type = match_elemsp[idx].type;
+
+		/*
+		 * Check node A for the property of interest
+		 */
+		for (ap = nodeap; MDE_TAG(ap) != MDET_NODE_END; ap++) {
+			char *elemname;
+
+			if (MDE_TAG(ap) != type)
+				continue;
+
+			elemname = mdap->namep + MDE_NAME(ap);
+
+			if (strcmp(elemname, match_elemsp[idx].namep) == 0) {
+				/* found the property of interest */
+				nodea_interest = B_TRUE;
+				break;
+			}
+		}
+
+		/* node A is not of interest */
+		if (!nodea_interest)
+			return (MD_DIFF_NOMATCH);
+
+		/*
+		 * Check node B for the property of interest
+		 */
+		for (bp = nodebp; MDE_TAG(bp) != MDET_NODE_END; bp++) {
+			char *elemname;
+
+			if (MDE_TAG(bp) != type)
+				continue;
+
+			elemname = mdbp->namep + MDE_NAME(bp);
+
+			if (strcmp(elemname, match_elemsp[idx].namep) == 0) {
+				nodeb_interest = B_TRUE;
+				break;
+			}
+		}
+
+		/* node B is not of interest */
+		if (!nodeb_interest)
+			return (MD_DIFF_NOMATCH);
+
+		/*
+		 * Both nodes have the property of interest. The
+		 * nodes are not a match unless the value of that
+		 * property match
+		 */
+		switch (type) {
+		case MDET_PROP_VAL:
+			if (MDE_PROP_VALUE(ap) != MDE_PROP_VALUE(bp))
+				return (MD_DIFF_NOMATCH);
+			break;
+
+		case MDET_PROP_STR: {
+			char *stra = (char *)(mdap->datap +
+			    MDE_PROP_DATA_OFFSET(ap));
+			char *strb = (char *)(mdbp->datap +
+			    MDE_PROP_DATA_OFFSET(bp));
+
+			if (strcmp(stra, strb) != 0)
+				return (MD_DIFF_NOMATCH);
+			break;
+		}
+
+		case MDET_PROP_DAT: {
+
+			caddr_t dataa;
+			caddr_t datab;
+
+			if (MDE_PROP_DATA_LEN(ap) != MDE_PROP_DATA_LEN(bp))
+				return (MD_DIFF_NOMATCH);
+
+			dataa = (caddr_t)(mdap->datap +
+			    MDE_PROP_DATA_OFFSET(ap));
+			datab = (caddr_t)(mdbp->datap +
+			    MDE_PROP_DATA_OFFSET(bp));
+
+			if (memcmp(dataa, datab, MDE_PROP_DATA_LEN(ap)) != 0)
+				return (MD_DIFF_NOMATCH);
+
+			break;
+		}
+
+		default:
+			/* unsupported prop type */
+			return (MD_DIFF_NOMATCH);
+		}
+	}
+
+	/*
+	 * All the specified properties exist in both
+	 * nodes and have the same value. The two nodes
+	 * match.
+	 */
+
+	return (MD_DIFF_MATCH);
+}
--- a/usr/src/common/mdesc/mdesc_fini.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/common/mdesc/mdesc_fini.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,6 +30,10 @@
 #include <sys/mdesc.h>
 #include <sys/mdesc_impl.h>
 
+/*
+ * Cleanup the internal MD structure. Does not
+ * deallocate the buffer holding the MD.
+ */
 int
 md_fini(md_t *ptr)
 {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/common/mdesc/mdesc_getbinsize.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,45 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/mdesc.h>
+#include <sys/mdesc_impl.h>
+
+size_t
+md_get_bin_size(md_t *ptr)
+{
+	md_impl_t *mdp;
+
+	mdp = (md_impl_t *)ptr;
+
+	if (mdp == NULL)
+		return (0);
+
+	return (mdp->size);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/common/mdesc/mdesc_getgen.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,45 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/mdesc.h>
+#include <sys/mdesc_impl.h>
+
+uint64_t
+md_get_gen(md_t *ptr)
+{
+	md_impl_t *mdp;
+
+	mdp = (md_impl_t *)ptr;
+
+	if (mdp == NULL)
+		return (MDESC_INVAL_GEN);
+
+	return (mdp->gen);
+}
--- a/usr/src/common/mdesc/mdesc_init_intern.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/common/mdesc/mdesc_init_intern.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,23 +32,25 @@
 #include <sys/mdesc_impl.h>
 
 md_t *
-md_init_intern(uint64_t *ptr, void *(*allocp)(size_t),
-			void (*freep)(void *, size_t))
+md_init_intern(uint64_t *ptr,  void *(*allocp)(size_t),
+	    void (*freep)(void *, size_t))
 {
 	md_impl_t	*mdp;
 	int		idx;
 	int		count;
 	int		done;
+	uint64_t	gen;
 	mde_str_cookie_t root_name;
 
 	/*
 	 * Very basic checkup for alignment to avoid
 	 * bus error issues.
 	 */
-	if ((((uintptr_t)ptr)&7) != 0)
+	if ((((uintptr_t)ptr) & 7) != 0)
 		return (NULL);
 
 	mdp = (md_impl_t *)allocp(sizeof (md_impl_t));
+
 	if (mdp == NULL)
 		return (NULL);
 
@@ -60,6 +62,7 @@
 	/*
 	 * setup internal structures
 	 */
+
 	mdp->headerp = (md_header_t *)mdp->caddr;
 
 	if (mdtoh32(mdp->headerp->transport_version) != MD_TRANSPORT_VERSION) {
@@ -70,13 +73,13 @@
 	mdp->name_blk_size = mdtoh32(mdp->headerp->name_blk_sz);
 	mdp->data_blk_size = mdtoh32(mdp->headerp->data_blk_sz);
 
-	mdp->size = MD_HEADER_SIZE+mdp->node_blk_size+
-	    mdp->name_blk_size+mdp->data_blk_size;
+	mdp->size = MD_HEADER_SIZE + mdp->node_blk_size +
+	    mdp->name_blk_size + mdp->data_blk_size;
 
-	mdp->mdep = (md_element_t *)(mdp->caddr+MD_HEADER_SIZE);
-	mdp->namep = (char *)(mdp->caddr+MD_HEADER_SIZE+mdp->node_blk_size);
-	mdp->datap = (uint8_t *)(mdp->caddr+MD_HEADER_SIZE+mdp->name_blk_size+
-	    mdp->node_blk_size);
+	mdp->mdep = (md_element_t *)(mdp->caddr + MD_HEADER_SIZE);
+	mdp->namep = (char *)(mdp->caddr + MD_HEADER_SIZE + mdp->node_blk_size);
+	mdp->datap = (uint8_t *)(mdp->caddr + MD_HEADER_SIZE +
+	    mdp->name_blk_size + mdp->node_blk_size);
 
 	mdp->root_node = MDE_INVAL_ELEM_COOKIE;
 
@@ -123,7 +126,7 @@
 				mdp->root_node = (mde_cookie_t)idx;
 			}
 			idx = MDE_PROP_INDEX(np);
-			count ++;
+			count++;
 			break;
 
 		default:
@@ -142,25 +145,35 @@
 	 * Register the counts
 	 */
 
-	mdp->element_count = idx+1;	/* include LIST_END */
+	mdp->element_count = idx + 1;	/* include LIST_END */
 	mdp->node_count = count;
 
 	/*
 	 * Final sanity check that everything adds up
 	 */
-	if (mdp->element_count != (mdp->node_blk_size/MD_ELEMENT_SIZE))
+	if (mdp->element_count != (mdp->node_blk_size / MD_ELEMENT_SIZE))
 		goto cleanup;
 
 	mdp->md_magic = LIBMD_MAGIC;
 
+	/*
+	 * Setup MD generation
+	 */
+	if (md_get_prop_val((md_t *)mdp, mdp->root_node,
+	    "md-generation#", &gen) != 0)
+		mdp->gen = MDESC_INVAL_GEN;
+	else
+		mdp->gen = gen;
+
 	return ((md_t *)mdp);
 
-cleanup:;
+cleanup:
 	/*
 	 * Clean up here - including a name hash if
 	 * we build one.
 	 */
-cleanup_nohash:;
+
+cleanup_nohash:
 	mdp->freep(mdp, sizeof (md_impl_t));
 	return (NULL);
 }
--- a/usr/src/common/mdesc/mdesc_rootnode.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/common/mdesc/mdesc_rootnode.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,7 +38,7 @@
 	mdp = (md_impl_t *)ptr;
 
 	if (mdp->md_magic != LIBMD_MAGIC)
-		return (-1);
+		return (MDE_INVAL_ELEM_COOKIE);
 
 	return (mdp->root_node);
 }
--- a/usr/src/common/mdesc/mdesc_scandag.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/common/mdesc/mdesc_scandag.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -121,7 +121,8 @@
 
 
 
-static int mdl_scan_dag(md_impl_t *mdp,
+static int
+mdl_scan_dag(md_impl_t *mdp,
 	int nodeidx,
 	mde_str_cookie_t node_name_cookie,
 	mde_str_cookie_t arc_name_cookie,
--- a/usr/src/lib/libpcp/common/libpcp.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/lib/libpcp/common/libpcp.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -48,6 +47,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/glvc.h>
+#include <sys/vldc.h>
+#include <sys/ldc.h>
 #include <netinet/in.h>
 
 #include "libpcp.h"
@@ -81,6 +82,11 @@
 static uint16_t checksum(uint16_t *addr, int32_t count);
 static int pcp_cleanup(int channel_fd);
 
+static int vldc_read(int fd, uint8_t *bufp, int size);
+static int vldc_write(int fd, uint8_t *bufp, int size);
+static int pcp_update_read_area(int byte_cnt);
+static int pcp_vldc_frame_error_handle(void);
+
 /*
  * local channel (glvc) file descriptor set by pcp_send_recv()
  */
@@ -156,6 +162,19 @@
 /* To restore old SIGALRM signal handler */
 static struct sigaction old_act;
 
+/*
+ * Variables to support vldc based streaming transport
+ */
+typedef enum {
+	GLVC_NON_STREAM,
+	VLDC_STREAMING
+} xport_t;
+
+static int xport_type = GLVC_NON_STREAM;
+#define	CHANNEL_DEV	"channel-devices"
+
+#define	VLDC_MTU_SIZE	(2048)
+
 static void
 glvc_timeout_handler(void)
 {
@@ -178,6 +197,7 @@
 
 	if (channel_name == NULL)
 		return (PCPL_INVALID_ARGS);
+
 	/*
 	 * Open virtual channel name.
 	 */
@@ -186,12 +206,33 @@
 	}
 
 	/*
-	 * Get the Channel MTU size
+	 * Check if the channel-name points to a vldc node
+	 * or a glvc node
 	 */
+	if (strstr(channel_name, CHANNEL_DEV) != NULL) {
+		vldc_opt_op_t op;
+
+		xport_type  = VLDC_STREAMING;
+		mtu_size = VLDC_MTU_SIZE;
 
-	if (pcp_get_prop(channel_fd, GLVC_XPORT_OPT_MTU_SZ, &mtu_size) != 0) {
-		(void) close(channel_fd);
-		return (PCPL_GLVC_ERROR);
+		op.op_sel = VLDC_OP_SET;
+		op.opt_sel = VLDC_OPT_MODE;
+		op.opt_val = LDC_MODE_STREAM;
+		if (ioctl(channel_fd, VLDC_IOCTL_OPT_OP, &op) != 0) {
+			(void) close(channel_fd);
+			return (PCPL_GLVC_ERROR);
+		}
+	} else {
+		xport_type  = GLVC_NON_STREAM;
+		/*
+		 * Get the Channel MTU size
+		 */
+
+		if (pcp_get_prop(channel_fd, GLVC_XPORT_OPT_MTU_SZ,
+				&mtu_size) != 0) {
+			(void) close(channel_fd);
+			return (PCPL_GLVC_ERROR);
+		}
 	}
 
 	/*
@@ -233,7 +274,8 @@
 {
 
 	if (channel_fd >= 0) {
-		(void) pcp_cleanup(channel_fd);
+		if (xport_type  == GLVC_NON_STREAM)
+			(void) pcp_cleanup(channel_fd);
 		(void) close(channel_fd);
 	} else {
 		return (-1);
@@ -631,7 +673,6 @@
 	(void) memcpy(buf, peek_area, m);
 
 	return (m);
-
 }
 
 /*
@@ -648,13 +689,19 @@
 		return (PCPL_INVALID_ARGS);
 	}
 
-	(void) alarm(glvc_timeout);
+	if (xport_type == GLVC_NON_STREAM) {
+		(void) alarm(glvc_timeout);
 
-	if ((ret = write(chnl_fd, buf, byte_cnt)) < 0) {
+		if ((ret = write(chnl_fd, buf, byte_cnt)) < 0) {
+			(void) alarm(0);
+			return (ret);
+		}
 		(void) alarm(0);
-		return (ret);
+	} else {
+		if ((ret = vldc_write(chnl_fd, buf, byte_cnt)) <= 0) {
+			return (ret);
+		}
 	}
-	(void) alarm(0);
 
 	return (ret);
 }
@@ -718,17 +765,28 @@
 	 * do a peek to see how much data is available and read complete data.
 	 */
 
-	if ((m = pcp_peek(read_tail, mtu_size)) < 0) {
-		return (m);
-	}
+	if (xport_type == GLVC_NON_STREAM) {
+		if ((m = pcp_peek(read_tail, mtu_size)) < 0) {
+			return (m);
+		}
+
+		(void) alarm(glvc_timeout);
+		if ((ret = read(chnl_fd, read_tail, m)) < 0) {
+			(void) alarm(0);
+			return (ret);
+		}
 
-	(void) alarm(glvc_timeout);
-	if ((ret = read(chnl_fd, read_tail, m)) < 0) {
 		(void) alarm(0);
-		return (ret);
+	} else {
+		/*
+		 * Read the extra number of bytes
+		 */
+		m = byte_cnt - (read_tail - read_head);
+		if ((ret = vldc_read(chnl_fd,
+				read_tail, m)) <= 0) {
+			return (ret);
+		}
 	}
-
-	(void) alarm(0);
 	read_tail += ret;
 
 	/*
@@ -743,6 +801,69 @@
 }
 
 /*
+ * Issue read from the driver until byet_cnt number
+ * of bytes are present in read buffer. Do not
+ * move the read head.
+ */
+static int
+pcp_update_read_area(int byte_cnt)
+{
+	int			ret;
+	int			n, i;
+
+	if (byte_cnt < 0 || byte_cnt > mtu_size) {
+		return (PCPL_INVALID_ARGS);
+	}
+
+	/*
+	 * initialization of local read buffer
+	 * from which the stream read requests are serviced.
+	 */
+	if (read_area == NULL) {
+		read_area = (uint8_t *)umem_zalloc(READ_AREA_SIZE,
+							UMEM_DEFAULT);
+		if (read_area == NULL) {
+			return (PCPL_MALLOC_FAIL);
+		}
+		read_head = read_area;
+		read_tail = read_area;
+	}
+
+	/*
+	 * if we already have sufficient data in the buffer,
+	 * just return
+	 */
+	if (byte_cnt <= (read_tail - read_head)) {
+		return (byte_cnt);
+	}
+
+	/*
+	 * if the request is not satisfied from the buffered data, then move the
+	 * remaining data to front of the buffer and read new data.
+	 */
+	for (i = 0; i < (read_tail - read_head); ++i) {
+		read_area[i] = read_head[i];
+	}
+	read_head = read_area;
+	read_tail = read_head + i;
+
+	n = byte_cnt - (read_tail - read_head);
+
+	if ((ret = vldc_read(chnl_fd,
+			read_tail, n)) <= 0) {
+		return (ret);
+	}
+	read_tail += ret;
+
+	/*
+	 * Return the number of bytes we could read
+	 */
+	n = MIN(byte_cnt, (read_tail - read_head));
+
+	return (n);
+}
+
+/*
  * This function is slight different from pcp_peek. The peek requests are first
  * serviced from local read buffer, if data is available. If the peek request
  * is not serviceble from local read buffer, then the data is peeked from
@@ -798,7 +919,6 @@
 	if ((m = pcp_peek(peek_read_tail, mtu_size)) < 0) {
 		return (m);
 	}
-
 	peek_read_tail += m;
 
 	/*
@@ -874,7 +994,12 @@
 	 * (magic seq) or if an error happens while reading data from
 	 * channel.
 	 */
-	if ((ret = pcp_frame_error_handle()) != 0)
+	if (xport_type  == GLVC_NON_STREAM)
+		ret = pcp_frame_error_handle();
+	else
+		ret = pcp_vldc_frame_error_handle();
+
+	if (ret != 0)
 		return (PCPL_FRAME_ERROR);
 
 	/* read magic number first */
@@ -1059,6 +1184,55 @@
 }
 
 /*
+ * This function handles channel framing errors. It waits until proper
+ * frame with starting sequence as magic numder (0xAFBCAFA0)
+ * is arrived. It removes unexpected data (before the magic number sequence)
+ * on the channel. It returns when proper magic number sequence is seen
+ * or when any failure happens while reading/peeking the channel.
+ */
+static int
+pcp_vldc_frame_error_handle(void)
+{
+	uint8_t		magic_num_buf[4];
+	uint32_t	net_magic_num; /* magic byte in network byte order */
+	uint32_t	host_magic_num = PCP_MAGIC_NUM;
+	int		found_magic = 0;
+
+	net_magic_num =  htonl(host_magic_num);
+	(void) memcpy(magic_num_buf, (uint8_t *)&net_magic_num, 4);
+
+	/*
+	 * For vldc, we need to read whatever data is available and
+	 * advance the read pointer one byte at a time until we get
+	 * the magic word. When this function is invoked, we do not
+	 * have any byte in the read buffer.
+	 */
+
+	/*
+	 * Keep reading until we find the matching magic number
+	 */
+	while (!found_magic) {
+		while ((read_tail - read_head) < sizeof (host_magic_num)) {
+			if (pcp_update_read_area(sizeof (host_magic_num)) < 0)
+				return (-1);
+		}
+
+		/*
+		 * We should have at least 4 bytes in read buffer. Check
+		 * if the magic number can be matched
+		 */
+		if (memcmp(read_head, magic_num_buf,
+				sizeof (host_magic_num))) {
+			read_head += 1;
+		} else {
+			found_magic = 1;
+		}
+	}
+
+	return (0);
+}
+
+/*
  * checks whether certain byte sequence is present in the data stream.
  */
 static int
@@ -1188,3 +1362,81 @@
 	umem_free(buf, mtu_size);
 	return (ret);
 }
+
+static int
+vldc_write(int fd, uint8_t *bufp, int size)
+{
+	int res;
+	int left = size;
+	pollfd_t pollfd;
+
+	pollfd.events = POLLOUT;
+	pollfd.revents = 0;
+	pollfd.fd = fd;
+
+	/*
+	 * Poll for the vldc channel to be ready
+	 */
+	if (poll(&pollfd, 1, glvc_timeout * MILLISEC) <= 0) {
+		return (-1);
+	}
+
+	do {
+		if ((res = write(fd, bufp, left)) <= 0) {
+			if (errno != EWOULDBLOCK) {
+				return (res);
+			}
+		} else {
+			bufp += res;
+			left -= res;
+		}
+	} while (left > 0);
+
+	/*
+	 * Return number of bytes actually written
+	 */
+	return (size - left);
+}
+
+/*
+ * Keep reading until we get the specified number of bytes
+ */
+static int
+vldc_read(int fd, uint8_t *bufp, int size)
+{
+	int res;
+	int left = size;
+
+	struct pollfd fds[1];
+
+	fds[0].events = POLLIN | POLLPRI;
+	fds[0].revents = 0;
+	fds[0].fd = fd;
+
+	if (poll(fds, 1, glvc_timeout * MILLISEC) <= 0) {
+		return (-1);
+	}
+
+	while (left > 0) {
+		res = read(fd, bufp, left);
+			/* return on error or short read */
+		if ((res == 0) || ((res < 0) &&
+			(errno == EAGAIN))) {
+				/* poll until the read is unblocked */
+				if ((poll(fds, 1, glvc_timeout * MILLISEC)) < 0)
+					return (-1);
+
+				continue;
+		} else
+		if (res < 0) {
+			/* unrecoverable error */
+
+			return (-1);
+		} else {
+			bufp += res;
+			left -= res;
+		}
+	}
+
+	return (size - left);
+}
--- a/usr/src/pkgdefs/Makefile	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/pkgdefs/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -69,6 +69,8 @@
 	SUNWkvm.u  \
 	SUNWkvm.v  \
 	SUNWkvmt200.v \
+	SUNWldomr.v  \
+	SUNWldomu.v  \
 	SUNWluxd.u \
 	SUNWluxl \
 	SUNWonmtst.u \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomr.v/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,38 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+
+DATAFILES += depend
+
+.KEEP_STATE:
+
+all: $(FILES)
+
+install: all pkg
+
+include ../Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomr.v/i.manifest	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# i.manifest - smf(5) service manifest install class action script
+#
+
+repfile=$PKG_INSTALL_ROOT/etc/svc/repository.db
+export repfile
+
+#
+# If the repository does not yet exist, create it from the appropriate seed.  If
+# for some reason the seeds do not exist, svccfg(1M) will create the repository
+# automatically.
+#
+if [ ! -f $repfile ]; then
+	if [ -n "$SUNW_PKG_INSTALL_ZONENAME" -a \
+	    "$SUNW_PKG_INSTALL_ZONENAME" != "global" ]; then
+		[ -f $PKG_INSTALL_ROOT/lib/svc/seed/nonglobal.db ] && \
+		/usr/bin/cp $PKG_INSTALL_ROOT/lib/svc/seed/nonglobal.db \
+		    $repfile
+	else
+		[ -f $PKG_INSTALL_ROOT/lib/svc/seed/global.db ] && \
+		/usr/bin/cp $PKG_INSTALL_ROOT/lib/svc/seed/global.db \
+		    $repfile
+	fi
+	/usr/bin/chmod 0600 $repfile
+	/usr/bin/chown root:sys $repfile
+fi
+
+if [ ! -r $PKG_INSTALL_ROOT/etc/svc/volatile/repository_door ]; then
+	#
+	# smf(5) is not presently running for the destination environment.
+	# Since we presently cannot refresh without a running svc.startd(1M), we
+	# cannot consistently handle dependent placement.  Defer to next boot.
+	#
+	while read src dst; do
+		/usr/bin/cp -p $src $dst
+	done
+else
+	#
+	# Local package install.
+	#
+	while read src dst; do
+		/usr/bin/cp -p $src $dst
+
+		[ "$PKG_INSTALL_ROOT" = "" -o "$PKG_INSTALL_ROOT" = "/" ] && \
+		    SVCCFG_CHECKHASH=1 /usr/sbin/svccfg import $dst
+	done
+fi
+
+exit 0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomr.v/pkginfo.tmpl	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,55 @@
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This required package information file describes characteristics of the
+# package, such as package abbreviation, full package name, package version,
+# and package architecture.
+#
+PKG="SUNWldomr"
+NAME="Solaris Logical Domains (Root)"
+ARCH="sparc.sun4v"
+VERSION="ONVERS,REV=0.0.0"
+SUNW_PRODNAME="SunOS"
+SUNW_PRODVERS="RELEASE/VERSION"
+SUNW_PKGTYPE="root"
+MAXINST="1000"
+CATEGORY="system"
+DESC="Solaris Logical Domains Configuration Files"
+VENDOR="Sun Microsystems, Inc."
+HOTLINE="Please contact your local service provider"
+EMAIL=""
+CLASSES="none"
+BASEDIR=/
+SUNW_PKGVERS="1.0"
+SUNW_PKG_ALLZONES="true"
+SUNW_PKG_HOLLOW="true"
+SUNW_PKG_THISZONE="false"
+#VSTOCK="<reserved by Release Engineering for package part #>"
+#ISTATES="<developer defined>"
+#RSTATES='<developer defined>'
+#ULIMIT="<developer defined>"
+#ORDER="<developer defined>"
+#PSTAMP="<developer defined>"
+#INTONLY="<developer defined>"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomr.v/postinstall	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,136 @@
+#!/sbin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+# Function: check_add_drv()
+#
+# This function will check if the module has an entry in etc/name_to_major
+# If not simply calls add_drv with the arguments given. If there is
+# such an entry in name_to_major file, it adds entries in driver_aliases
+# driver_classes and minor_perm if necessary.
+# The syntax of this function is the same as add_drv. 
+
+check_add_drv()
+{
+	if [ "$BASEDIR" = "" ]
+	then
+		BASEDIR=/  
+	fi
+	alias=""
+	class=""
+	ADD_ALIAS=0
+	ADD_CLASS=0
+	ADD_MINOR=0
+	OPTIND=1
+	IS_NET_DRIVER=0
+
+	cmd="add_drv"
+
+	NO_CMD=
+	while getopts i:b:m:c:N  opt
+	do
+		case $opt in
+			N )	NO_CMD=1;;
+			i )	ADD_ALIAS=1	
+				alias=$OPTARG
+				cmd=$cmd" -i '$alias'"
+				;;
+			m )	ADD_MINOR=1
+				minor=$OPTARG
+				cmd=$cmd" -m '$minor'"
+				;;
+			c)	ADD_CLASS=1
+				class=$OPTARG
+				cmd=$cmd" -c $class"
+				;;
+			b)	BASEDIR=$OPTARG
+				cmd=$cmd" -b $BASEDIR"
+				;;
+			\?) 	echo "check_add_drv can not handle this option"
+				return
+				;;
+			esac
+	done 
+	shift `/usr/bin/expr $OPTIND - 1`
+	
+	drvname=$1
+
+	cmd=$cmd" "$drvname
+
+	drvname=`echo $drvname | /usr/bin/sed 's;.*/;;g'`
+
+	/usr/bin/grep "^$drvname[ 	]" $BASEDIR/etc/name_to_major >  /dev/null 2>&1
+
+	if [ "$NO_CMD" = "" -a $? -ne 0 ] 
+	then
+		eval $cmd
+	else	
+		# entry already in name_to_major, add alias, class, minorperm
+		# if necessary
+		if [ $ADD_ALIAS = 1 ]	
+		then
+			for i in $alias
+			do
+				/usr/bin/egrep "^$drvname[ 	]+$i" $BASEDIR/etc/driver_aliases>/dev/null 2>&1
+				if [ $? -ne 0 ]
+				then
+					echo "$drvname $i" >> $BASEDIR/etc/driver_aliases	
+				fi
+			done
+		fi
+
+		if [ $ADD_CLASS = 1 ]
+		then
+			/usr/bin/egrep "^$drvname[ 	]+$class( |	|$)" $BASEDIR/etc/driver_classes > /dev/null 2>&1
+			if [ $? -ne 0 ]
+			then 
+				echo "$drvname\t$class" >> $BASEDIR/etc/driver_classes
+			fi
+		fi
+
+		if [ $ADD_MINOR = 1 ]
+		then
+			/usr/bin/grep "^$drvname:" $BASEDIR/etc/minor_perm > /dev/null 2>&1
+			if [ $? -ne 0 ]
+			then 
+				minorentry="$drvname:$minor"
+				echo $minorentry >> $BASEDIR/etc/minor_perm
+			fi
+		fi
+
+	fi
+
+
+}
+
+check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-channel-devices"' cnex
+check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-console-concentrator"' vcc
+check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-disk"' vdc
+check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-disk-server"' vds
+check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-channel"' vldc
+check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-network"' vnet
+check_add_drv -b "${BASEDIR}" -i '"SUNW,sun4v-network-switch"' vsw
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomr.v/preremove	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,58 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+PATH=/usr/bin:/usr/sbin:${PATH}
+export PATH
+
+EXIT=0
+
+not_installed()
+{
+	driver=$1
+
+	grep "^${driver} " ${BASEDIR}/etc/name_to_major > /dev/null 2>&1
+
+	if [ "$?" -eq 0 ]; then
+		return 1
+	else
+		return 0
+	fi
+}
+
+# 
+# Unload and remove drivers
+#
+not_installed cnex || rem_drv -b "${BASEDIR}" cnex || EXIT=1
+not_installed vcc  || rem_drv -b "${BASEDIR}" vcc  || EXIT=1
+not_installed vdc  || rem_drv -b "${BASEDIR}" vdc  || EXIT=1
+not_installed vds  || rem_drv -b "${BASEDIR}" vds  || EXIT=1
+not_installed vldc || rem_drv -b "${BASEDIR}" vldc || EXIT=1
+not_installed vnet || rem_drv -b "${BASEDIR}" vnet || EXIT=1
+not_installed vsw  || rem_drv -b "${BASEDIR}" vsw  || EXIT=1
+
+exit ${EXIT}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomr.v/prototype_com	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,52 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This required package information file contains a list of package contents.
+# The 'pkgmk' command uses this file to identify the contents of a package
+# and their location on the development machine when building the package.
+# Can be created via a text editor or through use of the 'pkgproto' command.
+#
+
+#!search <pathname pathname ...>	# where to find pkg objects
+#!include <filename>			# include another 'prototype' file
+#!default <mode> <owner> <group>	# default used if not specified on entry
+#!<param>=<value>			# puts parameter in pkg environment
+
+# packaging files
+i pkginfo
+i copyright
+i depend
+i postinstall
+i preremove
+i i.manifest
+i r.manifest
+
+#
+# source locations relative to the prototype file
+#
+# SUNWldomr.v
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomr.v/prototype_sparc	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,79 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This required package information file contains a list of package contents.
+# The 'pkgmk' command uses this file to identify the contents of a package
+# and their location on the development machine when building the package.
+# Can be created via a text editor or through use of the 'pkgproto' command.
+#
+
+#!search <pathname pathname ...>	# where to find pkg objects
+#!include <filename>			# include another 'prototype' file
+#!default <mode> <owner> <group>	# default used if not specified on entry
+#!<param>=<value>			# puts parameter in pkg environment
+
+#
+# Include ISA independent files (prototype_com)
+#
+!include prototype_com
+
+#
+# List files which are SPARC specific here
+#
+# source locations relative to the prototype file
+#
+# SUNWldomr.v
+#
+d none lib 755 root bin
+d none lib/svc 0755 root bin
+d none lib/svc/method 0755 root bin
+f none lib/svc/method/svc-vntsd 0555 root bin
+d none platform 755 root sys
+d none platform/sun4v 755 root sys
+d none platform/sun4v/kernel 755 root sys
+d none platform/sun4v/kernel/drv 755 root sys
+d none platform/sun4v/kernel/drv/sparcv9 755 root sys
+f none platform/sun4v/kernel/drv/sparcv9/cnex 755 root sys
+f none platform/sun4v/kernel/drv/sparcv9/vcc 755 root sys
+f none platform/sun4v/kernel/drv/sparcv9/vdc 755 root sys
+f none platform/sun4v/kernel/drv/sparcv9/vds 755 root sys
+f none platform/sun4v/kernel/drv/sparcv9/vldc 755 root sys
+f none platform/sun4v/kernel/drv/sparcv9/vnet 755 root sys
+f none platform/sun4v/kernel/drv/sparcv9/vsw 755 root sys
+d none platform/sun4v/kernel/misc 755 root sys
+d none platform/sun4v/kernel/misc/sparcv9 755 root sys
+f none platform/sun4v/kernel/misc/sparcv9/dr_cpu 755 root sys
+f none platform/sun4v/kernel/misc/sparcv9/ds 755 root sys
+f none platform/sun4v/kernel/misc/sparcv9/fault_iso 755 root sys
+f none platform/sun4v/kernel/misc/sparcv9/ldc 755 root sys
+f none platform/sun4v/kernel/misc/sparcv9/platsvc 755 root sys
+d none var 755 root sys
+d none var/svc 755 root sys
+d none var/svc/manifest 755 root sys
+d none var/svc/manifest/platform 755 root sys
+d none var/svc/manifest/platform/sun4v 755 root sys
+f manifest var/svc/manifest/platform/sun4v/vntsd.xml 0444 root sys
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomr.v/r.manifest	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,83 @@
+#!/bin/sh
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# r.manifest - smf(5) manifest remove class action script
+#
+
+if [ "$PKG_INSTALL_ROOT" != "" -a "$PKG_INSTALL_ROOT" != "/" ]; then
+	#
+	# We can't safely disable the service in this case.
+	#
+	smf_alive=no
+else
+	#
+	# We can verify if the service is disabled prior to
+	# removal.
+	#
+	if [ -r /etc/svc/volatile/repository_door ]; then
+		smf_alive=yes
+	fi
+fi
+
+MFSTSCAN=/lib/svc/bin/mfstscan
+SVCCFG=/usr/sbin/svccfg
+SVCPROP=/usr/bin/svcprop
+
+while read mfst; do
+	if [ "$smf_alive" = "yes" ]; then
+		ENTITIES=`$SVCCFG inventory $mfst`
+
+		for fmri in $ENTITIES; do
+			#
+			# Determine whether any of our instances are
+			# enabled.
+			#
+			en_p=`$SVCPROP -C -p general/enabled $fmri 2>/dev/null`
+			en_o=`$SVCPROP -C -p general_ovr/enabled $fmri 2>/dev/null`
+
+			if [ "$en_p" = "true" -o "$en_o" = "true" ]; then
+				echo "$fmri remains enabled; aborting"
+				exit 1
+			fi
+
+			$SVCCFG delete $fmri
+		done
+
+		#
+		# Delete the manifest hash value.
+		#
+		pg_name=`$MFSTSCAN -t $mfst`
+		if $SVCPROP -q -p $pg_name smf/manifest; then
+			$SVCCFG -s smf/manifest delpg $pg_name
+		fi
+	fi
+
+	/usr/bin/rm $mfst
+done
+
+exit 0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomu.v/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,38 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+include ../Makefile.com
+
+DATAFILES +=  i.manifest r.manifest
+
+.KEEP_STATE:
+
+all: $(FILES)
+
+install: all pkg
+
+include ../Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomu.v/depend	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,56 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+#
+# This package information file defines software dependencies associated
+# with the pkg.  You can define three types of pkg dependencies with this file:
+#	 P indicates a prerequisite for installation
+#	 I indicates an incompatible package
+#	 R indicates a reverse dependency
+# <pkg.abbr> see pkginfo(4), PKG parameter
+# <name> see pkginfo(4), NAME parameter
+# <version> see pkginfo(4), VERSION parameter
+# <arch> see pkginfo(4), ARCH parameter
+# <type> <pkg.abbr> <name>
+#	(<arch>)<version>
+#	(<arch>)<version>
+#	...
+# <type> <pkg.abbr> <name>
+# ...
+#
+
+P SUNWcar	Core Architecture, (Root)
+P SUNWcakr	Core Solaris Kernel Architecture (Root)
+P SUNWkvm	Core Architecture, (Kvm)
+P SUNWcsr	Core Solaris, (Root)
+P SUNWckr	Core Solaris Kernel (Root)
+P SUNWcnetr	Core Solaris Network Infrastructure (Root)
+P SUNWcsu	Core Solaris, (Usr)
+P SUNWcsd	Core Solaris Devices
+P SUNWcsl	Core Solaris Libraries
+P SUNWldomr	Solaris Logical Domains (Root)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomu.v/pkginfo.tmpl	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,55 @@
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This required package information file describes characteristics of the
+# package, such as package abbreviation, full package name, package version,
+# and package architecture.
+#
+PKG="SUNWldomu"
+NAME="Solaris Logical Domains (Usr)"
+ARCH="sparc.sun4v"
+VERSION="ONVERS,REV=0.0.0"
+SUNW_PRODNAME="SunOS"
+SUNW_PRODVERS="RELEASE/VERSION"
+SUNW_PKGTYPE="usr"
+MAXINST="1000"
+CATEGORY="system"
+DESC="Solaris Logical Domains Configuration and Administration"
+VENDOR="Sun Microsystems, Inc."
+HOTLINE="Please contact your local service provider"
+EMAIL=""
+CLASSES="none"
+BASEDIR=/
+SUNW_PKGVERS="1.0"
+SUNW_PKG_ALLZONES="true"
+SUNW_PKG_HOLLOW="true"
+SUNW_PKG_THISZONE="false"
+#VSTOCK="<reserved by Release Engineering for package part #>"
+#ISTATES="<developer defined>"
+#RSTATES='<developer defined>'
+#ULIMIT="<developer defined>"
+#ORDER="<developer defined>"
+#PSTAMP="<developer defined>"
+#INTONLY="<developer defined>"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomu.v/prototype_com	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,48 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This required package information file contains a list of package contents.
+# The 'pkgmk' command uses this file to identify the contents of a package
+# and their location on the development machine when building the package.
+# Can be created via a text editor or through use of the 'pkgproto' command.
+#
+
+#!search <pathname pathname ...>	# where to find pkg objects
+#!include <filename>			# include another 'prototype' file
+#!default <mode> <owner> <group>	# default used if not specified on entry
+#!<param>=<value>			# puts parameter in pkg environment
+
+# packaging files
+i pkginfo
+i copyright
+i depend
+
+#
+# source locations relative to the prototype file
+#
+# SUNWldomu.v
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/pkgdefs/SUNWldomu.v/prototype_sparc	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,54 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This required package information file contains a list of package contents.
+# The 'pkgmk' command uses this file to identify the contents of a package
+# and their location on the development machine when building the package.
+# Can be created via a text editor or through use of the 'pkgproto' command.
+#
+
+#!search <pathname pathname ...>	# where to find pkg objects
+#!include <filename>			# include another 'prototype' file
+#!default <mode> <owner> <group>	# default used if not specified on entry
+#!<param>=<value>			# puts parameter in pkg environment
+
+#
+# Include ISA independent files (prototype_com)
+#
+!include prototype_com
+
+#
+# List files which are SPARC specific here
+#
+# source locations relative to the prototype file
+#
+# SUNWldomu.v
+#
+d none usr 755 root sys
+d none usr/lib 755 root bin
+d none usr/lib/ldoms 755 root bin
+f none usr/lib/ldoms/vntsd 555 root bin
--- a/usr/src/pkgdefs/SUNWmdb/prototype_sparc	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_sparc	Tue May 16 16:05:21 2006 -0700
@@ -90,3 +90,4 @@
 d none usr/platform/sun4v/lib/mdb/kvm 755 root sys
 d none usr/platform/sun4v/lib/mdb/kvm/sparcv9 755 root sys
 f none usr/platform/sun4v/lib/mdb/kvm/sparcv9/unix.so 555 root sys
+f none usr/platform/sun4v/lib/mdb/kvm/sparcv9/vdsk.so 555 root sys
--- a/usr/src/pkgdefs/SUNWmdbr/prototype_sparc	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/pkgdefs/SUNWmdbr/prototype_sparc	Tue May 16 16:05:21 2006 -0700
@@ -75,3 +75,4 @@
 d none platform/sun4v/kernel/kmdb 755 root sys
 d none platform/sun4v/kernel/kmdb/sparcv9 755 root sys
 f none platform/sun4v/kernel/kmdb/sparcv9/unix 555 root sys
+f none platform/sun4v/kernel/kmdb/sparcv9/vdsk 555 root sys
--- a/usr/src/pkgdefs/etc/exception_list_i386	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/pkgdefs/etc/exception_list_i386	Tue May 16 16:05:21 2006 -0700
@@ -642,6 +642,7 @@
 usr/include/librestart_priv.h		i386
 usr/include/libcontract_priv.h		i386
 var/svc/manifest/platform/sun4u		i386
+var/svc/manifest/platform/sun4v		i386
 var/svc/profile/platform_SUNW,SPARC-Enterprise.xml	i386
 var/svc/profile/platform_SUNW,Sun-Fire.xml	i386
 var/svc/profile/platform_SUNW,Sun-Fire-880.xml	i386
--- a/usr/src/tools/scripts/bfu.sh	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/tools/scripts/bfu.sh	Tue May 16 16:05:21 2006 -0700
@@ -325,6 +325,7 @@
 	lib/svc/method/svc-scheduler
 	lib/svc/method/svc-sckmd
 	lib/svc/method/svc-syseventd
+	lib/svc/method/svc-vntsd
 	lib/svc/method/svc-zones
 	platform/*/kernel
 	platform/SUNW,Sun-Fire-15000/lib/cvcd
@@ -357,6 +358,7 @@
 	usr/include/netinet/ipl.h
 	usr/include/sys/dcam
 	usr/lib/devfsadm/linkmod/SUNW_dcam1394_link.so
+	usr/lib/ldoms
 	usr/platform/SUNW,SPARC-Enterprise/lib/dscp.ppp.options
 	usr/platform/SUNW,SPARC-Enterprise/lib/libdscp.so
 	usr/platform/SUNW,SPARC-Enterprise/lib/libdscp.so.1
@@ -376,6 +378,7 @@
 	var/svc/manifest/platform/sun4u/efdaemon.xml
 	var/svc/manifest/platform/sun4u/sckmd.xml
 	var/svc/manifest/platform/sun4u/sf880drd.xml
+	var/svc/manifest/platform/sun4v/vntsd.xml
 	var/svc/manifest/system/cvc.xml
 	var/svc/manifest/system/dumpadm.xml
 	var/svc/manifest/system/fmd.xml
--- a/usr/src/uts/common/sys/mdesc.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/common/sys/mdesc.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -87,22 +87,39 @@
 
 #ifndef _ASM	/* { */
 
-typedef uint64_t mde_cookie_t;
+/*
+ * Opaque handles for use in external interfaces
+ */
+
+typedef void			*md_t;
+
+typedef uint64_t		mde_cookie_t;
 #define	MDE_INVAL_ELEM_COOKIE	((mde_cookie_t)-1)
 
 typedef	uint32_t		mde_str_cookie_t;
 #define	MDE_INVAL_STR_COOKIE	((mde_str_cookie_t)-1)
 
+typedef uint64_t		md_diff_cookie_t;
+#define	MD_INVAL_DIFF_COOKIE	((md_diff_cookie_t)-1)
 
-	/* Opaque structure for handling in functions */
-typedef void * md_t;
+#define	MDESC_INVAL_GEN		(0)
+
+/*
+ * External structure for MD diff interface
+ */
+typedef struct {
+	uint8_t		type;		/* property type */
+	char		*namep;		/* property name */
+} md_prop_match_t;
 
 
-
+/*
+ * External Interface
+ */
 
-extern md_t		*md_init(void *);
-extern md_t		*md_init_intern(uint64_t *, void*(*)(size_t),
-				void (*)(void*, size_t));
+extern md_t		*md_init_intern(uint64_t *,
+				void *(*allocp)(size_t),
+				void (*freep)(void *, size_t));
 
 extern int		md_fini(md_t *);
 
@@ -112,6 +129,10 @@
 
 extern mde_cookie_t	md_root_node(md_t *);
 
+extern uint64_t		md_get_gen(md_t *);
+
+extern size_t		md_get_bin_size(md_t *);
+
 extern int		md_scan_dag(md_t *,
 				mde_cookie_t,
 				mde_str_cookie_t,
@@ -134,6 +155,24 @@
 				uint8_t **,
 				int *);
 
+extern md_diff_cookie_t	md_diff_init(md_t *,
+				mde_cookie_t,
+				md_t *,
+				mde_cookie_t,
+				char *,
+				md_prop_match_t *);
+
+extern int		md_diff_added(md_diff_cookie_t,
+				mde_cookie_t **);
+
+extern int		md_diff_removed(md_diff_cookie_t,
+				mde_cookie_t **);
+
+extern int		md_diff_matched(md_diff_cookie_t,
+				mde_cookie_t **,
+				mde_cookie_t **);
+
+extern int		md_diff_fini(md_diff_cookie_t);
 
 
 #endif	/* } _ASM */
@@ -150,7 +189,6 @@
 #define	MDESCIOCSSZ	(MDESCIOC | 2)   /* Set new quote buffer size */
 #define	MDESCIOCDISCARD	(MDESCIOC | 3)   /* Discard quotes and reset */
 
-
 #ifdef __cplusplus
 }
 #endif
--- a/usr/src/uts/common/sys/mdesc_impl.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/common/sys/mdesc_impl.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -115,7 +115,7 @@
 	caddr_t		caddr;
 
 	void		*(*allocp)(size_t);
-	void		(*freep)(void*, size_t);
+	void		(*freep)(void *, size_t);
 
 	md_header_t	*headerp;
 	md_element_t	*mdep;
@@ -132,6 +132,7 @@
 	mde_cookie_t	root_node;
 
 	int		size;
+	uint64_t	gen;
 
 	uint64_t	md_magic;
 };
@@ -152,7 +153,6 @@
 				mde_cookie_t,
 				mde_str_cookie_t,
 				int);
-
 #endif	/* _ASM */
 
 #ifdef __cplusplus
--- a/usr/src/uts/sfmmu/ml/sfmmu_kdi.s	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sfmmu/ml/sfmmu_kdi.s	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -303,8 +303,11 @@
 	ldx	[%g2], %g2		/* VA %g1, sfmmup %g2 */
 
 	mov	1, %g3			/* VA %g1, sfmmup %g2, idx %g3 */
-1:	mov	HBLK_RANGE_SHIFT, %g4
-	mulx	%g3, 3, %g4
+	mov	HBLK_RANGE_SHIFT, %g4
+	ba	3f
+	nop
+
+1:	mulx	%g3, 3, %g4		/* 3: see TTE_BSZS_SHIFT */
 	add	%g4, MMU_PAGESHIFT, %g4
 
 3:	KDI_HME_HASH_FUNCTION		/* %g1, %g2, %g4 => hash in %g4 */
@@ -321,11 +324,9 @@
 4:	ba,a	6f
 
 5:	add	%g3, 1, %g3
-#ifdef sun4v
-	cmp	%g3, MAX_HASHCNT	        
-#else                
-	cmp	%g3, DEFAULT_MAX_HASHCNT	/* no 32/256M kernel pages */
-#endif	
+	set	mmu_hashcnt, %g4
+	lduw	[%g4], %g4
+	cmp	%g3, %g4
 	ble	1b
 	nop
 
--- a/usr/src/uts/sun4/io/trapstat.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4/io/trapstat.c	Tue May 16 16:05:21 2006 -0700
@@ -1712,8 +1712,25 @@
 		break;
 
 	case CPU_UNCONFIG:
-		if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED)
+		if (tcpu->tcpu_flags & TSTAT_CPU_ENABLED) {
 			tcpu->tcpu_flags &= ~TSTAT_CPU_ENABLED;
+#ifdef	sun4v
+			/*
+			 * A power-off, causes the cpu mondo queues to be
+			 * unconfigured on sun4v. Since we can't teardown
+			 * trapstat's mappings on the cpu that is going away,
+			 * we simply mark it as not allocated. This will
+			 * prevent a teardown on a cpu with the same cpu id
+			 * that might have been added while trapstat is running.
+			 */
+			if (tcpu->tcpu_flags & TSTAT_CPU_ALLOCATED) {
+				tcpu->tcpu_pfn = NULL;
+				tcpu->tcpu_instr = NULL;
+				tcpu->tcpu_data = NULL;
+				tcpu->tcpu_flags &= ~TSTAT_CPU_ALLOCATED;
+			}
+#endif
+		}
 		break;
 
 	default:
--- a/usr/src/uts/sun4/os/ddi_impl.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4/os/ddi_impl.c	Tue May 16 16:05:21 2006 -0700
@@ -53,6 +53,7 @@
 #include <sys/fs/dv_node.h>
 #include <sys/fs/snode.h>
 #include <sys/ddi_isa.h>
+#include <sys/modhash.h>
 
 dev_info_t *get_intr_parent(dev_info_t *, dev_info_t *,
     ddi_intr_handle_impl_t *);
@@ -1968,3 +1969,831 @@
 
 	return (err);
 }
+
+/*
+ * Platform independent DR routines
+ */
+
+static int
+ndi2errno(int n)
+{
+	int err = 0;
+
+	switch (n) {
+		case NDI_NOMEM:
+			err = ENOMEM;
+			break;
+		case NDI_BUSY:
+			err = EBUSY;
+			break;
+		case NDI_FAULT:
+			err = EFAULT;
+			break;
+		case NDI_FAILURE:
+			err = EIO;
+			break;
+		case NDI_SUCCESS:
+			break;
+		case NDI_BADHANDLE:
+		default:
+			err = EINVAL;
+			break;
+	}
+	return (err);
+}
+
+/*
+ * Prom tree node list
+ */
+struct ptnode {
+	pnode_t		nodeid;
+	struct ptnode	*next;
+};
+
+/*
+ * Prom tree walk arg
+ */
+struct pta {
+	dev_info_t	*pdip;
+	devi_branch_t	*bp;
+	uint_t		flags;
+	dev_info_t	*fdip;
+	struct ptnode	*head;
+};
+
+static void
+visit_node(pnode_t nodeid, struct pta *ap)
+{
+	struct ptnode	**nextp;
+	int		(*select)(pnode_t, void *, uint_t);
+
+	ASSERT(nodeid != OBP_NONODE && nodeid != OBP_BADNODE);
+
+	select = ap->bp->create.prom_branch_select;
+
+	ASSERT(select);
+
+	if (select(nodeid, ap->bp->arg, 0) == DDI_SUCCESS) {
+
+		for (nextp = &ap->head; *nextp; nextp = &(*nextp)->next)
+			;
+
+		*nextp = kmem_zalloc(sizeof (struct ptnode), KM_SLEEP);
+
+		(*nextp)->nodeid = nodeid;
+	}
+
+	if ((ap->flags & DEVI_BRANCH_CHILD) == DEVI_BRANCH_CHILD)
+		return;
+
+	nodeid = prom_childnode(nodeid);
+	while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) {
+		visit_node(nodeid, ap);
+		nodeid = prom_nextnode(nodeid);
+	}
+}
+
+/*ARGSUSED*/
+static int
+set_dip_offline(dev_info_t *dip, void *arg)
+{
+	ASSERT(dip);
+
+	mutex_enter(&(DEVI(dip)->devi_lock));
+	if (!DEVI_IS_DEVICE_OFFLINE(dip))
+		DEVI_SET_DEVICE_OFFLINE(dip);
+	mutex_exit(&(DEVI(dip)->devi_lock));
+
+	return (DDI_WALK_CONTINUE);
+}
+
+/*ARGSUSED*/
+static int
+create_prom_branch(void *arg, int has_changed)
+{
+	int		circ, c;
+	int		exists, rv;
+	pnode_t		nodeid;
+	struct ptnode	*tnp;
+	dev_info_t	*dip;
+	struct pta	*ap = arg;
+	devi_branch_t	*bp;
+
+	ASSERT(ap);
+	ASSERT(ap->fdip == NULL);
+	ASSERT(ap->pdip && ndi_dev_is_prom_node(ap->pdip));
+
+	bp = ap->bp;
+
+	nodeid = ddi_get_nodeid(ap->pdip);
+	if (nodeid == OBP_NONODE || nodeid == OBP_BADNODE) {
+		cmn_err(CE_WARN, "create_prom_branch: invalid "
+		    "nodeid: 0x%x", nodeid);
+		return (EINVAL);
+	}
+
+	ap->head = NULL;
+
+	nodeid = prom_childnode(nodeid);
+	while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) {
+		visit_node(nodeid, ap);
+		nodeid = prom_nextnode(nodeid);
+	}
+
+	if (ap->head == NULL)
+		return (ENODEV);
+
+	rv = 0;
+	while ((tnp = ap->head) != NULL) {
+		ap->head = tnp->next;
+
+		ndi_devi_enter(ap->pdip, &circ);
+
+		/*
+		 * Check if the branch already exists.
+		 */
+		exists = 0;
+		dip = e_ddi_nodeid_to_dip(tnp->nodeid);
+		if (dip != NULL) {
+			exists = 1;
+
+			/* Parent is held busy, so release hold */
+			ndi_rele_devi(dip);
+#ifdef	DEBUG
+			cmn_err(CE_WARN, "create_prom_branch: dip(%p) exists"
+			    " for nodeid 0x%x", (void *)dip, tnp->nodeid);
+#endif
+		} else {
+			dip = i_ddi_create_branch(ap->pdip, tnp->nodeid);
+		}
+
+		kmem_free(tnp, sizeof (struct ptnode));
+
+		if (dip == NULL) {
+			ndi_devi_exit(ap->pdip, circ);
+			rv = EIO;
+			continue;
+		}
+
+		ASSERT(ddi_get_parent(dip) == ap->pdip);
+
+		/*
+		 * Hold the branch if it is not already held
+		 */
+		if (!exists)
+			e_ddi_branch_hold(dip);
+
+		ASSERT(e_ddi_branch_held(dip));
+
+		/*
+		 * Set all dips in the branch offline so that
+		 * only a "configure" operation can attach
+		 * the branch
+		 */
+		(void) set_dip_offline(dip, NULL);
+
+		ndi_devi_enter(dip, &c);
+		ddi_walk_devs(ddi_get_child(dip), set_dip_offline, NULL);
+		ndi_devi_exit(dip, c);
+
+		ndi_devi_exit(ap->pdip, circ);
+
+		if (ap->flags & DEVI_BRANCH_CONFIGURE) {
+			int error = e_ddi_branch_configure(dip, &ap->fdip, 0);
+			if (error && rv == 0)
+				rv = error;
+		}
+
+		/*
+		 * Invoke devi_branch_callback() (if it exists) only for
+		 * newly created branches
+		 */
+		if (bp->devi_branch_callback && !exists)
+			bp->devi_branch_callback(dip, bp->arg, 0);
+	}
+
+	return (rv);
+}
+
+static int
+sid_node_create(dev_info_t *pdip, devi_branch_t *bp, dev_info_t **rdipp)
+{
+	int			rv, circ, len;
+	int			i, flags;
+	dev_info_t		*dip;
+	char			*nbuf;
+	static const char	*noname = "<none>";
+
+	ASSERT(pdip);
+	ASSERT(DEVI_BUSY_OWNED(pdip));
+
+	flags = 0;
+
+	/*
+	 * Creating the root of a branch ?
+	 */
+	if (rdipp) {
+		*rdipp = NULL;
+		flags = DEVI_BRANCH_ROOT;
+	}
+
+	ndi_devi_alloc_sleep(pdip, (char *)noname, DEVI_SID_NODEID, &dip);
+	rv = bp->create.sid_branch_create(dip, bp->arg, flags);
+
+	nbuf = kmem_alloc(OBP_MAXDRVNAME, KM_SLEEP);
+
+	if (rv == DDI_WALK_ERROR) {
+		cmn_err(CE_WARN, "e_ddi_branch_create: Error setting"
+		    " properties on devinfo node %p",  (void *)dip);
+		goto fail;
+	}
+
+	len = OBP_MAXDRVNAME;
+	if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, "name", nbuf, &len)
+	    != DDI_PROP_SUCCESS) {
+		cmn_err(CE_WARN, "e_ddi_branch_create: devinfo node %p has"
+		    "no name property", (void *)dip);
+		goto fail;
+	}
+
+	ASSERT(i_ddi_node_state(dip) == DS_PROTO);
+	if (ndi_devi_set_nodename(dip, nbuf, 0) != NDI_SUCCESS) {
+		cmn_err(CE_WARN, "e_ddi_branch_create: cannot set name (%s)"
+		    " for devinfo node %p", nbuf, (void *)dip);
+		goto fail;
+	}
+
+	kmem_free(nbuf, OBP_MAXDRVNAME);
+
+	/*
+	 * Ignore bind failures just like boot does
+	 */
+	(void) ndi_devi_bind_driver(dip, 0);
+
+	switch (rv) {
+	case DDI_WALK_CONTINUE:
+	case DDI_WALK_PRUNESIB:
+		ndi_devi_enter(dip, &circ);
+
+		i = DDI_WALK_CONTINUE;
+		for (; i == DDI_WALK_CONTINUE; ) {
+			i = sid_node_create(dip, bp, NULL);
+		}
+
+		ASSERT(i == DDI_WALK_ERROR || i == DDI_WALK_PRUNESIB);
+		if (i == DDI_WALK_ERROR)
+			rv = i;
+		/*
+		 * If PRUNESIB stop creating siblings
+		 * of dip's child. Subsequent walk behavior
+		 * is determined by rv returned by dip.
+		 */
+
+		ndi_devi_exit(dip, circ);
+		break;
+	case DDI_WALK_TERMINATE:
+		/*
+		 * Don't create children and ask our parent
+		 * to not create siblings either.
+		 */
+		rv = DDI_WALK_PRUNESIB;
+		break;
+	case DDI_WALK_PRUNECHILD:
+		/*
+		 * Don't create children, but ask parent to continue
+		 * with siblings.
+		 */
+		rv = DDI_WALK_CONTINUE;
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	if (rdipp)
+		*rdipp = dip;
+
+	/*
+	 * Set device offline - only the "configure" op should cause an attach
+	 */
+	(void) set_dip_offline(dip, NULL);
+
+	return (rv);
+fail:
+	(void) ndi_devi_free(dip);
+	kmem_free(nbuf, OBP_MAXDRVNAME);
+	return (DDI_WALK_ERROR);
+}
+
+static int
+create_sid_branch(
+	dev_info_t	*pdip,
+	devi_branch_t	*bp,
+	dev_info_t	**dipp,
+	uint_t		flags)
+{
+	int		rv = 0, state = DDI_WALK_CONTINUE;
+	dev_info_t	*rdip;
+
+	while (state == DDI_WALK_CONTINUE) {
+		int	circ;
+
+		ndi_devi_enter(pdip, &circ);
+
+		state = sid_node_create(pdip, bp, &rdip);
+		if (rdip == NULL) {
+			ndi_devi_exit(pdip, circ);
+			ASSERT(state == DDI_WALK_ERROR);
+			break;
+		}
+
+		e_ddi_branch_hold(rdip);
+
+		ndi_devi_exit(pdip, circ);
+
+		if (flags & DEVI_BRANCH_CONFIGURE) {
+			int error = e_ddi_branch_configure(rdip, dipp, 0);
+			if (error && rv == 0)
+				rv = error;
+		}
+
+		/*
+		 * devi_branch_callback() is optional
+		 */
+		if (bp->devi_branch_callback)
+			bp->devi_branch_callback(rdip, bp->arg, 0);
+	}
+
+	ASSERT(state == DDI_WALK_ERROR || state == DDI_WALK_PRUNESIB);
+
+	return (state == DDI_WALK_ERROR ? EIO : rv);
+}
+
+int
+e_ddi_branch_create(
+	dev_info_t	*pdip,
+	devi_branch_t	*bp,
+	dev_info_t	**dipp,
+	uint_t		flags)
+{
+	int prom_devi, sid_devi, error;
+
+	if (pdip == NULL || bp == NULL || bp->type == 0)
+		return (EINVAL);
+
+	prom_devi = (bp->type == DEVI_BRANCH_PROM) ? 1 : 0;
+	sid_devi = (bp->type == DEVI_BRANCH_SID) ? 1 : 0;
+
+	if (prom_devi && bp->create.prom_branch_select == NULL)
+		return (EINVAL);
+	else if (sid_devi && bp->create.sid_branch_create == NULL)
+		return (EINVAL);
+	else if (!prom_devi && !sid_devi)
+		return (EINVAL);
+
+	if (flags & DEVI_BRANCH_EVENT)
+		return (EINVAL);
+
+	if (prom_devi) {
+		struct pta pta = {0};
+
+		pta.pdip = pdip;
+		pta.bp = bp;
+		pta.flags = flags;
+
+		error = prom_tree_access(create_prom_branch, &pta, NULL);
+
+		if (dipp)
+			*dipp = pta.fdip;
+		else if (pta.fdip)
+			ndi_rele_devi(pta.fdip);
+	} else {
+		error = create_sid_branch(pdip, bp, dipp, flags);
+	}
+
+	return (error);
+}
+
+int
+e_ddi_branch_configure(dev_info_t *rdip, dev_info_t **dipp, uint_t flags)
+{
+	int		circ, rv;
+	char		*devnm;
+	dev_info_t	*pdip;
+
+	if (dipp)
+		*dipp = NULL;
+
+	if (rdip == NULL || flags != 0 || (flags & DEVI_BRANCH_EVENT))
+		return (EINVAL);
+
+	pdip = ddi_get_parent(rdip);
+
+	ndi_devi_enter(pdip, &circ);
+
+	if (!e_ddi_branch_held(rdip)) {
+		ndi_devi_exit(pdip, circ);
+		cmn_err(CE_WARN, "e_ddi_branch_configure: "
+		    "dip(%p) not held", (void *)rdip);
+		return (EINVAL);
+	}
+
+	if (i_ddi_node_state(rdip) < DS_INITIALIZED) {
+		/*
+		 * First attempt to bind a driver. If we fail, return
+		 * success (On some platforms, dips for some device
+		 * types (CPUs) may not have a driver)
+		 */
+		if (ndi_devi_bind_driver(rdip, 0) != NDI_SUCCESS) {
+			ndi_devi_exit(pdip, circ);
+			return (0);
+		}
+
+		if (ddi_initchild(pdip, rdip) != DDI_SUCCESS) {
+			rv = NDI_FAILURE;
+			goto out;
+		}
+	}
+
+	ASSERT(i_ddi_node_state(rdip) >= DS_INITIALIZED);
+
+	devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
+
+	(void) ddi_deviname(rdip, devnm);
+
+	if ((rv = ndi_devi_config_one(pdip, devnm+1, &rdip,
+	    NDI_DEVI_ONLINE | NDI_CONFIG)) == NDI_SUCCESS) {
+		/* release hold from ndi_devi_config_one() */
+		ndi_rele_devi(rdip);
+	}
+
+	kmem_free(devnm, MAXNAMELEN + 1);
+out:
+	if (rv != NDI_SUCCESS && dipp) {
+		ndi_hold_devi(rdip);
+		*dipp = rdip;
+	}
+	ndi_devi_exit(pdip, circ);
+	return (ndi2errno(rv));
+}
+
+void
+e_ddi_branch_hold(dev_info_t *rdip)
+{
+	if (e_ddi_branch_held(rdip)) {
+		cmn_err(CE_WARN, "e_ddi_branch_hold: branch already held");
+		return;
+	}
+
+	mutex_enter(&DEVI(rdip)->devi_lock);
+	if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) == 0) {
+		DEVI(rdip)->devi_flags |= DEVI_BRANCH_HELD;
+		DEVI(rdip)->devi_ref++;
+	}
+	ASSERT(DEVI(rdip)->devi_ref > 0);
+	mutex_exit(&DEVI(rdip)->devi_lock);
+}
+
+int
+e_ddi_branch_held(dev_info_t *rdip)
+{
+	int rv = 0;
+
+	mutex_enter(&DEVI(rdip)->devi_lock);
+	if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) &&
+	    DEVI(rdip)->devi_ref > 0) {
+		rv = 1;
+	}
+	mutex_exit(&DEVI(rdip)->devi_lock);
+
+	return (rv);
+}
+void
+e_ddi_branch_rele(dev_info_t *rdip)
+{
+	mutex_enter(&DEVI(rdip)->devi_lock);
+	DEVI(rdip)->devi_flags &= ~DEVI_BRANCH_HELD;
+	DEVI(rdip)->devi_ref--;
+	mutex_exit(&DEVI(rdip)->devi_lock);
+}
+
+int
+e_ddi_branch_unconfigure(
+	dev_info_t *rdip,
+	dev_info_t **dipp,
+	uint_t flags)
+{
+	int	circ, rv;
+	int	destroy;
+	char	*devnm;
+	uint_t	nflags;
+	dev_info_t *pdip;
+
+	if (dipp)
+		*dipp = NULL;
+
+	if (rdip == NULL)
+		return (EINVAL);
+
+	pdip = ddi_get_parent(rdip);
+
+	ASSERT(pdip);
+
+	/*
+	 * Check if caller holds pdip busy - can cause deadlocks during
+	 * devfs_clean()
+	 */
+	if (DEVI_BUSY_OWNED(pdip)) {
+		cmn_err(CE_WARN, "e_ddi_branch_unconfigure: failed: parent"
+		    " devinfo node(%p) is busy held", (void *)pdip);
+		return (EINVAL);
+	}
+
+	destroy = (flags & DEVI_BRANCH_DESTROY) ? 1 : 0;
+
+	devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
+
+	ndi_devi_enter(pdip, &circ);
+	(void) ddi_deviname(rdip, devnm);
+	ndi_devi_exit(pdip, circ);
+
+	/*
+	 * ddi_deviname() returns a component name with / prepended.
+	 */
+	rv = devfs_clean(pdip, devnm + 1, DV_CLEAN_FORCE);
+	if (rv) {
+		kmem_free(devnm, MAXNAMELEN + 1);
+		return (rv);
+	}
+
+	ndi_devi_enter(pdip, &circ);
+
+	/*
+	 * Recreate device name as it may have changed state (init/uninit)
+	 * when parent busy lock was dropped for devfs_clean()
+	 */
+	(void) ddi_deviname(rdip, devnm);
+
+	if (!e_ddi_branch_held(rdip)) {
+		kmem_free(devnm, MAXNAMELEN + 1);
+		ndi_devi_exit(pdip, circ);
+		cmn_err(CE_WARN, "e_ddi_%s_branch: dip(%p) not held",
+		    destroy ? "destroy" : "unconfigure", (void *)rdip);
+		return (EINVAL);
+	}
+
+	/*
+	 * Release hold on the branch. This is ok since we are holding the
+	 * parent busy. If rdip is not removed, we must do a hold on the
+	 * branch before returning.
+	 */
+	e_ddi_branch_rele(rdip);
+
+	nflags = NDI_DEVI_OFFLINE;
+	if (destroy || (flags & DEVI_BRANCH_DESTROY)) {
+		nflags |= NDI_DEVI_REMOVE;
+		destroy = 1;
+	} else {
+		nflags |= NDI_UNCONFIG;		/* uninit but don't remove */
+	}
+
+	if (flags & DEVI_BRANCH_EVENT)
+		nflags |= NDI_POST_EVENT;
+
+	if (i_ddi_devi_attached(pdip) &&
+	    (i_ddi_node_state(rdip) >= DS_INITIALIZED)) {
+		rv = ndi_devi_unconfig_one(pdip, devnm+1, dipp, nflags);
+	} else {
+		rv = e_ddi_devi_unconfig(rdip, dipp, nflags);
+		if (rv == NDI_SUCCESS) {
+			ASSERT(!destroy || ddi_get_child(rdip) == NULL);
+			rv = ndi_devi_offline(rdip, nflags);
+		}
+	}
+
+	if (!destroy || rv != NDI_SUCCESS) {
+		/* The dip still exists, so do a hold */
+		e_ddi_branch_hold(rdip);
+	}
+out:
+	kmem_free(devnm, MAXNAMELEN + 1);
+	ndi_devi_exit(pdip, circ);
+	return (ndi2errno(rv));
+}
+
+int
+e_ddi_branch_destroy(dev_info_t *rdip, dev_info_t **dipp, uint_t flag)
+{
+	return (e_ddi_branch_unconfigure(rdip, dipp,
+	    flag|DEVI_BRANCH_DESTROY));
+}
+
+/*
+ * Number of chains for hash table
+ */
+#define	NUMCHAINS	17
+
+/*
+ * Devinfo busy arg
+ */
+struct devi_busy {
+	int dv_total;
+	int s_total;
+	mod_hash_t *dv_hash;
+	mod_hash_t *s_hash;
+	int (*callback)(dev_info_t *, void *, uint_t);
+	void *arg;
+};
+
+static int
+visit_dip(dev_info_t *dip, void *arg)
+{
+	uintptr_t sbusy, dvbusy, ref;
+	struct devi_busy *bsp = arg;
+
+	ASSERT(bsp->callback);
+
+	/*
+	 * A dip cannot be busy if its reference count is 0
+	 */
+	if ((ref = e_ddi_devi_holdcnt(dip)) == 0) {
+		return (bsp->callback(dip, bsp->arg, 0));
+	}
+
+	if (mod_hash_find(bsp->dv_hash, dip, (mod_hash_val_t *)&dvbusy))
+		dvbusy = 0;
+
+	/*
+	 * To catch device opens currently maintained on specfs common snodes.
+	 */
+	if (mod_hash_find(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy))
+		sbusy = 0;
+
+#ifdef	DEBUG
+	if (ref < sbusy || ref < dvbusy) {
+		cmn_err(CE_WARN, "dip(%p): sopen = %lu, dvopen = %lu "
+		    "dip ref = %lu\n", (void *)dip, sbusy, dvbusy, ref);
+	}
+#endif
+
+	dvbusy = (sbusy > dvbusy) ? sbusy : dvbusy;
+
+	return (bsp->callback(dip, bsp->arg, dvbusy));
+}
+
+static int
+visit_snode(struct snode *sp, void *arg)
+{
+	uintptr_t sbusy;
+	dev_info_t *dip;
+	int count;
+	struct devi_busy *bsp = arg;
+
+	ASSERT(sp);
+
+	/*
+	 * The stable lock is held. This prevents
+	 * the snode and its associated dip from
+	 * going away.
+	 */
+	dip = NULL;
+	count = spec_devi_open_count(sp, &dip);
+
+	if (count <= 0)
+		return (DDI_WALK_CONTINUE);
+
+	ASSERT(dip);
+
+	if (mod_hash_remove(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy))
+		sbusy = count;
+	else
+		sbusy += count;
+
+	if (mod_hash_insert(bsp->s_hash, dip, (mod_hash_val_t)sbusy)) {
+		cmn_err(CE_WARN, "%s: s_hash insert failed: dip=0x%p, "
+		    "sbusy = %lu", "e_ddi_branch_referenced",
+		    (void *)dip, sbusy);
+	}
+
+	bsp->s_total += count;
+
+	return (DDI_WALK_CONTINUE);
+}
+
+static void
+visit_dvnode(struct dv_node *dv, void *arg)
+{
+	uintptr_t dvbusy;
+	uint_t count;
+	struct vnode *vp;
+	struct devi_busy *bsp = arg;
+
+	ASSERT(dv && dv->dv_devi);
+
+	vp = DVTOV(dv);
+
+	mutex_enter(&vp->v_lock);
+	count = vp->v_count;
+	mutex_exit(&vp->v_lock);
+
+	if (!count)
+		return;
+
+	if (mod_hash_remove(bsp->dv_hash, dv->dv_devi,
+	    (mod_hash_val_t *)&dvbusy))
+		dvbusy = count;
+	else
+		dvbusy += count;
+
+	if (mod_hash_insert(bsp->dv_hash, dv->dv_devi,
+	    (mod_hash_val_t)dvbusy)) {
+		cmn_err(CE_WARN, "%s: dv_hash insert failed: dip=0x%p, "
+		    "dvbusy=%lu", "e_ddi_branch_referenced",
+		    (void *)dv->dv_devi, dvbusy);
+	}
+
+	bsp->dv_total += count;
+}
+
+/*
+ * Returns reference count on success or -1 on failure.
+ */
+int
+e_ddi_branch_referenced(
+	dev_info_t *rdip,
+	int (*callback)(dev_info_t *dip, void *arg, uint_t ref),
+	void *arg)
+{
+	int circ;
+	char *path;
+	dev_info_t *pdip;
+	struct devi_busy bsa = {0};
+
+	ASSERT(rdip);
+
+	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	ndi_hold_devi(rdip);
+
+	pdip = ddi_get_parent(rdip);
+
+	ASSERT(pdip);
+
+	/*
+	 * Check if caller holds pdip busy - can cause deadlocks during
+	 * devfs_walk()
+	 */
+	if (!e_ddi_branch_held(rdip) || DEVI_BUSY_OWNED(pdip)) {
+		cmn_err(CE_WARN, "e_ddi_branch_referenced: failed: "
+		    "devinfo branch(%p) not held or parent busy held",
+		    (void *)rdip);
+		ndi_rele_devi(rdip);
+		kmem_free(path, MAXPATHLEN);
+		return (-1);
+	}
+
+	ndi_devi_enter(pdip, &circ);
+	(void) ddi_pathname(rdip, path);
+	ndi_devi_exit(pdip, circ);
+
+	bsa.dv_hash = mod_hash_create_ptrhash("dv_node busy hash", NUMCHAINS,
+	    mod_hash_null_valdtor, sizeof (struct dev_info));
+
+	bsa.s_hash = mod_hash_create_ptrhash("snode busy hash", NUMCHAINS,
+	    mod_hash_null_valdtor, sizeof (struct snode));
+
+	if (devfs_walk(path, visit_dvnode, &bsa)) {
+		cmn_err(CE_WARN, "e_ddi_branch_referenced: "
+		    "devfs walk failed for: %s", path);
+		kmem_free(path, MAXPATHLEN);
+		bsa.s_total = bsa.dv_total = -1;
+		goto out;
+	}
+
+	kmem_free(path, MAXPATHLEN);
+
+	/*
+	 * Walk the snode table to detect device opens, which are currently
+	 * maintained on specfs common snodes.
+	 */
+	spec_snode_walk(visit_snode, &bsa);
+
+	if (callback == NULL)
+		goto out;
+
+	bsa.callback = callback;
+	bsa.arg = arg;
+
+	if (visit_dip(rdip, &bsa) == DDI_WALK_CONTINUE) {
+		ndi_devi_enter(rdip, &circ);
+		ddi_walk_devs(ddi_get_child(rdip), visit_dip, &bsa);
+		ndi_devi_exit(rdip, circ);
+	}
+
+out:
+	ndi_rele_devi(rdip);
+	mod_hash_destroy_ptrhash(bsa.s_hash);
+	mod_hash_destroy_ptrhash(bsa.dv_hash);
+	return (bsa.s_total > bsa.dv_total ? bsa.s_total : bsa.dv_total);
+}
--- a/usr/src/uts/sun4/os/mlsetup.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4/os/mlsetup.c	Tue May 16 16:05:21 2006 -0700
@@ -71,6 +71,7 @@
  */
 extern void map_wellknown_devices(void);
 extern void hsvc_setup(void);
+extern void mach_descrip_startup_init(void);
 
 int	dcache_size;
 int	dcache_linesize;
@@ -242,6 +243,13 @@
 	ctlp->d.limit = TRAP_TSIZE;		/* XXX dynamic someday */
 	ctlp->d.paddr_base = va_to_pa(trap_tr0);
 #endif /* TRAPTRACE */
+
+	/*
+	 * Initialize the Machine Description kernel framework
+	 */
+
+	mach_descrip_startup_init();
+
 	/*
 	 * initialize HV trap trace buffer for the boot cpu
 	 */
--- a/usr/src/uts/sun4/os/mp_startup.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4/os/mp_startup.c	Tue May 16 16:05:21 2006 -0700
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -45,6 +46,7 @@
 #include <sys/cpu_sgnblk_defs.h>
 
 extern void cpu_intrq_setup(struct cpu *);
+extern void cpu_intrq_cleanup(struct cpu *);
 extern void cpu_intrq_register(struct cpu *);
 
 struct cpu	*cpus;	/* pointer to other cpus; dynamically allocate */
@@ -469,6 +471,11 @@
 	cleanup_intr_pool(cp);
 
 	/*
+	 * Clean any machine specific interrupt states.
+	 */
+	cpu_intrq_cleanup(cp);
+
+	/*
 	 * At this point, the only threads bound to this CPU should be
 	 * special per-cpu threads: it's idle thread, it's pause thread,
 	 * and it's interrupt threads.  Clean these up.
--- a/usr/src/uts/sun4/os/startup.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4/os/startup.c	Tue May 16 16:05:21 2006 -0700
@@ -71,16 +71,21 @@
 extern void contig_mem_init(void);
 extern void mach_dump_buffer_init(void);
 extern void mach_descrip_init(void);
+extern void mach_descrip_startup_fini(void);
 extern void mach_memscrub(void);
 extern void mach_fpras(void);
 extern void mach_cpu_halt_idle(void);
 extern void mach_hw_copy_limit(void);
+extern void load_mach_drivers(void);
 extern void load_tod_module(void);
 #pragma weak load_tod_module
 
 extern int ndata_alloc_mmfsa(struct memlist *ndata);
 #pragma weak ndata_alloc_mmfsa
 
+extern void cif_init(void);
+#pragma weak cif_init
+
 extern void parse_idprom(void);
 extern void add_vx_handler(char *, int, void (*)(cell_t *));
 extern void mem_config_init(void);
@@ -1748,6 +1753,13 @@
 	extern int bop_io_quiesced;
 
 	/*
+	 * Destroy the MD initialized at startup
+	 * The startup initializes the MD framework
+	 * using prom and BOP alloc free it now.
+	 */
+	mach_descrip_startup_fini();
+
+	/*
 	 * Call back into boot and release boots resources.
 	 */
 	BOP_QUIESCE_IO(bootops);
@@ -2198,6 +2210,10 @@
 	 */
 	(void) modload("fs", "procfs");
 
+	/* load machine class specific drivers */
+	load_mach_drivers();
+
+	/* load platform specific drivers */
 	if (&load_platform_drivers)
 		load_platform_drivers();
 
@@ -2214,6 +2230,9 @@
 #ifdef	PTL1_PANIC_DEBUG
 	init_ptl1_thread();
 #endif	/* PTL1_PANIC_DEBUG */
+
+	if (&cif_init)
+		cif_init();
 }
 
 #ifdef	PTL1_PANIC_DEBUG
--- a/usr/src/uts/sun4u/os/mach_ddi_impl.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4u/os/mach_ddi_impl.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -35,7 +35,6 @@
 #include <sys/ethernet.h>
 #include <sys/idprom.h>
 #include <sys/machsystm.h>
-#include <sys/modhash.h>
 #include <sys/promif.h>
 #include <sys/prom_plat.h>
 #include <sys/sunndi.h>
@@ -397,831 +396,3 @@
 	}
 	return (DDI_FAILURE);
 }
-
-/*
- * Platform independent DR routines
- */
-
-static int
-ndi2errno(int n)
-{
-	int err = 0;
-
-	switch (n) {
-		case NDI_NOMEM:
-			err = ENOMEM;
-			break;
-		case NDI_BUSY:
-			err = EBUSY;
-			break;
-		case NDI_FAULT:
-			err = EFAULT;
-			break;
-		case NDI_FAILURE:
-			err = EIO;
-			break;
-		case NDI_SUCCESS:
-			break;
-		case NDI_BADHANDLE:
-		default:
-			err = EINVAL;
-			break;
-	}
-	return (err);
-}
-
-/*
- * Prom tree node list
- */
-struct ptnode {
-	pnode_t		nodeid;
-	struct ptnode	*next;
-};
-
-/*
- * Prom tree walk arg
- */
-struct pta {
-	dev_info_t	*pdip;
-	devi_branch_t	*bp;
-	uint_t		flags;
-	dev_info_t	*fdip;
-	struct ptnode	*head;
-};
-
-static void
-visit_node(pnode_t nodeid, struct pta *ap)
-{
-	struct ptnode	**nextp;
-	int		(*select)(pnode_t, void *, uint_t);
-
-	ASSERT(nodeid != OBP_NONODE && nodeid != OBP_BADNODE);
-
-	select = ap->bp->create.prom_branch_select;
-
-	ASSERT(select);
-
-	if (select(nodeid, ap->bp->arg, 0) == DDI_SUCCESS) {
-
-		for (nextp = &ap->head; *nextp; nextp = &(*nextp)->next)
-			;
-
-		*nextp = kmem_zalloc(sizeof (struct ptnode), KM_SLEEP);
-
-		(*nextp)->nodeid = nodeid;
-	}
-
-	if ((ap->flags & DEVI_BRANCH_CHILD) == DEVI_BRANCH_CHILD)
-		return;
-
-	nodeid = prom_childnode(nodeid);
-	while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) {
-		visit_node(nodeid, ap);
-		nodeid = prom_nextnode(nodeid);
-	}
-}
-
-/*ARGSUSED*/
-static int
-set_dip_offline(dev_info_t *dip, void *arg)
-{
-	ASSERT(dip);
-
-	mutex_enter(&(DEVI(dip)->devi_lock));
-	if (!DEVI_IS_DEVICE_OFFLINE(dip))
-		DEVI_SET_DEVICE_OFFLINE(dip);
-	mutex_exit(&(DEVI(dip)->devi_lock));
-
-	return (DDI_WALK_CONTINUE);
-}
-
-/*ARGSUSED*/
-static int
-create_prom_branch(void *arg, int has_changed)
-{
-	int		circ, c;
-	int		exists, rv;
-	pnode_t		nodeid;
-	struct ptnode	*tnp;
-	dev_info_t	*dip;
-	struct pta	*ap = arg;
-	devi_branch_t	*bp;
-
-	ASSERT(ap);
-	ASSERT(ap->fdip == NULL);
-	ASSERT(ap->pdip && ndi_dev_is_prom_node(ap->pdip));
-
-	bp = ap->bp;
-
-	nodeid = ddi_get_nodeid(ap->pdip);
-	if (nodeid == OBP_NONODE || nodeid == OBP_BADNODE) {
-		cmn_err(CE_WARN, "create_prom_branch: invalid "
-		    "nodeid: 0x%x", nodeid);
-		return (EINVAL);
-	}
-
-	ap->head = NULL;
-
-	nodeid = prom_childnode(nodeid);
-	while (nodeid != OBP_NONODE && nodeid != OBP_BADNODE) {
-		visit_node(nodeid, ap);
-		nodeid = prom_nextnode(nodeid);
-	}
-
-	if (ap->head == NULL)
-		return (ENODEV);
-
-	rv = 0;
-	while ((tnp = ap->head) != NULL) {
-		ap->head = tnp->next;
-
-		ndi_devi_enter(ap->pdip, &circ);
-
-		/*
-		 * Check if the branch already exists.
-		 */
-		exists = 0;
-		dip = e_ddi_nodeid_to_dip(tnp->nodeid);
-		if (dip != NULL) {
-			exists = 1;
-
-			/* Parent is held busy, so release hold */
-			ndi_rele_devi(dip);
-#ifdef	DEBUG
-			cmn_err(CE_WARN, "create_prom_branch: dip(%p) exists"
-			    " for nodeid 0x%x", (void *)dip, tnp->nodeid);
-#endif
-		} else {
-			dip = i_ddi_create_branch(ap->pdip, tnp->nodeid);
-		}
-
-		kmem_free(tnp, sizeof (struct ptnode));
-
-		if (dip == NULL) {
-			ndi_devi_exit(ap->pdip, circ);
-			rv = EIO;
-			continue;
-		}
-
-		ASSERT(ddi_get_parent(dip) == ap->pdip);
-
-		/*
-		 * Hold the branch if it is not already held
-		 */
-		if (!exists)
-			e_ddi_branch_hold(dip);
-
-		ASSERT(e_ddi_branch_held(dip));
-
-		/*
-		 * Set all dips in the branch offline so that
-		 * only a "configure" operation can attach
-		 * the branch
-		 */
-		(void) set_dip_offline(dip, NULL);
-
-		ndi_devi_enter(dip, &c);
-		ddi_walk_devs(ddi_get_child(dip), set_dip_offline, NULL);
-		ndi_devi_exit(dip, c);
-
-		ndi_devi_exit(ap->pdip, circ);
-
-		if (ap->flags & DEVI_BRANCH_CONFIGURE) {
-			int error = e_ddi_branch_configure(dip, &ap->fdip, 0);
-			if (error && rv == 0)
-				rv = error;
-		}
-
-		/*
-		 * Invoke devi_branch_callback() (if it exists) only for
-		 * newly created branches
-		 */
-		if (bp->devi_branch_callback && !exists)
-			bp->devi_branch_callback(dip, bp->arg, 0);
-	}
-
-	return (rv);
-}
-
-static int
-sid_node_create(dev_info_t *pdip, devi_branch_t *bp, dev_info_t **rdipp)
-{
-	int			rv, circ, len;
-	int			i, flags;
-	dev_info_t		*dip;
-	char			*nbuf;
-	static const char	*noname = "<none>";
-
-	ASSERT(pdip);
-	ASSERT(DEVI_BUSY_OWNED(pdip));
-
-	flags = 0;
-
-	/*
-	 * Creating the root of a branch ?
-	 */
-	if (rdipp) {
-		*rdipp = NULL;
-		flags = DEVI_BRANCH_ROOT;
-	}
-
-	ndi_devi_alloc_sleep(pdip, (char *)noname, DEVI_SID_NODEID, &dip);
-	rv = bp->create.sid_branch_create(dip, bp->arg, flags);
-
-	nbuf = kmem_alloc(OBP_MAXDRVNAME, KM_SLEEP);
-
-	if (rv == DDI_WALK_ERROR) {
-		cmn_err(CE_WARN, "e_ddi_branch_create: Error setting"
-		    " properties on devinfo node %p",  (void *)dip);
-		goto fail;
-	}
-
-	len = OBP_MAXDRVNAME;
-	if (ddi_getlongprop_buf(DDI_DEV_T_ANY, dip,
-	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, "name", nbuf, &len)
-	    != DDI_PROP_SUCCESS) {
-		cmn_err(CE_WARN, "e_ddi_branch_create: devinfo node %p has"
-		    "no name property", (void *)dip);
-		goto fail;
-	}
-
-	ASSERT(i_ddi_node_state(dip) == DS_PROTO);
-	if (ndi_devi_set_nodename(dip, nbuf, 0) != NDI_SUCCESS) {
-		cmn_err(CE_WARN, "e_ddi_branch_create: cannot set name (%s)"
-		    " for devinfo node %p", nbuf, (void *)dip);
-		goto fail;
-	}
-
-	kmem_free(nbuf, OBP_MAXDRVNAME);
-
-	/*
-	 * Ignore bind failures just like boot does
-	 */
-	(void) ndi_devi_bind_driver(dip, 0);
-
-	switch (rv) {
-	case DDI_WALK_CONTINUE:
-	case DDI_WALK_PRUNESIB:
-		ndi_devi_enter(dip, &circ);
-
-		i = DDI_WALK_CONTINUE;
-		for (; i == DDI_WALK_CONTINUE; ) {
-			i = sid_node_create(dip, bp, NULL);
-		}
-
-		ASSERT(i == DDI_WALK_ERROR || i == DDI_WALK_PRUNESIB);
-		if (i == DDI_WALK_ERROR)
-			rv = i;
-		/*
-		 * If PRUNESIB stop creating siblings
-		 * of dip's child. Subsequent walk behavior
-		 * is determined by rv returned by dip.
-		 */
-
-		ndi_devi_exit(dip, circ);
-		break;
-	case DDI_WALK_TERMINATE:
-		/*
-		 * Don't create children and ask our parent
-		 * to not create siblings either.
-		 */
-		rv = DDI_WALK_PRUNESIB;
-		break;
-	case DDI_WALK_PRUNECHILD:
-		/*
-		 * Don't create children, but ask parent to continue
-		 * with siblings.
-		 */
-		rv = DDI_WALK_CONTINUE;
-		break;
-	default:
-		ASSERT(0);
-		break;
-	}
-
-	if (rdipp)
-		*rdipp = dip;
-
-	/*
-	 * Set device offline - only the "configure" op should cause an attach
-	 */
-	(void) set_dip_offline(dip, NULL);
-
-	return (rv);
-fail:
-	(void) ndi_devi_free(dip);
-	kmem_free(nbuf, OBP_MAXDRVNAME);
-	return (DDI_WALK_ERROR);
-}
-
-static int
-create_sid_branch(
-	dev_info_t	*pdip,
-	devi_branch_t	*bp,
-	dev_info_t	**dipp,
-	uint_t		flags)
-{
-	int		rv = 0, state = DDI_WALK_CONTINUE;
-	dev_info_t	*rdip;
-
-	while (state == DDI_WALK_CONTINUE) {
-		int	circ;
-
-		ndi_devi_enter(pdip, &circ);
-
-		state = sid_node_create(pdip, bp, &rdip);
-		if (rdip == NULL) {
-			ndi_devi_exit(pdip, circ);
-			ASSERT(state == DDI_WALK_ERROR);
-			break;
-		}
-
-		e_ddi_branch_hold(rdip);
-
-		ndi_devi_exit(pdip, circ);
-
-		if (flags & DEVI_BRANCH_CONFIGURE) {
-			int error = e_ddi_branch_configure(rdip, dipp, 0);
-			if (error && rv == 0)
-				rv = error;
-		}
-
-		/*
-		 * devi_branch_callback() is optional
-		 */
-		if (bp->devi_branch_callback)
-			bp->devi_branch_callback(rdip, bp->arg, 0);
-	}
-
-	ASSERT(state == DDI_WALK_ERROR || state == DDI_WALK_PRUNESIB);
-
-	return (state == DDI_WALK_ERROR ? EIO : rv);
-}
-
-int
-e_ddi_branch_create(
-	dev_info_t	*pdip,
-	devi_branch_t	*bp,
-	dev_info_t	**dipp,
-	uint_t		flags)
-{
-	int prom_devi, sid_devi, error;
-
-	if (pdip == NULL || bp == NULL || bp->type == 0)
-		return (EINVAL);
-
-	prom_devi = (bp->type == DEVI_BRANCH_PROM) ? 1 : 0;
-	sid_devi = (bp->type == DEVI_BRANCH_SID) ? 1 : 0;
-
-	if (prom_devi && bp->create.prom_branch_select == NULL)
-		return (EINVAL);
-	else if (sid_devi && bp->create.sid_branch_create == NULL)
-		return (EINVAL);
-	else if (!prom_devi && !sid_devi)
-		return (EINVAL);
-
-	if (flags & DEVI_BRANCH_EVENT)
-		return (EINVAL);
-
-	if (prom_devi) {
-		struct pta pta = {0};
-
-		pta.pdip = pdip;
-		pta.bp = bp;
-		pta.flags = flags;
-
-		error = prom_tree_access(create_prom_branch, &pta, NULL);
-
-		if (dipp)
-			*dipp = pta.fdip;
-		else if (pta.fdip)
-			ndi_rele_devi(pta.fdip);
-	} else {
-		error = create_sid_branch(pdip, bp, dipp, flags);
-	}
-
-	return (error);
-}
-
-int
-e_ddi_branch_configure(dev_info_t *rdip, dev_info_t **dipp, uint_t flags)
-{
-	int		circ, rv;
-	char		*devnm;
-	dev_info_t	*pdip;
-
-	if (dipp)
-		*dipp = NULL;
-
-	if (rdip == NULL || flags != 0 || (flags & DEVI_BRANCH_EVENT))
-		return (EINVAL);
-
-	pdip = ddi_get_parent(rdip);
-
-	ndi_devi_enter(pdip, &circ);
-
-	if (!e_ddi_branch_held(rdip)) {
-		ndi_devi_exit(pdip, circ);
-		cmn_err(CE_WARN, "e_ddi_branch_configure: "
-		    "dip(%p) not held", (void *)rdip);
-		return (EINVAL);
-	}
-
-	if (i_ddi_node_state(rdip) < DS_INITIALIZED) {
-		/*
-		 * First attempt to bind a driver. If we fail, return
-		 * success (On some platforms, dips for some device
-		 * types (CPUs) may not have a driver)
-		 */
-		if (ndi_devi_bind_driver(rdip, 0) != NDI_SUCCESS) {
-			ndi_devi_exit(pdip, circ);
-			return (0);
-		}
-
-		if (ddi_initchild(pdip, rdip) != DDI_SUCCESS) {
-			rv = NDI_FAILURE;
-			goto out;
-		}
-	}
-
-	ASSERT(i_ddi_node_state(rdip) >= DS_INITIALIZED);
-
-	devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
-
-	(void) ddi_deviname(rdip, devnm);
-
-	if ((rv = ndi_devi_config_one(pdip, devnm+1, &rdip,
-	    NDI_DEVI_ONLINE | NDI_CONFIG)) == NDI_SUCCESS) {
-		/* release hold from ndi_devi_config_one() */
-		ndi_rele_devi(rdip);
-	}
-
-	kmem_free(devnm, MAXNAMELEN + 1);
-out:
-	if (rv != NDI_SUCCESS && dipp) {
-		ndi_hold_devi(rdip);
-		*dipp = rdip;
-	}
-	ndi_devi_exit(pdip, circ);
-	return (ndi2errno(rv));
-}
-
-void
-e_ddi_branch_hold(dev_info_t *rdip)
-{
-	if (e_ddi_branch_held(rdip)) {
-		cmn_err(CE_WARN, "e_ddi_branch_hold: branch already held");
-		return;
-	}
-
-	mutex_enter(&DEVI(rdip)->devi_lock);
-	if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) == 0) {
-		DEVI(rdip)->devi_flags |= DEVI_BRANCH_HELD;
-		DEVI(rdip)->devi_ref++;
-	}
-	ASSERT(DEVI(rdip)->devi_ref > 0);
-	mutex_exit(&DEVI(rdip)->devi_lock);
-}
-
-int
-e_ddi_branch_held(dev_info_t *rdip)
-{
-	int rv = 0;
-
-	mutex_enter(&DEVI(rdip)->devi_lock);
-	if ((DEVI(rdip)->devi_flags & DEVI_BRANCH_HELD) &&
-	    DEVI(rdip)->devi_ref > 0) {
-		rv = 1;
-	}
-	mutex_exit(&DEVI(rdip)->devi_lock);
-
-	return (rv);
-}
-void
-e_ddi_branch_rele(dev_info_t *rdip)
-{
-	mutex_enter(&DEVI(rdip)->devi_lock);
-	DEVI(rdip)->devi_flags &= ~DEVI_BRANCH_HELD;
-	DEVI(rdip)->devi_ref--;
-	mutex_exit(&DEVI(rdip)->devi_lock);
-}
-
-int
-e_ddi_branch_unconfigure(
-	dev_info_t *rdip,
-	dev_info_t **dipp,
-	uint_t flags)
-{
-	int	circ, rv;
-	int	destroy;
-	char	*devnm;
-	uint_t	nflags;
-	dev_info_t *pdip;
-
-	if (dipp)
-		*dipp = NULL;
-
-	if (rdip == NULL)
-		return (EINVAL);
-
-	pdip = ddi_get_parent(rdip);
-
-	ASSERT(pdip);
-
-	/*
-	 * Check if caller holds pdip busy - can cause deadlocks during
-	 * devfs_clean()
-	 */
-	if (DEVI_BUSY_OWNED(pdip)) {
-		cmn_err(CE_WARN, "e_ddi_branch_unconfigure: failed: parent"
-		    " devinfo node(%p) is busy held", (void *)pdip);
-		return (EINVAL);
-	}
-
-	destroy = (flags & DEVI_BRANCH_DESTROY) ? 1 : 0;
-
-	devnm = kmem_alloc(MAXNAMELEN + 1, KM_SLEEP);
-
-	ndi_devi_enter(pdip, &circ);
-	(void) ddi_deviname(rdip, devnm);
-	ndi_devi_exit(pdip, circ);
-
-	/*
-	 * ddi_deviname() returns a component name with / prepended.
-	 */
-	rv = devfs_clean(pdip, devnm + 1, DV_CLEAN_FORCE);
-	if (rv) {
-		kmem_free(devnm, MAXNAMELEN + 1);
-		return (rv);
-	}
-
-	ndi_devi_enter(pdip, &circ);
-
-	/*
-	 * Recreate device name as it may have changed state (init/uninit)
-	 * when parent busy lock was dropped for devfs_clean()
-	 */
-	(void) ddi_deviname(rdip, devnm);
-
-	if (!e_ddi_branch_held(rdip)) {
-		kmem_free(devnm, MAXNAMELEN + 1);
-		ndi_devi_exit(pdip, circ);
-		cmn_err(CE_WARN, "e_ddi_%s_branch: dip(%p) not held",
-		    destroy ? "destroy" : "unconfigure", (void *)rdip);
-		return (EINVAL);
-	}
-
-	/*
-	 * Release hold on the branch. This is ok since we are holding the
-	 * parent busy. If rdip is not removed, we must do a hold on the
-	 * branch before returning.
-	 */
-	e_ddi_branch_rele(rdip);
-
-	nflags = NDI_DEVI_OFFLINE;
-	if (destroy || (flags & DEVI_BRANCH_DESTROY)) {
-		nflags |= NDI_DEVI_REMOVE;
-		destroy = 1;
-	} else {
-		nflags |= NDI_UNCONFIG;		/* uninit but don't remove */
-	}
-
-	if (flags & DEVI_BRANCH_EVENT)
-		nflags |= NDI_POST_EVENT;
-
-	if (i_ddi_devi_attached(pdip) &&
-	    (i_ddi_node_state(rdip) >= DS_INITIALIZED)) {
-		rv = ndi_devi_unconfig_one(pdip, devnm+1, dipp, nflags);
-	} else {
-		rv = e_ddi_devi_unconfig(rdip, dipp, nflags);
-		if (rv == NDI_SUCCESS) {
-			ASSERT(!destroy || ddi_get_child(rdip) == NULL);
-			rv = ndi_devi_offline(rdip, nflags);
-		}
-	}
-
-	if (!destroy || rv != NDI_SUCCESS) {
-		/* The dip still exists, so do a hold */
-		e_ddi_branch_hold(rdip);
-	}
-out:
-	kmem_free(devnm, MAXNAMELEN + 1);
-	ndi_devi_exit(pdip, circ);
-	return (ndi2errno(rv));
-}
-
-int
-e_ddi_branch_destroy(dev_info_t *rdip, dev_info_t **dipp, uint_t flag)
-{
-	return (e_ddi_branch_unconfigure(rdip, dipp,
-	    flag|DEVI_BRANCH_DESTROY));
-}
-
-/*
- * Number of chains for hash table
- */
-#define	NUMCHAINS	17
-
-/*
- * Devinfo busy arg
- */
-struct devi_busy {
-	int dv_total;
-	int s_total;
-	mod_hash_t *dv_hash;
-	mod_hash_t *s_hash;
-	int (*callback)(dev_info_t *, void *, uint_t);
-	void *arg;
-};
-
-static int
-visit_dip(dev_info_t *dip, void *arg)
-{
-	uintptr_t sbusy, dvbusy, ref;
-	struct devi_busy *bsp = arg;
-
-	ASSERT(bsp->callback);
-
-	/*
-	 * A dip cannot be busy if its reference count is 0
-	 */
-	if ((ref = e_ddi_devi_holdcnt(dip)) == 0) {
-		return (bsp->callback(dip, bsp->arg, 0));
-	}
-
-	if (mod_hash_find(bsp->dv_hash, dip, (mod_hash_val_t *)&dvbusy))
-		dvbusy = 0;
-
-	/*
-	 * To catch device opens currently maintained on specfs common snodes.
-	 */
-	if (mod_hash_find(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy))
-		sbusy = 0;
-
-#ifdef	DEBUG
-	if (ref < sbusy || ref < dvbusy) {
-		cmn_err(CE_WARN, "dip(%p): sopen = %lu, dvopen = %lu "
-		    "dip ref = %lu\n", (void *)dip, sbusy, dvbusy, ref);
-	}
-#endif
-
-	dvbusy = (sbusy > dvbusy) ? sbusy : dvbusy;
-
-	return (bsp->callback(dip, bsp->arg, dvbusy));
-}
-
-static int
-visit_snode(struct snode *sp, void *arg)
-{
-	uintptr_t sbusy;
-	dev_info_t *dip;
-	int count;
-	struct devi_busy *bsp = arg;
-
-	ASSERT(sp);
-
-	/*
-	 * The stable lock is held. This prevents
-	 * the snode and its associated dip from
-	 * going away.
-	 */
-	dip = NULL;
-	count = spec_devi_open_count(sp, &dip);
-
-	if (count <= 0)
-		return (DDI_WALK_CONTINUE);
-
-	ASSERT(dip);
-
-	if (mod_hash_remove(bsp->s_hash, dip, (mod_hash_val_t *)&sbusy))
-		sbusy = count;
-	else
-		sbusy += count;
-
-	if (mod_hash_insert(bsp->s_hash, dip, (mod_hash_val_t)sbusy)) {
-		cmn_err(CE_WARN, "%s: s_hash insert failed: dip=0x%p, "
-		    "sbusy = %lu", "e_ddi_branch_referenced",
-		    (void *)dip, sbusy);
-	}
-
-	bsp->s_total += count;
-
-	return (DDI_WALK_CONTINUE);
-}
-
-static void
-visit_dvnode(struct dv_node *dv, void *arg)
-{
-	uintptr_t dvbusy;
-	uint_t count;
-	struct vnode *vp;
-	struct devi_busy *bsp = arg;
-
-	ASSERT(dv && dv->dv_devi);
-
-	vp = DVTOV(dv);
-
-	mutex_enter(&vp->v_lock);
-	count = vp->v_count;
-	mutex_exit(&vp->v_lock);
-
-	if (!count)
-		return;
-
-	if (mod_hash_remove(bsp->dv_hash, dv->dv_devi,
-	    (mod_hash_val_t *)&dvbusy))
-		dvbusy = count;
-	else
-		dvbusy += count;
-
-	if (mod_hash_insert(bsp->dv_hash, dv->dv_devi,
-	    (mod_hash_val_t)dvbusy)) {
-		cmn_err(CE_WARN, "%s: dv_hash insert failed: dip=0x%p, "
-		    "dvbusy=%lu", "e_ddi_branch_referenced",
-		    (void *)dv->dv_devi, dvbusy);
-	}
-
-	bsp->dv_total += count;
-}
-
-/*
- * Returns reference count on success or -1 on failure.
- */
-int
-e_ddi_branch_referenced(
-	dev_info_t *rdip,
-	int (*callback)(dev_info_t *dip, void *arg, uint_t ref),
-	void *arg)
-{
-	int circ;
-	char *path;
-	dev_info_t *pdip;
-	struct devi_busy bsa = {0};
-
-	ASSERT(rdip);
-
-	path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-
-	ndi_hold_devi(rdip);
-
-	pdip = ddi_get_parent(rdip);
-
-	ASSERT(pdip);
-
-	/*
-	 * Check if caller holds pdip busy - can cause deadlocks during
-	 * devfs_walk()
-	 */
-	if (!e_ddi_branch_held(rdip) || DEVI_BUSY_OWNED(pdip)) {
-		cmn_err(CE_WARN, "e_ddi_branch_referenced: failed: "
-		    "devinfo branch(%p) not held or parent busy held",
-		    (void *)rdip);
-		ndi_rele_devi(rdip);
-		kmem_free(path, MAXPATHLEN);
-		return (-1);
-	}
-
-	ndi_devi_enter(pdip, &circ);
-	(void) ddi_pathname(rdip, path);
-	ndi_devi_exit(pdip, circ);
-
-	bsa.dv_hash = mod_hash_create_ptrhash("dv_node busy hash", NUMCHAINS,
-	    mod_hash_null_valdtor, sizeof (struct dev_info));
-
-	bsa.s_hash = mod_hash_create_ptrhash("snode busy hash", NUMCHAINS,
-	    mod_hash_null_valdtor, sizeof (struct snode));
-
-	if (devfs_walk(path, visit_dvnode, &bsa)) {
-		cmn_err(CE_WARN, "e_ddi_branch_referenced: "
-		    "devfs walk failed for: %s", path);
-		kmem_free(path, MAXPATHLEN);
-		bsa.s_total = bsa.dv_total = -1;
-		goto out;
-	}
-
-	kmem_free(path, MAXPATHLEN);
-
-	/*
-	 * Walk the snode table to detect device opens, which are currently
-	 * maintained on specfs common snodes.
-	 */
-	spec_snode_walk(visit_snode, &bsa);
-
-	if (callback == NULL)
-		goto out;
-
-	bsa.callback = callback;
-	bsa.arg = arg;
-
-	if (visit_dip(rdip, &bsa) == DDI_WALK_CONTINUE) {
-		ndi_devi_enter(rdip, &circ);
-		ddi_walk_devs(ddi_get_child(rdip), visit_dip, &bsa);
-		ndi_devi_exit(rdip, circ);
-	}
-
-out:
-	ndi_rele_devi(rdip);
-	mod_hash_destroy_ptrhash(bsa.s_hash);
-	mod_hash_destroy_ptrhash(bsa.dv_hash);
-	return (bsa.s_total > bsa.dv_total ? bsa.s_total : bsa.dv_total);
-}
--- a/usr/src/uts/sun4u/os/mach_startup.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4u/os/mach_startup.c	Tue May 16 16:05:21 2006 -0700
@@ -402,6 +402,13 @@
 
 /*ARGSUSED*/
 void
+cpu_intrq_cleanup(struct cpu *cp)
+{
+	/* Interrupt mondo queues not applicable to sun4u */
+}
+
+/*ARGSUSED*/
+void
 cpu_intrq_register(struct cpu *cp)
 {
 	/* Interrupt/error queues not applicable to sun4u */
@@ -429,9 +436,29 @@
 }
 
 void
+mach_descrip_startup_init(void)
+{
+	/*
+	 * Only for sun4v.
+	 * Initialize Machine description framework during startup.
+	 */
+}
+void
+mach_descrip_startup_fini(void)
+{
+	/*
+	 * Only for sun4v.
+	 * Clean up Machine Description framework during startup.
+	 */
+}
+
+void
 mach_descrip_init(void)
 {
-	/* Obtain Machine description - only for sun4v */
+	/*
+	 * Only for sun4v.
+	 * Initialize Machine description framework.
+	 */
 }
 
 void
@@ -440,6 +467,12 @@
 	/* Setup hypervisor services, not applicable to sun4u */
 }
 
+void
+load_mach_drivers(void)
+{
+	/* Currently no machine class (sun4u) specific drivers to load */
+}
+
 /*
  * Return true if the machine we're running on is a Positron.
  * (Positron is an unsupported developers platform.)
--- a/usr/src/uts/sun4v/Makefile.files	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/Makefile.files	Tue May 16 16:05:21 2006 -0700
@@ -34,6 +34,7 @@
 #	object lists
 #
 CORE_OBJS +=	bootops.o
+CORE_OBJS +=	prom_alloc.o
 CORE_OBJS +=	cmp.o
 CORE_OBJS +=	cpc_hwreg.o
 CORE_OBJS +=	cpc_subr.o
@@ -44,11 +45,13 @@
 CORE_OBJS +=	hat_sfmmu.o
 CORE_OBJS +=	hat_kdi.o
 CORE_OBJS +=	hsvc.o
+CORE_OBJS +=	lpad.o
 CORE_OBJS +=	mach_cpu_states.o
 CORE_OBJS +=	mach_ddi_impl.o
-CORE_OBJS +=    mach_descrip.o
+CORE_OBJS +=	mach_descrip.o
 CORE_OBJS +=	mach_mp_startup.o
 CORE_OBJS +=	mach_mp_states.o
+CORE_OBJS +=	mach_proc_init.o
 CORE_OBJS +=	mach_sfmmu.o
 CORE_OBJS +=	mach_startup.o
 CORE_OBJS +=	mach_subr_asm.o
@@ -59,13 +62,30 @@
 CORE_OBJS +=	mem_config.o
 CORE_OBJS +=	memlist_new.o
 CORE_OBJS +=	ppage.o
+CORE_OBJS +=	promif_asr.o
+CORE_OBJS +=	promif_cpu.o
+CORE_OBJS +=	promif_emul.o
+CORE_OBJS +=	promif_mon.o
+CORE_OBJS +=	promif_io.o
+CORE_OBJS +=	promif_interp.o
+CORE_OBJS +=	promif_key.o
+CORE_OBJS +=	promif_power_off.o
+CORE_OBJS +=	promif_prop.o
+CORE_OBJS +=	promif_node.o
+CORE_OBJS +=	promif_reboot.o
+CORE_OBJS +=	promif_stree.o
+CORE_OBJS +=	promif_test.o
+CORE_OBJS +=	promif_version.o
 CORE_OBJS +=	sfmmu_kdi.o
 CORE_OBJS +=	swtch.o
 CORE_OBJS +=	xhat_sfmmu.o
 
+CORE_OBJS +=	mdesc_diff.o
 CORE_OBJS +=	mdesc_findname.o
 CORE_OBJS +=	mdesc_findnodeprop.o
 CORE_OBJS +=	mdesc_fini.o
+CORE_OBJS +=	mdesc_getbinsize.o
+CORE_OBJS +=	mdesc_getgen.o
 CORE_OBJS +=	mdesc_getpropdata.o
 CORE_OBJS +=	mdesc_getpropstr.o
 CORE_OBJS +=	mdesc_getpropval.o
@@ -109,14 +129,26 @@
 #
 QCN_OBJS	= qcn.o
 VNEX_OBJS	= vnex.o
+CNEX_OBJS	= cnex.o
 GLVC_OBJS	= glvc.o glvc_hcall.o
 MDESC_OBJS	= mdesc.o
+LDC_OBJS	= ldc.o
+VLDC_OBJS	= vldc.o
+VCC_OBJS	= vcc.o
+VNET_OBJS	= vnet.o vnet_gen.o
+VSW_OBJS	= vsw.o
+VDC_OBJS	= vdc.o
+VDS_OBJS	= vds.o
 
 #
 #			Misc modules
 #
+BOOTDEV_OBJS	+= bootdev.o
+DR_CPU_OBJS	+= dr_cpu.o dr_util.o
+DS_OBJS		= ds.o
+FAULT_ISO_OBJS	= fault_iso.o
 OBPSYM_OBJS	+= obpsym.o obpsym_1275.o
-BOOTDEV_OBJS	+= bootdev.o
+PLATSVC_OBJS	= platsvc.o mdeg.o
 
 #
 #			Performance Counter BackEnd (PCBE) Modules
@@ -163,4 +195,3 @@
 #
 
 ARCFOUR_OBJS	+= arcfour.o arcfour_crypt.o
-
--- a/usr/src/uts/sun4v/Makefile.rules	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/Makefile.rules	Tue May 16 16:05:21 2006 -0700
@@ -62,6 +62,10 @@
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
 
+$(OBJS_DIR)/%.o:		$(UTSBASE)/sun4v/promif/%.c
+	$(COMPILE.c) -o $@ $<
+	$(CTFCONVERT_O)
+
 $(OBJS_DIR)/%.o:		$(UTSBASE)/sun4v/io/px/%.c
 	$(COMPILE.c) -o $@ $<
 	$(CTFCONVERT_O)
@@ -163,6 +167,9 @@
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/sun4v/pcbe/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
+$(LINTS_DIR)/%.ln:		$(UTSBASE)/sun4v/promif/%.c
+	@($(LHEAD) $(LINT.c) $< $(LTAIL))
+
 $(LINTS_DIR)/%.ln:		$(UTSBASE)/sun4v/vm/%.c
 	@($(LHEAD) $(LINT.c) $< $(LTAIL))
 
--- a/usr/src/uts/sun4v/Makefile.sun4v.shared	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/Makefile.sun4v.shared	Tue May 16 16:05:21 2006 -0700
@@ -23,7 +23,7 @@
 # Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
+#ident	"%Z%%M%	%I%	%E% SMI"
 #
 #	This makefile contains the common definitions for the sun4v unix
 #	and all sun4v implementation architecture dependent modules.
@@ -309,19 +309,26 @@
 #
 #	Machine Specific Driver Modules (/kernel/drv):
 #
-DRV_KMODS	+= vnex
-DRV_KMODS	+= qcn
-DRV_KMODS	+= dma
+DRV_KMODS	+= bge
+DRV_KMODS	+= cnex
 DRV_KMODS	+= cpc
-DRV_KMODS	+= rootnex
-DRV_KMODS	+= trapstat
-DRV_KMODS	+= px
+DRV_KMODS	+= dma
+DRV_KMODS	+= ebus
 DRV_KMODS	+= fpc
 DRV_KMODS	+= glvc
-DRV_KMODS	+= bge
 DRV_KMODS	+= mdesc
-DRV_KMODS	+= ebus
-DRV_KMODS	+= su
+DRV_KMODS	+= px
+DRV_KMODS	+= qcn
+DRV_KMODS	+= rootnex
+DRV_KMODS       += su
+DRV_KMODS	+= trapstat
+DRV_KMODS	+= vcc 
+DRV_KMODS	+= vdc
+DRV_KMODS	+= vds
+DRV_KMODS	+= vldc
+DRV_KMODS	+= vnet
+DRV_KMODS	+= vnex
+DRV_KMODS	+= vsw
 
 $(CLOSED_BUILD)CLOSED_DRV_KMODS	+= memtest
 $(CLOSED_BUILD)CLOSED_DRV_KMODS	+= ncp
@@ -354,8 +361,16 @@
 #
 #	'User' Modules (/kernel/misc):
 #
-MISC_KMODS	+= obpsym bootdev vis platmod
-
+MISC_KMODS	+= bootdev 
+MISC_KMODS	+= dr_cpu 
+MISC_KMODS	+= ds
+MISC_KMODS	+= fault_iso
+MISC_KMODS	+= ldc 
+MISC_KMODS	+= obpsym 
+MISC_KMODS	+= platmod 
+MISC_KMODS	+= platsvc 
+MISC_KMODS	+= vis 
+ 
 #	md5 optimized for Niagara
 #
 MISC_KMODS	+= md5
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/cnex/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,99 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# uts/sun4v/cnex/Makefile
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+#	This makefile drives the production of the cnex driver kernel module.
+#
+#	sun4v implementation architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= cnex 
+OBJECTS		= $(CNEX_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(CNEX_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_DRV_DIR)/$(MODULE)
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+#	Override defaults to build a unique, local modstubs.o.
+#
+MODSTUBS_DIR	= $(OBJS_DIR)
+
+CLEANFILES	+= $(MODSTUBS_O)
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+# Module dependencies
+#
+LDFLAGS	+= -dy -Nmisc/ldc
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- a/usr/src/uts/sun4v/cpu/common_asm.s	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/cpu/common_asm.s	Tue May 16 16:05:21 2006 -0700
@@ -1050,8 +1050,16 @@
 	ta	FAST_TRAP
 	brz,pt	%o0, 1f
 	nop
-	ba	ptl1_panic
+
 	mov	PTL1_BAD_HCALL, %g1
+
+	cmp	%o0, H_ENOMAP
+	move	%xcc, PTL1_BAD_HCALL_UNMAP_PERM_ENOMAP, %g1
+	
+	cmp	%o0, H_EINVAL 
+	move	%xcc, PTL1_BAD_HCALL_UNMAP_PERM_EINVAL, %g1
+
+	ba,a	ptl1_panic
 1:
 	mov	%g6, %o5
 	mov	%g5, %o2
--- a/usr/src/uts/sun4v/cpu/generic.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/cpu/generic.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -54,140 +53,77 @@
 #include <sys/panic.h>
 #include <sys/dtrace.h>
 #include <vm/seg_spt.h>
-
-#define	S_VAC_SIZE	MMU_PAGESIZE /* XXXQ? */
+#include <sys/simulate.h>
+#include <sys/fault.h>
 
-/*
- * Maximum number of contexts
- */
-#define	MAX_NCTXS	(1 << 13)
 
 uint_t root_phys_addr_lo_mask = 0xffffffffU;
 
 void
 cpu_setup(void)
 {
-	extern int at_flags;
-	extern int disable_delay_tlb_flush, delay_tlb_flush;
 	extern int mmu_exported_pagesize_mask;
-	extern int get_cpu_pagesizes(void);
+	char *generic_isa_set[] = {
+	    "sparcv9+vis",
+	    "sparcv8plus+vis",
+	    NULL
+	};
+
+	/*
+	 * The setup common to all CPU modules is done in cpu_setup_common
+	 * routine.
+	 */
+	cpu_setup_common(generic_isa_set);
 
 	cache |= (CACHE_PTAG | CACHE_IOCOHERENT);
 
-	at_flags = EF_SPARC_32PLUS | EF_SPARC_SUN_US1; /* XXXQ */
-
-	/*
-	 * Use the maximum number of contexts available for Spitfire unless
-	 * it has been tuned for debugging.
-	 * We are checking against 0 here since this value can be patched
-	 * while booting.  It can not be patched via /etc/system since it
-	 * will be patched too late and thus cause the system to panic.
-	 */
-	if (nctxs == 0)
-		nctxs = MAX_NCTXS;
-
-	if (use_page_coloring) {
-		do_pg_coloring = 1;
-		if (use_virtual_coloring)
-			do_virtual_coloring = 1;
-	}
-	/*
-	 * Initalize supported page sizes information before the PD.
-	 * If no information is available, then initialize the
-	 * mmu_exported_pagesize_mask to a reasonable value for that processor.
-	 */
-	mmu_exported_pagesize_mask = get_cpu_pagesizes();
-	if (mmu_exported_pagesize_mask <= 0) {
-		mmu_exported_pagesize_mask = (1 << TTE8K) | (1 << TTE64K) |
-		    (1 << TTE4M);
+	if (broken_md_flag) {
+		/*
+		 * Turn on the missing bits supported by sun4v architecture in
+		 * MMU pagesize mask returned by MD.
+		 */
+		mmu_exported_pagesize_mask |= DEFAULT_SUN4V_MMU_PAGESIZE_MASK;
+	} else {
+		/*
+		 * According to sun4v architecture each processor must
+		 * support 8K, 64K and 4M page sizes. If any of the page
+		 * size is missing from page size mask, then panic.
+		 */
+		if ((mmu_exported_pagesize_mask &
+		    DEFAULT_SUN4V_MMU_PAGESIZE_MASK) !=
+		    DEFAULT_SUN4V_MMU_PAGESIZE_MASK)
+			cmn_err(CE_PANIC, "machine description"
+			    " does not have required sun4v page sizes"
+			    " 8K, 64K and 4M: MD mask is 0x%x",
+			    mmu_exported_pagesize_mask);
 	}
 
 	/*
-	 * Tune pp_slots to use up to 1/8th of the tlb entries.
-	 */
-	pp_slots = MIN(8, MAXPP_SLOTS);
-
-	/*
-	 * Block stores invalidate all pages of the d$ so pagecopy
-	 * et. al. do not need virtual translations with virtual
-	 * coloring taken into consideration.
-	 */
-	pp_consistent_coloring = 0;
-	isa_list =
-	    "sparcv9+vis sparcv9 "
-	    "sparcv8plus+vis sparcv8plus "
-	    "sparcv8 sparcv8-fsmuld sparcv7 sparc";
-
-	/*
-	 * On Spitfire, there's a hole in the address space
-	 * that we must never map (the hardware only support 44-bits of
-	 * virtual address).  Later CPUs are expected to have wider
-	 * supported address ranges.
-	 *
-	 * See address map on p23 of the UltraSPARC 1 user's manual.
+	 * If processor supports the subset of full 64-bit virtual
+	 * address space, then set VA hole accordingly.
 	 */
-/* XXXQ get from machine description */
-	hole_start = (caddr_t)0x80000000000ull;
-	hole_end = (caddr_t)0xfffff80000000000ull;
-
-	/*
-	 * The kpm mapping window.
-	 * kpm_size:
-	 *	The size of a single kpm range.
-	 *	The overall size will be: kpm_size * vac_colors.
-	 * kpm_vbase:
-	 *	The virtual start address of the kpm range within the kernel
-	 *	virtual address space. kpm_vbase has to be kpm_size aligned.
-	 */
-	kpm_size = (size_t)(2ull * 1024 * 1024 * 1024 * 1024); /* 2TB */
-	kpm_size_shift = 41;
-	kpm_vbase = (caddr_t)0xfffffa0000000000ull; /* 16EB - 6TB */
-
-	/*
-	 * The traptrace code uses either %tick or %stick for
-	 * timestamping.  We have %stick so we can use it.
-	 */
-	traptrace_use_stick = 1;
-
-	/*
-	 * sun4v provides demap_all
-	 */
-	if (!disable_delay_tlb_flush)
-		delay_tlb_flush = 1;
+	if (va_bits < VA_ADDRESS_SPACE_BITS) {
+		hole_start = (caddr_t)(1ull << (va_bits - 1));
+		hole_end = (caddr_t)(0ull - (1ull << (va_bits - 1)));
+	} else {
+		hole_start = hole_end = 0;
+	}
 }
 
-/*
- * Set the magic constants of the implementation.
- */
 void
 cpu_fiximp(struct cpu_node *cpunode)
 {
-	extern int vac_size, vac_shift;
-	extern uint_t vac_mask;
-	int i, a;
-
 	/*
-	 * The assumption here is that fillsysinfo will eventually
-	 * have code to fill this info in from the PD.
-	 * We hard code this for now.
-	 * Once the PD access library is done this code
-	 * might need to be changed to get the info from the PD
+	 * The Cache node is optional in MD. Therefore in case "Cache"
+	 * does not exists in MD, set the default L2 cache associativity,
+	 * size, linesize for generic CPU module.
 	 */
-	/*
-	 * Page Coloring defaults for sun4v
-	 */
-	ecache_setsize = 0x100000;
-	ecache_alignsize = 64;
-	cpunode->ecache_setsize =  0x100000;
-
-	vac_size = S_VAC_SIZE;
-	vac_mask = MMU_PAGEMASK & (vac_size - 1);
-	i = 0; a = vac_size;
-	while (a >>= 1)
-		++i;
-	vac_shift = i;
-	shm_alignment = vac_size;
-	vac = 0;
+	if (cpunode->ecache_size == 0)
+		cpunode->ecache_size = 0x100000;
+	if (cpunode->ecache_linesize == 0)
+		cpunode->ecache_linesize = 64;
+	if (cpunode->ecache_associativity == 0)
+		cpunode->ecache_associativity = 1;
 }
 
 void
@@ -220,7 +156,9 @@
 	 * unit sharing information from the Machine Description table.
 	 * It defaults to the CPU id in the absence of such information.
 	 */
-	cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id);
+	cp->cpu_m.cpu_ipipe = cpunodes[cp->cpu_id].exec_unit_mapping;
+	if (cp->cpu_m.cpu_ipipe == NO_EU_MAPPING_FOUND)
+		cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id);
 }
 
 void
@@ -246,6 +184,96 @@
 }
 
 /*
+ * Sun4v kernel must emulate code a generic sun4v processor may not support
+ * i.e. VIS1 and VIS2.
+ */
+#define	IS_FLOAT(i) (((i) & 0x1000000) != 0)
+#define	IS_IBIT_SET(x)	(x & 0x2000)
+#define	IS_VIS1(op, op3)(op == 2 && op3 == 0x36)
+#define	IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(op, op3, asi)		\
+		(op == 3 && (op3 == IOP_V8_LDDFA ||		\
+		op3 == IOP_V8_STDFA) &&	asi > ASI_SNFL)
+int
+vis1_partial_support(struct regs *rp, k_siginfo_t *siginfo, uint_t *fault)
+{
+	char *badaddr;
+	int instr;
+	uint_t	optype, op3, asi;
+	uint_t	rd, ignor;
+
+	if (!USERMODE(rp->r_tstate))
+		return (-1);
+
+	instr = fetch_user_instr((caddr_t)rp->r_pc);
+
+	rd = (instr >> 25) & 0x1f;
+	optype = (instr >> 30) & 0x3;
+	op3 = (instr >> 19) & 0x3f;
+	ignor = (instr >> 5) & 0xff;
+	if (IS_IBIT_SET(instr)) {
+		asi = (uint32_t)((rp->r_tstate >> TSTATE_ASI_SHIFT) &
+		    TSTATE_ASI_MASK);
+	} else {
+		asi = ignor;
+	}
+
+	if (!IS_VIS1(optype, op3) &&
+	    !IS_PARTIAL_OR_SHORT_FLOAT_LD_ST(optype, op3, asi)) {
+		return (-1);
+	}
+	switch (simulate_unimp(rp, &badaddr)) {
+	case SIMU_RETRY:
+		break;	/* regs are already set up */
+		/*NOTREACHED*/
+
+	case SIMU_SUCCESS:
+		/*
+		 * skip the successfully
+		 * simulated instruction
+		 */
+		rp->r_pc = rp->r_npc;
+		rp->r_npc += 4;
+		break;
+		/*NOTREACHED*/
+
+	case SIMU_FAULT:
+		siginfo->si_signo = SIGSEGV;
+		siginfo->si_code = SEGV_MAPERR;
+		siginfo->si_addr = badaddr;
+		*fault = FLTBOUNDS;
+		break;
+
+	case SIMU_DZERO:
+		siginfo->si_signo = SIGFPE;
+		siginfo->si_code = FPE_INTDIV;
+		siginfo->si_addr = (caddr_t)rp->r_pc;
+		*fault = FLTIZDIV;
+		break;
+
+	case SIMU_UNALIGN:
+		siginfo->si_signo = SIGBUS;
+		siginfo->si_code = BUS_ADRALN;
+		siginfo->si_addr = badaddr;
+		*fault = FLTACCESS;
+		break;
+
+	case SIMU_ILLEGAL:
+	default:
+		siginfo->si_signo = SIGILL;
+		op3 = (instr >> 19) & 0x3F;
+		if ((IS_FLOAT(instr) && (op3 == IOP_V8_STQFA) ||
+		    (op3 == IOP_V8_STDFA)))
+			siginfo->si_code = ILL_ILLADR;
+		else
+			siginfo->si_code = ILL_ILLOPC;
+		siginfo->si_addr = (caddr_t)rp->r_pc;
+		*fault = FLTILL;
+		break;
+	}
+	return (0);
+}
+
+/*
  * Trapstat support for generic sun4v processor
  */
 int
--- a/usr/src/uts/sun4v/cpu/niagara.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/cpu/niagara.c	Tue May 16 16:05:21 2006 -0700
@@ -58,12 +58,8 @@
 #include <sys/trapstat.h>
 #include <sys/hsvc.h>
 
-#define	S_VAC_SIZE	MMU_PAGESIZE /* XXXQ? */
-
-/*
- * Maximum number of contexts
- */
-#define	MAX_NCTXS	(1 << 13)
+#define	NI_MMU_PAGESIZE_MASK	((1 << TTE8K) | (1 << TTE64K) | (1 << TTE4M) \
+				    | (1 << TTE256M))
 
 uint_t root_phys_addr_lo_mask = 0xffffffffU;
 static niagara_mmustat_t *cpu_tstat_va;		/* VA of mmustat buffer */
@@ -82,12 +78,16 @@
 void
 cpu_setup(void)
 {
-	extern int at_flags;
-	extern int disable_delay_tlb_flush, delay_tlb_flush;
 	extern int mmu_exported_pagesize_mask;
-	extern int get_cpu_pagesizes(void);
 	extern int cpc_has_overflow_intr;
 	int status;
+	char *ni_isa_set[] = {
+	    "sparcv9+vis",
+	    "sparcv9+vis2",
+	    "sparcv8plus+vis",
+	    "sparcv8plus+vis2",
+	    NULL
+	};
 
 	/*
 	 * Negotiate the API version for Niagara specific hypervisor
@@ -102,49 +102,29 @@
 		niagara_hsvc_available = B_FALSE;
 	}
 
+	/*
+	 * The setup common to all CPU modules is done in cpu_setup_common
+	 * routine.
+	 */
+	cpu_setup_common(ni_isa_set);
+
 	cache |= (CACHE_PTAG | CACHE_IOCOHERENT);
-	at_flags = EF_SPARC_SUN_US3 | EF_SPARC_32PLUS | EF_SPARC_SUN_US1;
 
-	/*
-	 * Use the maximum number of contexts available for Spitfire unless
-	 * it has been tuned for debugging.
-	 * We are checking against 0 here since this value can be patched
-	 * while booting.  It can not be patched via /etc/system since it
-	 * will be patched too late and thus cause the system to panic.
-	 */
-	if (nctxs == 0)
-		nctxs = MAX_NCTXS;
-
-	if (use_page_coloring) {
-		do_pg_coloring = 1;
-		if (use_virtual_coloring)
-			do_virtual_coloring = 1;
+	if (broken_md_flag) {
+		/*
+		 * Turn on the missing bits supported by Niagara CPU in
+		 * MMU pagesize mask returned by MD.
+		 */
+		mmu_exported_pagesize_mask |= NI_MMU_PAGESIZE_MASK;
+	} else {
+		if ((mmu_exported_pagesize_mask &
+		    DEFAULT_SUN4V_MMU_PAGESIZE_MASK) !=
+		    DEFAULT_SUN4V_MMU_PAGESIZE_MASK)
+			cmn_err(CE_PANIC, "machine description"
+			    " does not have required sun4v page sizes"
+			    " 8K, 64K and 4M: MD mask is 0x%x",
+			    mmu_exported_pagesize_mask);
 	}
-	/*
-	 * Initalize supported page sizes information before the PD.
-	 * If no information is available, then initialize the
-	 * mmu_exported_pagesize_mask to a reasonable value for that processor.
-	 */
-	mmu_exported_pagesize_mask = get_cpu_pagesizes();
-	if (mmu_exported_pagesize_mask <= 0) {
-		mmu_exported_pagesize_mask = (1 << TTE8K) | (1 << TTE64K) |
-		    (1 << TTE4M) | (1 << TTE256M);
-	}
-
-	/*
-	 * Tune pp_slots to use up to 1/8th of the tlb entries.
-	 */
-	pp_slots = MIN(8, MAXPP_SLOTS);
-
-	/*
-	 * Block stores invalidate all pages of the d$ so pagecopy
-	 * et. al. do not need virtual translations with virtual
-	 * coloring taken into consideration.
-	 */
-	pp_consistent_coloring = 0;
-	isa_list =
-	    "sparcv9 sparcv8plus sparcv8 sparcv8-fsmuld sparcv7 "
-	    "sparc sparcv9+vis sparcv9+vis2 sparcv8plus+vis sparcv8plus+vis2";
 
 	cpu_hwcap_flags |= AV_SPARC_ASI_BLK_INIT;
 
@@ -155,84 +135,34 @@
 	 * and must never be mapped. In addition, software must not use
 	 * pages within 4GB of the VA hole as instruction pages to
 	 * avoid problems with prefetching into the VA hole.
-	 *
-	 * VA hole information should be obtained from the machine
-	 * description.
 	 */
-	hole_start = (caddr_t)(0x800000000000ul - (1ul << 32));
-	hole_end = (caddr_t)(0xffff800000000000ul + (1ul << 32));
+	hole_start = (caddr_t)((1ull << (va_bits - 1)) - (1ull << 32));
+	hole_end = (caddr_t)((0ull - (1ull << (va_bits - 1))) + (1ull << 32));
 
 	/*
-	 * The kpm mapping window.
-	 * kpm_size:
-	 *	The size of a single kpm range.
-	 *	The overall size will be: kpm_size * vac_colors.
-	 * kpm_vbase:
-	 *	The virtual start address of the kpm range within the kernel
-	 *	virtual address space. kpm_vbase has to be kpm_size aligned.
-	 */
-	kpm_size = (size_t)(2ull * 1024 * 1024 * 1024 * 1024); /* 2TB */
-	kpm_size_shift = 41;
-	kpm_vbase = (caddr_t)0xfffffa0000000000ull; /* 16EB - 6TB */
-
-	/*
-	 * The traptrace code uses either %tick or %stick for
-	 * timestamping.  We have %stick so we can use it.
-	 */
-	traptrace_use_stick = 1;
-
-	/*
-	 * sun4v provides demap_all
-	 */
-	if (!disable_delay_tlb_flush)
-		delay_tlb_flush = 1;
-	/*
 	 * Niagara has a performance counter overflow interrupt
 	 */
 	cpc_has_overflow_intr = 1;
 }
 
-#define	MB	 * 1024 * 1024
+#define	MB(n)	((n) * 1024 * 1024)
 /*
  * Set the magic constants of the implementation.
  */
 void
 cpu_fiximp(struct cpu_node *cpunode)
 {
-	extern int vac_size, vac_shift;
-	extern uint_t vac_mask;
-	int i, a;
-
 	/*
-	 * The assumption here is that fillsysinfo will eventually
-	 * have code to fill this info in from the PD.
-	 * We hard code this for niagara now.
-	 * Once the PD access library is done this code
-	 * might need to be changed to get the info from the PD
+	 * The Cache node is optional in MD. Therefore in case "Cache"
+	 * node does not exists in MD, set the default L2 cache associativity,
+	 * size, linesize.
 	 */
 	if (cpunode->ecache_size == 0)
-		cpunode->ecache_size = 3 MB;
+		cpunode->ecache_size = MB(3);
 	if (cpunode->ecache_linesize == 0)
 		cpunode->ecache_linesize = 64;
 	if (cpunode->ecache_associativity == 0)
 		cpunode->ecache_associativity = 12;
-
-	cpunode->ecache_setsize =
-	    cpunode->ecache_size / cpunode->ecache_associativity;
-
-	if (ecache_setsize == 0)
-		ecache_setsize = cpunode->ecache_setsize;
-	if (ecache_alignsize == 0)
-		ecache_alignsize = cpunode->ecache_linesize;
-
-	vac_size = S_VAC_SIZE;
-	vac_mask = MMU_PAGEMASK & (vac_size - 1);
-	i = 0; a = vac_size;
-	while (a >>= 1)
-		++i;
-	vac_shift = i;
-	shm_alignment = vac_size;
-	vac = 0;
 }
 
 static int niagara_cpucnt;
@@ -243,13 +173,13 @@
 	extern int niagara_kstat_init(void);
 
 	/*
-	 * This code change assumes that the virtual cpu ids are identical
-	 * to the physical cpu ids which is true for ontario but not for
-	 * niagara in general.
-	 * This is a temporary fix which will later be modified to obtain
-	 * the execution unit sharing information from MD table.
+	 * The cpu_ipipe field is initialized based on the execution
+	 * unit sharing information from the MD. It defaults to the
+	 * virtual CPU id in the absence of such information.
 	 */
-	cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id / 4);
+	cp->cpu_m.cpu_ipipe = cpunodes[cp->cpu_id].exec_unit_mapping;
+	if (cp->cpu_m.cpu_ipipe == NO_EU_MAPPING_FOUND)
+		cp->cpu_m.cpu_ipipe = (id_t)(cp->cpu_id);
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 	if (niagara_cpucnt++ == 0 && niagara_hsvc_available == B_TRUE) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/dr_cpu/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,93 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE		= dr_cpu
+OBJECTS		= $(DR_CPU_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(DR_CPU_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_MISC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+# Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS		+= -v
+
+#
+# Turn on doubleword alignment for 64 bit registers
+#
+CFLAGS		+= -dalign
+
+#
+# Module Dependencies
+#
+LDFLAGS		+= -dy -Nmisc/ds
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/ds/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,97 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This makefile drives the production of the zp kernel module.
+#
+# sun4v implementation architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE		= ds
+OBJECTS		= $(DS_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(DS_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_MISC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+# Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS		+= -v
+
+#
+# Turn on doubleword alignment for 64 bit registers
+#
+CFLAGS		+= -dalign
+
+#
+# Module Dependencies
+#
+LDFLAGS		+= -dy -Nmisc/ldc
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/fault_iso/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,97 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This makefile drives the production of the fault_iso kernel module.
+#
+# sun4v implementation architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE		= fault_iso
+OBJECTS		= $(FAULT_ISO_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(FAULT_ISO_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_MISC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+# Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS		+= -v
+
+#
+# Turn on doubleword alignment for 64 bit registers
+#
+CFLAGS		+= -dalign
+
+#
+# Module Dependencies
+#
+LDFLAGS		+= -dy -Nmisc/ds
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/cnex.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,1133 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Logical domain channel devices are devices implemented entirely
+ * in software; cnex is the nexus for channel-devices. They use
+ * the HV channel interfaces via the LDC transport module to send
+ * and receive data and to register callbacks.
+ */
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/devops.h>
+#include <sys/instance.h>
+#include <sys/modctl.h>
+#include <sys/open.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/systm.h>
+#include <sys/mkdev.h>
+#include <sys/machsystm.h>
+#include <sys/intr.h>
+#include <sys/ddi_intr_impl.h>
+#include <sys/ivintr.h>
+#include <sys/hypervisor_api.h>
+#include <sys/ldc.h>
+#include <sys/cnex.h>
+#include <sys/mach_descrip.h>
+
+/*
+ * Internal functions/information
+ */
+static struct cnex_pil_map cnex_class_to_pil[] = {
+	{LDC_DEV_GENERIC,	PIL_3},
+	{LDC_DEV_BLK,		PIL_4},
+	{LDC_DEV_BLK_SVC,	PIL_3},
+	{LDC_DEV_NT,		PIL_6},
+	{LDC_DEV_NT_SVC,	PIL_4},
+	{LDC_DEV_SERIAL,	PIL_6}
+};
+#define	CNEX_MAX_DEVS (sizeof (cnex_class_to_pil) / \
+				sizeof (cnex_class_to_pil[0]))
+
+#define	SUN4V_REG_SPEC2CFG_HDL(x)	((x >> 32) & ~(0xfull << 28))
+
+static hrtime_t cnex_pending_tmout = 2ull * NANOSEC; /* 2 secs in nsecs */
+static void *cnex_state;
+
+static void cnex_intr_redist(void *arg);
+static uint_t cnex_intr_wrapper(caddr_t arg);
+
+/*
+ * Debug info
+ */
+#ifdef DEBUG
+
+/*
+ * Print debug messages
+ *
+ * set cnexdbg to 0xf for enabling all msgs
+ * 0x8 - Errors
+ * 0x4 - Warnings
+ * 0x2 - All debug messages
+ * 0x1 - Minimal debug messages
+ */
+
+int cnexdbg = 0x8;
+
+static void
+cnexdebug(const char *fmt, ...)
+{
+	char buf[512];
+	va_list ap;
+
+	va_start(ap, fmt);
+	(void) vsprintf(buf, fmt, ap);
+	va_end(ap);
+
+	cmn_err(CE_CONT, "%s\n", buf);
+}
+
+#define	D1		\
+if (cnexdbg & 0x01)	\
+	cnexdebug
+
+#define	D2		\
+if (cnexdbg & 0x02)	\
+	cnexdebug
+
+#define	DWARN		\
+if (cnexdbg & 0x04)	\
+	cnexdebug
+
+#define	DERR		\
+if (cnexdbg & 0x08)	\
+	cnexdebug
+
+#else
+
+#define	D1
+#define	D2
+#define	DWARN
+#define	DERR
+
+#endif
+
+/*
+ * Config information
+ */
+static int cnex_attach(dev_info_t *, ddi_attach_cmd_t);
+static int cnex_detach(dev_info_t *, ddi_detach_cmd_t);
+static int cnex_open(dev_t *, int, int, cred_t *);
+static int cnex_close(dev_t, int, int, cred_t *);
+static int cnex_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+static int cnex_ctl(dev_info_t *, dev_info_t *, ddi_ctl_enum_t, void *,
+    void *);
+
+static struct bus_ops cnex_bus_ops = {
+	BUSO_REV,
+	nullbusmap,		/* bus_map */
+	NULL,			/* bus_get_intrspec */
+	NULL,			/* bus_add_intrspec */
+	NULL,			/* bus_remove_intrspec */
+	i_ddi_map_fault,	/* bus_map_fault */
+	ddi_no_dma_map,		/* bus_dma_map */
+	ddi_no_dma_allochdl,	/* bus_dma_allochdl */
+	NULL,			/* bus_dma_freehdl */
+	NULL,			/* bus_dma_bindhdl */
+	NULL,			/* bus_dma_unbindhdl */
+	NULL,			/* bus_dma_flush */
+	NULL,			/* bus_dma_win */
+	NULL,			/* bus_dma_ctl */
+	cnex_ctl,		/* bus_ctl */
+	ddi_bus_prop_op,	/* bus_prop_op */
+	0,			/* bus_get_eventcookie */
+	0,			/* bus_add_eventcall */
+	0,			/* bus_remove_eventcall	*/
+	0,			/* bus_post_event */
+	NULL,			/* bus_intr_ctl */
+	NULL,			/* bus_config */
+	NULL,			/* bus_unconfig */
+	NULL,			/* bus_fm_init */
+	NULL,			/* bus_fm_fini */
+	NULL,			/* bus_fm_access_enter */
+	NULL,			/* bus_fm_access_exit */
+	NULL,			/* bus_power */
+	NULL			/* bus_intr_op */
+};
+
+static struct cb_ops cnex_cb_ops = {
+	cnex_open,			/* open */
+	cnex_close,			/* close */
+	nodev,				/* strategy */
+	nodev,				/* print */
+	nodev,				/* dump */
+	nodev,				/* read */
+	nodev,				/* write */
+	cnex_ioctl,			/* ioctl */
+	nodev,				/* devmap */
+	nodev,				/* mmap */
+	nodev,				/* segmap */
+	nochpoll,			/* poll */
+	ddi_prop_op,			/* cb_prop_op */
+	0,				/* streamtab  */
+	D_MP | D_NEW | D_HOTPLUG	/* Driver compatibility flag */
+};
+
+static struct dev_ops cnex_ops = {
+	DEVO_REV,		/* devo_rev, */
+	0,			/* refcnt  */
+	ddi_getinfo_1to1,	/* info */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	cnex_attach,		/* attach */
+	cnex_detach,		/* detach */
+	nodev,			/* reset */
+	&cnex_cb_ops,		/* driver operations */
+	&cnex_bus_ops,		/* bus operations */
+	nulldev			/* power */
+};
+
+/*
+ * Module linkage information for the kernel.
+ */
+static struct modldrv modldrv = {
+	&mod_driverops,
+	"sun4v channel-devices nexus driver v%I%",
+	&cnex_ops,
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modldrv, NULL
+};
+
+int
+_init(void)
+{
+	int err;
+
+	if ((err = ddi_soft_state_init(&cnex_state,
+		sizeof (cnex_soft_state_t), 0)) != 0) {
+		return (err);
+	}
+	if ((err = mod_install(&modlinkage)) != 0) {
+		ddi_soft_state_fini(&cnex_state);
+		return (err);
+	}
+	return (0);
+}
+
+int
+_fini(void)
+{
+	int err;
+
+	if ((err = mod_remove(&modlinkage)) != 0)
+		return (err);
+	ddi_soft_state_fini(&cnex_state);
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * Callback function invoked by the interrupt redistribution
+ * framework. This will redirect interrupts at CPUs that are
+ * currently available in the system.
+ */
+static void
+cnex_intr_redist(void *arg)
+{
+	cnex_ldc_t		*cldcp;
+	cnex_soft_state_t	*cnex_ssp = arg;
+	int			intr_state;
+	hrtime_t 		start;
+	uint64_t		cpuid;
+	int 			rv;
+
+	ASSERT(cnex_ssp != NULL);
+	mutex_enter(&cnex_ssp->clist_lock);
+
+	cldcp = cnex_ssp->clist;
+	while (cldcp != NULL) {
+
+		mutex_enter(&cldcp->lock);
+
+		if (cldcp->tx.hdlr) {
+			/*
+			 * Don't do anything for disabled interrupts.
+			 */
+			rv = hvldc_intr_getvalid(cnex_ssp->cfghdl,
+			    cldcp->tx.ino, &intr_state);
+			if (rv) {
+				DWARN("cnex_intr_redist: tx ino=0x%llx, "
+				    "can't get valid\n", cldcp->tx.ino);
+				mutex_exit(&cldcp->lock);
+				mutex_exit(&cnex_ssp->clist_lock);
+				return;
+			}
+			if (intr_state == HV_INTR_NOTVALID) {
+				cldcp = cldcp->next;
+				continue;
+			}
+
+			cpuid = intr_dist_cpuid();
+
+			/* disable interrupts */
+			rv = hvldc_intr_setvalid(cnex_ssp->cfghdl,
+			    cldcp->tx.ino, HV_INTR_NOTVALID);
+			if (rv) {
+				DWARN("cnex_intr_redist: tx ino=0x%llx, "
+				    "can't set valid\n", cldcp->tx.ino);
+				mutex_exit(&cldcp->lock);
+				mutex_exit(&cnex_ssp->clist_lock);
+				return;
+			}
+
+			/*
+			 * Make a best effort to wait for pending interrupts
+			 * to finish. There is not much we can do if we timeout.
+			 */
+			start = gethrtime();
+
+			do {
+				rv = hvldc_intr_getstate(cnex_ssp->cfghdl,
+				    cldcp->tx.ino, &intr_state);
+				if (rv) {
+					DWARN("cnex_intr_redist: tx ino=0x%llx,"
+					    "can't get state\n", cldcp->tx.ino);
+					mutex_exit(&cldcp->lock);
+					mutex_exit(&cnex_ssp->clist_lock);
+					return;
+				}
+
+				if ((gethrtime() - start) > cnex_pending_tmout)
+					break;
+
+			} while (!panicstr &&
+			    intr_state == HV_INTR_DELIVERED_STATE);
+
+			(void) hvldc_intr_settarget(cnex_ssp->cfghdl,
+			    cldcp->tx.ino, cpuid);
+			(void) hvldc_intr_setvalid(cnex_ssp->cfghdl,
+			    cldcp->tx.ino, HV_INTR_VALID);
+		}
+
+		if (cldcp->rx.hdlr) {
+			/*
+			 * Don't do anything for disabled interrupts.
+			 */
+			rv = hvldc_intr_getvalid(cnex_ssp->cfghdl,
+			    cldcp->rx.ino, &intr_state);
+			if (rv) {
+				DWARN("cnex_intr_redist: rx ino=0x%llx, "
+				    "can't get valid\n", cldcp->rx.ino);
+				mutex_exit(&cldcp->lock);
+				mutex_exit(&cnex_ssp->clist_lock);
+				return;
+			}
+			if (intr_state == HV_INTR_NOTVALID) {
+				cldcp = cldcp->next;
+				continue;
+			}
+
+			cpuid = intr_dist_cpuid();
+
+			/* disable interrupts */
+			rv = hvldc_intr_setvalid(cnex_ssp->cfghdl,
+			    cldcp->rx.ino, HV_INTR_NOTVALID);
+			if (rv) {
+				DWARN("cnex_intr_redist: rx ino=0x%llx, "
+				    "can't set valid\n", cldcp->rx.ino);
+				mutex_exit(&cldcp->lock);
+				mutex_exit(&cnex_ssp->clist_lock);
+				return;
+			}
+
+			/*
+			 * Make a best effort to wait for pending interrupts
+			 * to finish. There is not much we can do if we timeout.
+			 */
+			start = gethrtime();
+
+			do {
+				rv = hvldc_intr_getstate(cnex_ssp->cfghdl,
+				    cldcp->rx.ino, &intr_state);
+				if (rv) {
+					DWARN("cnex_intr_redist: rx ino=0x%llx,"
+					    "can't set state\n", cldcp->rx.ino);
+					mutex_exit(&cldcp->lock);
+					mutex_exit(&cnex_ssp->clist_lock);
+					return;
+				}
+
+				if ((gethrtime() - start) > cnex_pending_tmout)
+					break;
+
+			} while (!panicstr &&
+			    intr_state == HV_INTR_DELIVERED_STATE);
+
+			(void) hvldc_intr_settarget(cnex_ssp->cfghdl,
+			    cldcp->rx.ino, cpuid);
+			(void) hvldc_intr_setvalid(cnex_ssp->cfghdl,
+			    cldcp->rx.ino, HV_INTR_VALID);
+		}
+
+		mutex_exit(&cldcp->lock);
+
+		/* next channel */
+		cldcp = cldcp->next;
+	}
+
+	mutex_exit(&cnex_ssp->clist_lock);
+}
+
+/*
+ * Exported interface to register a LDC endpoint with
+ * the channel nexus
+ */
+static int
+cnex_reg_chan(dev_info_t *dip, uint64_t id, ldc_dev_t devclass)
+{
+	int		idx;
+	cnex_ldc_t	*cldcp;
+	int		listsz, num_nodes, num_channels;
+	md_t		*mdp = NULL;
+	mde_cookie_t	rootnode, *listp = NULL;
+	uint64_t	tmp_id, rxino, txino;
+	cnex_soft_state_t *cnex_ssp;
+	int		status, instance;
+
+	/* Get device instance and structure */
+	instance = ddi_get_instance(dip);
+	cnex_ssp = ddi_get_soft_state(cnex_state, instance);
+
+	/* Check to see if channel is already registered */
+	mutex_enter(&cnex_ssp->clist_lock);
+	cldcp = cnex_ssp->clist;
+	while (cldcp) {
+		if (cldcp->id == id) {
+			DWARN("cnex_reg_chan: channel 0x%llx exists\n", id);
+			mutex_exit(&cnex_ssp->clist_lock);
+			return (EINVAL);
+		}
+		cldcp = cldcp->next;
+	}
+
+	/* Get the Tx/Rx inos from the MD */
+	if ((mdp = md_get_handle()) == NULL) {
+		DWARN("cnex_reg_chan: cannot init MD\n");
+		mutex_exit(&cnex_ssp->clist_lock);
+		return (ENXIO);
+	}
+	num_nodes = md_node_count(mdp);
+	ASSERT(num_nodes > 0);
+
+	listsz = num_nodes * sizeof (mde_cookie_t);
+	listp = (mde_cookie_t *)kmem_zalloc(listsz, KM_SLEEP);
+
+	rootnode = md_root_node(mdp);
+
+	/* search for all channel_endpoint nodes */
+	num_channels = md_scan_dag(mdp, rootnode,
+	    md_find_name(mdp, "channel-endpoint"),
+	    md_find_name(mdp, "fwd"), listp);
+	if (num_channels <= 0) {
+		DWARN("cnex_reg_chan: invalid channel id\n");
+		kmem_free(listp, listsz);
+		(void) md_fini_handle(mdp);
+		mutex_exit(&cnex_ssp->clist_lock);
+		return (EINVAL);
+	}
+
+	for (idx = 0; idx < num_channels; idx++) {
+
+		/* Get the channel ID */
+		status = md_get_prop_val(mdp, listp[idx], "id", &tmp_id);
+		if (status) {
+			DWARN("cnex_reg_chan: cannot read LDC ID\n");
+			kmem_free(listp, listsz);
+			(void) md_fini_handle(mdp);
+			mutex_exit(&cnex_ssp->clist_lock);
+			return (ENXIO);
+		}
+		if (tmp_id != id)
+			continue;
+
+		/* Get the Tx and Rx ino */
+		status = md_get_prop_val(mdp, listp[idx], "tx-ino", &txino);
+		if (status) {
+			DWARN("cnex_reg_chan: cannot read Tx ino\n");
+			kmem_free(listp, listsz);
+			(void) md_fini_handle(mdp);
+			mutex_exit(&cnex_ssp->clist_lock);
+			return (ENXIO);
+		}
+		status = md_get_prop_val(mdp, listp[idx], "rx-ino", &rxino);
+		if (status) {
+			DWARN("cnex_reg_chan: cannot read Rx ino\n");
+			kmem_free(listp, listsz);
+			(void) md_fini_handle(mdp);
+			mutex_exit(&cnex_ssp->clist_lock);
+			return (ENXIO);
+		}
+	}
+	kmem_free(listp, listsz);
+	(void) md_fini_handle(mdp);
+
+	/* Allocate a new channel structure */
+	cldcp = kmem_zalloc(sizeof (*cldcp), KM_SLEEP);
+
+	/* Initialize the channel */
+	mutex_init(&cldcp->lock, NULL, MUTEX_DRIVER, NULL);
+
+	cldcp->id = id;
+	cldcp->tx.ino = txino;
+	cldcp->rx.ino = rxino;
+	cldcp->devclass = devclass;
+
+	/* add channel to nexus channel list */
+	cldcp->next = cnex_ssp->clist;
+	cnex_ssp->clist = cldcp;
+
+	mutex_exit(&cnex_ssp->clist_lock);
+
+	return (0);
+}
+
+/*
+ * Add Tx/Rx interrupt handler for the channel
+ */
+static int
+cnex_add_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype,
+    uint_t (*hdlr)(), caddr_t arg1, caddr_t arg2)
+{
+	int		rv, idx, pil;
+	cnex_ldc_t	*cldcp;
+	cnex_intr_t	*iinfo;
+	uint64_t	cpuid;
+	cnex_soft_state_t *cnex_ssp;
+	int		instance;
+
+	/* Get device instance and structure */
+	instance = ddi_get_instance(dip);
+	cnex_ssp = ddi_get_soft_state(cnex_state, instance);
+
+	/* get channel info */
+	mutex_enter(&cnex_ssp->clist_lock);
+	cldcp = cnex_ssp->clist;
+	while (cldcp) {
+		if (cldcp->id == id)
+			break;
+		cldcp = cldcp->next;
+	}
+	if (cldcp == NULL) {
+		DWARN("cnex_add_intr: channel 0x%llx does not exist\n", id);
+		mutex_exit(&cnex_ssp->clist_lock);
+		return (EINVAL);
+	}
+	mutex_exit(&cnex_ssp->clist_lock);
+
+	/* get channel lock */
+	mutex_enter(&cldcp->lock);
+
+	/* get interrupt type */
+	if (itype == CNEX_TX_INTR) {
+		iinfo = &(cldcp->tx);
+	} else if (itype == CNEX_RX_INTR) {
+		iinfo = &(cldcp->rx);
+	} else {
+		DWARN("cnex_add_intr: invalid interrupt type\n", id);
+		mutex_exit(&cldcp->lock);
+		return (EINVAL);
+	}
+
+	/* check if a handler is already added */
+	if (iinfo->hdlr != 0) {
+		DWARN("cnex_add_intr: interrupt handler exists\n");
+		mutex_exit(&cldcp->lock);
+		return (EINVAL);
+	}
+
+	/* save interrupt handler info */
+	iinfo->hdlr = hdlr;
+	iinfo->arg1 = arg1;
+	iinfo->arg2 = arg2;
+
+	iinfo->ssp = cnex_ssp;
+
+	/*
+	 * FIXME - generate the interrupt cookie
+	 * using the interrupt registry
+	 */
+	iinfo->icookie = cnex_ssp->cfghdl | iinfo->ino;
+
+	D1("cnex_add_intr: add hdlr, cfghdl=0x%llx, ino=0x%llx, "
+	    "cookie=0x%llx\n", cnex_ssp->cfghdl, iinfo->ino, iinfo->icookie);
+
+	/* Pick a PIL on the basis of the channel's devclass */
+	for (idx = 0, pil = PIL_3; idx < CNEX_MAX_DEVS; idx++) {
+		if (cldcp->devclass == cnex_class_to_pil[idx].devclass) {
+			pil = cnex_class_to_pil[idx].pil;
+			break;
+		}
+	}
+
+	/* add interrupt to solaris ivec table */
+	VERIFY(add_ivintr(iinfo->icookie, pil, cnex_intr_wrapper,
+		(caddr_t)iinfo, NULL) == 0);
+
+	/* set the cookie in the HV */
+	rv = hvldc_intr_setcookie(cnex_ssp->cfghdl, iinfo->ino, iinfo->icookie);
+
+	/* pick next CPU in the domain for this channel */
+	cpuid = intr_dist_cpuid();
+
+	/* set the target CPU and then enable interrupts */
+	rv = hvldc_intr_settarget(cnex_ssp->cfghdl, iinfo->ino, cpuid);
+	if (rv) {
+		DWARN("cnex_add_intr: ino=0x%llx, cannot set target cpu\n",
+		    iinfo->ino);
+		goto hv_error;
+	}
+	rv = hvldc_intr_setstate(cnex_ssp->cfghdl, iinfo->ino,
+	    HV_INTR_IDLE_STATE);
+	if (rv) {
+		DWARN("cnex_add_intr: ino=0x%llx, cannot set state\n",
+		    iinfo->ino);
+		goto hv_error;
+	}
+	rv = hvldc_intr_setvalid(cnex_ssp->cfghdl, iinfo->ino, HV_INTR_VALID);
+	if (rv) {
+		DWARN("cnex_add_intr: ino=0x%llx, cannot set valid\n",
+		    iinfo->ino);
+		goto hv_error;
+	}
+
+	mutex_exit(&cldcp->lock);
+	return (0);
+
+hv_error:
+	(void) rem_ivintr(iinfo->icookie, NULL);
+	mutex_exit(&cldcp->lock);
+	return (ENXIO);
+}
+
+
+/*
+ * Exported interface to unregister a LDC endpoint with
+ * the channel nexus
+ */
+static int
+cnex_unreg_chan(dev_info_t *dip, uint64_t id)
+{
+	cnex_ldc_t	*cldcp, *prev_cldcp;
+	cnex_soft_state_t *cnex_ssp;
+	int		instance;
+
+	/* Get device instance and structure */
+	instance = ddi_get_instance(dip);
+	cnex_ssp = ddi_get_soft_state(cnex_state, instance);
+
+	/* find and remove channel from list */
+	mutex_enter(&cnex_ssp->clist_lock);
+	prev_cldcp = NULL;
+	cldcp = cnex_ssp->clist;
+	while (cldcp) {
+		if (cldcp->id == id)
+			break;
+		prev_cldcp = cldcp;
+		cldcp = cldcp->next;
+	}
+
+	if (cldcp == 0) {
+		DWARN("cnex_unreg_chan: invalid channel %d\n", id);
+		mutex_exit(&cnex_ssp->clist_lock);
+		return (EINVAL);
+	}
+
+	if (cldcp->tx.hdlr || cldcp->rx.hdlr) {
+		DWARN("cnex_unreg_chan: handlers still exist\n");
+		mutex_exit(&cnex_ssp->clist_lock);
+		return (ENXIO);
+	}
+
+	if (prev_cldcp)
+		prev_cldcp->next = cldcp->next;
+	else
+		cnex_ssp->clist = cldcp->next;
+
+	mutex_exit(&cnex_ssp->clist_lock);
+
+	/* destroy mutex */
+	mutex_destroy(&cldcp->lock);
+
+	/* free channel */
+	kmem_free(cldcp, sizeof (*cldcp));
+
+	return (0);
+}
+
+/*
+ * Remove Tx/Rx interrupt handler for the channel
+ */
+static int
+cnex_rem_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype)
+{
+	int			rv;
+	cnex_ldc_t		*cldcp;
+	cnex_intr_t		*iinfo;
+	cnex_soft_state_t	*cnex_ssp;
+	hrtime_t 		start;
+	int			instance, istate;
+
+	/* Get device instance and structure */
+	instance = ddi_get_instance(dip);
+	cnex_ssp = ddi_get_soft_state(cnex_state, instance);
+
+	/* get channel info */
+	mutex_enter(&cnex_ssp->clist_lock);
+	cldcp = cnex_ssp->clist;
+	while (cldcp) {
+		if (cldcp->id == id)
+			break;
+		cldcp = cldcp->next;
+	}
+	if (cldcp == NULL) {
+		DWARN("cnex_rem_intr: channel 0x%llx does not exist\n", id);
+		mutex_exit(&cnex_ssp->clist_lock);
+		return (EINVAL);
+	}
+	mutex_exit(&cnex_ssp->clist_lock);
+
+	/* get rid of the channel intr handler */
+	mutex_enter(&cldcp->lock);
+
+	/* get interrupt type */
+	if (itype == CNEX_TX_INTR) {
+		iinfo = &(cldcp->tx);
+	} else if (itype == CNEX_RX_INTR) {
+		iinfo = &(cldcp->rx);
+	} else {
+		DWARN("cnex_rem_intr: invalid interrupt type\n");
+		mutex_exit(&cldcp->lock);
+		return (EINVAL);
+	}
+
+	D1("cnex_rem_intr: interrupt ino=0x%x\n", iinfo->ino);
+
+	/* check if a handler is already added */
+	if (iinfo->hdlr == 0) {
+		DWARN("cnex_rem_intr: interrupt handler does not exist\n");
+		mutex_exit(&cldcp->lock);
+		return (EINVAL);
+	}
+
+	D1("cnex_rem_intr: set intr to invalid ino=0x%x\n", iinfo->ino);
+	rv = hvldc_intr_setvalid(cnex_ssp->cfghdl,
+	    iinfo->ino, HV_INTR_NOTVALID);
+	if (rv) {
+		DWARN("cnex_rem_intr: cannot set valid ino=%x\n", iinfo->ino);
+		mutex_exit(&cldcp->lock);
+		return (ENXIO);
+	}
+
+	/*
+	 * Make a best effort to wait for pending interrupts
+	 * to finish. There is not much we can do if we timeout.
+	 */
+	start = gethrtime();
+	do {
+		rv = hvldc_intr_getstate(cnex_ssp->cfghdl, iinfo->ino, &istate);
+		if (rv) {
+			DWARN("cnex_rem_intr: ino=0x%llx, cannot get state\n",
+			    iinfo->ino);
+		}
+
+		if (rv || ((gethrtime() - start) > cnex_pending_tmout))
+			break;
+
+	} while (!panicstr && istate == HV_INTR_DELIVERED_STATE);
+
+	/* if interrupts are still pending print warning */
+	if (istate != HV_INTR_IDLE_STATE) {
+		DWARN("cnex_rem_intr: cannot remove intr busy ino=%x\n",
+		    iinfo->ino);
+		/* clear interrupt state */
+		(void) hvldc_intr_setstate(cnex_ssp->cfghdl, iinfo->ino,
+		    HV_INTR_IDLE_STATE);
+	}
+
+	/* remove interrupt */
+	rem_ivintr(iinfo->icookie, NULL);
+
+	/* clear interrupt info */
+	bzero(iinfo, sizeof (*iinfo));
+
+	mutex_exit(&cldcp->lock);
+
+	return (0);
+}
+
+
+/*
+ * Clear pending Tx/Rx interrupt
+ */
+static int
+cnex_clr_intr(dev_info_t *dip, uint64_t id, cnex_intrtype_t itype)
+{
+	int			rv;
+	cnex_ldc_t		*cldcp;
+	cnex_intr_t		*iinfo;
+	cnex_soft_state_t	*cnex_ssp;
+	int			instance;
+
+	/* Get device instance and structure */
+	instance = ddi_get_instance(dip);
+	cnex_ssp = ddi_get_soft_state(cnex_state, instance);
+
+	/* get channel info */
+	mutex_enter(&cnex_ssp->clist_lock);
+	cldcp = cnex_ssp->clist;
+	while (cldcp) {
+		if (cldcp->id == id)
+			break;
+		cldcp = cldcp->next;
+	}
+	if (cldcp == NULL) {
+		DWARN("cnex_clr_intr: channel 0x%llx does not exist\n", id);
+		mutex_exit(&cnex_ssp->clist_lock);
+		return (EINVAL);
+	}
+	mutex_exit(&cnex_ssp->clist_lock);
+
+	mutex_enter(&cldcp->lock);
+
+	/* get interrupt type */
+	if (itype == CNEX_TX_INTR) {
+		iinfo = &(cldcp->tx);
+	} else if (itype == CNEX_RX_INTR) {
+		iinfo = &(cldcp->rx);
+	} else {
+		DWARN("cnex_rem_intr: invalid interrupt type\n");
+		mutex_exit(&cldcp->lock);
+		return (EINVAL);
+	}
+
+	D1("cnex_rem_intr: interrupt ino=0x%x\n", iinfo->ino);
+
+	/* check if a handler is already added */
+	if (iinfo->hdlr == 0) {
+		DWARN("cnex_clr_intr: interrupt handler does not exist\n");
+		mutex_exit(&cldcp->lock);
+		return (EINVAL);
+	}
+
+	rv = hvldc_intr_setstate(cnex_ssp->cfghdl, iinfo->ino,
+	    HV_INTR_IDLE_STATE);
+	if (rv) {
+		DWARN("cnex_intr_wrapper: cannot clear interrupt state\n");
+	}
+
+	mutex_exit(&cldcp->lock);
+
+	return (0);
+}
+
+/*
+ * Channel nexus interrupt handler wrapper
+ */
+static uint_t
+cnex_intr_wrapper(caddr_t arg)
+{
+	int 			res;
+	uint_t 			(*handler)();
+	caddr_t 		handler_arg1;
+	caddr_t 		handler_arg2;
+	cnex_intr_t 		*iinfo = (cnex_intr_t *)arg;
+
+	ASSERT(iinfo != NULL);
+
+	handler = iinfo->hdlr;
+	handler_arg1 = iinfo->arg1;
+	handler_arg2 = iinfo->arg2;
+
+	D1("cnex_intr_wrapper: ino=0x%llx invoke client handler\n", iinfo->ino);
+	res = (*handler)(handler_arg1, handler_arg2);
+
+	return (res);
+}
+
+/*ARGSUSED*/
+static int
+cnex_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+{
+	int 		rv, instance, reglen;
+	cnex_regspec_t	*reg_p;
+	ldc_cnex_t	cinfo;
+	cnex_soft_state_t *cnex_ssp;
+
+	switch (cmd) {
+	case DDI_ATTACH:
+		break;
+	case DDI_RESUME:
+		return (DDI_SUCCESS);
+	default:
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Get the instance specific soft state structure.
+	 * Save the devi for this instance in the soft_state data.
+	 */
+	instance = ddi_get_instance(devi);
+	if (ddi_soft_state_zalloc(cnex_state, instance) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+	cnex_ssp = ddi_get_soft_state(cnex_state, instance);
+
+	cnex_ssp->devi = devi;
+	cnex_ssp->clist = NULL;
+
+	if (ddi_getlongprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
+		"reg", (caddr_t)&reg_p, &reglen) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+
+	/* get the sun4v config handle for this device */
+	cnex_ssp->cfghdl = SUN4V_REG_SPEC2CFG_HDL(reg_p->physaddr);
+	kmem_free(reg_p, reglen);
+
+	D1("cnex_attach: cfghdl=0x%llx\n", cnex_ssp->cfghdl);
+
+	/* init channel list mutex */
+	mutex_init(&cnex_ssp->clist_lock, NULL, MUTEX_DRIVER, NULL);
+
+	/* Register with LDC module */
+	cinfo.dip = devi;
+	cinfo.reg_chan = cnex_reg_chan;
+	cinfo.unreg_chan = cnex_unreg_chan;
+	cinfo.add_intr = cnex_add_intr;
+	cinfo.rem_intr = cnex_rem_intr;
+	cinfo.clr_intr = cnex_clr_intr;
+
+	/*
+	 * LDC register will fail if an nexus instance had already
+	 * registered with the LDC framework
+	 */
+	rv = ldc_register(&cinfo);
+	if (rv) {
+		DWARN("cnex_attach: unable to register with LDC\n");
+		ddi_soft_state_free(cnex_state, instance);
+		mutex_destroy(&cnex_ssp->clist_lock);
+		return (DDI_FAILURE);
+	}
+
+	if (ddi_create_minor_node(devi, "devctl", S_IFCHR, instance,
+	    DDI_NT_NEXUS, 0) != DDI_SUCCESS) {
+		ddi_remove_minor_node(devi, NULL);
+		ddi_soft_state_free(cnex_state, instance);
+		mutex_destroy(&cnex_ssp->clist_lock);
+		return (DDI_FAILURE);
+	}
+
+	/* Add interrupt redistribution callback. */
+	intr_dist_add(cnex_intr_redist, cnex_ssp);
+
+	ddi_report_dev(devi);
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+cnex_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
+{
+	int 		instance;
+	ldc_cnex_t	cinfo;
+	cnex_soft_state_t *cnex_ssp;
+
+	switch (cmd) {
+	case DDI_DETACH:
+		break;
+	case DDI_SUSPEND:
+		return (DDI_SUCCESS);
+	default:
+		return (DDI_FAILURE);
+	}
+
+	instance = ddi_get_instance(devi);
+	cnex_ssp = ddi_get_soft_state(cnex_state, instance);
+
+	/* check if there are any channels still registered */
+	if (cnex_ssp->clist) {
+		cmn_err(CE_WARN, "?cnex_dettach: channels registered %d\n",
+		    ddi_get_instance(devi));
+		return (DDI_FAILURE);
+	}
+
+	/* Unregister with LDC module */
+	cinfo.dip = devi;
+	(void) ldc_unregister(&cinfo);
+
+	/* Remove interrupt redistribution callback. */
+	intr_dist_rem(cnex_intr_redist, cnex_ssp);
+
+	/* destroy mutex */
+	mutex_destroy(&cnex_ssp->clist_lock);
+
+	/* free soft state structure */
+	ddi_soft_state_free(cnex_state, instance);
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+cnex_open(dev_t *devp, int flags, int otyp, cred_t *credp)
+{
+	int instance;
+
+	if (otyp != OTYP_CHR)
+		return (EINVAL);
+
+	instance = getminor(*devp);
+	if (ddi_get_soft_state(cnex_state, instance) == NULL)
+		return (ENXIO);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+cnex_close(dev_t dev, int flags, int otyp, cred_t *credp)
+{
+	int instance;
+
+	if (otyp != OTYP_CHR)
+		return (EINVAL);
+
+	instance = getminor(dev);
+	if (ddi_get_soft_state(cnex_state, instance) == NULL)
+		return (ENXIO);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+cnex_ioctl(dev_t dev,
+    int cmd, intptr_t arg, int mode, cred_t *cred_p, int *rval_p)
+{
+	int instance;
+	cnex_soft_state_t *cnex_ssp;
+
+	instance = getminor(dev);
+	if ((cnex_ssp = ddi_get_soft_state(cnex_state, instance)) == NULL)
+		return (ENXIO);
+	ASSERT(cnex_ssp->devi);
+	return (ndi_devctl_ioctl(cnex_ssp->devi, cmd, arg, mode, 0));
+}
+
+static int
+cnex_ctl(dev_info_t *dip, dev_info_t *rdip, ddi_ctl_enum_t ctlop,
+    void *arg, void *result)
+{
+	char		name[MAXNAMELEN];
+	uint32_t	reglen;
+	int		*cnex_regspec;
+
+	switch (ctlop) {
+	case DDI_CTLOPS_REPORTDEV:
+		if (rdip == NULL)
+			return (DDI_FAILURE);
+		cmn_err(CE_CONT, "?channel-device: %s%d\n",
+		    ddi_driver_name(rdip), ddi_get_instance(rdip));
+		return (DDI_SUCCESS);
+
+	case DDI_CTLOPS_INITCHILD:
+	{
+		dev_info_t *child = (dev_info_t *)arg;
+
+		if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, child,
+			DDI_PROP_DONTPASS, "reg",
+			&cnex_regspec, &reglen) != DDI_SUCCESS) {
+			return (DDI_FAILURE);
+		}
+
+		(void) snprintf(name, sizeof (name), "%x", *cnex_regspec);
+		ddi_set_name_addr(child, name);
+		ddi_set_parent_data(child, NULL);
+		ddi_prop_free(cnex_regspec);
+		return (DDI_SUCCESS);
+	}
+
+	case DDI_CTLOPS_UNINITCHILD:
+	{
+		dev_info_t *child = (dev_info_t *)arg;
+
+		NDI_CONFIG_DEBUG((CE_NOTE,
+		    "DDI_CTLOPS_UNINITCHILD(%s, instance=%d)",
+		    ddi_driver_name(child), DEVI(child)->devi_instance));
+
+		ddi_set_name_addr(child, NULL);
+
+		return (DDI_SUCCESS);
+	}
+
+	case DDI_CTLOPS_DMAPMAPC:
+	case DDI_CTLOPS_REPORTINT:
+	case DDI_CTLOPS_REGSIZE:
+	case DDI_CTLOPS_NREGS:
+	case DDI_CTLOPS_SIDDEV:
+	case DDI_CTLOPS_SLAVEONLY:
+	case DDI_CTLOPS_AFFINITY:
+	case DDI_CTLOPS_POKE:
+	case DDI_CTLOPS_PEEK:
+		/*
+		 * These ops correspond to functions that "shouldn't" be called
+		 * by a channel-device driver.  So we whine when we're called.
+		 */
+		cmn_err(CE_WARN, "%s%d: invalid op (%d) from %s%d\n",
+		    ddi_driver_name(dip), ddi_get_instance(dip), ctlop,
+		    ddi_driver_name(rdip), ddi_get_instance(rdip));
+		return (DDI_FAILURE);
+
+	case DDI_CTLOPS_ATTACH:
+	case DDI_CTLOPS_BTOP:
+	case DDI_CTLOPS_BTOPR:
+	case DDI_CTLOPS_DETACH:
+	case DDI_CTLOPS_DVMAPAGESIZE:
+	case DDI_CTLOPS_IOMIN:
+	case DDI_CTLOPS_POWER:
+	case DDI_CTLOPS_PTOB:
+	default:
+		/*
+		 * Everything else (e.g. PTOB/BTOP/BTOPR requests) we pass up
+		 */
+		return (ddi_ctlops(dip, rdip, ctlop, arg, result));
+	}
+}
+
+/* -------------------------------------------------------------------------- */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/dr_cpu.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,1151 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * sun4v CPU DR Module
+ */
+
+#include <sys/modctl.h>
+#include <sys/processor.h>
+#include <sys/cpuvar.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/note.h>
+#include <sys/sysevent/dr.h>
+#include <sys/hypervisor_api.h>
+#include <sys/mach_descrip.h>
+#include <sys/mdesc.h>
+#include <sys/ds.h>
+#include <sys/dr_util.h>
+#include <sys/dr_cpu.h>
+#include <sys/promif.h>
+#include <sys/machsystm.h>
+
+
+static struct modlmisc modlmisc = {
+	&mod_miscops,
+	"sun4v CPU DR %I%"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	(void *)&modlmisc,
+	NULL
+};
+
+/*
+ * Global DS Handle
+ */
+static ds_svc_hdl_t ds_handle;
+
+/*
+ * Supported DS Capability Versions
+ */
+static ds_ver_t		dr_cpu_vers[] = { { 1, 0 } };
+#define	DR_CPU_NVERS	(sizeof (dr_cpu_vers) / sizeof (dr_cpu_vers[0]))
+
+/*
+ * DS Capability Description
+ */
+static ds_capability_t dr_cpu_cap = {
+	DR_CPU_DS_ID,		/* svc_id */
+	dr_cpu_vers,		/* vers */
+	DR_CPU_NVERS		/* nvers */
+};
+
+/*
+ * DS Callbacks
+ */
+static void dr_cpu_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
+static void dr_cpu_unreg_handler(ds_cb_arg_t arg);
+static void dr_cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
+
+/*
+ * DS Client Ops Vector
+ */
+static ds_clnt_ops_t dr_cpu_ops = {
+	dr_cpu_reg_handler,	/* ds_reg_cb */
+	dr_cpu_unreg_handler,	/* ds_unreg_cb */
+	dr_cpu_data_handler,	/* ds_data_cb */
+	NULL			/* cb_arg */
+};
+
+/*
+ * Internal Functions
+ */
+static int dr_cpu_init(void);
+static int dr_cpu_fini(void);
+
+static int dr_cpu_list_configure(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *);
+static int dr_cpu_list_unconfigure(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *);
+static int dr_cpu_list_status(dr_cpu_hdr_t *, dr_cpu_hdr_t **, int *);
+
+static int dr_cpu_unconfigure(processorid_t, int *status, boolean_t force);
+static int dr_cpu_configure(processorid_t, int *status);
+static int dr_cpu_status(processorid_t, int *status);
+
+static int dr_cpu_probe(processorid_t newcpuid);
+static int dr_cpu_deprobe(processorid_t cpuid);
+
+static dev_info_t *dr_cpu_find_node(processorid_t cpuid);
+static mde_cookie_t dr_cpu_find_node_md(processorid_t, md_t *, mde_cookie_t *);
+
+
+int
+_init(void)
+{
+	int	status;
+
+	/* check that CPU DR is enabled */
+	if (dr_is_disabled(DR_TYPE_CPU)) {
+		cmn_err(CE_CONT, "!CPU DR is disabled\n");
+		return (-1);
+	}
+
+	if ((status = dr_cpu_init()) != 0) {
+		cmn_err(CE_NOTE, "CPU DR initialization failed");
+		return (status);
+	}
+
+	if ((status = mod_install(&modlinkage)) != 0) {
+		(void) dr_cpu_fini();
+	}
+
+	return (status);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int dr_cpu_allow_unload;
+
+int
+_fini(void)
+{
+	int	status;
+
+	if (dr_cpu_allow_unload == 0)
+		return (EBUSY);
+
+	if ((status = mod_remove(&modlinkage)) == 0) {
+		(void) dr_cpu_fini();
+	}
+
+	return (status);
+}
+
+static int
+dr_cpu_init(void)
+{
+	int	rv;
+
+	if ((rv = ds_cap_init(&dr_cpu_cap, &dr_cpu_ops)) != 0) {
+		cmn_err(CE_NOTE, "ds_cap_init failed: %d", rv);
+		return (-1);
+	}
+
+	return (0);
+}
+
+static int
+dr_cpu_fini(void)
+{
+	int	rv;
+
+	if ((rv = ds_cap_fini(&dr_cpu_cap)) != 0) {
+		cmn_err(CE_NOTE, "ds_cap_fini failed: %d", rv);
+		return (-1);
+	}
+
+	return (0);
+}
+
+static void
+dr_cpu_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
+{
+	DR_DBG_CPU("reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n", arg,
+	    ver->major, ver->minor, hdl);
+
+	ds_handle = hdl;
+}
+
+static void
+dr_cpu_unreg_handler(ds_cb_arg_t arg)
+{
+	DR_DBG_CPU("unreg_handler: arg=0x%p\n", arg);
+
+	ds_handle = DS_INVALID_HDL;
+}
+
+static void
+dr_cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	dr_cpu_hdr_t	*req = buf;
+	dr_cpu_hdr_t	err_resp;
+	dr_cpu_hdr_t	*resp = &err_resp;
+	int		resp_len = 0;
+	int		rv;
+
+	/*
+	 * Sanity check the message
+	 */
+	if (buflen < sizeof (dr_cpu_hdr_t)) {
+		DR_DBG_CPU("incoming message short: expected at least %ld "
+		    "bytes, received %ld\n", sizeof (dr_cpu_hdr_t), buflen);
+		goto done;
+	}
+
+	if (req == NULL) {
+		DR_DBG_CPU("empty message: expected at least %ld bytes\n",
+		    sizeof (dr_cpu_hdr_t));
+		goto done;
+	}
+
+	DR_DBG_CPU("incoming request:\n");
+	DR_DBG_DUMP_MSG(buf, buflen);
+
+	if (req->num_records > NCPU) {
+		DR_DBG_CPU("CPU list too long: %d when %d is the maximum\n",
+		    req->num_records, NCPU);
+		goto done;
+	}
+
+	if (req->num_records == 0) {
+		DR_DBG_CPU("No CPU specified for operation\n");
+		goto done;
+	}
+
+	/*
+	 * Process the command
+	 */
+	switch (req->msg_type) {
+	case DR_CPU_CONFIGURE:
+		if ((rv = dr_cpu_list_configure(req, &resp, &resp_len)) != 0)
+			DR_DBG_CPU("dr_cpu_list_configure failed (%d)\n", rv);
+		break;
+
+	case DR_CPU_UNCONFIGURE:
+	case DR_CPU_FORCE_UNCONFIG:
+		if ((rv = dr_cpu_list_unconfigure(req, &resp, &resp_len)) != 0)
+			DR_DBG_CPU("dr_cpu_list_unconfigure failed (%d)\n", rv);
+		break;
+
+	case DR_CPU_STATUS:
+		if ((rv = dr_cpu_list_status(req, &resp, &resp_len)) != 0)
+			DR_DBG_CPU("dr_cpu_list_status failed (%d)\n", rv);
+		break;
+
+	default:
+		cmn_err(CE_NOTE, "unsupported DR operation (%d)",
+		    req->msg_type);
+		break;
+	}
+
+done:
+	/* check if an error occurred */
+	if (resp == &err_resp) {
+		resp->req_num = (req) ? req->req_num : 0;
+		resp->msg_type = DR_CPU_ERROR;
+		resp->num_records = 0;
+		resp_len = sizeof (dr_cpu_hdr_t);
+	}
+
+	/* send back the response */
+	if (ds_cap_send(ds_handle, resp, resp_len) != 0) {
+		DR_DBG_CPU("ds_send failed\n");
+	}
+
+	/* free any allocated memory */
+	if (resp != &err_resp) {
+		kmem_free(resp, resp_len);
+	}
+}
+
+/*
+ * Do not modify result buffer or length on error.
+ */
+static int
+dr_cpu_list_configure(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len)
+{
+	int		idx;
+	int		result;
+	int		status;
+	int		rlen;
+	uint32_t	*cpuids;
+	dr_cpu_hdr_t	*rp;
+	dr_cpu_stat_t	*stat;
+
+	/* the incoming array of cpuids to configure */
+	cpuids = (uint32_t *)((caddr_t)req + sizeof (dr_cpu_hdr_t));
+
+	/* allocate a response message */
+	rlen = sizeof (dr_cpu_hdr_t);
+	rlen += req->num_records * sizeof (dr_cpu_stat_t);
+	rp = kmem_zalloc(rlen, KM_SLEEP);
+
+	/* fill in the known data */
+	rp->req_num = req->req_num;
+	rp->msg_type = DR_CPU_OK;
+	rp->num_records = req->num_records;
+
+	/* stat array for the response */
+	stat = (dr_cpu_stat_t *)((caddr_t)rp + sizeof (dr_cpu_hdr_t));
+
+	/* configure each of the CPUs */
+	for (idx = 0; idx < req->num_records; idx++) {
+
+		result = dr_cpu_configure(cpuids[idx], &status);
+
+		/* save off results of the configure */
+		stat[idx].cpuid = cpuids[idx];
+		stat[idx].result = result;
+		stat[idx].status = status;
+	}
+
+	*resp = rp;
+	*resp_len = rlen;
+
+	dr_generate_event(DR_TYPE_CPU, SE_HINT_INSERT);
+
+	return (0);
+}
+
+static void
+dr_cpu_check_cpus(uint32_t *cpuids, int ncpus, dr_cpu_stat_t *stat)
+{
+	int		idx;
+	kthread_t	*tp;
+	proc_t		*pp;
+
+	DR_DBG_CPU("dr_cpu_check_cpus...\n");
+
+	mutex_enter(&cpu_lock);
+
+	/* process each cpu that is part of the request */
+	for (idx = 0; idx < ncpus; idx++) {
+
+		if (cpu_get(cpuids[idx]) == NULL)
+			continue;
+
+		mutex_enter(&pidlock);
+
+		/*
+		 * Walk the active processes, checking if each
+		 * thread belonging to the process is bound.
+		 */
+		for (pp = practive; pp != NULL; pp = pp->p_next) {
+			mutex_enter(&pp->p_lock);
+			tp = pp->p_tlist;
+
+			if (tp == NULL || (pp->p_flag & SSYS)) {
+				mutex_exit(&pp->p_lock);
+				continue;
+			}
+
+			do {
+				if (tp->t_bind_cpu != cpuids[idx])
+					continue;
+
+				DR_DBG_CPU("thread(s) bound to cpu %d\n",
+				    cpuids[idx]);
+
+				stat[idx].cpuid = cpuids[idx];
+				stat[idx].result = DR_CPU_RES_BLOCKED;
+				stat[idx].status = DR_CPU_STAT_CONFIGURED;
+				break;
+
+			} while ((tp = tp->t_forw) != pp->p_tlist);
+			mutex_exit(&pp->p_lock);
+		}
+
+		mutex_exit(&pidlock);
+	}
+
+	mutex_exit(&cpu_lock);
+}
+
+/*
+ * Do not modify result buffer or length on error.
+ */
+static int
+dr_cpu_list_unconfigure(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len)
+{
+	int		idx;
+	int		result;
+	int		status;
+	int		rlen;
+	uint32_t	*cpuids;
+	dr_cpu_hdr_t	*rp;
+	dr_cpu_stat_t	*stat;
+	boolean_t	force;
+
+	/* the incoming array of cpuids to configure */
+	cpuids = (uint32_t *)((caddr_t)req + sizeof (dr_cpu_hdr_t));
+
+	/* check if this is a forced unconfigured */
+	force = (req->msg_type == DR_CPU_FORCE_UNCONFIG) ? B_TRUE : B_FALSE;
+
+	/* allocate a response message */
+	rlen = sizeof (dr_cpu_hdr_t);
+	rlen += req->num_records * sizeof (dr_cpu_stat_t);
+	rp = kmem_zalloc(rlen, KM_SLEEP);
+
+	/* fill in the known data */
+	rp->req_num = req->req_num;
+	rp->msg_type = DR_CPU_OK;
+	rp->num_records = req->num_records;
+
+	/* stat array for the response */
+	stat = (dr_cpu_stat_t *)((caddr_t)rp + sizeof (dr_cpu_hdr_t));
+
+	/*
+	 * If the operation is not a forced unconfigure,
+	 * perform secondary checks for things that would
+	 * prevent an operation.
+	 */
+	if (!force)
+		dr_cpu_check_cpus(cpuids, req->num_records, stat);
+
+	/* unconfigure each of the CPUs */
+	for (idx = 0; idx < req->num_records; idx++) {
+
+		/* skip this cpu if it is already marked as blocked */
+		if (stat[idx].result == DR_CPU_RES_BLOCKED)
+			continue;
+
+		result = dr_cpu_unconfigure(cpuids[idx], &status, force);
+
+		/* save off results of the unconfigure */
+		stat[idx].cpuid = cpuids[idx];
+		stat[idx].result = result;
+		stat[idx].status = status;
+	}
+
+	*resp = rp;
+	*resp_len = rlen;
+
+	dr_generate_event(DR_TYPE_CPU, SE_HINT_REMOVE);
+
+	return (0);
+}
+
+/*
+ * Do not modify result buffer or length on error.
+ */
+static int
+dr_cpu_list_status(dr_cpu_hdr_t *req, dr_cpu_hdr_t **resp, int *resp_len)
+{
+	int		idx;
+	int		result;
+	int		status;
+	int		rlen;
+	uint32_t	*cpuids;
+	dr_cpu_hdr_t	*rp;
+	dr_cpu_stat_t	*stat;
+	md_t		*mdp = NULL;
+	int		num_nodes;
+	int		listsz;
+	mde_cookie_t	*listp = NULL;
+	mde_cookie_t	cpunode;
+	boolean_t	walk_md = B_FALSE;
+
+	/* the incoming array of cpuids to configure */
+	cpuids = (uint32_t *)((caddr_t)req + sizeof (dr_cpu_hdr_t));
+
+	/* allocate a response message */
+	rlen = sizeof (dr_cpu_hdr_t);
+	rlen += req->num_records * sizeof (dr_cpu_stat_t);
+	rp = kmem_zalloc(rlen, KM_SLEEP);
+
+	/* fill in the known data */
+	rp->req_num = req->req_num;
+	rp->msg_type = DR_CPU_STATUS;
+	rp->num_records = req->num_records;
+
+	/* stat array for the response */
+	stat = (dr_cpu_stat_t *)((caddr_t)rp + sizeof (dr_cpu_hdr_t));
+
+	/* get the status for each of the CPUs */
+	for (idx = 0; idx < req->num_records; idx++) {
+
+		result = dr_cpu_status(cpuids[idx], &status);
+
+		if (result == DR_CPU_RES_FAILURE)
+			walk_md = B_TRUE;
+
+		/* save off results of the status */
+		stat[idx].cpuid = cpuids[idx];
+		stat[idx].result = result;
+		stat[idx].status = status;
+	}
+
+	if (walk_md == B_FALSE)
+		goto done;
+
+	/*
+	 * At least one of the cpus did not have a CPU
+	 * structure. So, consult the MD to determine if
+	 * they are present.
+	 */
+
+	if ((mdp = md_get_handle()) == NULL) {
+		DR_DBG_CPU("unable to initialize MD\n");
+		goto done;
+	}
+
+	num_nodes = md_node_count(mdp);
+	ASSERT(num_nodes > 0);
+
+	listsz = num_nodes * sizeof (mde_cookie_t);
+	listp = kmem_zalloc(listsz, KM_SLEEP);
+
+	for (idx = 0; idx < req->num_records; idx++) {
+
+		if (stat[idx].result != DR_CPU_RES_FAILURE)
+			continue;
+
+		/* check the MD for the current cpuid */
+		cpunode = dr_cpu_find_node_md(stat[idx].cpuid, mdp, listp);
+
+		stat[idx].result = DR_CPU_RES_OK;
+
+		if (cpunode == MDE_INVAL_ELEM_COOKIE) {
+			stat[idx].status = DR_CPU_STAT_NOT_PRESENT;
+		} else {
+			stat[idx].status = DR_CPU_STAT_UNCONFIGURED;
+		}
+	}
+
+	kmem_free(listp, listsz);
+
+	(void) md_fini_handle(mdp);
+
+done:
+	*resp = rp;
+	*resp_len = rlen;
+
+	return (0);
+}
+
+static int
+dr_cpu_configure(processorid_t cpuid, int *status)
+{
+	struct cpu	*cp;
+	int		rv = 0;
+
+	DR_DBG_CPU("dr_cpu_configure...\n");
+
+	/*
+	 * Build device tree node for the CPU
+	 */
+	if ((rv = dr_cpu_probe(cpuid)) != 0) {
+		DR_DBG_CPU("failed to probe CPU %d (%d)\n", cpuid, rv);
+		if (rv == EINVAL) {
+			*status = DR_CPU_STAT_NOT_PRESENT;
+			return (DR_CPU_RES_NOT_IN_MD);
+		}
+		*status = DR_CPU_STAT_UNCONFIGURED;
+		return (DR_CPU_RES_FAILURE);
+	}
+
+	mutex_enter(&cpu_lock);
+
+	/*
+	 * Configure the CPU
+	 */
+	if ((cp = cpu_get(cpuid)) == NULL) {
+
+		if ((rv = cpu_configure(cpuid)) != 0) {
+			DR_DBG_CPU("failed to configure CPU %d (%d)\n",
+			    cpuid, rv);
+			rv = DR_CPU_RES_FAILURE;
+			*status = DR_CPU_STAT_UNCONFIGURED;
+			goto done;
+		}
+
+		DR_DBG_CPU("CPU %d configured\n", cpuid);
+
+		/* CPU struct should exist now */
+		cp = cpu_get(cpuid);
+	}
+
+	ASSERT(cp);
+
+	/*
+	 * Power on the CPU. In sun4v, this brings the stopped
+	 * CPU into the guest from the Hypervisor.
+	 */
+	if (cpu_is_poweredoff(cp)) {
+
+		if ((rv = cpu_poweron(cp)) != 0) {
+			DR_DBG_CPU("failed to power on CPU %d (%d)\n",
+			    cpuid, rv);
+			rv = DR_CPU_RES_FAILURE;
+			*status = DR_CPU_STAT_UNCONFIGURED;
+			goto done;
+		}
+
+		DR_DBG_CPU("CPU %d powered on\n", cpuid);
+	}
+
+	/*
+	 * Online the CPU
+	 */
+	if (cpu_is_offline(cp)) {
+
+		if ((rv = cpu_online(cp)) != 0) {
+			DR_DBG_CPU("failed to online CPU %d (%d)\n",
+			    cpuid, rv);
+			rv = DR_CPU_RES_FAILURE;
+			/* offline is still configured */
+			*status = DR_CPU_STAT_CONFIGURED;
+			goto done;
+		}
+
+		DR_DBG_CPU("CPU %d online\n", cpuid);
+	}
+
+	rv = DR_CPU_RES_OK;
+	*status = DR_CPU_STAT_CONFIGURED;
+
+done:
+	mutex_exit(&cpu_lock);
+
+	return (rv);
+}
+
+static int
+dr_cpu_unconfigure(processorid_t cpuid, int *status, boolean_t force)
+{
+	struct cpu	*cp;
+	int		rv = 0;
+	int		cpu_flags;
+
+	DR_DBG_CPU("dr_cpu_unconfigure%s...\n", (force) ? " (force)" : "");
+
+	mutex_enter(&cpu_lock);
+
+	cp = cpu_get(cpuid);
+
+	if (cp == NULL) {
+
+		/*
+		 * The OS CPU structures are already torn down,
+		 * Attempt to deprobe the CPU to make sure the
+		 * device tree is up to date.
+		 */
+		if (dr_cpu_deprobe(cpuid) != 0) {
+			DR_DBG_CPU("failed to deprobe CPU %d\n", cpuid);
+			rv = DR_CPU_RES_FAILURE;
+			*status = DR_CPU_STAT_UNCONFIGURED;
+			goto done;
+		}
+
+		goto done;
+	}
+
+	ASSERT(cp->cpu_id == cpuid);
+
+	/*
+	 * Offline the CPU
+	 */
+	if (cpu_is_active(cp)) {
+
+		/* set the force flag correctly */
+		cpu_flags = (force) ? CPU_FORCED : 0;
+
+		if ((rv = cpu_offline(cp, cpu_flags)) != 0) {
+			DR_DBG_CPU("failed to offline CPU %d (%d)\n",
+			    cpuid, rv);
+
+			rv = DR_CPU_RES_FAILURE;
+			*status = DR_CPU_STAT_CONFIGURED;
+			goto done;
+		}
+
+		DR_DBG_CPU("CPU %d offline\n", cpuid);
+	}
+
+	/*
+	 * Power off the CPU. In sun4v, this puts the running
+	 * CPU into the stopped state in the Hypervisor.
+	 */
+	if (!cpu_is_poweredoff(cp)) {
+
+		if ((rv = cpu_poweroff(cp)) != 0) {
+			DR_DBG_CPU("failed to power off CPU %d (%d)\n",
+			    cpuid, rv);
+			rv = DR_CPU_RES_FAILURE;
+			*status = DR_CPU_STAT_CONFIGURED;
+			goto done;
+		}
+
+		DR_DBG_CPU("CPU %d powered off\n", cpuid);
+	}
+
+	/*
+	 * Unconfigure the CPU
+	 */
+	if ((rv = cpu_unconfigure(cpuid)) != 0) {
+		DR_DBG_CPU("failed to unconfigure CPU %d (%d)\n", cpuid, rv);
+		rv = DR_CPU_RES_FAILURE;
+		*status = DR_CPU_STAT_UNCONFIGURED;
+		goto done;
+	}
+
+	DR_DBG_CPU("CPU %d unconfigured\n", cpuid);
+
+	/*
+	 * Tear down device tree.
+	 */
+	if ((rv = dr_cpu_deprobe(cpuid)) != 0) {
+		DR_DBG_CPU("failed to deprobe CPU %d (%d)\n", cpuid, rv);
+		rv = DR_CPU_RES_FAILURE;
+		*status = DR_CPU_STAT_UNCONFIGURED;
+		goto done;
+	}
+
+	rv = DR_CPU_RES_OK;
+	*status = DR_CPU_STAT_UNCONFIGURED;
+
+done:
+	mutex_exit(&cpu_lock);
+
+	return (rv);
+}
+
+/*
+ * Determine the state of a CPU. If the CPU structure is not present,
+ * it does not attempt to determine whether or not the CPU is in the
+ * MD. It is more efficient to do this at the higher level for all
+ * CPUs since it may not even be necessary to search the MD if all
+ * the CPUs are accounted for. Returns DR_CPU_RES_OK if the CPU
+ * structure is present, and DR_CPU_RES_FAILURE otherwise as a signal
+ * that an MD walk is necessary.
+ */
+static int
+dr_cpu_status(processorid_t cpuid, int *status)
+{
+	int		rv;
+	struct cpu	*cp;
+
+	DR_DBG_CPU("dr_cpu_status...\n");
+
+	mutex_enter(&cpu_lock);
+
+	if ((cp = cpu_get(cpuid)) == NULL) {
+		/* need to check if cpu is in the MD */
+		rv = DR_CPU_RES_FAILURE;
+		goto done;
+	}
+
+	if (cpu_is_poweredoff(cp)) {
+		/*
+		 * The CPU is powered off, so it is considered
+		 * unconfigured from the service entity point of
+		 * view. The CPU is not available to the system
+		 * and intervention by the service entity would
+		 * be required to change that.
+		 */
+		*status = DR_CPU_STAT_UNCONFIGURED;
+	} else {
+		/*
+		 * The CPU is powered on, so it is considered
+		 * configured from the service entity point of
+		 * view. It is available for use by the system
+		 * and service entities are not concerned about
+		 * the operational status (offline, online, etc.)
+		 * of the CPU in terms of DR.
+		 */
+		*status = DR_CPU_STAT_CONFIGURED;
+	}
+
+	rv = DR_CPU_RES_OK;
+
+done:
+	mutex_exit(&cpu_lock);
+
+	return (rv);
+}
+
+typedef struct {
+	md_t		*mdp;
+	mde_cookie_t	cpunode;
+	dev_info_t	*dip;
+} cb_arg_t;
+
+#define	STR_ARR_LEN	5
+
+static int
+new_cpu_node(dev_info_t *new_node, void *arg, uint_t flags)
+{
+	_NOTE(ARGUNUSED(flags))
+
+	char		*compat;
+	uint64_t	freq;
+	uint64_t	cpuid = 0;
+	int		regbuf[4];
+	int		len = 0;
+	cb_arg_t	*cba;
+	char		*str_arr[STR_ARR_LEN];
+	char		*curr;
+	int		idx = 0;
+
+	DR_DBG_CPU("new_cpu_node...\n");
+
+	cba = (cb_arg_t *)arg;
+
+	/*
+	 * Add 'name' property
+	 */
+	if (ndi_prop_update_string(DDI_DEV_T_NONE, new_node,
+	    "name", "cpu") != DDI_SUCCESS) {
+		DR_DBG_CPU("new_cpu_node: failed to create 'name' property\n");
+		return (DDI_WALK_ERROR);
+	}
+
+	/*
+	 * Add 'compatible' property
+	 */
+	if (md_get_prop_data(cba->mdp, cba->cpunode, "compatible",
+	    (uint8_t **)(&compat), &len)) {
+		DR_DBG_CPU("new_cpu_node: failed to read 'compatible' property "
+		    "from MD\n");
+		return (DDI_WALK_ERROR);
+	}
+
+	DR_DBG_CPU("'compatible' len is %d\n", len);
+
+	/* parse the MD string array */
+	curr = compat;
+	while (curr < (compat + len)) {
+
+		DR_DBG_CPU("adding '%s' to 'compatible' property\n", curr);
+
+		str_arr[idx++] = curr;
+		curr += strlen(curr) + 1;
+
+		if (idx == STR_ARR_LEN) {
+			DR_DBG_CPU("exceeded str_arr len (%d)\n", STR_ARR_LEN);
+			break;
+		}
+	}
+
+	if (ndi_prop_update_string_array(DDI_DEV_T_NONE, new_node,
+	    "compatible", str_arr, idx) != DDI_SUCCESS) {
+		DR_DBG_CPU("new_cpu_node: failed to create 'compatible' "
+		    "property\n");
+		return (DDI_WALK_ERROR);
+	}
+
+	/*
+	 * Add 'device_type' property
+	 */
+	if (ndi_prop_update_string(DDI_DEV_T_NONE, new_node,
+	    "device_type", "cpu") != DDI_SUCCESS) {
+		DR_DBG_CPU("new_cpu_node: failed to create 'device_type' "
+		    "property\n");
+		return (DDI_WALK_ERROR);
+	}
+
+	/*
+	 * Add 'clock-frequency' property
+	 */
+	if (md_get_prop_val(cba->mdp, cba->cpunode, "clock-frequency", &freq)) {
+		DR_DBG_CPU("new_cpu_node: failed to read 'clock-frequency' "
+		    "property from MD\n");
+		return (DDI_WALK_ERROR);
+	}
+
+	if (ndi_prop_update_int(DDI_DEV_T_NONE, new_node,
+	    "clock-frequency", freq) != DDI_SUCCESS) {
+		DR_DBG_CPU("new_cpu_node: failed to create 'clock-frequency' "
+		    "property\n");
+		return (DDI_WALK_ERROR);
+	}
+
+	/*
+	 * Add 'reg' (cpuid) property
+	 */
+	if (md_get_prop_val(cba->mdp, cba->cpunode, "id", &cpuid)) {
+		DR_DBG_CPU("new_cpu_node: failed to read 'id' property "
+		    "from MD\n");
+		return (DDI_WALK_ERROR);
+	}
+
+	DR_DBG_CPU("new cpuid=0x%lx\n", cpuid);
+
+	bzero(regbuf, 4 * sizeof (int));
+	regbuf[0] = 0xc0000000 | cpuid;
+
+	if (ndi_prop_update_int_array(DDI_DEV_T_NONE, new_node,
+	    "reg", regbuf, 4) != DDI_SUCCESS) {
+		DR_DBG_CPU("new_cpu_node: failed to create 'reg' property\n");
+		return (DDI_WALK_ERROR);
+	}
+
+	cba->dip = new_node;
+
+	return (DDI_WALK_TERMINATE);
+}
+
+static int
+dr_cpu_probe(processorid_t cpuid)
+{
+	dev_info_t	*pdip;
+	dev_info_t	*dip;
+	devi_branch_t	br;
+	md_t		*mdp = NULL;
+	int		num_nodes;
+	int		rv = 0;
+	int		listsz;
+	mde_cookie_t	*listp = NULL;
+	cb_arg_t	cba;
+	mde_cookie_t	cpunode;
+
+	if ((dip = dr_cpu_find_node(cpuid)) != NULL) {
+		/* nothing to do */
+		e_ddi_branch_rele(dip);
+		return (0);
+	}
+
+	if ((mdp = md_get_handle()) == NULL) {
+		DR_DBG_CPU("unable to initialize machine description\n");
+		return (-1);
+	}
+
+	num_nodes = md_node_count(mdp);
+	ASSERT(num_nodes > 0);
+
+	listsz = num_nodes * sizeof (mde_cookie_t);
+	listp = kmem_zalloc(listsz, KM_SLEEP);
+
+	cpunode = dr_cpu_find_node_md(cpuid, mdp, listp);
+
+	if (cpunode == MDE_INVAL_ELEM_COOKIE) {
+		rv = EINVAL;
+		goto done;
+	}
+
+	/* pass in MD cookie for CPU */
+	cba.mdp = mdp;
+	cba.cpunode = cpunode;
+
+	br.arg = (void *)&cba;
+	br.type = DEVI_BRANCH_SID;
+	br.create.sid_branch_create = new_cpu_node;
+	br.devi_branch_callback = NULL;
+	pdip = ddi_root_node();
+
+	if ((rv = e_ddi_branch_create(pdip, &br, NULL, 0))) {
+		DR_DBG_CPU("e_ddi_branch_create failed: %d\n", rv);
+		rv = -1;
+		goto done;
+	}
+
+	DR_DBG_CPU("CPU %d probed\n", cpuid);
+
+	rv = 0;
+
+done:
+	if (listp)
+		kmem_free(listp, listsz);
+
+	if (mdp)
+		(void) md_fini_handle(mdp);
+
+	return (rv);
+}
+
+static int
+dr_cpu_deprobe(processorid_t cpuid)
+{
+	dev_info_t	*fdip = NULL;
+	dev_info_t	*dip;
+
+	if ((dip = dr_cpu_find_node(cpuid)) == NULL) {
+		DR_DBG_CPU("cpuid %d already deprobed\n", cpuid);
+		return (0);
+	}
+
+	ASSERT(e_ddi_branch_held(dip));
+
+	if (e_ddi_branch_destroy(dip, &fdip, 0)) {
+		char *path = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+		/*
+		 * If non-NULL, fdip is held and must be released.
+		 */
+		if (fdip != NULL) {
+			(void) ddi_pathname(fdip, path);
+			ddi_release_devi(fdip);
+		} else {
+			(void) ddi_pathname(dip, path);
+		}
+		cmn_err(CE_NOTE, "node removal failed: %s (%p)",
+		    path, (fdip) ? (void *)fdip : (void *)dip);
+
+		kmem_free(path, MAXPATHLEN);
+
+		return (-1);
+	}
+
+	DR_DBG_CPU("CPU %d deprobed\n", cpuid);
+
+	return (0);
+}
+
+typedef struct {
+	processorid_t	cpuid;
+	dev_info_t	*dip;
+} dr_search_arg_t;
+
+static int
+dr_cpu_check_node(dev_info_t *dip, void *arg)
+{
+	char 		*name;
+	processorid_t	cpuid;
+	dr_search_arg_t	*sarg = (dr_search_arg_t *)arg;
+
+	if (dip == ddi_root_node()) {
+		return (DDI_WALK_CONTINUE);
+	}
+
+	name = ddi_node_name(dip);
+
+	if (strcmp(name, "cpu") != 0) {
+		return (DDI_WALK_PRUNECHILD);
+	}
+
+	cpuid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+	    "reg", -1);
+
+	cpuid = PROM_CFGHDL_TO_CPUID(cpuid);
+
+	DR_DBG_CPU("found cpuid=0x%x, looking for 0x%x\n", cpuid, sarg->cpuid);
+
+	if (cpuid == sarg->cpuid) {
+		DR_DBG_CPU("matching node\n");
+
+		/* matching node must be returned held */
+		if (!e_ddi_branch_held(dip))
+			e_ddi_branch_hold(dip);
+
+		sarg->dip = dip;
+		return (DDI_WALK_TERMINATE);
+	}
+
+	return (DDI_WALK_CONTINUE);
+}
+
+/*
+ * Walk the device tree to find the dip corresponding to the cpuid
+ * passed in. If present, the dip is returned held. The caller must
+ * release the hold on the dip once it is no longer required. If no
+ * matching node if found, NULL is returned.
+ */
+static dev_info_t *
+dr_cpu_find_node(processorid_t cpuid)
+{
+	dr_search_arg_t	arg;
+
+	DR_DBG_CPU("dr_cpu_find_node...\n");
+
+	arg.cpuid = cpuid;
+	arg.dip = NULL;
+
+	ddi_walk_devs(ddi_root_node(), dr_cpu_check_node, &arg);
+
+	ASSERT((arg.dip == NULL) || (e_ddi_branch_held(arg.dip)));
+
+	return ((arg.dip) ? arg.dip : NULL);
+}
+
+/*
+ * Look up a particular cpuid in the MD. Returns the mde_cookie_t
+ * representing that CPU if present, and MDE_INVAL_ELEM_COOKIE
+ * otherwise. It is assumed the scratch array has already been
+ * allocated so that it can accommodate the worst case scenario,
+ * every node in the MD.
+ */
+static mde_cookie_t
+dr_cpu_find_node_md(processorid_t cpuid, md_t *mdp, mde_cookie_t *listp)
+{
+	int		idx;
+	int		nnodes;
+	mde_cookie_t	rootnode;
+	uint64_t	cpuid_prop;
+	mde_cookie_t	result = MDE_INVAL_ELEM_COOKIE;
+
+	rootnode = md_root_node(mdp);
+	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
+
+	/*
+	 * Scan the DAG for all the CPU nodes
+	 */
+	nnodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "cpu"),
+	    md_find_name(mdp, "fwd"), listp);
+
+	if (nnodes < 0) {
+		DR_DBG_CPU("Scan for CPUs failed\n");
+		return (result);
+	}
+
+	DR_DBG_CPU("dr_cpu_find_node_md: found %d CPUs in the MD\n", nnodes);
+
+	/*
+	 * Find the CPU of interest
+	 */
+	for (idx = 0; idx < nnodes; idx++) {
+
+		if (md_get_prop_val(mdp, listp[idx], "id", &cpuid_prop)) {
+			DR_DBG_CPU("Missing 'id' property for CPU node %d\n",
+			    idx);
+			break;
+		}
+
+		if (cpuid_prop == cpuid) {
+			/* found a match */
+			DR_DBG_CPU("dr_cpu_find_node_md: found CPU %d "
+			    "in MD\n", cpuid);
+			result = listp[idx];
+			break;
+		}
+	}
+
+	if (result == MDE_INVAL_ELEM_COOKIE) {
+		DR_DBG_CPU("CPU %d not in MD\n", cpuid);
+	}
+
+	return (result);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/dr_util.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,206 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * sun4v DR Utility functions
+ */
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/sunddi.h>
+#include <sys/note.h>
+#include <sys/sysevent.h>
+#include <sys/sysevent/dr.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/ldoms.h>
+
+#include <sys/dr_util.h>
+
+boolean_t
+dr_is_disabled(dr_type_t type)
+{
+	/*
+	 * The type argument is currently unused. However, it
+	 * keeps the interface flexible enough to allows for
+	 * only disabling certain types of DR.
+	 */
+	_NOTE(ARGUNUSED(type))
+
+	/*
+	 * DR requires that the kernel is using its own CIF
+	 * handler. If that is not the case, either because
+	 * domaining has been explicitly disabled, or because
+	 * the firmware does not support it, the system must
+	 * remain static and DR must be disabled.
+	 */
+	if (!domaining_enabled) {
+		cmn_err(CE_NOTE, "!Kernel CIF handler is not enabled, DR "
+		    "is not available\n");
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Generate a DR sysevent based on the type of resource and
+ * sysevent hint specified. The hint indicates whether the
+ * resource was added or removed.
+ */
+void
+dr_generate_event(dr_type_t type, int se_hint)
+{
+	int			rv;
+	sysevent_id_t		eid;
+	sysevent_t		*ev = NULL;
+	sysevent_attr_list_t	*evnt_attr_list = NULL;
+	sysevent_value_t	evnt_val;
+	static char		pubname[] = SUNW_KERN_PUB"dr";
+
+	DR_DBG_ALL("generate_event: type=%s, hint=%s\n", DR_TYPE2STR(type),
+	    SE_HINT2STR(se_hint));
+
+	/*
+	 * Add the attachment point attribute
+	 */
+	ev = sysevent_alloc(EC_DR, ESC_DR_AP_STATE_CHANGE, pubname, KM_SLEEP);
+	evnt_val.value_type = SE_DATA_TYPE_STRING;
+	evnt_val.value.sv_string = DR_TYPE2STR(type);
+
+	rv = sysevent_add_attr(&evnt_attr_list, DR_AP_ID, &evnt_val, KM_SLEEP);
+	if (rv != 0) {
+		DR_DBG_ALL("generate_event: failed to add attr '%s' for "
+		    "'%s' event\n", DR_AP_ID, EC_DR);
+		goto done;
+	}
+
+	/*
+	 * Add the DR hint attribute
+	 */
+	evnt_val.value_type = SE_DATA_TYPE_STRING;
+	evnt_val.value.sv_string = SE_HINT2STR(se_hint);
+
+	rv = sysevent_add_attr(&evnt_attr_list, DR_HINT, &evnt_val, KM_SLEEP);
+	if (rv != 0) {
+		DR_DBG_ALL("generate_event: failed to add attr '%s' for "
+		    "'%s' event\n", DR_HINT, EC_DR);
+		sysevent_free_attr(evnt_attr_list);
+		goto done;
+	}
+
+	/*
+	 * Attach the attribute list to the event
+	 */
+	rv = sysevent_attach_attributes(ev, evnt_attr_list);
+	if (rv != 0) {
+		DR_DBG_ALL("generate_event: failed to add attr list for "
+		    "'%s' event\n", EC_DR);
+		sysevent_free_attr(evnt_attr_list);
+		goto done;
+	}
+
+	/*
+	 * Log the event
+	 */
+	rv = log_sysevent(ev, KM_NOSLEEP, &eid);
+	if (rv != 0) {
+		DR_DBG_ALL("generate_event: failed to log event (%d)\n", rv);
+	}
+
+done:
+	if (ev != NULL)
+		sysevent_free(ev);
+}
+
+/*
+ * Debugging Features
+ */
+#ifdef DEBUG
+
+uint_t dr_debug = 0x0;
+
+#define	BYTESPERLINE    8
+#define	LINEWIDTH	((BYTESPERLINE * 3) + (BYTESPERLINE + 2) + 1)
+#define	ASCIIOFFSET	((BYTESPERLINE * 3) + 2)
+#define	ISPRINT(c)	((c >= ' ') && (c <= '~'))
+
+/*
+ * Output a buffer formatted with a set number of bytes on
+ * each line. Append each line with the ASCII equivalent of
+ * each byte if it falls within the printable ASCII range,
+ * and '.' otherwise.
+ */
+void
+dr_dbg_dump_msg(void *buf, size_t len)
+{
+	int	i, j;
+	char	*msg = buf;
+	char	*curr;
+	char	*aoff;
+	char	line[LINEWIDTH];
+
+	/* abort if not debugging transport */
+	if (!(dr_debug & DR_DBG_FLAG_TRANS)) {
+		return;
+	}
+
+	/* walk the buffer one line at a time */
+	for (i = 0; i < len; i += BYTESPERLINE) {
+
+		bzero(line, LINEWIDTH);
+
+		curr = line;
+		aoff = line + ASCIIOFFSET;
+
+		/*
+		 * Walk the bytes in the current line, storing
+		 * the hex value for the byte as well as the
+		 * ASCII representation in a temporary buffer.
+		 * All ASCII values are placed at the end of
+		 * the line.
+		 */
+		for (j = 0; (j < BYTESPERLINE) && ((i + j) < len); j++) {
+			(void) sprintf(curr, " %02x", msg[i + j]);
+			*aoff = (ISPRINT(msg[i + j])) ? msg[i + j] : '.';
+			curr += 3;
+			aoff++;
+		}
+
+		/*
+		 * Fill in to the start of the ASCII translation
+		 * with spaces. This will only be necessary if
+		 * this is the last line and there are not enough
+		 * bytes to fill the whole line.
+		 */
+		while (curr != (line + ASCIIOFFSET))
+			*curr++ = ' ';
+
+		DR_DBG_TRANS("%s\n", line);
+	}
+}
+#endif /* DEBUG */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/ds.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,2728 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Domain Services Module
+ *
+ * The Domain Services (DS) module is responsible for communication
+ * with external service entities. It provides an API for clients to
+ * publish capabilities and handles the low level communication and
+ * version negotiation required to export those capabilites to any
+ * interested service entity. Once a capability has been successfully
+ * registered with a service entity, the DS module facilitates all
+ * data transfers between the service entity and the client providing
+ * that particular capability.
+ */
+
+#include <sys/modctl.h>
+#include <sys/ksynch.h>
+#include <sys/taskq.h>
+#include <sys/disp.h>
+#include <sys/cmn_err.h>
+#include <sys/note.h>
+
+#include <sys/mach_descrip.h>
+#include <sys/mdesc.h>
+#include <sys/ldc.h>
+
+#include <sys/ds.h>
+#include <sys/ds_impl.h>
+
+/*
+ * All DS ports in the system
+ *
+ * The list of DS ports is read in from the MD when the DS module is
+ * initialized and is never modified. This eliminates the need for
+ * locking to access the port array itself. Access to the individual
+ * ports are synchronized at the port level.
+ */
+static ds_port_t	ds_ports[DS_MAX_PORTS];
+static ds_portset_t	ds_allports;	/* all DS ports in the system */
+
+/*
+ * Table of registered services
+ *
+ * Locking: Accesses to the table of services are sychronized using
+ *   a RW lock. The reader lock must be held when looking up service
+ *   information in the table. The writer lock must be held when any
+ *   service information is being modified.
+ */
+static struct ds_svcs {
+	ds_svc_t	**tbl;		/* the table itself */
+	krwlock_t	rwlock;		/* table lock */
+	uint_t		maxsvcs;	/* size of the table */
+	uint_t		nsvcs;		/* current number of items */
+} ds_svcs;
+
+/* initial size of the table */
+#define	DS_MAXSVCS_INIT		32
+
+/*
+ * Taskq for internal task processing
+ */
+static taskq_t *ds_taskq;
+static boolean_t ds_enabled;	/* enable/disable taskq processing */
+
+/*
+ * The actual required number of parallel threads is not expected
+ * to be very large. Use the maximum number of CPUs in the system
+ * as a rough upper bound.
+ */
+#define	DS_MAX_TASKQ_THR	NCPU
+#define	DS_DISPATCH(fn, arg)	taskq_dispatch(ds_taskq, fn, arg, TQ_SLEEP)
+
+/*
+ * Supported versions of the DS message protocol
+ *
+ * The version array must be sorted in order from the highest
+ * supported version to the lowest. Support for a particular
+ * <major>.<minor> version implies all lower minor versions of
+ * that same major version are supported as well.
+ */
+static ds_ver_t ds_vers[] = { { 1, 0 } };
+
+#define	DS_NUM_VER	(sizeof (ds_vers) / sizeof (ds_vers[0]))
+
+/*
+ * Results of checking version array with ds_vers_isvalid()
+ */
+typedef enum {
+	DS_VERS_OK,
+	DS_VERS_INCREASING_MAJOR_ERR,
+	DS_VERS_INCREASING_MINOR_ERR
+} ds_vers_check_t;
+
+/* incoming message handling functions */
+typedef void (*ds_msg_handler_t)(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_init_req(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_init_ack(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_init_nack(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_reg_req(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_reg_ack(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_reg_nack(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_unreg_req(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_unreg_ack(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_unreg_nack(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_data(ds_port_t *port, caddr_t buf, size_t len);
+static void ds_handle_nack(ds_port_t *port, caddr_t buf, size_t len);
+
+/*
+ * DS Message Handler Dispatch Table
+ *
+ * A table used to dispatch all incoming messages. This table
+ * contains handlers for all the fixed message types, as well as
+ * the the messages defined in the 1.0 version of the DS protocol.
+ */
+static const ds_msg_handler_t ds_msg_handlers[] = {
+	ds_handle_init_req,		/* DS_INIT_REQ */
+	ds_handle_init_ack,		/* DS_INIT_ACK */
+	ds_handle_init_nack,		/* DS_INIT_NACK */
+	ds_handle_reg_req,		/* DS_REG_REQ */
+	ds_handle_reg_ack,		/* DS_REG_ACK */
+	ds_handle_reg_nack,		/* DS_REG_NACK */
+	ds_handle_unreg_req,		/* DS_UNREG */
+	ds_handle_unreg_ack,		/* DS_UNREG_ACK */
+	ds_handle_unreg_nack,		/* DS_UNREG_NACK */
+	ds_handle_data,			/* DS_DATA */
+	ds_handle_nack			/* DS_NACK */
+};
+
+/*
+ * DS message log
+ *
+ * Locking: The message log is protected by a single mutex. This
+ *   protects all fields in the log structure itself as well as
+ *   everything in the entry structures on both the log and the
+ *   free list.
+ */
+static struct log {
+	ds_log_entry_t		*head;		/* head of the log */
+	ds_log_entry_t		*freelist;	/* head of the free list */
+	size_t			size;		/* size of the log in bytes */
+	uint32_t		nentry;		/* number of entries */
+	kmutex_t		lock;		/* log lock */
+} ds_log;
+
+/* log soft limit */
+uint_t ds_log_sz = DS_LOG_DEFAULT_SZ;
+
+/* initial pool of log entry structures */
+static ds_log_entry_t ds_log_entry_pool[DS_LOG_NPOOL];
+
+/*
+ * Debugging Features
+ */
+#ifdef DEBUG
+
+#define	DS_DBG_FLAG_LDC			0x1
+#define	DS_DBG_FLAG_LOG			0x2
+#define	DS_DBG_FLAG_ALL			0xf
+
+#define	DS_DBG				if (ds_debug) printf
+#define	DS_DBG_LDC			if (ds_debug & DS_DBG_FLAG_LDC) printf
+#define	DS_DBG_LOG			if (ds_debug & DS_DBG_FLAG_LOG) printf
+#define	DS_DUMP_LDC_MSG(buf, len)	ds_dump_ldc_msg(buf, len)
+
+uint_t ds_debug = 0;
+static void ds_dump_ldc_msg(void *buf, size_t len);
+
+#else /* DEBUG */
+
+#define	DS_DBG				_NOTE(CONSTCOND) if (0) printf
+#define	DS_DBG_LDC			DS_DBG
+#define	DS_DBG_LOG			DS_DBG
+#define	DS_DUMP_LDC_MSG(buf, len)
+
+#endif /* DEBUG */
+
+
+/* initialization functions */
+static void ds_init(void);
+static void ds_fini(void);
+static int ds_ports_init(void);
+static int ds_ports_fini(void);
+static int ds_ldc_init(ds_port_t *port);
+static int ds_ldc_fini(ds_port_t *port);
+
+/* event processing functions */
+static uint_t ds_ldc_cb(uint64_t event, caddr_t arg);
+static void ds_dispatch_event(void *arg);
+static void ds_handle_ldc_event(ds_port_t *port, int newstate);
+static int ds_recv_msg(ldc_handle_t ldc_hdl, caddr_t msgp, size_t msglen);
+static void ds_handle_recv(void *arg);
+
+/* message sending functions */
+static int ds_send_msg(ds_port_t *port, caddr_t msg, size_t msglen);
+static void ds_send_init_req(ds_port_t *port);
+static int ds_send_reg_req(ds_svc_t *svc);
+static int ds_send_unreg_req(ds_svc_t *svc);
+static void ds_send_unreg_nack(ds_port_t *port, ds_svc_hdl_t bad_hdl);
+static void ds_send_data_nack(ds_port_t *port, ds_svc_hdl_t bad_hdl);
+
+/* walker functions */
+typedef int (*svc_cb_t)(ds_svc_t *svc, void *arg);
+static int ds_walk_svcs(svc_cb_t svc_cb, void *arg);
+static int ds_svc_isfree(ds_svc_t *svc, void *arg);
+static int ds_svc_ismatch(ds_svc_t *svc, void *arg);
+static int ds_svc_free(ds_svc_t *svc, void *arg);
+static int ds_svc_register(ds_svc_t *svc, void *arg);
+static int ds_svc_unregister(ds_svc_t *svc, void *arg);
+static int ds_svc_port_up(ds_svc_t *svc, void *arg);
+
+/* service utilities */
+static ds_svc_t *ds_alloc_svc(void);
+static void ds_reset_svc(ds_svc_t *svc, ds_port_t *port);
+static ds_svc_t *ds_get_svc(ds_svc_hdl_t hdl);
+
+/* port utilities */
+static int ds_port_add(md_t *mdp, mde_cookie_t port, mde_cookie_t chan);
+static void ds_port_reset(ds_port_t *port);
+
+/* misc utilities */
+static ds_vers_check_t ds_vers_isvalid(ds_ver_t *vers, int nvers);
+
+/* log functions */
+static void ds_log_init(void);
+static void ds_log_fini(void);
+static int ds_log_add_msg(int32_t dest, uint8_t *msg, size_t sz);
+static int ds_log_remove(void);
+static void ds_log_purge(void *arg);
+
+
+static struct modlmisc modlmisc = {
+	&mod_miscops,
+	"Domain Services %I%"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	(void *)&modlmisc,
+	NULL
+};
+
+int
+_init(void)
+{
+	int	rv;
+
+	/*
+	 * Perform all internal setup before initializing
+	 * the DS ports. This ensures that events can be
+	 * processed as soon as the port comes up.
+	 */
+	ds_init();
+
+	if ((rv = ds_ports_init()) != 0) {
+		cmn_err(CE_WARN, "Domin Services initialization failed");
+		ds_fini();
+		return (rv);
+	}
+
+	if ((rv = mod_install(&modlinkage)) != 0) {
+		(void) ds_ports_fini();
+		ds_fini();
+	}
+
+	return (rv);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int	rv;
+
+	if ((rv = mod_remove(&modlinkage)) == 0) {
+		(void) ds_ports_fini();
+		ds_fini();
+	}
+
+	return (rv);
+}
+
+static void
+ds_init(void)
+{
+	int	tblsz;
+
+	/*
+	 * Initialize table of registered service classes
+	 */
+	ds_svcs.maxsvcs = DS_MAXSVCS_INIT;
+
+	tblsz = ds_svcs.maxsvcs * sizeof (ds_svc_t *);
+	ds_svcs.tbl = kmem_zalloc(tblsz, KM_SLEEP);
+
+	rw_init(&ds_svcs.rwlock, NULL, RW_DRIVER, NULL);
+
+	ds_svcs.nsvcs = 0;
+
+	/*
+	 * Initialize the message log.
+	 */
+	ds_log_init();
+
+	/*
+	 * Create taskq for internal processing threads. This
+	 * includes processing incoming request messages and
+	 * sending out of band registration messages.
+	 */
+	ds_taskq = taskq_create("ds_taskq", 1, minclsyspri, 1,
+	    DS_MAX_TASKQ_THR, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
+	ds_enabled = B_TRUE;
+
+	/* catch problems with the version array */
+	ASSERT(ds_vers_isvalid(ds_vers, DS_NUM_VER) == DS_VERS_OK);
+}
+
+static void
+ds_fini(void)
+{
+	int	idx;
+
+	/*
+	 * Flip the enabled switch to make sure that no
+	 * incoming events get dispatched while things
+	 * are being torn down.
+	 */
+	ds_enabled = B_FALSE;
+
+	/*
+	 * Destroy the taskq.
+	 */
+	taskq_destroy(ds_taskq);
+
+	/*
+	 * Destroy the message log.
+	 */
+	ds_log_fini();
+
+	/*
+	 * Deallocate the table of registered services
+	 */
+
+	/* clear out all entries */
+	rw_enter(&ds_svcs.rwlock, RW_WRITER);
+	idx = ds_walk_svcs(ds_svc_free, NULL);
+	rw_exit(&ds_svcs.rwlock);
+
+	/* should have gone through the whole table */
+	ASSERT(idx == ds_svcs.maxsvcs);
+
+	/* destroy the table itself */
+	kmem_free(ds_svcs.tbl, ds_svcs.maxsvcs * sizeof (ds_svc_t *));
+	rw_destroy(&ds_svcs.rwlock);
+	bzero(&ds_svcs, sizeof (ds_svcs));
+}
+
+/*
+ * Initialize the list of ports based on the MD.
+ */
+static int
+ds_ports_init(void)
+{
+	int		idx;
+	int		rv;
+	md_t		*mdp;
+	int		num_nodes;
+	int		listsz;
+	mde_cookie_t	rootnode;
+	mde_cookie_t	*portp = NULL;
+	mde_cookie_t	*chanp = NULL;
+	int		nport;
+	int		nchan;
+	ds_port_t	*port;
+
+	if ((mdp = md_get_handle()) == NULL) {
+		cmn_err(CE_WARN, "unable to initialize machine description");
+		return (-1);
+	}
+
+	num_nodes = md_node_count(mdp);
+	ASSERT(num_nodes > 0);
+
+	listsz = num_nodes * sizeof (mde_cookie_t);
+
+	/* allocate temporary storage for MD scans */
+	portp = kmem_zalloc(listsz, KM_SLEEP);
+	chanp = kmem_zalloc(listsz, KM_SLEEP);
+
+	rootnode = md_root_node(mdp);
+	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
+
+	/* find all the DS ports in the MD */
+	nport = md_scan_dag(mdp, rootnode, md_find_name(mdp, DS_MD_PORT_NAME),
+	    md_find_name(mdp, "fwd"), portp);
+
+	if (nport <= 0) {
+		cmn_err(CE_NOTE, "No '%s' nodes in MD", DS_MD_PORT_NAME);
+		rv = -1;
+		goto done;
+	}
+
+	/*
+	 * Initialize all the ports found in the MD.
+	 */
+	for (idx = 0; idx < nport; idx++) {
+
+		/* get the channels for this port */
+		nchan = md_scan_dag(mdp, portp[idx],
+		    md_find_name(mdp, DS_MD_CHAN_NAME),
+		    md_find_name(mdp, "fwd"), chanp);
+
+		if (nchan <= 0) {
+			cmn_err(CE_NOTE, "No '%s' node for DS port",
+			    DS_MD_CHAN_NAME);
+			rv = -1;
+			goto done;
+		}
+
+		/* expecting only one channel */
+		if (nchan != 1) {
+			DS_DBG("expected 1 '%s' node for DS port, found %d\n",
+			    DS_MD_CHAN_NAME, nchan);
+		}
+
+		if (ds_port_add(mdp, portp[idx], chanp[0]) != 0) {
+			rv = -1;
+			goto done;
+		}
+	}
+
+	/*
+	 * Initialize the LDC channel for each port.
+	 */
+	for (idx = 0; idx < DS_MAX_PORTS; idx++) {
+
+		if (!DS_PORT_IN_SET(ds_allports, idx))
+			continue;
+
+		port = &ds_ports[idx];
+
+		mutex_enter(&port->lock);
+
+		if (ds_ldc_init(port)) {
+			cmn_err(CE_WARN, "ds@%lx: ports_init: failed to "
+			    "initialize LDC %ld", port->id, port->ldc.id);
+		} else {
+			DS_DBG("ds@%lx: ports_init: initialization complete\n",
+			    port->id);
+		}
+
+		mutex_exit(&port->lock);
+	}
+
+	rv = 0;
+
+done:
+	if (rv != 0)
+		(void) ds_ports_fini();
+
+	kmem_free(portp, listsz);
+	kmem_free(chanp, listsz);
+
+	(void) md_fini_handle(mdp);
+
+	return (rv);
+}
+
+static int
+ds_ports_fini(void)
+{
+	int		idx;
+	ds_port_t	*port;
+
+	/*
+	 * Tear down each initialized port.
+	 */
+	for (idx = 0; idx < DS_MAX_PORTS; idx++) {
+
+		if (!DS_PORT_IN_SET(ds_allports, idx))
+			continue;
+
+		port = &ds_ports[idx];
+
+		mutex_enter(&port->lock);
+
+		if (port->state >= DS_PORT_LDC_INIT) {
+			/* shut down the LDC for this port */
+			(void) ds_ldc_fini(port);
+		}
+
+		port->state = DS_PORT_FREE;
+
+		mutex_exit(&port->lock);
+
+		/* clean up the port structure */
+		mutex_destroy(&port->lock);
+		DS_PORTSET_DEL(ds_allports, idx);
+	}
+
+	return (0);
+}
+
+static int
+ds_ldc_init(ds_port_t *port)
+{
+	int		rv;
+	ldc_attr_t	ldc_attr;
+	caddr_t		cb_arg = (caddr_t)port;
+
+	ASSERT(MUTEX_HELD(&port->lock));
+
+	DS_DBG("ds@%lx: ldc_init: ldc_id=%ld\n", port->id, port->ldc.id);
+
+	ldc_attr.devclass = LDC_DEV_GENERIC;
+	ldc_attr.instance = 0;
+	ldc_attr.mode = LDC_MODE_STREAM;
+	ldc_attr.qlen = DS_QUEUE_LEN;
+
+	if ((rv = ldc_init(port->ldc.id, &ldc_attr, &port->ldc.hdl)) != 0) {
+		cmn_err(CE_WARN, "ds@%lx: ldc_init: ldc_init error (%d)",
+		    port->id, rv);
+		goto done;
+	}
+
+	/* register the LDC callback */
+	if ((rv = ldc_reg_callback(port->ldc.hdl, ds_ldc_cb, cb_arg)) != 0) {
+		cmn_err(CE_WARN, "ds@%lx: dc_init: ldc_reg_callback error "
+		    "(%d)", port->id, rv);
+		goto done;
+	}
+
+	if ((rv = ldc_open(port->ldc.hdl)) != 0) {
+		cmn_err(CE_WARN, "ds@%lx: ldc_init: ldc_open error (%d)",
+		    port->id, rv);
+		goto done;
+	}
+
+	(void) ldc_up(port->ldc.hdl);
+
+	(void) ldc_status(port->ldc.hdl, &port->ldc.state);
+
+	DS_DBG_LDC("ds@%lx: ldc_init: initial LDC state 0x%x\n",
+	    port->id, port->ldc.state);
+
+	port->state = DS_PORT_LDC_INIT;
+
+	/* if port is up, send init message */
+	if (port->ldc.state == LDC_UP) {
+		ds_send_init_req(port);
+	}
+
+done:
+	return (rv);
+}
+
+static int
+ds_ldc_fini(ds_port_t *port)
+{
+	int	rv;
+
+	ASSERT(port->state >= DS_PORT_LDC_INIT);
+
+	DS_DBG("ds@%lx: ldc_fini: ldc_id=%ld\n", port->id, port->ldc.id);
+
+	if ((rv = ldc_close(port->ldc.hdl)) != 0) {
+		cmn_err(CE_WARN, "ds@%lx: ldc_fini: ldc_close error (%d)",
+		    port->id, rv);
+		return (rv);
+	}
+
+	if ((rv = ldc_unreg_callback(port->ldc.hdl)) != 0) {
+		cmn_err(CE_WARN, "ds@%lx: ldc_fini: ldc_unreg_callback error "
+		    "(%d)", port->id, rv);
+		return (rv);
+	}
+
+	if ((rv = ldc_fini(port->ldc.hdl)) != 0) {
+		cmn_err(CE_WARN, "ds@%lx: ldc_fini: ldc_fini error (%d)",
+		    port->id, rv);
+		return (rv);
+	}
+
+	return (rv);
+}
+
+/*
+ * A DS event consists of a buffer on a port.
+ */
+typedef struct ds_event {
+	ds_port_t	*port;
+	char		*buf;
+	size_t		buflen;
+} ds_event_t;
+
+static uint_t
+ds_ldc_cb(uint64_t event, caddr_t arg)
+{
+	ldc_status_t	ldc_state;
+	int		rv;
+	ds_port_t	*port = (ds_port_t *)arg;
+	ldc_handle_t	ldc_hdl;
+
+	DS_DBG("ds@%lx: ds_ldc_cb...\n", port->id);
+
+	if (!ds_enabled) {
+		DS_DBG("ds@%lx: callback handling is disabled\n", port->id);
+		return (LDC_SUCCESS);
+	}
+
+	ldc_hdl = port->ldc.hdl;
+
+	/*
+	 * Check the LDC event.
+	 */
+
+	if (event & LDC_EVT_READ) {
+		/* dispatch a thread to handle the read event */
+		if (DS_DISPATCH(ds_handle_recv, port) == NULL) {
+			cmn_err(CE_WARN, "error initiating event handler");
+		}
+		return (LDC_SUCCESS);
+	}
+
+	/* only check status if not a read event */
+	if ((rv = ldc_status(ldc_hdl, &ldc_state)) != 0) {
+		DS_DBG("ds@%lx: ldc_status error (%d)", port->id, rv);
+		return (LDC_SUCCESS);
+	}
+
+	if (event & LDC_EVT_DOWN || event & LDC_EVT_UP) {
+		mutex_enter(&port->lock);
+		ds_handle_ldc_event(port, ldc_state);
+		mutex_exit(&port->lock);
+		return (LDC_SUCCESS);
+	}
+
+	if (event & LDC_EVT_RESET || event & LDC_EVT_WRITE) {
+		DS_DBG("ds@%lx: LDC event (%lx) received", port->id, event);
+		return (LDC_SUCCESS);
+	}
+
+	cmn_err(CE_NOTE, "ds@%lx: Unexpected LDC event (%lx) received",
+	    port->id, event);
+
+	return (LDC_SUCCESS);
+}
+
+static int
+ds_recv_msg(ldc_handle_t ldc_hdl, caddr_t msgp, size_t msglen)
+{
+	int	rv = 0;
+	size_t	amt_left = msglen;
+	int	loopcnt = 0;
+
+	while (msglen > 0) {
+		if ((rv = ldc_read(ldc_hdl, msgp, &amt_left)) != 0) {
+			if ((rv == EAGAIN) && (loopcnt++ < 1000)) {
+				/*
+				 * Try again, but don't try for more than
+				 * one second.  Something is wrong with
+				 * the channel.
+				 */
+				delay(drv_usectohz(10000)); /* 1/1000 sec */
+			} else {
+				/* fail */
+				return (rv);
+			}
+		} else {
+			msgp += amt_left;
+			msglen -= amt_left;
+			amt_left = msglen;
+		}
+	} /* while (msglen > 0) */
+
+	return (rv);
+}
+
+static void
+ds_handle_recv(void *arg)
+{
+	ds_port_t	*port = (ds_port_t *)arg;
+	char		*hbuf;
+	size_t		len;
+	size_t		read_size;
+	boolean_t	isempty;
+	ds_hdr_t	hdr;
+	uint8_t		*msg;
+	char		*currp;
+	int		rv;
+	ldc_handle_t	ldc_hdl;
+	ds_event_t	*devent;
+
+	DS_DBG("ds@%lx: ds_ldc_cb...\n", port->id);
+
+	ldc_hdl = port->ldc.hdl;
+
+	mutex_enter(&port->lock);
+	while ((ldc_chkq(ldc_hdl, &isempty) == 0) && (!isempty)) {
+
+
+		DS_DBG("ds@%lx: reading next message\n", port->id);
+
+		/*
+		 * Read in the next message.
+		 */
+		hbuf = (char *)&hdr;
+		bzero(hbuf, DS_HDR_SZ);
+		read_size = DS_HDR_SZ;
+		currp = hbuf;
+
+		/* read in the message header */
+
+		if ((rv = ds_recv_msg(ldc_hdl, currp, read_size)) != 0) {
+			/*
+			 * failed to read message drop it and see if there
+			 * are anymore messages
+			 */
+			cmn_err(CE_NOTE, "ldc_read returned %d", rv);
+			continue;
+		}
+
+		len = read_size;
+
+		/* get payload size and alloc a buffer */
+
+		read_size = ((ds_hdr_t *)hbuf)->payload_len;
+		msg = kmem_zalloc((DS_HDR_SZ + read_size), KM_SLEEP);
+
+		/* move message header into buffer */
+
+		bcopy(hbuf, msg, DS_HDR_SZ);
+		currp = (char *)(msg) + DS_HDR_SZ;
+
+		/* read in the message body */
+
+		if ((rv = ds_recv_msg(ldc_hdl, currp, read_size)) != 0) {
+			/*
+			 * failed to read message drop it and see if there
+			 * are anymore messages
+			 */
+			kmem_free(msg, (DS_HDR_SZ + read_size));
+			cmn_err(CE_NOTE, "ldc_read returned %d", rv);
+			continue;
+		}
+
+		len += read_size;
+		DS_DUMP_LDC_MSG(msg, len);
+
+		/*
+		 * Send the message for processing, and store it
+		 * in the log. The memory is deallocated only when
+		 * the message is removed from the log.
+		 */
+
+		devent = kmem_zalloc(sizeof (ds_event_t), KM_SLEEP);
+		devent->port = port;
+		devent->buf = (char *)msg;
+		devent->buflen = len;
+
+		/* log the message */
+		(void) ds_log_add_msg(DS_LOG_IN(port->id), msg, len);
+
+		/* send the message off to get processed in a new thread */
+		if (DS_DISPATCH(ds_dispatch_event, devent) == NULL) {
+			cmn_err(CE_WARN, "error initiating event handler");
+			continue;
+		}
+
+	}
+	mutex_exit(&port->lock);
+}
+
+static void
+ds_dispatch_event(void *arg)
+{
+	ds_event_t	*event = (ds_event_t *)arg;
+	ds_hdr_t	*hdr;
+	ds_port_t	*port;
+
+	port = event->port;
+
+	hdr = (ds_hdr_t *)event->buf;
+
+	if (!DS_MSG_TYPE_VALID(hdr->msg_type)) {
+		cmn_err(CE_NOTE, "ds@%lx: dispatch_event: invalid msg "
+		    "type (%d)", port->id, hdr->msg_type);
+		return;
+	}
+
+	DS_DBG("ds@%lx: dispatch_event: msg_type=%d\n", port->id,
+	    hdr->msg_type);
+
+	(*ds_msg_handlers[hdr->msg_type])(port, event->buf, event->buflen);
+
+	kmem_free(event, sizeof (ds_event_t));
+}
+
+static void
+ds_handle_ldc_event(ds_port_t *port, int newstate)
+{
+	ldc_status_t oldstate = port->ldc.state;
+
+	ASSERT(MUTEX_HELD(&port->lock));
+
+	DS_DBG_LDC("ds@%lx: LDC state change: 0x%x -> 0x%x\n",
+	    port->id, oldstate, newstate);
+
+	switch (newstate) {
+	case LDC_UP:
+		if ((oldstate == LDC_OPEN) || (oldstate == LDC_READY)) {
+			/* start the version negotiation */
+			ds_send_init_req(port);
+		} else {
+			DS_DBG_LDC("unsupported LDC state change\n");
+		}
+		break;
+
+	case LDC_READY:
+	case LDC_OPEN:
+		if (oldstate != LDC_UP) {
+			/* not worried about this state change */
+			break;
+		}
+
+		_NOTE(FALLTHROUGH)
+
+	default:
+		if (oldstate == LDC_UP) {
+			ds_port_reset(port);
+		} else {
+			DS_DBG_LDC("unsupported LDC state change\n");
+		}
+		break;
+	}
+
+	port->ldc.state = newstate;
+}
+
+/*
+ * Version negotiation is always initiated by the guest. Any
+ * attempt by a remote party to initiate the handshake gets
+ * nack'd with a major number equal to zero. This indicates
+ * that no version is supported since an init request is not
+ * expected.
+ */
+static void
+ds_handle_init_req(ds_port_t *port, caddr_t buf, size_t len)
+{
+	ds_hdr_t	*hdr;
+	ds_init_nack_t	*nack;
+	char		*msg;
+	size_t		msglen;
+	ds_init_req_t	*req;
+	size_t		explen = DS_MSG_LEN(ds_init_req_t);
+
+	req = (ds_init_req_t *)(buf + DS_HDR_SZ);
+
+	/* sanity check the incoming message */
+	if (len != explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <init_req: invalid message length "
+		    "(%ld), expected %ld", port->id, len, explen);
+	} else {
+		DS_DBG("ds@%lx: <init_req: ver=%d.%d\n", port->id,
+		    req->major_vers, req->minor_vers);
+	}
+
+	DS_DBG("ds@%lx: init_nack>: major=0\n", port->id);
+
+	msglen = DS_MSG_LEN(ds_init_nack_t);
+	msg = kmem_zalloc(msglen, KM_SLEEP);
+
+	hdr = (ds_hdr_t *)msg;
+	hdr->msg_type = DS_INIT_NACK;
+	hdr->payload_len = sizeof (ds_init_nack_t);
+
+	nack = (ds_init_nack_t *)(msg + DS_HDR_SZ);
+	nack->major_vers = 0;
+
+	/* send message */
+	mutex_enter(&port->lock);
+	(void) ds_send_msg(port, msg, msglen);
+	mutex_exit(&port->lock);
+}
+
+static void
+ds_handle_init_ack(ds_port_t *port, caddr_t buf, size_t len)
+{
+	ds_init_ack_t	*ack;
+	ds_ver_t	*ver;
+	size_t		explen = DS_MSG_LEN(ds_init_ack_t);
+
+	/* sanity check the incoming message */
+	if (len != explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <init_ack: invalid message length "
+		    "(%ld), expected %ld", port->id, len, explen);
+		return;
+	}
+
+	ack = (ds_init_ack_t *)(buf + DS_HDR_SZ);
+
+	mutex_enter(&port->lock);
+
+	if (port->state != DS_PORT_INIT_REQ) {
+		cmn_err(CE_NOTE, "ds@%lx: <init_ack: invalid state for msg "
+		    "(%d)", port->id, port->state);
+		mutex_exit(&port->lock);
+		return;
+	}
+
+	ver = &(ds_vers[port->ver_idx]);
+
+	DS_DBG("ds@%lx: <init_ack: req=v%d.%d, ack=v%d.%d\n", port->id,
+	    ver->major, ver->minor, ver->major, ack->minor_vers);
+
+	/* agreed upon a major version */
+	port->ver.major = ver->major;
+
+	/*
+	 * If the returned minor version is larger than
+	 * the requested minor version, use the lower of
+	 * the two, i.e. the requested version.
+	 */
+	if (ack->minor_vers >= ver->minor) {
+		/*
+		 * Use the minor version specified in the
+		 * original request.
+		 */
+		port->ver.minor = ver->minor;
+	} else {
+		/*
+		 * Use the lower minor version returned in
+		 * the ack. By defninition, all lower minor
+		 * versions must be supported.
+		 */
+		port->ver.minor = ack->minor_vers;
+	}
+
+	port->state = DS_PORT_READY;
+
+	DS_DBG("ds@%lx: <init_ack: port ready v%d.%d\n", port->id,
+	    port->ver.major, port->ver.minor);
+
+	mutex_exit(&port->lock);
+
+	/*
+	 * The port came up, so update all the services
+	 * with this information. Follow that up with an
+	 * attempt to register any service that is not
+	 * already registered.
+	 */
+	rw_enter(&ds_svcs.rwlock, RW_WRITER);
+
+	(void) ds_walk_svcs(ds_svc_port_up, port);
+	(void) ds_walk_svcs(ds_svc_register, NULL);
+
+	rw_exit(&ds_svcs.rwlock);
+}
+
+static void
+ds_handle_init_nack(ds_port_t *port, caddr_t buf, size_t len)
+{
+	int		idx;
+	ds_init_nack_t	*nack;
+	ds_ver_t	*ver;
+	size_t		explen = DS_MSG_LEN(ds_init_nack_t);
+
+	/* sanity check the incoming message */
+	if (len != explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <init_nack: invalid message length "
+		    "(%ld), expected %ld", port->id, len, explen);
+		return;
+	}
+
+	nack = (ds_init_nack_t *)(buf + DS_HDR_SZ);
+
+	mutex_enter(&port->lock);
+
+	if (port->state != DS_PORT_INIT_REQ) {
+		cmn_err(CE_NOTE, "ds@%lx: <init_nack: invalid state for msg "
+		    "(%d)", port->id, port->state);
+		mutex_exit(&port->lock);
+		return;
+	}
+
+	ver = &(ds_vers[port->ver_idx]);
+
+	DS_DBG("ds@%lx: <init_nack: req=v%d.%d, nack=v%d.x\n", port->id,
+	    ver->major, ver->minor, nack->major_vers);
+
+	if (nack->major_vers == 0) {
+		/* no supported protocol version */
+		cmn_err(CE_NOTE, "ds@%lx: <init_nack: DS not supported",
+		    port->id);
+		mutex_exit(&port->lock);
+		return;
+	}
+
+	/*
+	 * Walk the version list, looking for a major version
+	 * that is as close to the requested major version as
+	 * possible.
+	 */
+	for (idx = port->ver_idx; idx < DS_NUM_VER; idx++) {
+		if (ds_vers[idx].major <= nack->major_vers) {
+			/* found a version to try */
+			goto done;
+		}
+	}
+
+	if (idx == DS_NUM_VER) {
+		/* no supported version */
+		cmn_err(CE_NOTE, "ds@%lx: <init_nack: DS v%d.x not supported",
+		    port->id, nack->major_vers);
+
+		mutex_exit(&port->lock);
+		return;
+	}
+
+done:
+	/* start the handshake again */
+	port->ver_idx = idx;
+	port->state = DS_PORT_LDC_INIT;
+
+	ds_send_init_req(port);
+
+	mutex_exit(&port->lock);
+}
+
+static void
+ds_handle_reg_req(ds_port_t *port, caddr_t buf, size_t len)
+{
+	ds_hdr_t	*hdr;
+	ds_reg_req_t	*req;
+	ds_reg_nack_t	*nack;
+	char		*msg;
+	size_t		msglen;
+	size_t		explen = DS_MSG_LEN(ds_reg_req_t);
+
+	/* the request information */
+	req = (ds_reg_req_t *)(buf + DS_HDR_SZ);
+
+	/* sanity check the incoming message */
+	if (len < explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <reg_req: invalid message length "
+		    "(%ld), expected at least %ld", port->id, len, explen);
+	} else {
+		DS_DBG("ds@%lx: <reg_req: id='%s', ver=%d.%d, hdl=0x%lx\n",
+		    port->id, req->svc_id, req->major_vers, req->minor_vers,
+		    req->svc_handle);
+	}
+
+	DS_DBG("ds@%lx: reg_nack>: major=0\n", port->id);
+
+	msglen = DS_MSG_LEN(ds_reg_nack_t);
+	msg = kmem_zalloc(msglen, KM_SLEEP);
+
+	hdr = (ds_hdr_t *)msg;
+	hdr->msg_type = DS_REG_NACK;
+	hdr->payload_len = sizeof (ds_reg_nack_t);
+
+	nack = (ds_reg_nack_t *)(msg + DS_HDR_SZ);
+	nack->svc_handle = req->svc_handle;
+	nack->result = DS_REG_VER_NACK;
+	nack->major_vers = 0;
+
+	/* send message */
+	mutex_enter(&port->lock);
+	(void) ds_send_msg(port, msg, msglen);
+	mutex_exit(&port->lock);
+}
+
+static void
+ds_handle_reg_ack(ds_port_t *port, caddr_t buf, size_t len)
+{
+	ds_reg_ack_t	*ack;
+	ds_ver_t	*ver;
+	ds_ver_t	tmpver;
+	ds_svc_t	*svc;
+	size_t		explen = DS_MSG_LEN(ds_reg_ack_t);
+
+	/* sanity check the incoming message */
+	if (len != explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <reg_ack: invalid message length "
+		    "(%ld), expected %ld", port->id, len, explen);
+		return;
+	}
+
+	ack = (ds_reg_ack_t *)(buf + DS_HDR_SZ);
+
+	rw_enter(&ds_svcs.rwlock, RW_READER);
+
+	/* lookup appropriate client */
+	if ((svc = ds_get_svc(ack->svc_handle)) == NULL) {
+		cmn_err(CE_NOTE, "ds@%lx: <reg_ack: invalid handle 0x%lx",
+		    port->id, ack->svc_handle);
+		goto done;
+	}
+
+	/* make sure the message makes sense */
+	if (svc->state != DS_SVC_REG_PENDING) {
+		cmn_err(CE_NOTE, "ds@%lx: <reg_ack: invalid state for message "
+		    "(%d)", port->id, svc->state);
+		goto done;
+	}
+
+	ver = &(svc->cap.vers[svc->ver_idx]);
+
+	DS_DBG("ds@%lx: <reg_ack: hdl=0x%lx, ack=v%d.%d\n", port->id,
+	    ack->svc_handle, ver->major, ack->minor_vers);
+
+	/* major version has been agreed upon */
+	svc->ver.major = ver->major;
+
+	if (ack->minor_vers >= ver->minor) {
+		/*
+		 * Use the minor version specified in the
+		 * original request.
+		 */
+		svc->ver.minor = ver->minor;
+	} else {
+		/*
+		 * Use the lower minor version returned in
+		 * the ack. By defninition, all lower minor
+		 * versions must be supported.
+		 */
+		svc->ver.minor = ack->minor_vers;
+	}
+
+	svc->state = DS_SVC_ACTIVE;
+
+	DS_DBG("ds@%lx: <reg_ack: %s v%d.%d ready, hdl=0x%lx\n", port->id,
+	    svc->cap.svc_id, svc->ver.major, svc->ver.minor, svc->hdl);
+
+	/* notify the client that registration is complete */
+	if (svc->ops.ds_reg_cb) {
+		/*
+		 * Use a temporary version structure so that
+		 * the copy in the svc structure cannot be
+		 * modified by the client.
+		 */
+		tmpver.major = svc->ver.major;
+		tmpver.minor = svc->ver.minor;
+
+		(*svc->ops.ds_reg_cb)(svc->ops.cb_arg, &tmpver, svc->hdl);
+	}
+
+done:
+	rw_exit(&ds_svcs.rwlock);
+}
+
+static void
+ds_handle_reg_nack(ds_port_t *port, caddr_t buf, size_t len)
+{
+	ds_reg_nack_t	*nack;
+	ds_svc_t	*svc;
+	int		idx;
+	size_t		explen = DS_MSG_LEN(ds_reg_nack_t);
+
+	/* sanity check the incoming message */
+	if (len != explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <reg_nack: invalid message length "
+		    "(%ld), expected %ld", port->id, len, explen);
+		return;
+	}
+
+	nack = (ds_reg_nack_t *)(buf + DS_HDR_SZ);
+
+	rw_enter(&ds_svcs.rwlock, RW_READER);
+
+	/* lookup appropriate client */
+	if ((svc = ds_get_svc(nack->svc_handle)) == NULL) {
+		cmn_err(CE_NOTE, "ds@%lx: <reg_nack: invalid handle 0x%lx",
+		    port->id, nack->svc_handle);
+		goto done;
+	}
+
+	/* make sure the message makes sense */
+	if (svc->state != DS_SVC_REG_PENDING) {
+		cmn_err(CE_NOTE, "ds@%lx: <reg_nack: invalid state for message "
+		    "(%d)", port->id, svc->state);
+		goto done;
+	}
+
+	if (nack->result == DS_REG_DUP) {
+		cmn_err(CE_NOTE, "ds@%lx: <reg_nack: duplicate registration "
+		    "for %s", port->id, svc->cap.svc_id);
+		goto done;
+	}
+
+	/*
+	 * A major version of zero indicates that the
+	 * service is not supported at all.
+	 */
+	if (nack->major_vers == 0) {
+		DS_DBG("ds@%lx: <reg_nack: %s not supported\n", port->id,
+		    svc->cap.svc_id);
+		ds_reset_svc(svc, port);
+		goto done;
+	}
+
+	DS_DBG("ds@%lx: <reg_nack: hdl=0x%lx, nack=%d.x\n", port->id,
+	    nack->svc_handle, nack->major_vers);
+
+	/*
+	 * Walk the version list for the service, looking for
+	 * a major version that is as close to the requested
+	 * major version as possible.
+	 */
+	for (idx = svc->ver_idx; idx < svc->cap.nvers; idx++) {
+		if (svc->cap.vers[idx].major <= nack->major_vers) {
+			/* found a version to try */
+			break;
+		}
+	}
+
+	if (idx == svc->cap.nvers) {
+		/* no supported version */
+		DS_DBG("ds@%lx: <reg_nack: %s v%d.x not supported\n",
+		    port->id, svc->cap.svc_id, nack->major_vers);
+		svc->state = DS_SVC_INACTIVE;
+		goto done;
+	}
+
+	/* start the handshake again */
+	svc->state = DS_SVC_INACTIVE;
+	svc->ver_idx = idx;
+
+	(void) ds_svc_register(svc, NULL);
+
+done:
+	rw_exit(&ds_svcs.rwlock);
+}
+
+static void
+ds_handle_unreg_req(ds_port_t *port, caddr_t buf, size_t len)
+{
+	ds_hdr_t	*hdr;
+	ds_unreg_req_t	*req;
+	ds_unreg_ack_t	*ack;
+	ds_svc_t	*svc;
+	char		*msg;
+	size_t		msglen;
+	size_t		explen = DS_MSG_LEN(ds_unreg_req_t);
+
+	/* sanity check the incoming message */
+	if (len != explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <unreg_req: invalid message length "
+		    "(%ld), expected %ld", port->id, len, explen);
+		return;
+	}
+
+	/* the request information */
+	req = (ds_unreg_req_t *)(buf + DS_HDR_SZ);
+
+	rw_enter(&ds_svcs.rwlock, RW_READER);
+
+	/* lookup appropriate client */
+	if ((svc = ds_get_svc(req->svc_handle)) == NULL) {
+		cmn_err(CE_NOTE, "ds@%lx: <unreg_req: invalid handle "
+		    "0x%lx", port->id, req->svc_handle);
+		ds_send_unreg_nack(port, req->svc_handle);
+		goto done;
+	}
+
+	/* unregister the service */
+	(void) ds_svc_unregister(svc, svc->port);
+
+	DS_DBG("ds@%lx: unreg_ack>: hdl=0x%lx\n", port->id, req->svc_handle);
+
+	msglen = DS_HDR_SZ + sizeof (ds_unreg_ack_t);
+	msg = kmem_zalloc(msglen, KM_SLEEP);
+
+	hdr = (ds_hdr_t *)msg;
+	hdr->msg_type = DS_UNREG_ACK;
+	hdr->payload_len = sizeof (ds_unreg_ack_t);
+
+	ack = (ds_unreg_ack_t *)(msg + DS_HDR_SZ);
+	ack->svc_handle = req->svc_handle;
+
+	/* send message */
+	mutex_enter(&port->lock);
+	(void) ds_send_msg(port, msg, msglen);
+	mutex_enter(&port->lock);
+
+done:
+	rw_exit(&ds_svcs.rwlock);
+}
+
+static void
+ds_handle_unreg_ack(ds_port_t *port, caddr_t buf, size_t len)
+{
+	ds_unreg_ack_t	*ack;
+	size_t		explen = DS_MSG_LEN(ds_unreg_ack_t);
+
+	/* sanity check the incoming message */
+	if (len != explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <unreg_ack: invalid message length "
+		    "(%ld), expected %ld", port->id, len, explen);
+		return;
+	}
+
+	ack = (ds_unreg_ack_t *)(buf + DS_HDR_SZ);
+
+	DS_DBG("ds@%lx: <unreg_ack: hdl=0x%lx\n", port->id, ack->svc_handle);
+
+	rw_enter(&ds_svcs.rwlock, RW_READER);
+
+	/*
+	 * Since the unregister request was initiated locally,
+	 * the service structure has already been torn down.
+	 * Just perform a sanity check to make sure the message
+	 * is appropriate.
+	 */
+	if (ds_get_svc(ack->svc_handle) != NULL) {
+		cmn_err(CE_NOTE, "ds@%lx: <unreg_ack: handle 0x%lx still "
+		    "in use", port->id, ack->svc_handle);
+	}
+
+	rw_exit(&ds_svcs.rwlock);
+}
+
+static void
+ds_handle_unreg_nack(ds_port_t *port, caddr_t buf, size_t len)
+{
+	ds_unreg_nack_t	*nack;
+	size_t		explen = DS_MSG_LEN(ds_unreg_nack_t);
+
+	/* sanity check the incoming message */
+	if (len != explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <unreg_nack: invalid message length "
+		    "(%ld), expected %ld", port->id, len, explen);
+		return;
+	}
+
+	nack = (ds_unreg_nack_t *)(buf + DS_HDR_SZ);
+
+	DS_DBG("ds@%lx: <unreg_nack: hdl=0x%lx\n", port->id,
+	    nack->svc_handle);
+
+	rw_enter(&ds_svcs.rwlock, RW_READER);
+
+	/*
+	 * Since the unregister request was initiated locally,
+	 * the service structure has already been torn down.
+	 * Just perform a sanity check to make sure the message
+	 * is appropriate.
+	 */
+	if (ds_get_svc(nack->svc_handle) != NULL) {
+		cmn_err(CE_NOTE, "ds@%lx: <unreg_nack: handle 0x%lx still "
+		    "in use", port->id, nack->svc_handle);
+	}
+
+	rw_exit(&ds_svcs.rwlock);
+}
+
+static void
+ds_handle_data(ds_port_t *port, caddr_t buf, size_t len)
+{
+	ds_data_handle_t	*data;
+	ds_svc_t		*svc;
+	char			*msg;
+	int			msgsz;
+	int			hdrsz;
+	size_t			explen = DS_MSG_LEN(ds_data_handle_t);
+
+	/* sanity check the incoming message */
+	if (len < explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <data: invalid message length "
+		    "(%ld), expected at least %ld", port->id, len, explen);
+		return;
+	}
+
+	data = (ds_data_handle_t *)(buf + DS_HDR_SZ);
+
+	hdrsz = DS_HDR_SZ + sizeof (ds_data_handle_t);
+	msgsz = len - hdrsz;
+
+	/* strip off the header for the client */
+	msg = (msgsz) ? (buf + hdrsz) : NULL;
+
+	rw_enter(&ds_svcs.rwlock, RW_READER);
+
+	/* lookup appropriate client */
+	if ((svc = ds_get_svc(data->svc_handle)) == NULL) {
+		cmn_err(CE_NOTE, "ds@%lx: <data: invalid handle 0x%lx",
+		    port->id, data->svc_handle);
+		ds_send_data_nack(port, data->svc_handle);
+		return;
+	}
+
+	rw_exit(&ds_svcs.rwlock);
+
+	DS_DBG("ds@%lx: <data: client=%s hdl=0x%lx\n", port->id,
+	    (svc->cap.svc_id) ? svc->cap.svc_id : "NULL", svc->hdl);
+
+	/* dispatch this message to the client */
+	(*svc->ops.ds_data_cb)(svc->ops.cb_arg, msg, msgsz);
+}
+
+static void
+ds_handle_nack(ds_port_t *port, caddr_t buf, size_t len)
+{
+	ds_svc_t	*svc;
+	ds_data_nack_t	*nack;
+	size_t		explen = DS_MSG_LEN(ds_data_nack_t);
+
+	/* sanity check the incoming message */
+	if (len != explen) {
+		cmn_err(CE_NOTE, "ds@%lx: <data_nack: invalid message length "
+		    "(%ld), expected %ld", port->id, len, explen);
+		return;
+	}
+
+	nack = (ds_data_nack_t *)(buf + DS_HDR_SZ);
+
+	DS_DBG("ds@%lx: data_nack: hdl=0x%lx, result=0x%lx\n", port->id,
+	    nack->svc_handle, nack->result);
+
+	if (nack->result == DS_INV_HDL) {
+
+		rw_enter(&ds_svcs.rwlock, RW_READER);
+
+		if ((svc = ds_get_svc(nack->svc_handle)) == NULL) {
+			rw_exit(&ds_svcs.rwlock);
+			return;
+		}
+
+		cmn_err(CE_NOTE, "ds@%lx: <data_nack: handle 0x%lx reported "
+		    "as invalid", port->id, nack->svc_handle);
+
+		(void) ds_svc_unregister(svc, svc->port);
+
+		rw_exit(&ds_svcs.rwlock);
+	}
+}
+
+static int
+ds_send_msg(ds_port_t *port, caddr_t msg, size_t msglen)
+{
+	int	rv;
+	caddr_t	currp = msg;
+	size_t	amt_left = msglen;
+	int	loopcnt = 0;
+
+	DS_DUMP_LDC_MSG(msg, msglen);
+	(void) ds_log_add_msg(DS_LOG_OUT(port->id), (uint8_t *)msg, msglen);
+
+	/*
+	 * ensure that no other messages can be sent on this port in case
+	 * the write doesn't get sent with one write to guarantee that the
+	 * message doesn't become fragmented.
+	 */
+	ASSERT(MUTEX_HELD(&port->lock));
+
+	/* send the message */
+	do {
+		if ((rv = ldc_write(port->ldc.hdl, currp, &msglen)) != 0) {
+			if ((rv == EWOULDBLOCK) && (loopcnt++ < 1000)) {
+				/*
+				 * don't try for more than a sec.  Something
+				 * is wrong with the channel.
+				 */
+				delay(drv_usectohz(10000)); /* 1/1000 sec */
+			} else {
+				cmn_err(CE_WARN,
+				    "ds@%lx: send_msg: ldc_write failed (%d)",
+				    port->id, rv);
+				return (rv);
+			}
+		} else {
+			amt_left -= msglen;
+			currp += msglen;
+			msglen = amt_left;
+			loopcnt = 0;
+		}
+	} while (amt_left > 0);
+
+	return (rv);
+}
+
+static void
+ds_send_init_req(ds_port_t *port)
+{
+	ds_hdr_t	*hdr;
+	ds_init_req_t	*init_req;
+	size_t		nbytes;
+	ds_ver_t	*vers = &ds_vers[port->ver_idx];
+
+	ASSERT(MUTEX_HELD(&port->lock));
+
+	if (port->state != DS_PORT_LDC_INIT) {
+		cmn_err(CE_NOTE, "ds@%lx: init_req>: invalid port state (%d)",
+		    port->id, port->state);
+		return;
+	}
+
+	DS_DBG("ds@%lx: init_req>: req=v%d.%d\n", port->id, vers->major,
+	    vers->minor);
+
+	nbytes = DS_HDR_SZ + sizeof (ds_init_req_t);
+	hdr = kmem_zalloc(nbytes, KM_SLEEP);
+
+	hdr->msg_type = DS_INIT_REQ;
+	hdr->payload_len = sizeof (ds_init_req_t);
+
+	init_req = (ds_init_req_t *)((caddr_t)hdr + DS_HDR_SZ);
+	init_req->major_vers = vers->major;
+	init_req->minor_vers = vers->minor;
+
+	/* send the message */
+	if (ds_send_msg(port, (caddr_t)hdr, nbytes) == 0) {
+		port->state = DS_PORT_INIT_REQ;
+	}
+}
+
+static int
+ds_send_reg_req(ds_svc_t *svc)
+{
+	ds_port_t	*port = svc->port;
+	ds_ver_t	*ver;
+	ds_hdr_t	*hdr;
+	caddr_t		msg;
+	size_t		msglen;
+	size_t		nbytes;
+	ds_reg_req_t	*req;
+	size_t		idlen;
+
+	/* assumes some checking has already occurred */
+	ASSERT(svc->state == DS_SVC_INACTIVE);
+
+	mutex_enter(&port->lock);
+
+	/* check on the LDC to Zeus */
+	if (port->ldc.state != LDC_UP) {
+		/* can not send message */
+		DS_DBG("ds@%lx: reg_req>: channel %ld is not up\n", port->id,
+		    port->ldc.id);
+		mutex_exit(&port->lock);
+		return (-1);
+	}
+
+	/* make sure port is ready */
+	if (port->state != DS_PORT_READY) {
+		/* can not send message */
+		DS_DBG("ds@%lx: reg_req>: port is not ready\n", port->id);
+		mutex_exit(&port->lock);
+		return (-1);
+	}
+
+	mutex_exit(&port->lock);
+
+	/* allocate the message buffer */
+	idlen = strlen(svc->cap.svc_id);
+	msglen = DS_HDR_SZ + sizeof (ds_reg_req_t) + idlen;
+	msg = kmem_zalloc(msglen, KM_SLEEP);
+
+	/* copy in the header data */
+	hdr = (ds_hdr_t *)msg;
+	hdr->msg_type = DS_REG_REQ;
+	hdr->payload_len = sizeof (ds_reg_req_t) + idlen;
+
+	req = (ds_reg_req_t *)(msg + DS_HDR_SZ);
+	req->svc_handle = svc->hdl;
+	ver = &(svc->cap.vers[svc->ver_idx]);
+	req->major_vers = ver->major;
+	req->minor_vers = ver->minor;
+
+	/* copy in the service id */
+	bcopy(svc->cap.svc_id, req->svc_id, idlen + 1);
+
+	/* send the message */
+	DS_DBG("ds@%lx: reg_req>: id='%s', ver=%d.%d, hdl=0x%lx\n", port->id,
+	    svc->cap.svc_id, ver->major, ver->minor, svc->hdl);
+
+	nbytes = msglen;
+	mutex_enter(&port->lock);
+	if (ds_send_msg(port, msg, nbytes) != 0) {
+		mutex_exit(&port->lock);
+		return (-1);
+	} else {
+		svc->state = DS_SVC_REG_PENDING;
+	}
+	mutex_exit(&port->lock);
+
+	return (0);
+}
+
+static int
+ds_send_unreg_req(ds_svc_t *svc)
+{
+	caddr_t		msg;
+	size_t		msglen;
+	size_t		nbytes;
+	ds_hdr_t	*hdr;
+	ds_unreg_req_t	*req;
+	ds_port_t	*port = svc->port;
+
+	if (port == NULL) {
+		DS_DBG("send_unreg_req: service '%s' not associated with "
+		    "a port\n", svc->cap.svc_id);
+		return (-1);
+	}
+
+	mutex_enter(&port->lock);
+
+	/* check on the LDC to Zeus */
+	if (port->ldc.state != LDC_UP) {
+		/* can not send message */
+		cmn_err(CE_NOTE, "ds@%lx: unreg_req>: channel %ld is not up",
+		    port->id, port->ldc.id);
+		mutex_exit(&port->lock);
+		return (-1);
+	}
+
+	/* make sure port is ready */
+	if (port->state != DS_PORT_READY) {
+		/* can not send message */
+		cmn_err(CE_NOTE, "ds@%lx: unreg_req>: port is not ready",
+		    port->id);
+		mutex_exit(&port->lock);
+		return (-1);
+	}
+
+	mutex_exit(&port->lock);
+
+	msglen = DS_HDR_SZ + sizeof (ds_unreg_req_t);
+	msg = kmem_zalloc(msglen, KM_SLEEP);
+
+	/* copy in the header data */
+	hdr = (ds_hdr_t *)msg;
+	hdr->msg_type = DS_UNREG;
+	hdr->payload_len = sizeof (ds_unreg_req_t);
+
+	req = (ds_unreg_req_t *)(msg + DS_HDR_SZ);
+	req->svc_handle = svc->hdl;
+
+	/* send the message */
+	DS_DBG("ds@%lx: unreg_req>: id='%s', hdl=0x%lx\n", port->id,
+	    (svc->cap.svc_id) ? svc->cap.svc_id : "NULL", svc->hdl);
+
+	nbytes = msglen;
+	mutex_enter(&port->lock);
+	if (ds_send_msg(port, msg, nbytes) != 0) {
+		mutex_exit(&port->lock);
+		return (-1);
+	}
+	mutex_exit(&port->lock);
+
+	return (0);
+}
+
+static void
+ds_send_unreg_nack(ds_port_t *port, ds_svc_hdl_t bad_hdl)
+{
+	caddr_t		msg;
+	size_t		msglen;
+	size_t		nbytes;
+	ds_hdr_t	*hdr;
+	ds_unreg_nack_t	*nack;
+
+	mutex_enter(&port->lock);
+
+	/* check on the LDC to Zeus */
+	if (port->ldc.state != LDC_UP) {
+		/* can not send message */
+		cmn_err(CE_NOTE, "ds@%lx: unreg_nack>: channel %ld is not up",
+		    port->id, port->ldc.id);
+		mutex_exit(&port->lock);
+		return;
+	}
+
+	/* make sure port is ready */
+	if (port->state != DS_PORT_READY) {
+		/* can not send message */
+		cmn_err(CE_NOTE, "ds@%lx: unreg_nack>: port is not ready",
+		    port->id);
+		mutex_exit(&port->lock);
+		return;
+	}
+
+	mutex_exit(&port->lock);
+
+	msglen = DS_HDR_SZ + sizeof (ds_unreg_nack_t);
+	msg = kmem_zalloc(msglen, KM_SLEEP);
+
+	/* copy in the header data */
+	hdr = (ds_hdr_t *)msg;
+	hdr->msg_type = DS_UNREG_NACK;
+	hdr->payload_len = sizeof (ds_unreg_nack_t);
+
+	nack = (ds_unreg_nack_t *)(msg + DS_HDR_SZ);
+	nack->svc_handle = bad_hdl;
+
+	/* send the message */
+	DS_DBG("ds@%lx: unreg_nack>: hdl=0x%lx\n", port->id, bad_hdl);
+
+	nbytes = msglen;
+	mutex_enter(&port->lock);
+	(void) ds_send_msg(port, msg, nbytes);
+	mutex_exit(&port->lock);
+}
+
+static void
+ds_send_data_nack(ds_port_t *port, ds_svc_hdl_t bad_hdl)
+{
+	caddr_t		msg;
+	size_t		msglen;
+	size_t		nbytes;
+	ds_hdr_t	*hdr;
+	ds_data_nack_t	*nack;
+
+	mutex_enter(&port->lock);
+
+	/* check on the LDC to Zeus */
+	if (port->ldc.state != LDC_UP) {
+		/* can not send message */
+		cmn_err(CE_NOTE, "ds@%lx: data_nack>: channel %ld is not up",
+		    port->id, port->ldc.id);
+		mutex_exit(&port->lock);
+		return;
+	}
+
+	/* make sure port is ready */
+	if (port->state != DS_PORT_READY) {
+		/* can not send message */
+		cmn_err(CE_NOTE, "ds@%lx: data_nack>: port is not ready",
+		    port->id);
+		mutex_exit(&port->lock);
+		return;
+	}
+
+	mutex_exit(&port->lock);
+
+	msglen = DS_HDR_SZ + sizeof (ds_data_nack_t);
+	msg = kmem_zalloc(msglen, KM_SLEEP);
+
+	/* copy in the header data */
+	hdr = (ds_hdr_t *)msg;
+	hdr->msg_type = DS_NACK;
+	hdr->payload_len = sizeof (ds_data_nack_t);
+
+	nack = (ds_data_nack_t *)(msg + DS_HDR_SZ);
+	nack->svc_handle = bad_hdl;
+	nack->result = DS_INV_HDL;
+
+	/* send the message */
+	DS_DBG("ds@%lx: data_nack>: hdl=0x%lx\n", port->id, bad_hdl);
+
+	nbytes = msglen;
+	mutex_enter(&port->lock);
+	(void) ds_send_msg(port, msg, nbytes);
+	mutex_exit(&port->lock);
+}
+
+#ifdef DEBUG
+
+#define	BYTESPERLINE	8
+#define	LINEWIDTH	((BYTESPERLINE * 3) + (BYTESPERLINE + 2) + 1)
+#define	ASCIIOFFSET	((BYTESPERLINE * 3) + 2)
+#define	ISPRINT(c)	((c >= ' ') && (c <= '~'))
+
+/*
+ * Output a buffer formatted with a set number of bytes on
+ * each line. Append each line with the ASCII equivalent of
+ * each byte if it falls within the printable ASCII range,
+ * and '.' otherwise.
+ */
+static void
+ds_dump_ldc_msg(void *vbuf, size_t len)
+{
+	int	i, j;
+	char	*curr;
+	char	*aoff;
+	char	line[LINEWIDTH];
+	uint8_t	*buf = vbuf;
+
+	/* abort if not debugging ldc */
+	if (!(ds_debug & DS_DBG_FLAG_LDC)) {
+		return;
+	}
+
+	/* walk the buffer one line at a time */
+	for (i = 0; i < len; i += BYTESPERLINE) {
+
+		bzero(line, LINEWIDTH);
+
+		curr = line;
+		aoff = line + ASCIIOFFSET;
+
+		/*
+		 * Walk the bytes in the current line, storing
+		 * the hex value for the byte as well as the
+		 * ASCII representation in a temporary buffer.
+		 * All ASCII values are placed at the end of
+		 * the line.
+		 */
+		for (j = 0; (j < BYTESPERLINE) && ((i + j) < len); j++) {
+			(void) sprintf(curr, " %02x", buf[i + j]);
+			*aoff = (ISPRINT(buf[i + j])) ? buf[i + j] : '.';
+			curr += 3;
+			aoff++;
+		}
+
+		/*
+		 * Fill in to the start of the ASCII translation
+		 * with spaces. This will only be necessary if
+		 * this is the last line and there are not enough
+		 * bytes to fill the whole line.
+		 */
+		while (curr != (line + ASCIIOFFSET))
+			*curr++ = ' ';
+
+		DS_DBG_LDC("%s\n", line);
+	}
+}
+#endif /* DEBUG */
+
+
+/*
+ * Walk the table of registered services, executing the specified
+ * callback function for each service. A non-zero return value from
+ * the callback is used to terminate the walk, not to indicate an
+ * error. Returns the index of the last service visited.
+ */
+static int
+ds_walk_svcs(svc_cb_t svc_cb, void *arg)
+{
+	int		idx;
+	ds_svc_t	*svc;
+
+	ASSERT(RW_LOCK_HELD(&ds_svcs.rwlock));
+
+	/* walk every table entry */
+	for (idx = 0; idx < ds_svcs.maxsvcs; idx++) {
+
+		svc = ds_svcs.tbl[idx];
+
+		/* execute the callback */
+		if ((*svc_cb)(svc, arg) != 0)
+			break;
+	}
+
+	return (idx);
+}
+
+static int
+ds_svc_isfree(ds_svc_t *svc, void *arg)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	/*
+	 * Looking for a free service. This may be a NULL entry
+	 * in the table, or an unused structure that could be
+	 * reused.
+	 */
+
+	if (DS_SVC_ISFREE(svc)) {
+		/* yes, it is free */
+		return (1);
+	}
+
+	/* not a candidate */
+	return (0);
+}
+
+static int
+ds_svc_ismatch(ds_svc_t *svc, void *arg)
+{
+	if (DS_SVC_ISFREE(svc)) {
+		return (0);
+	}
+
+	if (strcmp(svc->cap.svc_id, arg) == 0) {
+		/* found a match */
+		return (1);
+	}
+
+	return (0);
+}
+
+static int
+ds_svc_free(ds_svc_t *svc, void *arg)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	if (svc == NULL) {
+		return (0);
+	}
+
+	if (svc->cap.svc_id) {
+		kmem_free(svc->cap.svc_id, strlen(svc->cap.svc_id) + 1);
+		svc->cap.svc_id = NULL;
+	}
+
+	if (svc->cap.vers) {
+		kmem_free(svc->cap.vers, svc->cap.nvers * sizeof (ds_ver_t));
+		svc->cap.vers = NULL;
+	}
+
+	kmem_free(svc, sizeof (ds_svc_t));
+
+	return (0);
+}
+
+static int
+ds_svc_register(ds_svc_t *svc, void *arg)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	int	idx;
+
+	/* check the state of the service */
+	if (DS_SVC_ISFREE(svc) || (svc->state != DS_SVC_INACTIVE))
+		return (0);
+
+	/* check if there are any ports to try */
+	if (DS_PORTSET_ISNULL(svc->avail))
+		return (0);
+
+	/*
+	 * Attempt to register the service. Start with the lowest
+	 * numbered port and continue until a registration message
+	 * is sent successfully, or there are no ports left to try.
+	 */
+	for (idx = 0; idx < DS_MAX_PORTS; idx++) {
+
+		/*
+		 * If the port is not in the available list,
+		 * it is not a candidate for registration.
+		 */
+		if (!DS_PORT_IN_SET(svc->avail, idx)) {
+			continue;
+		}
+
+		svc->port = &ds_ports[idx];
+		if (ds_send_reg_req(svc) == 0) {
+			/* register sent successfully */
+			break;
+		}
+
+		/* reset the service to try the next port */
+		ds_reset_svc(svc, svc->port);
+	}
+
+	return (0);
+}
+
+static int
+ds_svc_unregister(ds_svc_t *svc, void *arg)
+{
+	ds_port_t *port = (ds_port_t *)arg;
+
+	if (DS_SVC_ISFREE(svc)) {
+		return (0);
+	}
+
+	/* make sure the service is using this port */
+	if (svc->port != port) {
+		return (0);
+	}
+
+	/* reset the service structure */
+	ds_reset_svc(svc, port);
+
+	/* increment the count in the handle to prevent reuse */
+	svc->hdl = DS_ALLOC_HDL(DS_HDL2IDX(svc->hdl), DS_HDL2COUNT(svc->hdl));
+
+	/* call the client unregister callback */
+	if (svc->ops.ds_unreg_cb)
+		(*svc->ops.ds_unreg_cb)(svc->ops.cb_arg);
+
+	/* try to initiate a new registration */
+	(void) ds_svc_register(svc, NULL);
+
+	return (0);
+}
+
+static int
+ds_svc_port_up(ds_svc_t *svc, void *arg)
+{
+	ds_port_t *port = (ds_port_t *)arg;
+
+	if (DS_SVC_ISFREE(svc)) {
+		/* nothing to do */
+		return (0);
+	}
+
+	DS_PORTSET_ADD(svc->avail, port->id);
+
+	return (0);
+}
+
+static ds_svc_t *
+ds_alloc_svc(void)
+{
+	int		idx;
+	uint_t		newmaxsvcs;
+	ds_svc_t	**newtbl;
+	ds_svc_t	*newsvc;
+
+	ASSERT(RW_WRITE_HELD(&ds_svcs.rwlock));
+
+	idx = ds_walk_svcs(ds_svc_isfree, NULL);
+
+	if (idx != ds_svcs.maxsvcs) {
+		goto found;
+	}
+
+	/*
+	 * There was no free space in the table. Grow
+	 * the table to double its current size.
+	 */
+	newmaxsvcs = ds_svcs.maxsvcs * 2;
+	newtbl = kmem_zalloc(newmaxsvcs * sizeof (ds_svc_t *), KM_SLEEP);
+
+	/* copy old table data to the new table */
+	for (idx = 0; idx < ds_svcs.maxsvcs; idx++) {
+		newtbl[idx] = ds_svcs.tbl[idx];
+	}
+
+	/* clean up the old table */
+	kmem_free(ds_svcs.tbl, ds_svcs.maxsvcs * sizeof (ds_svc_t *));
+	ds_svcs.tbl = newtbl;
+	ds_svcs.maxsvcs = newmaxsvcs;
+
+	/* search for a free space again */
+	idx = ds_walk_svcs(ds_svc_isfree, NULL);
+
+	/* the table is locked so should find a free slot */
+	ASSERT(idx != ds_svcs.maxsvcs);
+
+found:
+	/* allocate a new svc structure if necessary */
+	if ((newsvc = ds_svcs.tbl[idx]) == NULL) {
+		/* allocate a new service */
+		newsvc = kmem_zalloc(sizeof (ds_svc_t), KM_SLEEP);
+		ds_svcs.tbl[idx] = newsvc;
+	}
+
+	/* fill in the handle */
+	newsvc->hdl = DS_ALLOC_HDL(idx, DS_HDL2COUNT(newsvc->hdl));
+
+	return (newsvc);
+}
+
+static void
+ds_reset_svc(ds_svc_t *svc, ds_port_t *port)
+{
+	svc->state = DS_SVC_INACTIVE;
+	svc->ver_idx = 0;
+	svc->ver.major = 0;
+	svc->ver.minor = 0;
+	svc->port = NULL;
+	DS_PORTSET_DEL(svc->avail, port->id);
+}
+
+static ds_svc_t *
+ds_get_svc(ds_svc_hdl_t hdl)
+{
+	int		idx;
+	ds_svc_t	*svc;
+
+	ASSERT(RW_LOCK_HELD(&ds_svcs.rwlock));
+
+	if (hdl == DS_INVALID_HDL)
+		return (NULL);
+
+	idx = DS_HDL2IDX(hdl);
+
+	/* check if index is out of bounds */
+	if ((idx < 0) || (idx >= ds_svcs.maxsvcs))
+		return (NULL);
+
+	svc = ds_svcs.tbl[idx];
+
+	/* check for a valid service */
+	if (DS_SVC_ISFREE(svc))
+		return (NULL);
+
+	/* make sure the handle is an exact match */
+	if (svc->hdl != hdl)
+		return (NULL);
+
+	return (svc);
+}
+
+static int
+ds_port_add(md_t *mdp, mde_cookie_t port, mde_cookie_t chan)
+{
+	ds_port_t	*newport;
+	uint64_t	port_id;
+	uint64_t	ldc_id;
+
+	/* get the ID for this port */
+	if (md_get_prop_val(mdp, port, "id", &port_id) != 0) {
+		cmn_err(CE_NOTE, "ds_port_add: port 'id' property not found");
+		return (-1);
+	}
+
+	/* sanity check the port id */
+	if (port_id > DS_MAX_PORT_ID) {
+		cmn_err(CE_WARN, "ds_port_add: port ID %ld out of range",
+		    port_id);
+		return (-1);
+	}
+
+	DS_DBG("ds_port_add: adding port ds@%ld\n", port_id);
+
+	/* get the channel ID for this port */
+	if (md_get_prop_val(mdp, chan, "id", &ldc_id) != 0) {
+		cmn_err(CE_NOTE, "ds@%lx: add_port: no channel 'id' property",
+		    port_id);
+		return (-1);
+	}
+
+	/* get the port structure from the array of ports */
+	newport = &ds_ports[port_id];
+
+	/* check for a duplicate port in the MD */
+	if (newport->state != DS_PORT_FREE) {
+		cmn_err(CE_NOTE, "ds@%lx: add_port: port already exists",
+		    port_id);
+		return (-1);
+	}
+
+	/* initialize the port lock */
+	mutex_init(&newport->lock, NULL, MUTEX_DRIVER, NULL);
+
+	/* initialize the port */
+	newport->id = port_id;
+	newport->state = DS_PORT_INIT;
+	newport->ldc.id = ldc_id;
+
+	/* add the port to the set of all ports */
+	DS_PORTSET_ADD(ds_allports, port_id);
+
+	return (0);
+}
+
+static void
+ds_port_reset(ds_port_t *port)
+{
+	ASSERT(MUTEX_HELD(&port->lock));
+
+	/* connection went down, mark everything inactive */
+	rw_enter(&ds_svcs.rwlock, RW_WRITER);
+
+	(void) ds_walk_svcs(ds_svc_unregister, port);
+
+	rw_exit(&ds_svcs.rwlock);
+
+	port->ver_idx = 0;
+	port->ver.major = 0;
+	port->ver.minor = 0;
+	port->state = DS_PORT_LDC_INIT;
+}
+
+/*
+ * Verify that a version array is sorted as expected for the
+ * version negotiation to work correctly.
+ */
+static ds_vers_check_t
+ds_vers_isvalid(ds_ver_t *vers, int nvers)
+{
+	uint16_t	curr_major;
+	uint16_t	curr_minor;
+	int		idx;
+
+	curr_major = vers[0].major;
+	curr_minor = vers[0].minor;
+
+	/*
+	 * Walk the version array, verifying correct ordering.
+	 * The array must be sorted from highest supported
+	 * version to lowest supported version.
+	 */
+	for (idx = 0; idx < nvers; idx++) {
+		if (vers[idx].major > curr_major) {
+			DS_DBG("vers_isvalid: version array has increasing "
+			    "major versions\n");
+			return (DS_VERS_INCREASING_MAJOR_ERR);
+		}
+
+		if (vers[idx].major < curr_major) {
+			curr_major = vers[idx].major;
+			curr_minor = vers[idx].minor;
+			continue;
+		}
+
+		if (vers[idx].minor > curr_minor) {
+			DS_DBG("vers_isvalid: version array has increasing "
+			    "minor versions\n");
+			return (DS_VERS_INCREASING_MINOR_ERR);
+		}
+
+		curr_minor = vers[idx].minor;
+	}
+
+	return (DS_VERS_OK);
+}
+
+/*
+ * Logging Support
+ */
+static void
+ds_log_init(void)
+{
+	ds_log_entry_t	*new;
+
+	/* initialize global lock */
+	mutex_init(&ds_log.lock, NULL, MUTEX_DRIVER, NULL);
+
+	mutex_enter(&ds_log.lock);
+
+	/* initialize the log */
+	ds_log.head = NULL;
+	ds_log.size = 0;
+	ds_log.nentry = 0;
+
+	/* initialize the free list */
+	for (new = ds_log_entry_pool; new < DS_LOG_POOL_END; new++) {
+		new->next = ds_log.freelist;
+		ds_log.freelist = new;
+	}
+
+	mutex_exit(&ds_log.lock);
+
+	DS_DBG_LOG("ds_log initialized: size=%d bytes, limit=%d bytes, "
+	    "ninit=%ld\n", ds_log_sz, DS_LOG_LIMIT, DS_LOG_NPOOL);
+}
+
+static void
+ds_log_fini(void)
+{
+	ds_log_entry_t	*next;
+
+	mutex_enter(&ds_log.lock);
+
+	/* clear out the log */
+	while (ds_log.nentry > 0)
+		(void) ds_log_remove();
+
+	/*
+	 * Now all the entries are on the free list.
+	 * Clear out the free list, deallocating any
+	 * entry that was dynamically allocated.
+	 */
+	while (ds_log.freelist != NULL) {
+		next = ds_log.freelist->next;
+
+		if (!DS_IS_POOL_ENTRY(ds_log.freelist)) {
+			kmem_free(ds_log.freelist, sizeof (ds_log_entry_t));
+		}
+
+		ds_log.freelist = next;
+	}
+
+	mutex_exit(&ds_log.lock);
+
+	mutex_destroy(&ds_log.lock);
+}
+
+static ds_log_entry_t *
+ds_log_entry_alloc(void)
+{
+	ds_log_entry_t	*new = NULL;
+
+	ASSERT(MUTEX_HELD(&ds_log.lock));
+
+	if (ds_log.freelist != NULL) {
+		new = ds_log.freelist;
+		ds_log.freelist = ds_log.freelist->next;
+	}
+
+	if (new == NULL) {
+		/* free list was empty */
+		new = kmem_zalloc(sizeof (ds_log_entry_t), KM_SLEEP);
+	}
+
+	ASSERT(new);
+
+	return (new);
+}
+
+static void
+ds_log_entry_free(ds_log_entry_t *entry)
+{
+	ASSERT(MUTEX_HELD(&ds_log.lock));
+
+	if (entry == NULL)
+		return;
+
+	if (entry->data != NULL) {
+		kmem_free(entry->data, entry->datasz);
+		entry->data = NULL;
+	}
+
+	/* place entry on the free list */
+	entry->next = ds_log.freelist;
+	ds_log.freelist = entry;
+}
+
+/*
+ * Add a message to the end of the log
+ */
+static int
+ds_log_add(ds_log_entry_t *new)
+{
+	ASSERT(MUTEX_HELD(&ds_log.lock));
+
+	if (ds_log.head == NULL) {
+
+		new->prev = new;
+		new->next = new;
+
+		ds_log.head = new;
+	} else {
+		ds_log_entry_t	*head = ds_log.head;
+		ds_log_entry_t	*tail = ds_log.head->prev;
+
+		new->next = head;
+		new->prev = tail;
+		tail->next = new;
+		head->prev = new;
+	}
+
+	/* increase the log size, including the metadata size */
+	ds_log.size += DS_LOG_ENTRY_SZ(new);
+	ds_log.nentry++;
+
+	DS_DBG_LOG("ds_log: added %ld data bytes, %ld total bytes\n",
+	    new->datasz, DS_LOG_ENTRY_SZ(new));
+
+	return (0);
+}
+
+/*
+ * Remove an entry from the head of the log
+ */
+static int
+ds_log_remove(void)
+{
+	ds_log_entry_t	*head;
+
+	ASSERT(MUTEX_HELD(&ds_log.lock));
+
+	head = ds_log.head;
+
+	/* empty list */
+	if (head == NULL)
+		return (0);
+
+	if (head->next == ds_log.head) {
+		/* one element list */
+		ds_log.head = NULL;
+	} else {
+		head->next->prev = head->prev;
+		head->prev->next = head->next;
+		ds_log.head = head->next;
+	}
+
+	DS_DBG_LOG("ds_log: removed %ld data bytes, %ld total bytes\n",
+	    head->datasz, DS_LOG_ENTRY_SZ(head));
+
+	ds_log.size -= DS_LOG_ENTRY_SZ(head);
+	ds_log.nentry--;
+
+	ASSERT((ds_log.size >= 0) && (ds_log.nentry >= 0));
+
+	ds_log_entry_free(head);
+
+	return (0);
+}
+
+/*
+ * Replace the data in the entry at the front of the list with then
+ * new data. This has the effect of removing the oldest entry and
+ * adding the new entry.
+ */
+static int
+ds_log_replace(uint8_t *msg, size_t sz)
+{
+	ds_log_entry_t	*head;
+
+	ASSERT(MUTEX_HELD(&ds_log.lock));
+
+	head = ds_log.head;
+
+	DS_DBG_LOG("ds_log: replaced %ld data bytes (%ld total) with %ld data "
+	    "bytes (%ld total)\n", head->datasz, DS_LOG_ENTRY_SZ(head),
+	    sz, sz + sizeof (ds_log_entry_t));
+
+	ds_log.size -= DS_LOG_ENTRY_SZ(head);
+
+	ASSERT((ds_log.size >= 0) && (ds_log.nentry >= 0));
+
+	kmem_free(head->data, head->datasz);
+	head->data = msg;
+	head->datasz = sz;
+
+	ds_log.size += DS_LOG_ENTRY_SZ(head);
+
+	ds_log.head = head->next;
+
+	return (0);
+}
+
+static void
+ds_log_purge(void *arg)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	mutex_enter(&ds_log.lock);
+
+	DS_DBG_LOG("ds_log: purging oldest log entries\n");
+
+	while ((ds_log.nentry) && (ds_log.size >= ds_log_sz)) {
+		(void) ds_log_remove();
+	}
+
+	mutex_exit(&ds_log.lock);
+}
+
+static int
+ds_log_add_msg(int32_t dest, uint8_t *msg, size_t sz)
+{
+	int	rv = 0;
+
+	mutex_enter(&ds_log.lock);
+
+	/* check if the log is larger than the soft limit */
+	if ((ds_log.nentry) && ((ds_log.size + sz) >= ds_log_sz)) {
+		/*
+		 * The log is larger than the soft limit.
+		 * Swap the oldest entry for the newest.
+		 */
+		DS_DBG_LOG("ds_log: replacing oldest entry with new entry\n");
+		(void) ds_log_replace(msg, sz);
+	} else {
+		/*
+		 * Still have headroom under the soft limit.
+		 * Add the new entry to the log.
+		 */
+		ds_log_entry_t	*new;
+
+		new = ds_log_entry_alloc();
+
+		/* fill in message data */
+		new->data = msg;
+		new->datasz = sz;
+		new->timestamp = ddi_get_time();
+		new->dest = dest;
+
+		rv = ds_log_add(new);
+	}
+
+	/* check if the log is larger than the hard limit */
+	if ((ds_log.nentry > 1) && (ds_log.size >= DS_LOG_LIMIT)) {
+		/*
+		 * Wakeup the thread to remove entries
+		 * from the log until it is smaller than
+		 * the soft limit.
+		 */
+		DS_DBG_LOG("ds_log: log exceeded %d bytes, scheduling a "
+		    "purge...\n", DS_LOG_LIMIT);
+
+		if (DS_DISPATCH(ds_log_purge, (void *)msg) == NULL) {
+			cmn_err(CE_NOTE, "ds_log: purge thread failed to "
+			    "start");
+		}
+	}
+
+	mutex_exit(&ds_log.lock);
+
+	return (rv);
+}
+
+/*
+ * Client Interface
+ */
+
+int
+ds_cap_init(ds_capability_t *cap, ds_clnt_ops_t *ops)
+{
+	int		idx;
+	ds_vers_check_t	status;
+	ds_svc_t	*svc;
+
+	/* sanity check the args */
+	if ((cap == NULL) || (ops == NULL)) {
+		cmn_err(CE_NOTE, "ds_cap_init: invalid arguments");
+		return (EINVAL);
+	}
+
+	/* sanity check the capability specifier */
+	if ((cap->svc_id == NULL) || (cap->vers == NULL) || (cap->nvers == 0)) {
+		cmn_err(CE_NOTE, "ds_cap_init: invalid capability specifier");
+		return (EINVAL);
+	}
+
+	/* sanity check the version array */
+	if ((status = ds_vers_isvalid(cap->vers, cap->nvers)) != DS_VERS_OK) {
+		cmn_err(CE_NOTE, "ds_cap_init: invalid capability "
+		    "version array for %s service: %s", cap->svc_id,
+		    (status == DS_VERS_INCREASING_MAJOR_ERR) ?
+		    "increasing major versions" :
+		    "increasing minor versions");
+		return (EINVAL);
+	}
+
+	/* data and register callbacks are required */
+	if ((ops->ds_data_cb == NULL) || (ops->ds_reg_cb == NULL)) {
+		cmn_err(CE_NOTE, "ds_cap_init: invalid ops specifier for "
+		    "%s service", cap->svc_id);
+		return (EINVAL);
+	}
+
+	DS_DBG("ds_cap_init: svc_id='%s', data_cb=0x%lx, cb_arg=0x%lx\n",
+	    cap->svc_id, (uint64_t)ops->ds_data_cb, (uint64_t)ops->cb_arg);
+
+	rw_enter(&ds_svcs.rwlock, RW_WRITER);
+
+	/* check if the service is already registered */
+	idx = ds_walk_svcs(ds_svc_ismatch, cap->svc_id);
+	if (idx != ds_svcs.maxsvcs) {
+		/* already registered */
+		cmn_err(CE_NOTE, "service '%s' already registered",
+		    cap->svc_id);
+		rw_exit(&ds_svcs.rwlock);
+		return (EALREADY);
+	}
+
+	svc = ds_alloc_svc();
+
+	/* copy over all the client information */
+	bcopy(cap, &svc->cap, sizeof (ds_capability_t));
+
+	/* make a copy of the service name */
+	svc->cap.svc_id = kmem_zalloc(strlen(cap->svc_id) + 1, KM_SLEEP);
+	(void) strncpy(svc->cap.svc_id, cap->svc_id, strlen(cap->svc_id));
+
+	/* make a copy of the version array */
+	svc->cap.vers = kmem_zalloc(cap->nvers * sizeof (ds_ver_t), KM_SLEEP);
+	bcopy(cap->vers, svc->cap.vers, cap->nvers * sizeof (ds_ver_t));
+
+	/* copy the client ops vector */
+	bcopy(ops, &svc->ops, sizeof (ds_clnt_ops_t));
+
+	svc->state = DS_SVC_INACTIVE;
+	svc->ver_idx = 0;
+	DS_PORTSET_DUP(svc->avail, ds_allports);
+
+	ds_svcs.nsvcs++;
+
+	rw_exit(&ds_svcs.rwlock);
+
+	/* attempt to register the service */
+	(void) ds_svc_register(svc, NULL);
+
+	DS_DBG("ds_cap_init: service '%s' assigned handle 0x%lx\n",
+	    svc->cap.svc_id, svc->hdl);
+
+	return (0);
+}
+
+int
+ds_cap_fini(ds_capability_t *cap)
+{
+	int		idx;
+	ds_svc_t	*svc;
+	ds_svc_hdl_t	tmp_hdl;
+
+	rw_enter(&ds_svcs.rwlock, RW_WRITER);
+
+	/* make sure the service is registered */
+	idx = ds_walk_svcs(ds_svc_ismatch, cap->svc_id);
+	if (idx == ds_svcs.maxsvcs) {
+		/* service is not registered */
+		cmn_err(CE_NOTE, "ds_cap_fini: unknown service '%s'",
+		    cap->svc_id);
+		rw_exit(&ds_svcs.rwlock);
+		return (EINVAL);
+	}
+
+	svc = ds_svcs.tbl[idx];
+
+	DS_DBG("ds_cap_fini: svcid='%s', hdl=0x%lx\n", svc->cap.svc_id,
+	    svc->hdl);
+
+	/*
+	 * Attempt to send an unregister notification. Even
+	 * if sending the message fails, the local unregister
+	 * request must be honored, since this indicates that
+	 * the client will no longer handle incoming requests.
+	 */
+	(void) ds_send_unreg_req(svc);
+
+	/*
+	 * Clear out the structure, but do not deallocate the
+	 * memory. It can be reused for the next registration.
+	 */
+	kmem_free(svc->cap.svc_id, strlen(svc->cap.svc_id) + 1);
+	kmem_free(svc->cap.vers, svc->cap.nvers * sizeof (ds_ver_t));
+
+	/* save the handle to prevent reuse */
+	tmp_hdl = svc->hdl;
+	bzero(svc, sizeof (ds_svc_t));
+
+	/* initialize for next use */
+	svc->hdl = tmp_hdl;
+	svc->state = DS_SVC_FREE;
+
+	ds_svcs.nsvcs--;
+
+	rw_exit(&ds_svcs.rwlock);
+
+	return (0);
+}
+
+int
+ds_cap_send(ds_svc_hdl_t hdl, void *buf, size_t len)
+{
+	int		rv;
+	ds_hdr_t	*hdr;
+	caddr_t		msg;
+	size_t		msglen;
+	size_t		hdrlen;
+	caddr_t		payload;
+	ds_svc_t	*svc;
+	ds_port_t	*port;
+	ds_data_handle_t *data;
+
+	rw_enter(&ds_svcs.rwlock, RW_READER);
+
+	if ((hdl == DS_INVALID_HDL) || (svc = ds_get_svc(hdl)) == NULL) {
+		cmn_err(CE_NOTE, "ds_cap_send: invalid handle 0x%lx", hdl);
+		rw_exit(&ds_svcs.rwlock);
+		return (EINVAL);
+	}
+
+	if ((port = svc->port) == NULL) {
+		cmn_err(CE_NOTE, "ds_cap_send: service '%s' not associated "
+		    "with a port", svc->cap.svc_id);
+		rw_exit(&ds_svcs.rwlock);
+		return (ECONNRESET);
+	}
+
+	mutex_enter(&port->lock);
+
+	/* check that the LDC channel is ready */
+	if (port->ldc.state != LDC_UP) {
+		cmn_err(CE_NOTE, "ds_cap_send: LDC channel is not up");
+		mutex_exit(&port->lock);
+		rw_exit(&ds_svcs.rwlock);
+		return (ECONNRESET);
+	}
+
+
+	if (svc->state != DS_SVC_ACTIVE) {
+		/* channel is up, but svc is not registered */
+		cmn_err(CE_NOTE, "ds_cap_send: invalid service state 0x%x",
+		    svc->state);
+		mutex_exit(&port->lock);
+		rw_exit(&ds_svcs.rwlock);
+		return (EINVAL);
+	}
+
+	hdrlen = DS_HDR_SZ + sizeof (ds_data_handle_t);
+
+	msg = kmem_zalloc(len + hdrlen, KM_SLEEP);
+	hdr = (ds_hdr_t *)msg;
+	payload = msg + hdrlen;
+	msglen = len + hdrlen;
+
+	hdr->payload_len = len + sizeof (ds_data_handle_t);
+	hdr->msg_type = DS_DATA;
+
+	data = (ds_data_handle_t *)(msg + DS_HDR_SZ);
+	data->svc_handle = hdl;
+
+	if ((buf != NULL) && (len != 0)) {
+		bcopy(buf, payload, len);
+	}
+
+	DS_DBG("ds@%lx: data>: hdl=0x%lx, len=%ld, payload_len=%d\n",
+	    port->id, svc->hdl, msglen, hdr->payload_len);
+
+	if ((rv = ds_send_msg(port, msg, msglen)) != 0) {
+		rv = (rv == EIO) ? ECONNRESET : rv;
+	}
+
+	mutex_exit(&port->lock);
+	rw_exit(&ds_svcs.rwlock);
+
+	return (rv);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/fault_iso.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,453 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * sun4v Fault Isolation Services Module
+ */
+
+#include <sys/modctl.h>
+#include <sys/cmn_err.h>
+#include <sys/machsystm.h>
+#include <sys/processor.h>
+#include <sys/mem.h>
+#include <vm/page.h>
+#include <sys/note.h>
+#include <sys/ds.h>
+#include <sys/fault_iso.h>
+
+/*
+ * Debugging routines
+ */
+#ifdef DEBUG
+uint_t fi_debug = 0x0;
+#define	FI_DBG	if (fi_debug) cmn_err
+#else /* DEBUG */
+#define	FI_DBG	_NOTE(CONSTCOND) if (0) cmn_err
+#endif /* DEBUG */
+
+/*
+ * Domains Services interaction
+ */
+static ds_svc_hdl_t	cpu_handle;
+static ds_svc_hdl_t	mem_handle;
+
+static ds_ver_t		fi_vers[] = { { 1, 0 } };
+#define	FI_NVERS	(sizeof (fi_vers) / sizeof (fi_vers[0]))
+
+static ds_capability_t cpu_cap = {
+	"fma-cpu-service",	/* svc_id */
+	fi_vers,		/* vers */
+	FI_NVERS		/* nvers */
+};
+
+static ds_capability_t mem_cap = {
+	"fma-mem-service",	/* svc_id */
+	fi_vers,		/* vers */
+	FI_NVERS		/* nvers */
+};
+
+static void fi_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl);
+static void fi_unreg_handler(ds_cb_arg_t arg);
+
+static void cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
+static void mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
+
+static ds_clnt_ops_t cpu_ops = {
+	fi_reg_handler,		/* ds_reg_cb */
+	fi_unreg_handler,	/* ds_unreg_cb */
+	cpu_data_handler,	/* ds_data_cb */
+	&cpu_handle		/* cb_arg */
+};
+
+static ds_clnt_ops_t mem_ops = {
+	fi_reg_handler,		/* ds_reg_cb */
+	fi_unreg_handler,	/* ds_unreg_cb */
+	mem_data_handler,	/* ds_data_cb */
+	&mem_handle		/* cb_arg */
+};
+
+static int fi_init(void);
+static void fi_fini(void);
+
+static struct modlmisc modlmisc = {
+	&mod_miscops,
+	"sun4v Fault Isolation Services %I%"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	(void *)&modlmisc,
+	NULL
+};
+
+int
+_init(void)
+{
+	int	rv;
+
+	if ((rv = fi_init()) != 0)
+		return (rv);
+
+	if ((rv = mod_install(&modlinkage)) != 0)
+		fi_fini();
+
+	return (rv);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int fi_allow_unload;
+
+int
+_fini(void)
+{
+	int	status;
+
+	if (fi_allow_unload == 0)
+		return (EBUSY);
+
+	if ((status = mod_remove(&modlinkage)) == 0)
+		fi_fini();
+
+	return (status);
+}
+
+static int
+fi_init(void)
+{
+	int	rv;
+
+	/* register CPU service with domain services framework */
+	rv = ds_cap_init(&cpu_cap, &cpu_ops);
+	if (rv != 0) {
+		FI_DBG(CE_CONT, "ds_cap_init failed: %d", rv);
+		return (rv);
+	}
+
+	/* register MEM servicewith domain services framework */
+	rv = ds_cap_init(&mem_cap, &mem_ops);
+	if (rv != 0) {
+		FI_DBG(CE_CONT, "ds_cap_init failed: %d", rv);
+		(void) ds_cap_fini(&cpu_cap);
+		return (rv);
+	}
+
+	return (rv);
+}
+
+static void
+fi_fini(void)
+{
+	/*
+	 * Stop incoming requests from Zeus
+	 */
+	(void) ds_cap_fini(&cpu_cap);
+	(void) ds_cap_fini(&mem_cap);
+}
+
+static void
+cpu_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	fma_cpu_service_req_t	*msg = buf;
+	fma_cpu_resp_t		resp_msg;
+	int			rv = 0;
+	int			cpu_status;
+	int			resp_back = 0;
+
+	/*
+	 * If the buffer is the wrong size for CPU calls or is NULL then
+	 * do not return any message. The call from the ldom mgr. will time out
+	 * and the response will be NULL.
+	 */
+	if (msg == NULL || buflen != sizeof (fma_cpu_service_req_t)) {
+		return;
+	}
+
+	FI_DBG(CE_CONT, "req_num = %ld, msg_type = %d, cpu_id = %d\n",
+	    msg->req_num, msg->msg_type, msg->cpu_id);
+
+	resp_msg.req_num = msg->req_num;
+
+	switch (msg->msg_type) {
+	case FMA_CPU_REQ_STATUS:
+		rv = p_online_internal(msg->cpu_id, P_STATUS,
+		    &cpu_status);
+		if (rv == EINVAL) {
+			FI_DBG(CE_CONT, "Failed p_online call failed."
+			    "Invalid CPU\n");
+			resp_msg.result = FMA_CPU_RESP_FAILURE;
+			resp_msg.status = FMA_CPU_STAT_ILLEGAL;
+			resp_back = 1;
+		}
+		break;
+	case FMA_CPU_REQ_OFFLINE:
+		rv = p_online_internal(msg->cpu_id, P_OFFLINE,
+		    &cpu_status);
+		if (rv == EINVAL) {
+			FI_DBG(CE_CONT, "Failed p_online call failed."
+			    "Invalid CPU\n");
+			resp_msg.result = FMA_CPU_RESP_FAILURE;
+			resp_msg.status = FMA_CPU_STAT_ILLEGAL;
+			resp_back = 1;
+		} else if (rv == EBUSY) {
+			FI_DBG(CE_CONT, "Failed p_online call failed."
+			    "Tried to offline while busy\n");
+			resp_msg.result = FMA_CPU_RESP_FAILURE;
+			resp_msg.status = FMA_CPU_STAT_ONLINE;
+			resp_back = 1;
+		}
+		break;
+	case FMA_CPU_REQ_ONLINE:
+		rv = p_online_internal(msg->cpu_id, P_ONLINE,
+		    &cpu_status);
+		if (rv == EINVAL) {
+			FI_DBG(CE_CONT, "Failed p_online call failed."
+			    "Invalid CPU\n");
+			resp_msg.result = FMA_CPU_RESP_FAILURE;
+			resp_msg.status = FMA_CPU_STAT_ILLEGAL;
+			resp_back = 1;
+		} else if (rv == ENOTSUP) {
+			FI_DBG(CE_CONT, "Failed p_online call failed."
+			    "Online not supported for single CPU\n");
+			resp_msg.result = FMA_CPU_RESP_FAILURE;
+			resp_msg.status = FMA_CPU_STAT_OFFLINE;
+			resp_back = 1;
+		}
+		break;
+	default:
+		/*
+		 * If the msg_type was of unknown type simply return and
+		 * have the ldom mgr. time out with a NULL response.
+		 */
+		return;
+	}
+
+	if (rv != 0) {
+		if (resp_back) {
+			if ((rv = ds_cap_send(cpu_handle, &resp_msg,
+				sizeof (resp_msg))) != 0) {
+				FI_DBG(CE_CONT, "ds_cap_send failed (%d)\n",
+				    rv);
+			}
+			return;
+		}
+		ASSERT((rv == EINVAL) || ((rv == EBUSY) &&
+			(msg->msg_type == FMA_CPU_REQ_OFFLINE)) ||
+		    ((rv == ENOTSUP) &&
+			(msg->msg_type == FMA_CPU_REQ_ONLINE)));
+
+		cmn_err(CE_WARN, "p_online_internal error not handled "
+		    "rv = %d\n", rv);
+	}
+
+	resp_msg.req_num = msg->req_num;
+	resp_msg.result = FMA_CPU_RESP_OK;
+
+	switch (cpu_status) {
+	case P_OFFLINE:
+	case P_FAULTED:
+	case P_POWEROFF:
+	case P_SPARE:
+		resp_msg.status = FMA_CPU_STAT_OFFLINE;
+		break;
+	case P_ONLINE:
+	case P_NOINTR:
+		resp_msg.status = FMA_CPU_STAT_ONLINE;
+		break;
+	default:
+		resp_msg.status = FMA_CPU_STAT_ILLEGAL;
+	}
+
+	if ((rv = ds_cap_send(cpu_handle, &resp_msg,
+	    sizeof (resp_msg))) != 0) {
+		FI_DBG(CE_CONT, "ds_cap_send failed (%d)\n", rv);
+	}
+}
+
+static void
+mem_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	fma_mem_service_req_t	*msg = buf;
+	fma_mem_resp_t		resp_msg;
+	int			rv = 0;
+
+	/*
+	 * If the buffer is the wrong size for Mem calls or is NULL then
+	 * do not return any message. The call from the ldom mgr. will time out
+	 * and the response will be NULL.
+	 */
+	if (msg == NULL || buflen != sizeof (fma_mem_service_req_t)) {
+		return;
+	}
+
+	FI_DBG(CE_CONT, "req_num = %ld, msg_type = %d, memory addr = 0x%lx"
+	"memory length = 0x%lx\n", msg->req_num, msg->msg_type,
+	    msg->real_addr, msg->length);
+
+	resp_msg.req_num = msg->req_num;
+	resp_msg.res_addr = msg->real_addr;
+	resp_msg.res_length = msg->length;
+
+	/*
+	 * Information about return values for page calls can be referenced
+	 * in usr/src/uts/common/vm/page_retire.c
+	 */
+	switch (msg->msg_type) {
+	case FMA_MEM_REQ_STATUS:
+		rv = page_retire_check(msg->real_addr, NULL);
+		switch (rv) {
+		/* Page is retired */
+		case 0:
+			resp_msg.result = FMA_MEM_RESP_OK;
+			resp_msg.status = FMA_MEM_STAT_RETIRED;
+			break;
+		/* Page is pending. Send back failure and not retired */
+		case EAGAIN:
+			resp_msg.result = FMA_MEM_RESP_FAILURE;
+			resp_msg.status = FMA_MEM_STAT_NOTRETIRED;
+			break;
+		/* Page is not retired. */
+		case EIO:
+			resp_msg.result = FMA_MEM_RESP_FAILURE;
+			resp_msg.status = FMA_MEM_STAT_NOTRETIRED;
+			break;
+		/* PA is not valid */
+		case EINVAL:
+			resp_msg.result = FMA_MEM_RESP_FAILURE;
+			resp_msg.status = FMA_MEM_STAT_ILLEGAL;
+			break;
+		default:
+			ASSERT((rv == 0) || (rv == EAGAIN) || (rv == EIO) ||
+			    (rv ==  EINVAL));
+			cmn_err(CE_WARN, "fault_iso: return value from "
+			    "page_retire_check invalid: %d\n", rv);
+		}
+		break;
+	case FMA_MEM_REQ_RETIRE:
+		rv = page_retire(msg->real_addr, PR_FMA);
+		switch (rv) {
+		/* Page retired successfully */
+		case 0:
+			resp_msg.result = FMA_MEM_RESP_OK;
+			resp_msg.status = FMA_MEM_STAT_RETIRED;
+			break;
+		/* Tried to retire and now Pending retirement */
+		case EAGAIN:
+			resp_msg.result = FMA_MEM_RESP_FAILURE;
+			resp_msg.status = FMA_MEM_STAT_NOTRETIRED;
+			break;
+		/* Did not try to retire. Page already retired */
+		case EIO:
+			resp_msg.result = FMA_MEM_RESP_FAILURE;
+			resp_msg.status = FMA_MEM_STAT_RETIRED;
+			break;
+		/* PA is not valid */
+		case EINVAL:
+			resp_msg.result = FMA_MEM_RESP_FAILURE;
+			resp_msg.status = FMA_MEM_STAT_ILLEGAL;
+			break;
+		default:
+			ASSERT((rv == 0) || (rv == EAGAIN) || (rv == EIO) ||
+			    (rv ==  EINVAL));
+			cmn_err(CE_WARN, "fault_iso: return value from "
+			    "page_retire invalid: %d\n", rv);
+		}
+		break;
+	case FMA_MEM_REQ_RESURRECT:
+		rv = page_unretire(msg->real_addr);
+		switch (rv) {
+		/* Page succesfullly unretired */
+		case 0:
+			resp_msg.result = FMA_MEM_RESP_OK;
+			resp_msg.status = FMA_MEM_STAT_NOTRETIRED;
+			break;
+		/* Page could not be locked. Still retired */
+		case EAGAIN:
+			resp_msg.result = FMA_MEM_RESP_FAILURE;
+			resp_msg.status = FMA_MEM_STAT_RETIRED;
+			break;
+		/* Page was not retired already */
+		case EIO:
+			resp_msg.result = FMA_MEM_RESP_FAILURE;
+			resp_msg.status = FMA_MEM_STAT_NOTRETIRED;
+			break;
+		/* PA is not valid */
+		case EINVAL:
+			resp_msg.result = FMA_MEM_RESP_FAILURE;
+			resp_msg.status = FMA_MEM_STAT_ILLEGAL;
+			break;
+		default:
+			ASSERT((rv == 0) || (rv == EAGAIN) || (rv == EIO) ||
+			    (rv ==  EINVAL));
+			cmn_err(CE_WARN, "fault_iso: return value from "
+			    "page_unretire invalid: %d\n", rv);
+		}
+		break;
+	default:
+		/*
+		 * If the msg_type was of unknown type simply return and
+		 * have the ldom mgr. time out with a NULL response.
+		 */
+		return;
+	}
+
+	if ((rv = ds_cap_send(mem_handle, &resp_msg, sizeof (resp_msg))) != 0) {
+		FI_DBG(CE_CONT, "ds_cap_send failed (%d)\n", rv);
+	}
+}
+
+static void
+fi_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
+{
+	FI_DBG(CE_CONT, "fi_reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n",
+	    arg, ver->major, ver->minor, hdl);
+
+	if ((ds_svc_hdl_t *)arg == &cpu_handle)
+		cpu_handle = hdl;
+	if ((ds_svc_hdl_t *)arg == &mem_handle)
+		mem_handle = hdl;
+}
+
+static void
+fi_unreg_handler(ds_cb_arg_t arg)
+{
+	FI_DBG(CE_CONT, "fi_unreg_handler: arg=0x%p\n", arg);
+
+	if ((ds_svc_hdl_t *)arg == &cpu_handle)
+		cpu_handle = DS_INVALID_HDL;
+	if ((ds_svc_hdl_t *)arg == &mem_handle)
+		mem_handle = DS_INVALID_HDL;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/ldc.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,5609 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * sun4v LDC Transport Layer
+ */
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/open.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/ksynch.h>
+#include <sys/modctl.h>
+#include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/cred.h>
+#include <sys/promif.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cyclic.h>
+#include <sys/machsystm.h>
+#include <sys/vm.h>
+#include <sys/cpu.h>
+#include <sys/intreg.h>
+#include <sys/machcpuvar.h>
+#include <sys/note.h>
+#include <sys/ivintr.h>
+#include <sys/hypervisor_api.h>
+#include <sys/ldc.h>
+#include <sys/ldc_impl.h>
+#include <sys/cnex.h>
+#include <sys/hsvc.h>
+
+/* Core internal functions */
+static int i_ldc_h2v_error(int h_error);
+static int i_ldc_txq_reconf(ldc_chan_t *ldcp);
+static int i_ldc_rxq_reconf(ldc_chan_t *ldcp);
+static void i_ldc_reset_state(ldc_chan_t *ldcp);
+static void i_ldc_reset(ldc_chan_t *ldcp);
+
+static int i_ldc_get_tx_tail(ldc_chan_t *ldcp, uint64_t *tail);
+static int i_ldc_set_tx_tail(ldc_chan_t *ldcp, uint64_t tail);
+static int i_ldc_set_rx_head(ldc_chan_t *ldcp, uint64_t head);
+static int i_ldc_send_pkt(ldc_chan_t *ldcp, uint8_t pkttype, uint8_t subtype,
+    uint8_t ctrlmsg);
+
+/* Interrupt handling functions */
+static uint_t i_ldc_tx_hdlr(caddr_t arg1, caddr_t arg2);
+static uint_t i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2);
+static void i_ldc_clear_intr(ldc_chan_t *ldcp, cnex_intrtype_t itype);
+
+/* Read method functions */
+static int i_ldc_read_raw(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep);
+static int i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp,
+	size_t *sizep);
+static int i_ldc_read_stream(ldc_chan_t *ldcp, caddr_t target_bufp,
+	size_t *sizep);
+
+/* Write method functions */
+static int i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t target_bufp,
+	size_t *sizep);
+static int i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t target_bufp,
+	size_t *sizep);
+static int i_ldc_write_stream(ldc_chan_t *ldcp, caddr_t target_bufp,
+	size_t *sizep);
+
+/* Pkt processing internal functions */
+static int i_ldc_check_seqid(ldc_chan_t *ldcp, ldc_msg_t *ldcmsg);
+static int i_ldc_ctrlmsg(ldc_chan_t *ldcp, ldc_msg_t *ldcmsg);
+static int i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg);
+static int i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg);
+static int i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg);
+static int i_ldc_process_RDX(ldc_chan_t *ldcp, ldc_msg_t *msg);
+static int i_ldc_process_data_ACK(ldc_chan_t *ldcp, ldc_msg_t *msg);
+
+/* Memory synchronization internal functions */
+static int i_ldc_mem_acquire_release(ldc_mem_handle_t mhandle,
+    uint8_t direction, uint64_t offset, size_t size);
+static int i_ldc_dring_acquire_release(ldc_dring_handle_t dhandle,
+    uint8_t direction, uint64_t start, uint64_t end);
+
+/* LDC Version */
+static ldc_ver_t ldc_versions[] = { {1, 0} };
+
+/* number of supported versions */
+#define	LDC_NUM_VERS	(sizeof (ldc_versions) / sizeof (ldc_versions[0]))
+
+/* Module State Pointer */
+static ldc_soft_state_t *ldcssp;
+
+static struct modldrv md = {
+	&mod_miscops,			/* This is a misc module */
+	"sun4v LDC module v%I%",	/* Name of the module */
+};
+
+static struct modlinkage ml = {
+	MODREV_1,
+	&md,
+	NULL
+};
+
+static uint64_t ldc_sup_minor;		/* Supported minor number */
+static hsvc_info_t ldc_hsvc = {
+	HSVC_REV_1, NULL, HSVC_GROUP_LDC, 1, 0, "ldc"
+};
+
+static uint64_t intr_sup_minor;		/* Supported minor number */
+static hsvc_info_t intr_hsvc = {
+	HSVC_REV_1, NULL, HSVC_GROUP_INTR, 1, 0, "ldc"
+};
+
+#ifdef DEBUG
+
+/*
+ * Print debug messages
+ *
+ * set ldcdbg to 0x7 for enabling all msgs
+ * 0x4 - Warnings
+ * 0x2 - All debug messages
+ * 0x1 - Minimal debug messages
+ *
+ * set ldcdbgchan to the channel number you want to debug
+ * setting it to -1 prints debug messages for all channels
+ * NOTE: ldcdbgchan has no effect on error messages
+ */
+
+#define	DBG_ALL_LDCS -1
+
+int ldcdbg = 0x0;
+int64_t ldcdbgchan = DBG_ALL_LDCS;
+
+static void
+ldcdebug(int64_t id, const char *fmt, ...)
+{
+	char buf[512];
+	va_list ap;
+
+	/*
+	 * Do not return if,
+	 * caller wants to print it anyway - (id == DBG_ALL_LDCS)
+	 * debug channel is set to all LDCs - (ldcdbgchan == DBG_ALL_LDCS)
+	 * debug channel = caller specified channel
+	 */
+	if ((id != DBG_ALL_LDCS) &&
+	    (ldcdbgchan != DBG_ALL_LDCS) &&
+	    (ldcdbgchan != id)) {
+		return;
+	}
+
+	va_start(ap, fmt);
+	(void) vsprintf(buf, fmt, ap);
+	va_end(ap);
+
+	cmn_err(CE_CONT, "?%s\n", buf);
+}
+
+#define	D1		\
+if (ldcdbg & 0x01)	\
+	ldcdebug
+
+#define	D2		\
+if (ldcdbg & 0x02)	\
+	ldcdebug
+
+#define	DWARN		\
+if (ldcdbg & 0x04)	\
+	ldcdebug
+
+#define	DUMP_PAYLOAD(id, addr)						\
+{									\
+	char buf[65*3];							\
+	int i;								\
+	uint8_t *src = (uint8_t *)addr;					\
+	for (i = 0; i < 64; i++, src++)					\
+		(void) sprintf(&buf[i * 3], "|%02x", *src);		\
+	(void) sprintf(&buf[i * 3], "|\n");				\
+	D2((id), "payload: %s", buf);					\
+}
+
+#define	DUMP_LDC_PKT(c, s, addr)					\
+{									\
+	ldc_msg_t *msg = (ldc_msg_t *)(addr);				\
+	uint32_t mid = ((c)->mode != LDC_MODE_RAW) ? msg->seqid : 0;	\
+	if (msg->type == LDC_DATA) {                                    \
+	    D2((c)->id, "%s: msg%d (/%x/%x/%x/,env[%c%c,sz=%d])",	\
+	    (s), mid, msg->type, msg->stype, msg->ctrl,			\
+	    (msg->env & LDC_FRAG_START) ? 'B' : ' ',                    \
+	    (msg->env & LDC_FRAG_STOP) ? 'E' : ' ',                     \
+	    (msg->env & LDC_LEN_MASK));					\
+	} else { 							\
+	    D2((c)->id, "%s: msg%d (/%x/%x/%x/,env=%x)", (s),		\
+	    mid, msg->type, msg->stype, msg->ctrl, msg->env);		\
+	} 								\
+}
+
+#else
+
+#define	DBG_ALL_LDCS -1
+
+#define	D1
+#define	D2
+#define	DWARN
+
+#define	DUMP_PAYLOAD(id, addr)
+#define	DUMP_LDC_PKT(c, s, addr)
+
+#endif
+
+#define	ZERO_PKT(p)			\
+	bzero((p), sizeof (ldc_msg_t));
+
+#define	IDX2COOKIE(idx, pg_szc, pg_shift)				\
+	(((pg_szc) << LDC_COOKIE_PGSZC_SHIFT) | ((idx) << (pg_shift)))
+
+
+int
+_init(void)
+{
+	int status;
+
+	status = hsvc_register(&ldc_hsvc, &ldc_sup_minor);
+	if (status != 0) {
+		cmn_err(CE_WARN, "%s: cannot negotiate hypervisor LDC services"
+		    " group: 0x%lx major: %ld minor: %ld errno: %d",
+		    ldc_hsvc.hsvc_modname, ldc_hsvc.hsvc_group,
+		    ldc_hsvc.hsvc_major, ldc_hsvc.hsvc_minor, status);
+		return (-1);
+	}
+
+	status = hsvc_register(&intr_hsvc, &intr_sup_minor);
+	if (status != 0) {
+		cmn_err(CE_WARN, "%s: cannot negotiate hypervisor interrupt "
+		    "services group: 0x%lx major: %ld minor: %ld errno: %d",
+		    intr_hsvc.hsvc_modname, intr_hsvc.hsvc_group,
+		    intr_hsvc.hsvc_major, intr_hsvc.hsvc_minor, status);
+		(void) hsvc_unregister(&ldc_hsvc);
+		return (-1);
+	}
+
+	/* allocate soft state structure */
+	ldcssp = kmem_zalloc(sizeof (ldc_soft_state_t), KM_SLEEP);
+
+	/* Link the module into the system */
+	status = mod_install(&ml);
+	if (status != 0) {
+		kmem_free(ldcssp, sizeof (ldc_soft_state_t));
+		return (status);
+	}
+
+	/* Initialize the LDC state structure */
+	mutex_init(&ldcssp->lock, NULL, MUTEX_DRIVER, NULL);
+
+	mutex_enter(&ldcssp->lock);
+
+	ldcssp->channel_count = 0;
+	ldcssp->channels_open = 0;
+	ldcssp->chan_list = NULL;
+	ldcssp->dring_list = NULL;
+
+	mutex_exit(&ldcssp->lock);
+
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	/* Report status of the dynamically loadable driver module */
+	return (mod_info(&ml, modinfop));
+}
+
+int
+_fini(void)
+{
+	int 		rv, status;
+	ldc_chan_t 	*ldcp;
+	ldc_dring_t 	*dringp;
+	ldc_mem_info_t 	minfo;
+
+	/* Unlink the driver module from the system */
+	status = mod_remove(&ml);
+	if (status) {
+		DWARN(DBG_ALL_LDCS, "_fini: mod_remove failed\n");
+		return (EIO);
+	}
+
+	/* close and finalize channels */
+	ldcp = ldcssp->chan_list;
+	while (ldcp != NULL) {
+		(void) ldc_close((ldc_handle_t)ldcp);
+		(void) ldc_fini((ldc_handle_t)ldcp);
+
+		ldcp = ldcp->next;
+	}
+
+	/* Free descriptor rings */
+	dringp = ldcssp->dring_list;
+	while (dringp != NULL) {
+		dringp = dringp->next;
+
+		rv = ldc_mem_dring_info((ldc_dring_handle_t)dringp, &minfo);
+		if (rv == 0 && minfo.status != LDC_UNBOUND) {
+			if (minfo.status == LDC_BOUND) {
+				(void) ldc_mem_dring_unbind(
+						(ldc_dring_handle_t)dringp);
+			}
+			if (minfo.status == LDC_MAPPED) {
+				(void) ldc_mem_dring_unmap(
+						(ldc_dring_handle_t)dringp);
+			}
+		}
+
+		(void) ldc_mem_dring_destroy((ldc_dring_handle_t)dringp);
+	}
+	ldcssp->dring_list = NULL;
+
+	/*
+	 * We have successfully "removed" the driver.
+	 * Destroying soft states
+	 */
+	mutex_destroy(&ldcssp->lock);
+	kmem_free(ldcssp, sizeof (ldc_soft_state_t));
+
+	(void) hsvc_unregister(&ldc_hsvc);
+	(void) hsvc_unregister(&intr_hsvc);
+
+	return (status);
+}
+
+/* -------------------------------------------------------------------------- */
+
+/*
+ * LDC Transport Internal Functions
+ */
+
+/*
+ * Translate HV Errors to sun4v error codes
+ */
+static int
+i_ldc_h2v_error(int h_error)
+{
+	switch (h_error) {
+
+	case	H_EOK:
+		return (0);
+
+	case	H_ENORADDR:
+		return (EFAULT);
+
+	case	H_EBADPGSZ:
+	case	H_EINVAL:
+		return (EINVAL);
+
+	case	H_EWOULDBLOCK:
+		return (EWOULDBLOCK);
+
+	case	H_ENOACCESS:
+	case	H_ENOMAP:
+		return (EACCES);
+
+	case	H_EIO:
+	case	H_ECPUERROR:
+		return (EIO);
+
+	case	H_ENOTSUPPORTED:
+		return (ENOTSUP);
+
+	case 	H_ETOOMANY:
+		return (ENOSPC);
+
+	case	H_ECHANNEL:
+		return (ECHRNG);
+	default:
+		break;
+	}
+
+	return (EIO);
+}
+
+/*
+ * Reconfigure the transmit queue
+ */
+static int
+i_ldc_txq_reconf(ldc_chan_t *ldcp)
+{
+	int rv;
+
+	ASSERT(MUTEX_HELD(&ldcp->lock));
+	rv = hv_ldc_tx_qconf(ldcp->id, ldcp->tx_q_ra, ldcp->tx_q_entries);
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "ldc_tx_qconf: (0x%lx) cannot set qconf", ldcp->id);
+		return (EIO);
+	}
+	rv = hv_ldc_tx_get_state(ldcp->id, &(ldcp->tx_head),
+	    &(ldcp->tx_tail), &(ldcp->link_state));
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "ldc_tx_get_state: (0x%lx) cannot get qptrs", ldcp->id);
+		return (EIO);
+	}
+	D1(ldcp->id, "ldc_tx_get_state: (0x%llx) h=0x%llx,t=0x%llx,"
+	    "s=0x%llx\n", ldcp->id, ldcp->tx_head, ldcp->tx_tail,
+	    ldcp->link_state);
+
+	return (0);
+}
+
+/*
+ * Reconfigure the receive queue
+ */
+static int
+i_ldc_rxq_reconf(ldc_chan_t *ldcp)
+{
+	int rv;
+	uint64_t rx_head, rx_tail;
+
+	ASSERT(MUTEX_HELD(&ldcp->lock));
+	rv = hv_ldc_rx_get_state(ldcp->id, &rx_head, &rx_tail,
+	    &(ldcp->link_state));
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "ldc_rx_getstate: (0x%lx) cannot get state",
+		    ldcp->id);
+		return (EIO);
+	}
+
+	if (rx_head != rx_tail || ldcp->tstate > TS_READY) {
+		rv = hv_ldc_rx_qconf(ldcp->id, ldcp->rx_q_ra,
+			ldcp->rx_q_entries);
+		if (rv) {
+			cmn_err(CE_WARN,
+			    "ldc_rx_qconf: (0x%lx) cannot set qconf",
+			    ldcp->id);
+			return (EIO);
+		}
+		D1(ldcp->id, "ldc_rx_qconf: (0x%llx) completed qconf",
+		    ldcp->id);
+	}
+
+	return (0);
+}
+
+/*
+ * Reset LDC state structure and its contents
+ */
+static void
+i_ldc_reset_state(ldc_chan_t *ldcp)
+{
+	ASSERT(MUTEX_HELD(&ldcp->lock));
+	ldcp->last_msg_snt = LDC_INIT_SEQID;
+	ldcp->last_ack_rcd = 0;
+	ldcp->last_msg_rcd = 0;
+	ldcp->tx_ackd_head = ldcp->tx_head;
+	ldcp->next_vidx = 0;
+	ldcp->hstate = 0;
+	ldcp->tstate = TS_OPEN;
+	ldcp->status = LDC_OPEN;
+
+	if (ldcp->link_state == LDC_CHANNEL_UP ||
+	    ldcp->link_state == LDC_CHANNEL_RESET) {
+
+		if (ldcp->mode == LDC_MODE_RAW) {
+			ldcp->status = LDC_UP;
+			ldcp->tstate = TS_UP;
+		} else {
+			ldcp->status = LDC_READY;
+			ldcp->tstate |= TS_LINK_READY;
+		}
+	}
+}
+
+/*
+ * Reset a LDC channel
+ */
+static void
+i_ldc_reset(ldc_chan_t *ldcp)
+{
+	D2(ldcp->id, "i_ldc_reset: (0x%llx) channel reset\n", ldcp->id);
+
+	(void) i_ldc_txq_reconf(ldcp);
+	(void) i_ldc_rxq_reconf(ldcp);
+	i_ldc_reset_state(ldcp);
+}
+
+/*
+ * Clear pending interrupts
+ */
+static void
+i_ldc_clear_intr(ldc_chan_t *ldcp, cnex_intrtype_t itype)
+{
+	ldc_cnex_t *cinfo = &ldcssp->cinfo;
+
+	ASSERT(MUTEX_HELD(&ldcp->lock));
+	if (cinfo->dip && ldcp->intr_pending) {
+		ldcp->intr_pending = B_FALSE;
+		(void) cinfo->clr_intr(cinfo->dip, ldcp->id, itype);
+	}
+}
+
+/*
+ * Set the receive queue head
+ * Returns an error if it fails
+ */
+static int
+i_ldc_set_rx_head(ldc_chan_t *ldcp, uint64_t head)
+{
+	int rv;
+
+	ASSERT(MUTEX_HELD(&ldcp->lock));
+	rv = hv_ldc_rx_set_qhead(ldcp->id, head);
+	if (rv && rv != H_EWOULDBLOCK) {
+		cmn_err(CE_WARN,
+		    "ldc_rx_set_qhead: (0x%lx) cannot set qhead", ldcp->id);
+		i_ldc_reset(ldcp);
+		return (ECONNRESET);
+	}
+
+	return (0);
+}
+
+
+/*
+ * Returns the tx_tail to be used for transfer
+ * Re-reads the TX queue ptrs if and only if the
+ * the cached head and tail are equal (queue is full)
+ */
+static int
+i_ldc_get_tx_tail(ldc_chan_t *ldcp, uint64_t *tail)
+{
+	int 		rv;
+	uint64_t 	current_head, new_tail;
+
+	ASSERT(MUTEX_HELD(&ldcp->lock));
+	/* Read the head and tail ptrs from HV */
+	rv = hv_ldc_tx_get_state(ldcp->id,
+	    &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state);
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "i_ldc_get_tx_tail: (0x%lx) cannot read qptrs\n",
+		    ldcp->id);
+		return (EIO);
+	}
+	if (ldcp->link_state == LDC_CHANNEL_DOWN) {
+		DWARN(DBG_ALL_LDCS,
+		    "i_ldc_get_tx_tail: (0x%llx) channel not ready\n",
+		    ldcp->id);
+		return (ECONNRESET);
+	}
+
+	/* In reliable mode, check against last ACKd msg */
+	current_head = (ldcp->mode == LDC_MODE_RELIABLE ||
+		ldcp->mode == LDC_MODE_STREAM)
+		? ldcp->tx_ackd_head : ldcp->tx_head;
+
+	/* increment the tail */
+	new_tail = (ldcp->tx_tail + LDC_PACKET_SIZE) %
+		(ldcp->tx_q_entries << LDC_PACKET_SHIFT);
+
+	if (new_tail == current_head) {
+		DWARN(ldcp->id,
+		    "i_ldc_get_tx_tail: (0x%llx) TX queue is full\n",
+		    ldcp->id);
+		return (EWOULDBLOCK);
+	}
+
+	D2(ldcp->id, "i_ldc_get_tx_tail: (0x%llx) head=0x%llx, tail=0x%llx\n",
+	    ldcp->id, ldcp->tx_head, ldcp->tx_tail);
+
+	*tail = ldcp->tx_tail;
+	return (0);
+}
+
+/*
+ * Set the tail pointer. If HV returns EWOULDBLOCK, it will back off
+ * and retry LDC_CHK_CNT times before returning an error.
+ * Returns 0, EWOULDBLOCK or EIO
+ */
+static int
+i_ldc_set_tx_tail(ldc_chan_t *ldcp, uint64_t tail)
+{
+	int		rv, retval = EWOULDBLOCK;
+	int 		loop_cnt, chk_cnt;
+
+	ASSERT(MUTEX_HELD(&ldcp->lock));
+	for (chk_cnt = 0; chk_cnt < LDC_CHK_CNT; chk_cnt++) {
+
+		if ((rv = hv_ldc_tx_set_qtail(ldcp->id, tail)) == 0) {
+			retval = 0;
+			break;
+		}
+		if (rv != H_EWOULDBLOCK) {
+			DWARN(ldcp->id, "i_ldc_set_tx_tail: (0x%llx) set "
+			    "qtail=0x%llx failed, rv=%d\n", ldcp->id, tail, rv);
+			retval = EIO;
+			break;
+		}
+
+		/* spin LDC_LOOP_CNT and then try again */
+		for (loop_cnt = 0; loop_cnt < LDC_LOOP_CNT; loop_cnt++);
+	}
+	return (retval);
+}
+
+/*
+ * Send a LDC message
+ */
+static int
+i_ldc_send_pkt(ldc_chan_t *ldcp, uint8_t pkttype, uint8_t subtype,
+    uint8_t ctrlmsg)
+{
+	int		rv;
+	ldc_msg_t 	*pkt;
+	uint64_t	tx_tail;
+	uint32_t	curr_seqid = ldcp->last_msg_snt;
+
+	ASSERT(MUTEX_HELD(&ldcp->lock));
+	/* get the current tail for the message */
+	rv = i_ldc_get_tx_tail(ldcp, &tx_tail);
+	if (rv) {
+		DWARN(ldcp->id,
+		    "i_ldc_send_pkt: (0x%llx) error sending pkt, "
+		    "type=0x%x,subtype=0x%x,ctrl=0x%x\n",
+		    ldcp->id, pkttype, subtype, ctrlmsg);
+		return (rv);
+	}
+
+	pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail);
+	ZERO_PKT(pkt);
+
+	/* Initialize the packet */
+	pkt->type = pkttype;
+	pkt->stype = subtype;
+	pkt->ctrl = ctrlmsg;
+
+	/* Store ackid/seqid iff it is RELIABLE mode & not a RTS/RTR message */
+	if (((ctrlmsg & LDC_CTRL_MASK) != LDC_RTS) &&
+	    ((ctrlmsg & LDC_CTRL_MASK) != LDC_RTR)) {
+		curr_seqid++;
+		if (ldcp->mode != LDC_MODE_RAW) {
+			pkt->seqid = curr_seqid;
+			pkt->ackid = ldcp->last_msg_rcd;
+		}
+	}
+	DUMP_LDC_PKT(ldcp, "i_ldc_send_pkt", (uint64_t)pkt);
+
+	/* initiate the send by calling into HV and set the new tail */
+	tx_tail = (tx_tail + LDC_PACKET_SIZE) %
+		(ldcp->tx_q_entries << LDC_PACKET_SHIFT);
+
+	rv = i_ldc_set_tx_tail(ldcp, tx_tail);
+	if (rv) {
+		DWARN(ldcp->id,
+		    "i_ldc_send_pkt:(0x%llx) error sending pkt, "
+		    "type=0x%x,stype=0x%x,ctrl=0x%x\n",
+		    ldcp->id, pkttype, subtype, ctrlmsg);
+		return (EIO);
+	}
+
+	ldcp->last_msg_snt = curr_seqid;
+	ldcp->tx_tail = tx_tail;
+
+	return (0);
+}
+
+/*
+ * Checks if packet was received in right order
+ * in the case of a reliable transport.
+ * Returns 0 if in order, else EIO
+ */
+static int
+i_ldc_check_seqid(ldc_chan_t *ldcp, ldc_msg_t *msg)
+{
+	/* No seqid checking for RAW mode */
+	if (ldcp->mode == LDC_MODE_RAW)
+		return (0);
+
+	/* No seqid checking for version, RTS, RTR message */
+	if (msg->ctrl == LDC_VER ||
+	    msg->ctrl == LDC_RTS ||
+	    msg->ctrl == LDC_RTR)
+		return (0);
+
+	/* Initial seqid to use is sent in RTS/RTR and saved in last_msg_rcd */
+	if (msg->seqid != (ldcp->last_msg_rcd + 1)) {
+		DWARN(ldcp->id,
+		    "i_ldc_check_seqid: (0x%llx) out-of-order pkt, got 0x%x, "
+		    "expecting 0x%x\n", ldcp->id, msg->seqid,
+		    (ldcp->last_msg_rcd + 1));
+		return (EIO);
+	}
+
+	return (0);
+}
+
+
+/*
+ * Process an incoming version ctrl message
+ */
+static int
+i_ldc_process_VER(ldc_chan_t *ldcp, ldc_msg_t *msg)
+{
+	int 		rv = 0, idx = ldcp->next_vidx;
+	ldc_msg_t 	*pkt;
+	uint64_t	tx_tail;
+	ldc_ver_t	*rcvd_ver;
+
+	/* get the received version */
+	rcvd_ver = (ldc_ver_t *)((uint64_t)msg + LDC_PAYLOAD_VER_OFF);
+
+	D2(ldcp->id, "i_ldc_process_VER: (0x%llx) received VER v%u.%u\n",
+	    ldcp->id, rcvd_ver->major, rcvd_ver->minor);
+
+	switch (msg->stype) {
+	case LDC_INFO:
+
+		/* get the current tail and pkt for the response */
+		rv = i_ldc_get_tx_tail(ldcp, &tx_tail);
+		if (rv != 0) {
+			DWARN(ldcp->id,
+			    "i_ldc_process_VER: (0x%llx) err sending "
+			    "version ACK/NACK\n", ldcp->id);
+			i_ldc_reset(ldcp);
+			return (ECONNRESET);
+		}
+
+		pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail);
+		ZERO_PKT(pkt);
+
+		/* initialize the packet */
+		pkt->type = LDC_CTRL;
+		pkt->ctrl = LDC_VER;
+
+		for (;;) {
+
+			D1(ldcp->id, "i_ldc_process_VER: got %u.%u chk %u.%u\n",
+			    rcvd_ver->major, rcvd_ver->minor,
+			    ldc_versions[idx].major, ldc_versions[idx].minor);
+
+			if (rcvd_ver->major == ldc_versions[idx].major) {
+				/* major version match - ACK version */
+				pkt->stype = LDC_ACK;
+
+				/*
+				 * lower minor version to the one this endpt
+				 * supports, if necessary
+				 */
+				if (rcvd_ver->minor > ldc_versions[idx].minor)
+					rcvd_ver->minor =
+						ldc_versions[idx].minor;
+				bcopy(rcvd_ver, pkt->udata, sizeof (*rcvd_ver));
+
+				break;
+			}
+
+			if (rcvd_ver->major > ldc_versions[idx].major) {
+
+				D1(ldcp->id, "i_ldc_process_VER: using next"
+				    " lower idx=%d, v%u.%u\n", idx,
+				    ldc_versions[idx].major,
+				    ldc_versions[idx].minor);
+
+				/* nack with next lower version */
+				pkt->stype = LDC_NACK;
+				bcopy(&ldc_versions[idx], pkt->udata,
+				    sizeof (ldc_versions[idx]));
+				ldcp->next_vidx = idx;
+				break;
+			}
+
+			/* next major version */
+			idx++;
+
+			D1(ldcp->id, "i_ldc_process_VER: inc idx %x\n", idx);
+
+			if (idx == LDC_NUM_VERS) {
+				/* no version match - send NACK */
+				pkt->stype = LDC_NACK;
+				bzero(pkt->udata, sizeof (ldc_ver_t));
+				ldcp->next_vidx = 0;
+				break;
+			}
+		}
+
+		/* initiate the send by calling into HV and set the new tail */
+		tx_tail = (tx_tail + LDC_PACKET_SIZE) %
+			(ldcp->tx_q_entries << LDC_PACKET_SHIFT);
+
+		rv = i_ldc_set_tx_tail(ldcp, tx_tail);
+		if (rv == 0) {
+			ldcp->tx_tail = tx_tail;
+			if (pkt->stype == LDC_ACK) {
+				D2(ldcp->id, "i_ldc_process_VER: (0x%llx) sent"
+				    " version ACK\n", ldcp->id);
+				/* Save the ACK'd version */
+				ldcp->version.major = rcvd_ver->major;
+				ldcp->version.minor = rcvd_ver->minor;
+				ldcp->tstate |= TS_VER_DONE;
+				DWARN(DBG_ALL_LDCS,
+				    "(0x%llx) Agreed on version v%u.%u\n",
+				    ldcp->id, rcvd_ver->major, rcvd_ver->minor);
+			}
+		} else {
+			DWARN(ldcp->id,
+			    "i_ldc_process_VER: (0x%llx) error sending "
+			    "ACK/NACK\n", ldcp->id);
+			i_ldc_reset(ldcp);
+			return (ECONNRESET);
+		}
+
+		break;
+
+	case LDC_ACK:
+		/* SUCCESS - we have agreed on a version */
+		ldcp->version.major = rcvd_ver->major;
+		ldcp->version.minor = rcvd_ver->minor;
+		ldcp->tstate |= TS_VER_DONE;
+
+		D1(DBG_ALL_LDCS, "(0x%llx) Agreed on version v%u.%u\n",
+		    ldcp->id, rcvd_ver->major, rcvd_ver->minor);
+
+		/* initiate RTS-RTR-RDX handshake */
+		rv = i_ldc_get_tx_tail(ldcp, &tx_tail);
+		if (rv) {
+			DWARN(ldcp->id,
+			    "i_ldc_process_VER: (0x%llx) cannot send RTS\n",
+			    ldcp->id);
+			i_ldc_reset(ldcp);
+			return (ECONNRESET);
+		}
+
+		pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail);
+		ZERO_PKT(pkt);
+
+		pkt->type = LDC_CTRL;
+		pkt->stype = LDC_INFO;
+		pkt->ctrl = LDC_RTS;
+		pkt->env = ldcp->mode;
+		if (ldcp->mode != LDC_MODE_RAW)
+			pkt->seqid = LDC_INIT_SEQID;
+
+		ldcp->last_msg_rcd = LDC_INIT_SEQID;
+
+		DUMP_LDC_PKT(ldcp, "i_ldc_process_VER snd rts", (uint64_t)pkt);
+
+		/* initiate the send by calling into HV and set the new tail */
+		tx_tail = (tx_tail + LDC_PACKET_SIZE) %
+			(ldcp->tx_q_entries << LDC_PACKET_SHIFT);
+
+		rv = i_ldc_set_tx_tail(ldcp, tx_tail);
+		if (rv) {
+			D2(ldcp->id,
+			    "i_ldc_process_VER: (0x%llx) no listener\n",
+			    ldcp->id);
+			i_ldc_reset(ldcp);
+			return (ECONNRESET);
+		}
+
+		ldcp->last_msg_snt++;
+		ldcp->tx_tail = tx_tail;
+		ldcp->hstate |= TS_SENT_RTS;
+
+		break;
+
+	case LDC_NACK:
+		/* check if version in NACK is zero */
+		if (rcvd_ver->major == 0 && rcvd_ver->minor == 0) {
+			/* version handshake failure */
+			DWARN(DBG_ALL_LDCS,
+			    "i_ldc_process_VER: (0x%llx) no version match\n",
+			    ldcp->id);
+			i_ldc_reset(ldcp);
+			return (ECONNRESET);
+		}
+
+		/* get the current tail and pkt for the response */
+		rv = i_ldc_get_tx_tail(ldcp, &tx_tail);
+		if (rv != 0) {
+			cmn_err(CE_NOTE,
+			    "i_ldc_process_VER: (0x%lx) err sending "
+			    "version ACK/NACK\n", ldcp->id);
+			i_ldc_reset(ldcp);
+			return (ECONNRESET);
+		}
+
+		pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail);
+		ZERO_PKT(pkt);
+
+		/* initialize the packet */
+		pkt->type = LDC_CTRL;
+		pkt->ctrl = LDC_VER;
+		pkt->stype = LDC_INFO;
+
+		/* check ver in NACK msg has a match */
+		for (;;) {
+			if (rcvd_ver->major == ldc_versions[idx].major) {
+				/*
+				 * major version match - resubmit request
+				 * if lower minor version to the one this endpt
+				 * supports, if necessary
+				 */
+				if (rcvd_ver->minor > ldc_versions[idx].minor)
+					rcvd_ver->minor =
+						ldc_versions[idx].minor;
+				bcopy(rcvd_ver, pkt->udata, sizeof (*rcvd_ver));
+				break;
+
+			}
+
+			if (rcvd_ver->major > ldc_versions[idx].major) {
+
+				D1(ldcp->id, "i_ldc_process_VER: using next"
+				    " lower idx=%d, v%u.%u\n", idx,
+				    ldc_versions[idx].major,
+				    ldc_versions[idx].minor);
+
+				/* send next lower version */
+				bcopy(&ldc_versions[idx], pkt->udata,
+				    sizeof (ldc_versions[idx]));
+				ldcp->next_vidx = idx;
+				break;
+			}
+
+			/* next version */
+			idx++;
+
+			D1(ldcp->id, "i_ldc_process_VER: inc idx %x\n", idx);
+
+			if (idx == LDC_NUM_VERS) {
+				/* no version match - terminate */
+				ldcp->next_vidx = 0;
+				return (ECONNRESET);
+			}
+		}
+
+		/* initiate the send by calling into HV and set the new tail */
+		tx_tail = (tx_tail + LDC_PACKET_SIZE) %
+			(ldcp->tx_q_entries << LDC_PACKET_SHIFT);
+
+		rv = i_ldc_set_tx_tail(ldcp, tx_tail);
+		if (rv == 0) {
+			D2(ldcp->id, "i_ldc_process_VER: (0x%llx) sent version"
+			    "INFO v%u.%u\n", ldcp->id, ldc_versions[idx].major,
+			    ldc_versions[idx].minor);
+			ldcp->tx_tail = tx_tail;
+		} else {
+			cmn_err(CE_NOTE,
+			    "i_ldc_process_VER: (0x%lx) error sending version"
+			    "INFO\n", ldcp->id);
+			i_ldc_reset(ldcp);
+			return (ECONNRESET);
+		}
+
+		break;
+	}
+
+	return (rv);
+}
+
+
+/*
+ * Process an incoming RTS ctrl message
+ */
+static int
+i_ldc_process_RTS(ldc_chan_t *ldcp, ldc_msg_t *msg)
+{
+	int 		rv = 0;
+	ldc_msg_t 	*pkt;
+	uint64_t	tx_tail;
+	boolean_t	sent_NACK = B_FALSE;
+
+	D2(ldcp->id, "i_ldc_process_RTS: (0x%llx) received RTS\n", ldcp->id);
+
+	switch (msg->stype) {
+	case LDC_NACK:
+		DWARN(ldcp->id,
+		    "i_ldc_process_RTS: (0x%llx) RTS NACK received\n",
+		    ldcp->id);
+
+		/* Reset the channel -- as we cannot continue */
+		i_ldc_reset(ldcp);
+		rv = ECONNRESET;
+		break;
+
+	case LDC_INFO:
+
+		/* check mode */
+		if (ldcp->mode != (ldc_mode_t)msg->env) {
+			cmn_err(CE_NOTE,
+			    "i_ldc_process_RTS: (0x%lx) mode mismatch\n",
+			    ldcp->id);
+			/*
+			 * send NACK in response to MODE message
+			 * get the current tail for the response
+			 */
+			rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_NACK, LDC_RTS);
+			if (rv) {
+				/* if cannot send NACK - reset channel */
+				i_ldc_reset(ldcp);
+				rv = ECONNRESET;
+				break;
+			}
+			sent_NACK = B_TRUE;
+		}
+		break;
+	default:
+		DWARN(ldcp->id, "i_ldc_process_RTS: (0x%llx) unexp ACK\n",
+		    ldcp->id);
+		i_ldc_reset(ldcp);
+		rv = ECONNRESET;
+		break;
+	}
+
+	/*
+	 * If either the connection was reset (when rv != 0) or
+	 * a NACK was sent, we return. In the case of a NACK
+	 * we dont want to consume the packet that came in but
+	 * not record that we received the RTS
+	 */
+	if (rv || sent_NACK)
+		return (rv);
+
+	/* record RTS received */
+	ldcp->hstate |= TS_RCVD_RTS;
+
+	/* store initial SEQID info */
+	ldcp->last_msg_snt = msg->seqid;
+
+	/* get the current tail for the response */
+	rv = i_ldc_get_tx_tail(ldcp, &tx_tail);
+	if (rv != 0) {
+		cmn_err(CE_NOTE,
+		    "i_ldc_process_RTS: (0x%lx) err sending RTR\n",
+		    ldcp->id);
+		i_ldc_reset(ldcp);
+		return (ECONNRESET);
+	}
+
+	pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail);
+	ZERO_PKT(pkt);
+
+	/* initialize the packet */
+	pkt->type = LDC_CTRL;
+	pkt->stype = LDC_INFO;
+	pkt->ctrl = LDC_RTR;
+	pkt->env = ldcp->mode;
+	if (ldcp->mode != LDC_MODE_RAW)
+		pkt->seqid = LDC_INIT_SEQID;
+
+	ldcp->last_msg_rcd = msg->seqid;
+
+	/* initiate the send by calling into HV and set the new tail */
+	tx_tail = (tx_tail + LDC_PACKET_SIZE) %
+		(ldcp->tx_q_entries << LDC_PACKET_SHIFT);
+
+	rv = i_ldc_set_tx_tail(ldcp, tx_tail);
+	if (rv == 0) {
+		D2(ldcp->id,
+		    "i_ldc_process_RTS: (0x%llx) sent RTR\n", ldcp->id);
+		DUMP_LDC_PKT(ldcp, "i_ldc_process_RTS sent rtr", (uint64_t)pkt);
+
+		ldcp->tx_tail = tx_tail;
+		ldcp->hstate |= TS_SENT_RTR;
+
+	} else {
+		cmn_err(CE_NOTE,
+		    "i_ldc_process_RTS: (0x%lx) error sending RTR\n",
+		    ldcp->id);
+		i_ldc_reset(ldcp);
+		return (ECONNRESET);
+	}
+
+	return (0);
+}
+
+/*
+ * Process an incoming RTR ctrl message
+ */
+static int
+i_ldc_process_RTR(ldc_chan_t *ldcp, ldc_msg_t *msg)
+{
+	int 		rv = 0;
+	boolean_t	sent_NACK = B_FALSE;
+
+	D2(ldcp->id, "i_ldc_process_RTR: (0x%llx) received RTR\n", ldcp->id);
+
+	switch (msg->stype) {
+	case LDC_NACK:
+		/* RTR NACK received */
+		DWARN(ldcp->id,
+		    "i_ldc_process_RTR: (0x%llx) RTR NACK received\n",
+		    ldcp->id);
+
+		/* Reset the channel -- as we cannot continue */
+		i_ldc_reset(ldcp);
+		rv = ECONNRESET;
+
+		break;
+
+	case LDC_INFO:
+
+		/* check mode */
+		if (ldcp->mode != (ldc_mode_t)msg->env) {
+			DWARN(ldcp->id,
+			    "i_ldc_process_RTR: (0x%llx) mode mismatch\n",
+			    ldcp->id);
+			/*
+			 * send NACK in response to MODE message
+			 * get the current tail for the response
+			 */
+			rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_NACK, LDC_RTR);
+			if (rv) {
+				/* if cannot send NACK - reset channel */
+				i_ldc_reset(ldcp);
+				rv = ECONNRESET;
+				break;
+			}
+			sent_NACK = B_TRUE;
+		}
+		break;
+
+	default:
+		DWARN(ldcp->id, "i_ldc_process_RTR: (0x%llx) unexp ACK\n",
+		    ldcp->id);
+
+		/* Reset the channel -- as we cannot continue */
+		i_ldc_reset(ldcp);
+		rv = ECONNRESET;
+		break;
+	}
+
+	/*
+	 * If either the connection was reset (when rv != 0) or
+	 * a NACK was sent, we return. In the case of a NACK
+	 * we dont want to consume the packet that came in but
+	 * not record that we received the RTR
+	 */
+	if (rv || sent_NACK)
+		return (rv);
+
+	ldcp->last_msg_snt = msg->seqid;
+	ldcp->hstate |= TS_RCVD_RTR;
+
+	rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_INFO, LDC_RDX);
+	if (rv) {
+		cmn_err(CE_NOTE,
+		    "i_ldc_process_RTR: (0x%lx) cannot send RDX\n",
+		    ldcp->id);
+		i_ldc_reset(ldcp);
+		return (ECONNRESET);
+	}
+	D2(ldcp->id,
+	    "i_ldc_process_RTR: (0x%llx) sent RDX\n", ldcp->id);
+
+	ldcp->hstate |= TS_SENT_RDX;
+	ldcp->tstate |= TS_HSHAKE_DONE;
+	ldcp->status = LDC_UP;
+
+	DWARN(DBG_ALL_LDCS, "(0x%llx) Handshake Complete\n", ldcp->id);
+
+	return (0);
+}
+
+
+/*
+ * Process an incoming RDX ctrl message
+ */
+static int
+i_ldc_process_RDX(ldc_chan_t *ldcp, ldc_msg_t *msg)
+{
+	int	rv = 0;
+
+	D2(ldcp->id, "i_ldc_process_RDX: (0x%llx) received RDX\n", ldcp->id);
+
+	switch (msg->stype) {
+	case LDC_NACK:
+		/* RDX NACK received */
+		DWARN(ldcp->id,
+		    "i_ldc_process_RDX: (0x%llx) RDX NACK received\n",
+		    ldcp->id);
+
+		/* Reset the channel -- as we cannot continue */
+		i_ldc_reset(ldcp);
+		rv = ECONNRESET;
+
+		break;
+
+	case LDC_INFO:
+
+		/*
+		 * if channel is UP and a RDX received after data transmission
+		 * has commenced it is an error
+		 */
+		if ((ldcp->tstate == TS_UP) && (ldcp->hstate & TS_RCVD_RDX)) {
+			DWARN(DBG_ALL_LDCS,
+			    "i_ldc_process_RDX: (0x%llx) unexpected RDX"
+			    " - LDC reset\n", ldcp->id);
+			i_ldc_reset(ldcp);
+			return (ECONNRESET);
+		}
+
+		ldcp->hstate |= TS_RCVD_RDX;
+		ldcp->tstate |= TS_HSHAKE_DONE;
+		ldcp->status = LDC_UP;
+
+		D1(DBG_ALL_LDCS, "(0x%llx) Handshake Complete\n", ldcp->id);
+		break;
+
+	default:
+		DWARN(ldcp->id, "i_ldc_process_RDX: (0x%llx) unexp ACK\n",
+		    ldcp->id);
+
+		/* Reset the channel -- as we cannot continue */
+		i_ldc_reset(ldcp);
+		rv = ECONNRESET;
+		break;
+	}
+
+	return (rv);
+}
+
+/*
+ * Process an incoming ACK for a data packet
+ */
+static int
+i_ldc_process_data_ACK(ldc_chan_t *ldcp, ldc_msg_t *msg)
+{
+	int		rv;
+	uint64_t 	tx_head;
+	ldc_msg_t	*pkt;
+
+	/*
+	 * Read the curret Tx head and tail
+	 */
+	rv = hv_ldc_tx_get_state(ldcp->id,
+	    &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state);
+	if (rv != 0) {
+		cmn_err(CE_WARN,
+		    "i_ldc_process_data_ACK: (0x%lx) cannot read qptrs\n",
+		    ldcp->id);
+		return (0);
+	}
+
+	/*
+	 * loop from where the previous ACK location was to the
+	 * current head location. This is how far the HV has
+	 * actually send pkts. Pkts between head and tail are
+	 * yet to be sent by HV.
+	 */
+	tx_head = ldcp->tx_ackd_head;
+	for (;;) {
+		pkt = (ldc_msg_t *)(ldcp->tx_q_va + tx_head);
+		tx_head = (tx_head + LDC_PACKET_SIZE) %
+			(ldcp->tx_q_entries << LDC_PACKET_SHIFT);
+
+		if (pkt->seqid == msg->ackid) {
+			D2(ldcp->id,
+			    "i_ldc_process_data_ACK: (0x%llx) found packet\n",
+			    ldcp->id);
+			ldcp->last_ack_rcd = msg->ackid;
+			ldcp->tx_ackd_head = tx_head;
+			break;
+		}
+		if (tx_head == ldcp->tx_head) {
+			/* could not find packet */
+			DWARN(ldcp->id,
+			    "i_ldc_process_data_ACK: (0x%llx) invalid ACKid\n",
+			    ldcp->id);
+			break;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Process incoming control message
+ * Return 0 - session can continue
+ *        EAGAIN - reprocess packet - state was changed
+ *	  ECONNRESET - channel was reset
+ */
+static int
+i_ldc_ctrlmsg(ldc_chan_t *ldcp, ldc_msg_t *msg)
+{
+	int 		rv = 0;
+
+	switch (ldcp->tstate) {
+
+	case TS_OPEN:
+	case TS_READY:
+
+		switch (msg->ctrl & LDC_CTRL_MASK) {
+		case LDC_VER:
+			/* process version message */
+			rv = i_ldc_process_VER(ldcp, msg);
+			break;
+		default:
+			DWARN(ldcp->id,
+			    "i_ldc_ctrlmsg: (0x%llx) unexp ctrl 0x%x "
+			    "tstate=0x%x\n", ldcp->id,
+			    (msg->ctrl & LDC_CTRL_MASK), ldcp->tstate);
+			break;
+		}
+
+		break;
+
+	case TS_VREADY:
+
+		switch (msg->ctrl & LDC_CTRL_MASK) {
+		case LDC_VER:
+			/* peer is redoing version negotiation */
+			(void) i_ldc_txq_reconf(ldcp);
+			i_ldc_reset_state(ldcp);
+			rv = EAGAIN;
+			break;
+		case LDC_RTS:
+			/* process RTS message */
+			rv = i_ldc_process_RTS(ldcp, msg);
+			break;
+		case LDC_RTR:
+			/* process RTR message */
+			rv = i_ldc_process_RTR(ldcp, msg);
+			break;
+		case LDC_RDX:
+			/* process RDX message */
+			rv = i_ldc_process_RDX(ldcp, msg);
+			break;
+		default:
+			DWARN(ldcp->id,
+			    "i_ldc_ctrlmsg: (0x%llx) unexp ctrl 0x%x "
+			    "tstate=0x%x\n", ldcp->id,
+			    (msg->ctrl & LDC_CTRL_MASK), ldcp->tstate);
+			break;
+		}
+
+		break;
+
+	case TS_UP:
+
+		switch (msg->ctrl & LDC_CTRL_MASK) {
+		case LDC_VER:
+			DWARN(ldcp->id,
+			    "i_ldc_ctrlmsg: (0x%llx) unexpected VER "
+			    "- LDC reset\n", ldcp->id);
+			/* peer is redoing version negotiation */
+			(void) i_ldc_txq_reconf(ldcp);
+			i_ldc_reset_state(ldcp);
+			rv = EAGAIN;
+			break;
+
+		case LDC_RDX:
+			/* process RDX message */
+			rv = i_ldc_process_RDX(ldcp, msg);
+			break;
+
+		default:
+			DWARN(ldcp->id,
+			    "i_ldc_ctrlmsg: (0x%llx) unexp ctrl 0x%x "
+			    "tstate=0x%x\n", ldcp->id,
+			    (msg->ctrl & LDC_CTRL_MASK), ldcp->tstate);
+			break;
+		}
+	}
+
+	return (rv);
+}
+
+/*
+ * Register channel with the channel nexus
+ */
+static int
+i_ldc_register_channel(ldc_chan_t *ldcp)
+{
+	int		rv = 0;
+	ldc_cnex_t	*cinfo = &ldcssp->cinfo;
+
+	if (cinfo->dip == NULL) {
+		DWARN(ldcp->id,
+		    "i_ldc_register_channel: cnex has not registered\n");
+		return (EAGAIN);
+	}
+
+	rv = cinfo->reg_chan(cinfo->dip, ldcp->id, ldcp->devclass);
+	if (rv) {
+		DWARN(ldcp->id,
+		    "i_ldc_register_channel: cannot register channel\n");
+		return (rv);
+	}
+
+	rv = cinfo->add_intr(cinfo->dip, ldcp->id, CNEX_TX_INTR,
+	    i_ldc_tx_hdlr, ldcp, NULL);
+	if (rv) {
+		DWARN(ldcp->id,
+		    "i_ldc_register_channel: cannot add Tx interrupt\n");
+		(void) cinfo->unreg_chan(cinfo->dip, ldcp->id);
+		return (rv);
+	}
+
+	rv = cinfo->add_intr(cinfo->dip, ldcp->id, CNEX_RX_INTR,
+	    i_ldc_rx_hdlr, ldcp, NULL);
+	if (rv) {
+		DWARN(ldcp->id,
+		    "i_ldc_register_channel: cannot add Rx interrupt\n");
+		(void) cinfo->rem_intr(cinfo->dip, ldcp->id, CNEX_TX_INTR);
+		(void) cinfo->unreg_chan(cinfo->dip, ldcp->id);
+		return (rv);
+	}
+
+	ldcp->tstate |= TS_CNEX_RDY;
+
+	return (0);
+}
+
+/*
+ * Unregister a channel with the channel nexus
+ */
+static int
+i_ldc_unregister_channel(ldc_chan_t *ldcp)
+{
+	int		rv = 0;
+	ldc_cnex_t	*cinfo = &ldcssp->cinfo;
+
+	if (cinfo->dip == NULL) {
+		DWARN(ldcp->id,
+		    "i_ldc_unregister_channel: cnex has not registered\n");
+		return (EAGAIN);
+	}
+
+	if (ldcp->tstate & TS_CNEX_RDY) {
+
+		rv = cinfo->rem_intr(cinfo->dip, ldcp->id, CNEX_RX_INTR);
+		if (rv) {
+			DWARN(ldcp->id,
+			    "i_ldc_unregister_channel: err removing Rx intr\n");
+		}
+		rv = cinfo->rem_intr(cinfo->dip, ldcp->id, CNEX_TX_INTR);
+		if (rv) {
+			DWARN(ldcp->id,
+			    "i_ldc_unregister_channel: err removing Tx intr\n");
+		}
+		rv = cinfo->unreg_chan(ldcssp->cinfo.dip, ldcp->id);
+		if (rv) {
+			DWARN(ldcp->id,
+			    "i_ldc_unregister_channel: cannot unreg channel\n");
+		}
+
+		ldcp->tstate &= ~TS_CNEX_RDY;
+	}
+
+	return (0);
+}
+
+
+/*
+ * LDC transmit interrupt handler
+ *    triggered for chanel up/down/reset events
+ *    and Tx queue content changes
+ */
+static uint_t
+i_ldc_tx_hdlr(caddr_t arg1, caddr_t arg2)
+{
+	_NOTE(ARGUNUSED(arg2))
+
+	int 		rv;
+	ldc_chan_t 	*ldcp;
+	boolean_t 	notify_client = B_FALSE;
+	uint64_t	notify_event = 0;
+
+	/* Get the channel for which interrupt was received */
+	ASSERT(arg1 != NULL);
+	ldcp = (ldc_chan_t *)arg1;
+
+	D1(ldcp->id, "i_ldc_tx_hdlr: (0x%llx) Received intr, ldcp=0x%p\n",
+	    ldcp->id, ldcp);
+
+	/* Lock channel */
+	mutex_enter(&ldcp->lock);
+
+	rv = hv_ldc_tx_get_state(ldcp->id, &ldcp->tx_head, &ldcp->tx_tail,
+	    &ldcp->link_state);
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "i_ldc_tx_hdlr: (0x%lx) cannot read queue ptrs rv=0x%d\n",
+		    ldcp->id, rv);
+		mutex_exit(&ldcp->lock);
+		return (DDI_INTR_CLAIMED);
+	}
+
+	/*
+	 * reset the channel state if the channel went down
+	 * (other side unconfigured queue) or channel was reset
+	 * (other side reconfigured its queue)
+	 */
+	if (ldcp->link_state == LDC_CHANNEL_DOWN) {
+		D1(ldcp->id, "i_ldc_tx_hdlr: channel link down\n", ldcp->id);
+		i_ldc_reset(ldcp);
+		notify_client = B_TRUE;
+		notify_event = LDC_EVT_DOWN;
+	}
+
+	if (ldcp->link_state == LDC_CHANNEL_RESET) {
+		D1(ldcp->id, "i_ldc_tx_hdlr: channel link reset\n", ldcp->id);
+		i_ldc_reset(ldcp);
+		notify_client = B_TRUE;
+		notify_event = LDC_EVT_RESET;
+	}
+
+	if (ldcp->tstate == TS_OPEN && ldcp->link_state == LDC_CHANNEL_UP) {
+		D1(ldcp->id, "i_ldc_tx_hdlr: channel link up\n", ldcp->id);
+		notify_client = B_TRUE;
+		notify_event = LDC_EVT_RESET;
+		ldcp->tstate |= TS_LINK_READY;
+		ldcp->status = LDC_READY;
+	}
+
+	/* if callbacks are disabled, do not notify */
+	if (!ldcp->cb_enabled)
+		notify_client = B_FALSE;
+
+	if (notify_client)
+		ldcp->cb_inprogress = B_TRUE;
+
+	/* Unlock channel */
+	mutex_exit(&ldcp->lock);
+
+	if (notify_client) {
+		rv = ldcp->cb(notify_event, ldcp->cb_arg);
+		if (rv) {
+			DWARN(ldcp->id, "i_ldc_tx_hdlr: (0x%llx) callback "
+			    "failure", ldcp->id);
+		}
+		mutex_enter(&ldcp->lock);
+		ldcp->cb_inprogress = B_FALSE;
+		mutex_exit(&ldcp->lock);
+	}
+
+	mutex_enter(&ldcp->lock);
+	i_ldc_clear_intr(ldcp, CNEX_TX_INTR);
+	mutex_exit(&ldcp->lock);
+
+	D1(ldcp->id, "i_ldc_tx_hdlr: (0x%llx) exiting handler", ldcp->id);
+
+	return (DDI_INTR_CLAIMED);
+}
+
+/*
+ * LDC receive interrupt handler
+ *    triggered for channel with data pending to read
+ *    i.e. Rx queue content changes
+ */
+static uint_t
+i_ldc_rx_hdlr(caddr_t arg1, caddr_t arg2)
+{
+	_NOTE(ARGUNUSED(arg2))
+
+	int		rv;
+	uint64_t 	rx_head, rx_tail;
+	ldc_msg_t 	*msg;
+	ldc_chan_t 	*ldcp;
+	boolean_t 	notify_client = B_FALSE;
+	uint64_t	notify_event = 0;
+
+	/* Get the channel for which interrupt was received */
+	if (arg1 == NULL) {
+		cmn_err(CE_WARN, "i_ldc_rx_hdlr: invalid arg\n");
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	ldcp = (ldc_chan_t *)arg1;
+
+	D1(ldcp->id, "i_ldc_rx_hdlr: (0x%llx) Received intr, ldcp=0x%p\n",
+	    ldcp->id, ldcp);
+
+	/* Lock channel */
+	mutex_enter(&ldcp->lock);
+
+	/* mark interrupt as pending */
+	ldcp->intr_pending = B_TRUE;
+
+	/*
+	 * Read packet(s) from the queue
+	 */
+	for (;;) {
+
+		rv = hv_ldc_rx_get_state(ldcp->id, &rx_head, &rx_tail,
+		    &ldcp->link_state);
+		if (rv) {
+			cmn_err(CE_WARN,
+			    "i_ldc_rx_hdlr: (0x%lx) cannot read "
+			    "queue ptrs, rv=0x%d\n", ldcp->id, rv);
+			i_ldc_clear_intr(ldcp, CNEX_RX_INTR);
+			mutex_exit(&ldcp->lock);
+			return (DDI_INTR_CLAIMED);
+		}
+
+		/*
+		 * reset the channel state if the channel went down
+		 * (other side unconfigured queue) or channel was reset
+		 * (other side reconfigured its queue
+		 */
+		if (ldcp->link_state == LDC_CHANNEL_DOWN) {
+			D1(ldcp->id, "i_ldc_rx_hdlr: channel link down\n",
+			    ldcp->id);
+			i_ldc_reset(ldcp);
+			notify_client = B_TRUE;
+			notify_event = LDC_EVT_DOWN;
+			break;
+		}
+		if (ldcp->link_state == LDC_CHANNEL_RESET) {
+			D1(ldcp->id, "i_ldc_rx_hdlr: channel link reset\n",
+			    ldcp->id);
+			i_ldc_reset(ldcp);
+			notify_client = B_TRUE;
+			notify_event = LDC_EVT_RESET;
+		}
+
+		if (ldcp->tstate == TS_OPEN &&
+		    ldcp->link_state == LDC_CHANNEL_UP) {
+			D1(ldcp->id, "i_ldc_rx_hdlr: channel link up\n",
+			    ldcp->id);
+			notify_client = B_TRUE;
+			notify_event = LDC_EVT_RESET;
+			ldcp->tstate |= TS_LINK_READY;
+			ldcp->status = LDC_READY;
+		}
+
+		if (rx_head == rx_tail) {
+			D2(ldcp->id, "i_ldc_rx_hdlr: (0x%llx) No packets\n",
+			    ldcp->id);
+			break;
+		}
+		D2(ldcp->id, "i_ldc_rx_hdlr: head=0x%llx, tail=0x%llx\n",
+		    rx_head, rx_tail);
+		DUMP_LDC_PKT(ldcp, "i_ldc_rx_hdlr rcd",
+		    ldcp->rx_q_va + rx_head);
+
+		/* get the message */
+		msg = (ldc_msg_t *)(ldcp->rx_q_va + rx_head);
+
+		/* if channel is in RAW mode or data pkt, notify and return */
+		if (ldcp->mode == LDC_MODE_RAW) {
+			notify_client = B_TRUE;
+			notify_event |= LDC_EVT_READ;
+			break;
+		}
+
+		if ((msg->type & LDC_DATA) && (msg->stype & LDC_INFO)) {
+
+			/* discard packet if channel is not up */
+			if (ldcp->tstate != TS_UP) {
+
+				/* move the head one position */
+				rx_head = (rx_head + LDC_PACKET_SIZE) %
+				(ldcp->rx_q_entries << LDC_PACKET_SHIFT);
+
+				if (rv = i_ldc_set_rx_head(ldcp, rx_head))
+					break;
+
+				continue;
+			} else {
+				notify_client = B_TRUE;
+				notify_event |= LDC_EVT_READ;
+				break;
+			}
+		}
+
+		/* Check the sequence ID for the message received */
+		if ((rv = i_ldc_check_seqid(ldcp, msg)) != 0) {
+
+			DWARN(ldcp->id, "i_ldc_rx_hdlr: (0x%llx) seqid error, "
+			    "q_ptrs=0x%lx,0x%lx", ldcp->id, rx_head, rx_tail);
+
+			/* Reset last_msg_rcd to start of message */
+			if (ldcp->first_fragment != 0) {
+				ldcp->last_msg_rcd =
+					ldcp->first_fragment - 1;
+				ldcp->first_fragment = 0;
+			}
+			/*
+			 * Send a NACK due to seqid mismatch
+			 */
+			rv = i_ldc_send_pkt(ldcp, LDC_CTRL, LDC_NACK,
+			    (msg->ctrl & LDC_CTRL_MASK));
+
+			if (rv) {
+				cmn_err(CE_NOTE,
+				    "i_ldc_rx_hdlr: (0x%lx) err sending "
+				    "CTRL/NACK msg\n", ldcp->id);
+			}
+
+			/* purge receive queue */
+			(void) i_ldc_set_rx_head(ldcp, rx_tail);
+			break;
+		}
+
+		/* record the message ID */
+		ldcp->last_msg_rcd = msg->seqid;
+
+		/* process control messages */
+		if (msg->type & LDC_CTRL) {
+			/* save current internal state */
+			uint64_t tstate = ldcp->tstate;
+
+			rv = i_ldc_ctrlmsg(ldcp, msg);
+			if (rv == EAGAIN) {
+				/* re-process pkt - state was adjusted */
+				continue;
+			}
+			if (rv == ECONNRESET) {
+				notify_client = B_TRUE;
+				notify_event = LDC_EVT_RESET;
+				break;
+			}
+
+			/*
+			 * control message processing was successful
+			 * channel transitioned to ready for communication
+			 */
+			if (rv == 0 && ldcp->tstate == TS_UP &&
+			    tstate != ldcp->tstate) {
+				notify_client = B_TRUE;
+				notify_event = LDC_EVT_UP;
+			}
+		}
+
+		/* process data ACKs */
+		if ((msg->type & LDC_DATA) && (msg->stype & LDC_ACK)) {
+			(void) i_ldc_process_data_ACK(ldcp, msg);
+		}
+
+		/* move the head one position */
+		rx_head = (rx_head + LDC_PACKET_SIZE) %
+			(ldcp->rx_q_entries << LDC_PACKET_SHIFT);
+		if (rv = i_ldc_set_rx_head(ldcp, rx_head))
+			break;
+
+	} /* for */
+
+	/* if callbacks are disabled, do not notify */
+	if (!ldcp->cb_enabled)
+		notify_client = B_FALSE;
+
+	if (notify_client)
+		ldcp->cb_inprogress = B_TRUE;
+
+	/* Unlock channel */
+	mutex_exit(&ldcp->lock);
+
+	if (notify_client) {
+		rv = ldcp->cb(notify_event, ldcp->cb_arg);
+		if (rv) {
+			DWARN(ldcp->id,
+			    "i_ldc_rx_hdlr: (0x%llx) callback failure",
+			    ldcp->id);
+		}
+		mutex_enter(&ldcp->lock);
+		ldcp->cb_inprogress = B_FALSE;
+		mutex_exit(&ldcp->lock);
+	}
+
+	mutex_enter(&ldcp->lock);
+
+	/*
+	 * If there are data packets in the queue, the ldc_read will
+	 * clear interrupts after draining the queue, else clear interrupts
+	 */
+	if ((notify_event & LDC_EVT_READ) == 0) {
+		i_ldc_clear_intr(ldcp, CNEX_RX_INTR);
+	}
+
+	mutex_exit(&ldcp->lock);
+
+	D1(ldcp->id, "i_ldc_rx_hdlr: (0x%llx) exiting handler", ldcp->id);
+	return (DDI_INTR_CLAIMED);
+}
+
+
+/* -------------------------------------------------------------------------- */
+
+/*
+ * LDC API functions
+ */
+
+/*
+ * Initialize the channel. Allocate internal structure and memory for
+ * TX/RX queues, and initialize locks.
+ */
+int
+ldc_init(uint64_t id, ldc_attr_t *attr, ldc_handle_t *handle)
+{
+	ldc_chan_t 	*ldcp;
+	int		rv, exit_val;
+	uint64_t	ra_base, nentries;
+
+	exit_val = EINVAL;	/* guarantee an error if exit on failure */
+
+	if (attr == NULL) {
+		DWARN(id, "ldc_init: (0x%llx) invalid attr\n", id);
+		return (EINVAL);
+	}
+	if (handle == NULL) {
+		DWARN(id, "ldc_init: (0x%llx) invalid handle\n", id);
+		return (EINVAL);
+	}
+
+	/* check if channel is valid */
+	rv = hv_ldc_tx_qinfo(id, &ra_base, &nentries);
+	if (rv == H_ECHANNEL) {
+		DWARN(id, "ldc_init: (0x%llx) invalid channel id\n", id);
+		return (EINVAL);
+	}
+
+	/* check if the channel has already been initialized */
+	mutex_enter(&ldcssp->lock);
+	ldcp = ldcssp->chan_list;
+	while (ldcp != NULL) {
+		if (ldcp->id == id) {
+			DWARN(id, "ldc_init: (0x%llx) already initialized\n",
+			    id);
+			mutex_exit(&ldcssp->lock);
+			return (EADDRINUSE);
+		}
+		ldcp = ldcp->next;
+	}
+	mutex_exit(&ldcssp->lock);
+
+	ASSERT(ldcp == NULL);
+
+	*handle = 0;
+
+	/* Allocate an ldcp structure */
+	ldcp = kmem_zalloc(sizeof (ldc_chan_t), KM_SLEEP);
+
+	/* Initialize the channel lock */
+	mutex_init(&ldcp->lock, NULL, MUTEX_DRIVER, NULL);
+
+	/* Channel specific processing */
+	mutex_enter(&ldcp->lock);
+
+	/* Initialize the channel */
+	ldcp->id = id;
+	ldcp->cb = NULL;
+	ldcp->cb_arg = NULL;
+	ldcp->cb_inprogress = B_FALSE;
+	ldcp->cb_enabled = B_FALSE;
+	ldcp->next = NULL;
+
+	/* Read attributes */
+	ldcp->mode = attr->mode;
+	ldcp->devclass = attr->devclass;
+	ldcp->devinst = attr->instance;
+
+	ldcp->rx_q_entries =
+		(attr->qlen > 0) ? attr->qlen : LDC_QUEUE_ENTRIES;
+	ldcp->tx_q_entries = ldcp->rx_q_entries;
+
+	D1(ldcp->id,
+	    "ldc_init: (0x%llx) channel attributes, class=0x%x, "
+	    "instance=0x%llx,mode=%d, qlen=%d\n",
+	    ldcp->id, ldcp->devclass, ldcp->devinst,
+	    ldcp->mode, ldcp->rx_q_entries);
+
+	ldcp->next_vidx = 0;
+	ldcp->tstate = 0;
+	ldcp->hstate = 0;
+	ldcp->last_msg_snt = LDC_INIT_SEQID;
+	ldcp->last_ack_rcd = 0;
+	ldcp->last_msg_rcd = 0;
+
+	ldcp->stream_bufferp = NULL;
+	ldcp->exp_dring_list = NULL;
+	ldcp->imp_dring_list = NULL;
+	ldcp->mhdl_list = NULL;
+
+	/* Initialize payload size depending on whether channel is reliable */
+	switch (ldcp->mode) {
+	case LDC_MODE_RAW:
+		ldcp->pkt_payload = LDC_PAYLOAD_SIZE_RAW;
+		ldcp->read_p = i_ldc_read_raw;
+		ldcp->write_p = i_ldc_write_raw;
+		ldcp->mtu = 0;
+		break;
+	case LDC_MODE_UNRELIABLE:
+		ldcp->pkt_payload = LDC_PAYLOAD_SIZE_UNRELIABLE;
+		ldcp->read_p = i_ldc_read_packet;
+		ldcp->write_p = i_ldc_write_packet;
+		ldcp->mtu = 0;
+		break;
+	case LDC_MODE_RELIABLE:
+		ldcp->pkt_payload = LDC_PAYLOAD_SIZE_RELIABLE;
+		ldcp->read_p = i_ldc_read_packet;
+		ldcp->write_p = i_ldc_write_packet;
+		ldcp->mtu = 0;
+		break;
+	case LDC_MODE_STREAM:
+		ldcp->pkt_payload = LDC_PAYLOAD_SIZE_RELIABLE;
+
+		ldcp->stream_remains = 0;
+		ldcp->stream_offset = 0;
+		ldcp->mtu = LDC_STREAM_MTU;
+		ldcp->stream_bufferp = kmem_alloc(ldcp->mtu, KM_SLEEP);
+		ldcp->read_p = i_ldc_read_stream;
+		ldcp->write_p = i_ldc_write_stream;
+		break;
+	default:
+		exit_val = EINVAL;
+		goto cleanup_on_exit;
+	}
+
+	/* Create a transmit queue */
+	ldcp->tx_q_va = (uint64_t)
+		contig_mem_alloc(ldcp->tx_q_entries << LDC_PACKET_SHIFT);
+	if (ldcp->tx_q_va == NULL) {
+		cmn_err(CE_WARN,
+		    "ldc_init: (0x%lx) TX queue allocation failed\n",
+		    ldcp->id);
+		exit_val = ENOMEM;
+		goto cleanup_on_exit;
+	}
+	ldcp->tx_q_ra = va_to_pa((caddr_t)ldcp->tx_q_va);
+
+	D2(ldcp->id, "ldc_init: txq_va=0x%llx, txq_ra=0x%llx, entries=0x%llx\n",
+	    ldcp->tx_q_va, ldcp->tx_q_ra, ldcp->tx_q_entries);
+
+	ldcp->tstate |= TS_TXQ_RDY;
+
+	/* Create a receive queue */
+	ldcp->rx_q_va = (uint64_t)
+		contig_mem_alloc(ldcp->rx_q_entries << LDC_PACKET_SHIFT);
+	if (ldcp->rx_q_va == NULL) {
+		cmn_err(CE_WARN,
+		    "ldc_init: (0x%lx) RX queue allocation failed\n",
+		    ldcp->id);
+		exit_val = ENOMEM;
+		goto cleanup_on_exit;
+	}
+	ldcp->rx_q_ra = va_to_pa((caddr_t)ldcp->rx_q_va);
+
+	D2(ldcp->id, "ldc_init: rxq_va=0x%llx, rxq_ra=0x%llx, entries=0x%llx\n",
+	    ldcp->rx_q_va, ldcp->rx_q_ra, ldcp->rx_q_entries);
+
+	ldcp->tstate |= TS_RXQ_RDY;
+
+	/* Init descriptor ring and memory handle list lock */
+	mutex_init(&ldcp->exp_dlist_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ldcp->imp_dlist_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ldcp->mlist_lock, NULL, MUTEX_DRIVER, NULL);
+
+	/* mark status as INITialized */
+	ldcp->status = LDC_INIT;
+
+	mutex_exit(&ldcp->lock);
+
+	/* Add to channel list */
+	mutex_enter(&ldcssp->lock);
+	ldcp->next = ldcssp->chan_list;
+	ldcssp->chan_list = ldcp;
+	ldcssp->channel_count++;
+	mutex_exit(&ldcssp->lock);
+
+	/* set the handle */
+	*handle = (ldc_handle_t)ldcp;
+
+	D1(ldcp->id, "ldc_init: (0x%llx) channel initialized\n", ldcp->id);
+
+	return (0);
+
+cleanup_on_exit:
+
+	if (ldcp->mode == LDC_MODE_STREAM && ldcp->stream_bufferp)
+		kmem_free(ldcp->stream_bufferp, ldcp->mtu);
+
+	if (ldcp->tstate & TS_TXQ_RDY)
+		contig_mem_free((caddr_t)ldcp->tx_q_va,
+		    (ldcp->tx_q_entries << LDC_PACKET_SHIFT));
+
+	if (ldcp->tstate & TS_RXQ_RDY)
+		contig_mem_free((caddr_t)ldcp->rx_q_va,
+		    (ldcp->rx_q_entries << LDC_PACKET_SHIFT));
+
+	mutex_exit(&ldcp->lock);
+	mutex_destroy(&ldcp->lock);
+
+	if (ldcp)
+		kmem_free(ldcp, sizeof (ldc_chan_t));
+
+	return (exit_val);
+}
+
+/*
+ * Finalizes the LDC connection. It will return EBUSY if the
+ * channel is open. A ldc_close() has to be done prior to
+ * a ldc_fini operation. It frees TX/RX queues, associated
+ * with the channel
+ */
+int
+ldc_fini(ldc_handle_t handle)
+{
+	ldc_chan_t 	*ldcp;
+	ldc_chan_t 	*tmp_ldcp;
+	uint64_t 	id;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_fini: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+	id = ldcp->id;
+
+	mutex_enter(&ldcp->lock);
+
+	if (ldcp->tstate > TS_INIT) {
+		DWARN(ldcp->id, "ldc_fini: (0x%llx) channel is open\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EBUSY);
+	}
+
+	/* Remove from the channel list */
+	mutex_enter(&ldcssp->lock);
+	tmp_ldcp = ldcssp->chan_list;
+	if (tmp_ldcp == ldcp) {
+		ldcssp->chan_list = ldcp->next;
+		ldcp->next = NULL;
+	} else {
+		while (tmp_ldcp != NULL) {
+			if (tmp_ldcp->next == ldcp) {
+				tmp_ldcp->next = ldcp->next;
+				ldcp->next = NULL;
+				break;
+			}
+			tmp_ldcp = tmp_ldcp->next;
+		}
+		if (tmp_ldcp == NULL) {
+			DWARN(DBG_ALL_LDCS, "ldc_fini: invalid channel hdl\n");
+			mutex_exit(&ldcssp->lock);
+			mutex_exit(&ldcp->lock);
+			return (EINVAL);
+		}
+	}
+
+	ldcssp->channel_count--;
+
+	mutex_exit(&ldcssp->lock);
+
+	/* Free the map table for this channel */
+	if (ldcp->mtbl) {
+		(void) hv_ldc_set_map_table(ldcp->id, NULL, NULL);
+		contig_mem_free(ldcp->mtbl->table, ldcp->mtbl->size);
+		mutex_destroy(&ldcp->mtbl->lock);
+		kmem_free(ldcp->mtbl, sizeof (ldc_mtbl_t));
+	}
+
+	/* Destroy descriptor ring and memory handle list lock */
+	mutex_destroy(&ldcp->exp_dlist_lock);
+	mutex_destroy(&ldcp->imp_dlist_lock);
+	mutex_destroy(&ldcp->mlist_lock);
+
+	/* Free the stream buffer for STREAM_MODE */
+	if (ldcp->mode == LDC_MODE_STREAM && ldcp->stream_bufferp)
+		kmem_free(ldcp->stream_bufferp, ldcp->mtu);
+
+	/* Free the RX queue */
+	contig_mem_free((caddr_t)ldcp->rx_q_va,
+	    (ldcp->rx_q_entries << LDC_PACKET_SHIFT));
+	ldcp->tstate &= ~TS_RXQ_RDY;
+
+	/* Free the TX queue */
+	contig_mem_free((caddr_t)ldcp->tx_q_va,
+	    (ldcp->tx_q_entries << LDC_PACKET_SHIFT));
+	ldcp->tstate &= ~TS_TXQ_RDY;
+
+
+	mutex_exit(&ldcp->lock);
+
+	/* Destroy mutex */
+	mutex_destroy(&ldcp->lock);
+
+	/* free channel structure */
+	kmem_free(ldcp, sizeof (ldc_chan_t));
+
+	D1(id, "ldc_fini: (0x%llx) channel finalized\n", id);
+
+	return (0);
+}
+
+/*
+ * Open the LDC channel for use. It registers the TX/RX queues
+ * with the Hypervisor. It also specifies the interrupt number
+ * and target CPU for this channel
+ */
+int
+ldc_open(ldc_handle_t handle)
+{
+	ldc_chan_t 	*ldcp;
+	int 		rv;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_open: invalid channel handle\n");
+		return (EINVAL);
+	}
+
+	ldcp = (ldc_chan_t *)handle;
+
+	mutex_enter(&ldcp->lock);
+
+	if (ldcp->tstate < TS_INIT) {
+		DWARN(ldcp->id,
+		    "ldc_open: (0x%llx) channel not initialized\n", ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EFAULT);
+	}
+	if (ldcp->tstate >= TS_OPEN) {
+		DWARN(ldcp->id,
+		    "ldc_open: (0x%llx) channel is already open\n", ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EFAULT);
+	}
+
+	/*
+	 * Unregister/Register the tx queue with the hypervisor
+	 */
+	rv = hv_ldc_tx_qconf(ldcp->id, NULL, NULL);
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "ldc_open: (0x%lx) channel tx queue unconf failed\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+
+	rv = hv_ldc_tx_qconf(ldcp->id, ldcp->tx_q_ra, ldcp->tx_q_entries);
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "ldc_open: (0x%lx) channel tx queue conf failed\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+
+	D2(ldcp->id, "ldc_open: (0x%llx) registered tx queue with LDC\n",
+	    ldcp->id);
+
+	/*
+	 * Unregister/Register the rx queue with the hypervisor
+	 */
+	rv = hv_ldc_rx_qconf(ldcp->id, NULL, NULL);
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "ldc_open: (0x%lx) channel rx queue unconf failed\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+
+	rv = hv_ldc_rx_qconf(ldcp->id, ldcp->rx_q_ra, ldcp->rx_q_entries);
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "ldc_open: (0x%lx) channel rx queue conf failed\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+
+	D2(ldcp->id, "ldc_open: (0x%llx) registered rx queue with LDC\n",
+	    ldcp->id);
+
+	ldcp->tstate |= TS_QCONF_RDY;
+
+	/* Register the channel with the channel nexus */
+	rv = i_ldc_register_channel(ldcp);
+	if (rv && rv != EAGAIN) {
+		cmn_err(CE_WARN,
+		    "ldc_open: (0x%lx) channel register failed\n", ldcp->id);
+		(void) hv_ldc_tx_qconf(ldcp->id, NULL, NULL);
+		(void) hv_ldc_rx_qconf(ldcp->id, NULL, NULL);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+
+	/* mark channel in OPEN state */
+	ldcp->status = LDC_OPEN;
+
+	/* Read channel state */
+	rv = hv_ldc_tx_get_state(ldcp->id,
+	    &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state);
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "ldc_open: (0x%lx) cannot read channel state\n",
+		    ldcp->id);
+		(void) i_ldc_unregister_channel(ldcp);
+		(void) hv_ldc_tx_qconf(ldcp->id, NULL, NULL);
+		(void) hv_ldc_rx_qconf(ldcp->id, NULL, NULL);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+
+	/*
+	 * set the ACKd head to current head location for reliable &
+	 * streaming mode
+	 */
+	ldcp->tx_ackd_head = ldcp->tx_head;
+
+	/* mark channel ready if HV report link is UP (peer alloc'd Rx queue) */
+	if (ldcp->link_state == LDC_CHANNEL_UP ||
+	    ldcp->link_state == LDC_CHANNEL_RESET) {
+		ldcp->tstate |= TS_LINK_READY;
+		ldcp->status = LDC_READY;
+	}
+
+	/*
+	 * if channel is being opened in RAW mode - no handshake is needed
+	 * switch the channel READY and UP state
+	 */
+	if (ldcp->mode == LDC_MODE_RAW) {
+		ldcp->tstate = TS_UP;	/* set bits associated with LDC UP */
+		ldcp->status = LDC_UP;
+	}
+
+	mutex_exit(&ldcp->lock);
+
+	/*
+	 * Increment number of open channels
+	 */
+	mutex_enter(&ldcssp->lock);
+	ldcssp->channels_open++;
+	mutex_exit(&ldcssp->lock);
+
+	D1(ldcp->id,
+	    "ldc_open: (0x%llx) channel (0x%p) open for use (tstate=0x%x)\n",
+	    ldcp->id, ldcp, ldcp->tstate);
+
+	return (0);
+}
+
+/*
+ * Close the LDC connection. It will return EBUSY if there
+ * are memory segments or descriptor rings either bound to or
+ * mapped over the channel
+ */
+int
+ldc_close(ldc_handle_t handle)
+{
+	ldc_chan_t 	*ldcp;
+	int		rv = 0;
+	boolean_t	chk_done = B_FALSE;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_close: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	mutex_enter(&ldcp->lock);
+
+	/* return error if channel is not open */
+	if (ldcp->tstate < TS_OPEN) {
+		DWARN(ldcp->id,
+		    "ldc_close: (0x%llx) channel is not open\n", ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EFAULT);
+	}
+
+	/* if any memory handles, drings, are bound or mapped cannot close */
+	if (ldcp->mhdl_list != NULL) {
+		DWARN(ldcp->id,
+		    "ldc_close: (0x%llx) channel has bound memory handles\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EBUSY);
+	}
+	if (ldcp->exp_dring_list != NULL) {
+		DWARN(ldcp->id,
+		    "ldc_close: (0x%llx) channel has bound descriptor rings\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EBUSY);
+	}
+	if (ldcp->imp_dring_list != NULL) {
+		DWARN(ldcp->id,
+		    "ldc_close: (0x%llx) channel has mapped descriptor rings\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EBUSY);
+	}
+
+	/*
+	 * Wait for pending transmits to complete i.e Tx queue to drain
+	 * if there are pending pkts - wait 1 ms and retry again
+	 */
+	for (;;) {
+
+		rv = hv_ldc_tx_get_state(ldcp->id,
+		    &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state);
+		if (rv) {
+			cmn_err(CE_WARN,
+			    "ldc_close: (0x%lx) cannot read qptrs\n", ldcp->id);
+			mutex_exit(&ldcp->lock);
+			return (EIO);
+		}
+
+		if (ldcp->tx_head == ldcp->tx_tail ||
+		    ldcp->link_state != LDC_CHANNEL_UP) {
+			break;
+		}
+
+		if (chk_done) {
+			DWARN(ldcp->id,
+			    "ldc_close: (0x%llx) Tx queue drain timeout\n",
+			    ldcp->id);
+			break;
+		}
+
+		/* wait for one ms and try again */
+		delay(drv_usectohz(1000));
+		chk_done = B_TRUE;
+	}
+
+	/*
+	 * Unregister the channel with the nexus
+	 */
+	rv = i_ldc_unregister_channel(ldcp);
+	if (rv && rv != EAGAIN) {
+		cmn_err(CE_WARN,
+		    "ldc_close: (0x%lx) channel unregister failed\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (rv);
+	}
+
+	/*
+	 * Unregister queues
+	 */
+	rv = hv_ldc_tx_qconf(ldcp->id, NULL, NULL);
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "ldc_close: (0x%lx) channel TX queue unconf failed\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+	rv = hv_ldc_rx_qconf(ldcp->id, NULL, NULL);
+	if (rv) {
+		cmn_err(CE_WARN,
+		    "ldc_close: (0x%lx) channel RX queue unconf failed\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+
+	ldcp->tstate &= ~TS_QCONF_RDY;
+
+	/* Reset channel state information */
+	i_ldc_reset_state(ldcp);
+
+	/* Mark channel as down and in initialized state */
+	ldcp->tx_ackd_head = 0;
+	ldcp->tx_head = 0;
+	ldcp->tstate = TS_INIT;
+	ldcp->status = LDC_INIT;
+
+	mutex_exit(&ldcp->lock);
+
+	/* Decrement number of open channels */
+	mutex_enter(&ldcssp->lock);
+	ldcssp->channels_open--;
+	mutex_exit(&ldcssp->lock);
+
+	D1(ldcp->id, "ldc_close: (0x%llx) channel closed\n", ldcp->id);
+
+	return (0);
+}
+
+/*
+ * Register channel callback
+ */
+int
+ldc_reg_callback(ldc_handle_t handle,
+    uint_t(*cb)(uint64_t event, caddr_t arg), caddr_t arg)
+{
+	ldc_chan_t *ldcp;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_reg_callback: invalid channel handle\n");
+		return (EINVAL);
+	}
+	if (((uint64_t)cb) < KERNELBASE) {
+		DWARN(DBG_ALL_LDCS, "ldc_reg_callback: invalid callback\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	mutex_enter(&ldcp->lock);
+
+	if (ldcp->cb) {
+		DWARN(ldcp->id, "ldc_reg_callback: (0x%llx) callback exists\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+	if (ldcp->cb_inprogress) {
+		DWARN(ldcp->id, "ldc_reg_callback: (0x%llx) callback active\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EWOULDBLOCK);
+	}
+
+	ldcp->cb = cb;
+	ldcp->cb_arg = arg;
+	ldcp->cb_enabled = B_TRUE;
+
+	D1(ldcp->id,
+	    "ldc_reg_callback: (0x%llx) registered callback for channel\n",
+	    ldcp->id);
+
+	mutex_exit(&ldcp->lock);
+
+	return (0);
+}
+
+/*
+ * Unregister channel callback
+ */
+int
+ldc_unreg_callback(ldc_handle_t handle)
+{
+	ldc_chan_t *ldcp;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_unreg_callback: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	mutex_enter(&ldcp->lock);
+
+	if (ldcp->cb == NULL) {
+		DWARN(ldcp->id,
+		    "ldc_unreg_callback: (0x%llx) no callback exists\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+	if (ldcp->cb_inprogress) {
+		DWARN(ldcp->id,
+		    "ldc_unreg_callback: (0x%llx) callback active\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EWOULDBLOCK);
+	}
+
+	ldcp->cb = NULL;
+	ldcp->cb_arg = NULL;
+	ldcp->cb_enabled = B_FALSE;
+
+	D1(ldcp->id,
+	    "ldc_unreg_callback: (0x%llx) unregistered callback for channel\n",
+	    ldcp->id);
+
+	mutex_exit(&ldcp->lock);
+
+	return (0);
+}
+
+
+/*
+ * Bring a channel up by initiating a handshake with the peer
+ * This call is asynchronous. It will complete at a later point
+ * in time when the peer responds back with an RTR.
+ */
+int
+ldc_up(ldc_handle_t handle)
+{
+	int 		rv;
+	ldc_chan_t 	*ldcp;
+	ldc_msg_t 	*ldcmsg;
+	uint64_t 	tx_tail;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_up: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	mutex_enter(&ldcp->lock);
+
+	if (ldcp->tstate == TS_UP) {
+		D2(ldcp->id,
+		    "ldc_up: (0x%llx) channel is already in UP state\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (0);
+	}
+
+	/* if the channel is in RAW mode - mark it as UP, if READY */
+	if (ldcp->mode == LDC_MODE_RAW && ldcp->tstate >= TS_READY) {
+		ldcp->tstate = TS_UP;
+		mutex_exit(&ldcp->lock);
+		return (0);
+	}
+
+	/* Don't start another handshake if there is one in progress */
+	if (ldcp->hstate) {
+		D2(ldcp->id,
+		    "ldc_up: (0x%llx) channel handshake in progress\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (0);
+	}
+
+	/* get the current tail for the LDC msg */
+	rv = i_ldc_get_tx_tail(ldcp, &tx_tail);
+	if (rv) {
+		DWARN(ldcp->id, "ldc_up: (0x%llx) cannot initiate handshake\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (ECONNREFUSED);
+	}
+
+	ldcmsg = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail);
+	ZERO_PKT(ldcmsg);
+
+	ldcmsg->type = LDC_CTRL;
+	ldcmsg->stype = LDC_INFO;
+	ldcmsg->ctrl = LDC_VER;
+	ldcp->next_vidx = 0;
+	bcopy(&ldc_versions[0], ldcmsg->udata, sizeof (ldc_versions[0]));
+
+	DUMP_LDC_PKT(ldcp, "ldc_up snd ver", (uint64_t)ldcmsg);
+
+	/* initiate the send by calling into HV and set the new tail */
+	tx_tail = (tx_tail + LDC_PACKET_SIZE) %
+		(ldcp->tx_q_entries << LDC_PACKET_SHIFT);
+
+	rv = i_ldc_set_tx_tail(ldcp, tx_tail);
+	if (rv) {
+		DWARN(ldcp->id,
+		    "ldc_up: (0x%llx) cannot initiate handshake rv=%d\n",
+		    ldcp->id, rv);
+		mutex_exit(&ldcp->lock);
+		return (rv);
+	}
+
+	ldcp->tx_tail = tx_tail;
+	D1(ldcp->id, "ldc_up: (0x%llx) channel up initiated\n", ldcp->id);
+
+	mutex_exit(&ldcp->lock);
+
+	return (rv);
+}
+
+
+/*
+ * Reset a channel by re-registering the Rx queues
+ */
+int
+ldc_reset(ldc_handle_t handle)
+{
+	ldc_chan_t 	*ldcp;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_reset: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	mutex_enter(&ldcp->lock);
+	i_ldc_reset(ldcp);
+	mutex_exit(&ldcp->lock);
+
+	return (0);
+}
+
+/*
+ * Get the current channel status
+ */
+int
+ldc_status(ldc_handle_t handle, ldc_status_t *status)
+{
+	ldc_chan_t *ldcp;
+
+	if (handle == NULL || status == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_status: invalid argument\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	*status = ((ldc_chan_t *)handle)->status;
+
+	D1(ldcp->id,
+	    "ldc_status: (0x%llx) returned status %d\n", ldcp->id, *status);
+	return (0);
+}
+
+
+/*
+ * Set the channel's callback mode - enable/disable callbacks
+ */
+int
+ldc_set_cb_mode(ldc_handle_t handle, ldc_cb_mode_t cmode)
+{
+	ldc_chan_t 	*ldcp;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_set_intr_mode: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	/*
+	 * Record no callbacks should be invoked
+	 */
+	mutex_enter(&ldcp->lock);
+
+	switch (cmode) {
+	case LDC_CB_DISABLE:
+		if (!ldcp->cb_enabled) {
+			DWARN(ldcp->id,
+			    "ldc_set_cb_mode: (0x%llx) callbacks disabled\n",
+			    ldcp->id);
+			break;
+		}
+		ldcp->cb_enabled = B_FALSE;
+
+		D1(ldcp->id, "ldc_set_cb_mode: (0x%llx) disabled callbacks\n",
+		    ldcp->id);
+		break;
+
+	case LDC_CB_ENABLE:
+		if (ldcp->cb_enabled) {
+			DWARN(ldcp->id,
+			    "ldc_set_cb_mode: (0x%llx) callbacks enabled\n",
+			    ldcp->id);
+			break;
+		}
+		ldcp->cb_enabled = B_TRUE;
+
+		D1(ldcp->id, "ldc_set_cb_mode: (0x%llx) enabled callbacks\n",
+		    ldcp->id);
+		break;
+	}
+
+	mutex_exit(&ldcp->lock);
+
+	return (0);
+}
+
+/*
+ * Check to see if there are packets on the incoming queue
+ * Will return isempty = B_FALSE if there are  packets
+ */
+int
+ldc_chkq(ldc_handle_t handle, boolean_t *isempty)
+{
+	int 		rv;
+	uint64_t 	rx_head, rx_tail;
+	ldc_chan_t 	*ldcp;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_chkq: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	*isempty = B_TRUE;
+
+	mutex_enter(&ldcp->lock);
+
+	if (ldcp->tstate != TS_UP) {
+		D1(ldcp->id,
+		    "ldc_chkq: (0x%llx) channel is not up\n", ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (ECONNRESET);
+	}
+
+	/* Read packet(s) from the queue */
+	rv = hv_ldc_rx_get_state(ldcp->id, &rx_head, &rx_tail,
+	    &ldcp->link_state);
+	if (rv != 0) {
+		cmn_err(CE_WARN,
+		    "ldc_chkq: (0x%lx) unable to read queue ptrs", ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EIO);
+	}
+	/* reset the channel state if the channel went down */
+	if (ldcp->link_state == LDC_CHANNEL_DOWN ||
+	    ldcp->link_state == LDC_CHANNEL_RESET) {
+		i_ldc_reset(ldcp);
+		mutex_exit(&ldcp->lock);
+		return (ECONNRESET);
+	}
+
+	if (rx_head != rx_tail) {
+		D1(ldcp->id, "ldc_chkq: (0x%llx) queue has pkt(s)\n", ldcp->id);
+		*isempty = B_FALSE;
+	}
+
+	mutex_exit(&ldcp->lock);
+
+	return (0);
+}
+
+
+/*
+ * Read 'size' amount of bytes or less. If incoming buffer
+ * is more than 'size', ENOBUFS is returned.
+ *
+ * On return, size contains the number of bytes read.
+ */
+int
+ldc_read(ldc_handle_t handle, caddr_t bufp, size_t *sizep)
+{
+	ldc_chan_t 	*ldcp;
+	uint64_t 	rx_head = 0, rx_tail = 0;
+	int		rv = 0, exit_val;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_read: invalid channel handle\n");
+		return (EINVAL);
+	}
+
+	ldcp = (ldc_chan_t *)handle;
+
+	/* channel lock */
+	mutex_enter(&ldcp->lock);
+
+	if (ldcp->tstate != TS_UP) {
+		DWARN(ldcp->id,
+		    "ldc_read: (0x%llx) channel is not in UP state\n",
+		    ldcp->id);
+		exit_val = ECONNRESET;
+	} else {
+		exit_val = ldcp->read_p(ldcp, bufp, sizep);
+	}
+
+	/*
+	 * if queue has been drained - clear interrupt
+	 */
+	rv = hv_ldc_rx_get_state(ldcp->id, &rx_head, &rx_tail,
+	    &ldcp->link_state);
+	if (exit_val == 0 && rv == 0 && rx_head == rx_tail) {
+		i_ldc_clear_intr(ldcp, CNEX_RX_INTR);
+	}
+
+	mutex_exit(&ldcp->lock);
+	return (exit_val);
+}
+
+/*
+ * Basic raw mondo read -
+ * no interpretation of mondo contents at all.
+ *
+ * Enter and exit with ldcp->lock held by caller
+ */
+static int
+i_ldc_read_raw(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
+{
+	uint64_t 	q_size_mask;
+	ldc_msg_t 	*msgp;
+	uint8_t		*msgbufp;
+	int		rv = 0, space;
+	uint64_t 	rx_head, rx_tail;
+
+	space = *sizep;
+
+	if (space < LDC_PAYLOAD_SIZE_RAW)
+		return (ENOBUFS);
+
+	ASSERT(mutex_owned(&ldcp->lock));
+
+	/* compute mask for increment */
+	q_size_mask = (ldcp->rx_q_entries-1)<<LDC_PACKET_SHIFT;
+
+	/*
+	 * Read packet(s) from the queue
+	 */
+	rv = hv_ldc_rx_get_state(ldcp->id, &rx_head, &rx_tail,
+	    &ldcp->link_state);
+	if (rv != 0) {
+		cmn_err(CE_WARN,
+		    "ldc_read_raw: (0x%lx) unable to read queue ptrs",
+		    ldcp->id);
+		return (EIO);
+	}
+	D1(ldcp->id, "ldc_read_raw: (0x%llx) rxh=0x%llx,"
+		" rxt=0x%llx, st=0x%llx\n",
+		ldcp->id, rx_head, rx_tail, ldcp->link_state);
+
+	/* reset the channel state if the channel went down */
+	if (ldcp->link_state == LDC_CHANNEL_DOWN) {
+		i_ldc_reset(ldcp);
+		return (ECONNRESET);
+	}
+
+	/*
+	 * Check for empty queue
+	 */
+	if (rx_head == rx_tail) {
+		*sizep = 0;
+		return (0);
+	}
+
+	/* get the message */
+	msgp = (ldc_msg_t *)(ldcp->rx_q_va + rx_head);
+
+	/* if channel is in RAW mode, copy data and return */
+	msgbufp = (uint8_t *)&(msgp->raw[0]);
+
+	bcopy(msgbufp, target_bufp, LDC_PAYLOAD_SIZE_RAW);
+
+	DUMP_PAYLOAD(ldcp->id, msgbufp);
+
+	*sizep = LDC_PAYLOAD_SIZE_RAW;
+
+	rx_head = (rx_head + LDC_PACKET_SIZE) & q_size_mask;
+	(void) i_ldc_set_rx_head(ldcp, rx_head);
+
+	return (rv);
+}
+
+/*
+ * Process LDC mondos to build larger packets
+ * with either un-reliable or reliable delivery.
+ *
+ * Enter and exit with ldcp->lock held by caller
+ */
+static int
+i_ldc_read_packet(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
+{
+	int		rv = 0;
+	uint64_t 	rx_head = 0, rx_tail = 0;
+	uint64_t 	curr_head = 0;
+	ldc_msg_t 	*msg;
+	caddr_t 	target;
+	size_t 		len = 0, bytes_read = 0;
+	int 		loop_cnt = 0, chk_cnt = 0;
+	uint64_t 	q_size_mask;
+
+	target = target_bufp;
+
+	ASSERT(mutex_owned(&ldcp->lock));
+
+	/* reset first frag to 0 */
+	ldcp->first_fragment = 0;
+
+	/* compute mask for increment */
+	q_size_mask = (ldcp->rx_q_entries-1)<<LDC_PACKET_SHIFT;
+
+	/*
+	 * Read packet(s) from the queue
+	 */
+	rv = hv_ldc_rx_get_state(ldcp->id, &curr_head, &rx_tail,
+	    &ldcp->link_state);
+	if (rv != 0) {
+		cmn_err(CE_WARN,
+		    "ldc_read: (0x%lx) unable to read queue ptrs",
+		    ldcp->id);
+		return (EIO);
+	}
+	D1(ldcp->id, "ldc_read: (0x%llx) chd=0x%llx, tl=0x%llx, st=0x%llx\n",
+	    ldcp->id, curr_head, rx_tail, ldcp->link_state);
+
+	/* reset the channel state if the channel went down */
+	if (ldcp->link_state == LDC_CHANNEL_DOWN) {
+		i_ldc_reset(ldcp);
+		return (ECONNRESET);
+	}
+
+	for (;;) {
+
+		if (curr_head == rx_tail) {
+			rv = hv_ldc_rx_get_state(ldcp->id,
+			    &rx_head, &rx_tail, &ldcp->link_state);
+			if (rv != 0) {
+				cmn_err(CE_WARN,
+				    "ldc_read: (0x%lx) cannot read queue ptrs",
+				    ldcp->id);
+				return (EIO);
+			}
+			/* reset the channel state if the channel went down */
+			if (ldcp->link_state == LDC_CHANNEL_DOWN) {
+				i_ldc_reset(ldcp);
+				return (ECONNRESET);
+			}
+		}
+
+		if (curr_head == rx_tail) {
+
+			/* If in the middle of a fragmented xfer */
+			if (ldcp->first_fragment != 0) {
+				if (++loop_cnt > LDC_LOOP_CNT) {
+					loop_cnt = 0;
+					++chk_cnt;
+				}
+				if (chk_cnt < LDC_CHK_CNT) {
+					continue;
+				} else {
+					*sizep = 0;
+					ldcp->last_msg_rcd =
+						ldcp->first_fragment - 1;
+					DWARN(DBG_ALL_LDCS,
+					    "ldc_read: (0x%llx) read timeout",
+					    ldcp->id);
+					return (ETIMEDOUT);
+				}
+			}
+			*sizep = 0;
+			break;
+		}
+		loop_cnt = 0;
+		chk_cnt = 0;
+
+		D2(ldcp->id,
+		    "ldc_read: (0x%llx) chd=0x%llx, rxhd=0x%llx, rxtl=0x%llx\n",
+		    ldcp->id, curr_head, rx_head, rx_tail);
+
+		/* get the message */
+		msg = (ldc_msg_t *)(ldcp->rx_q_va + curr_head);
+
+		DUMP_LDC_PKT(ldcp, "ldc_read received pkt",
+		    ldcp->rx_q_va + curr_head);
+
+		/* Check the message ID for the message received */
+		if ((rv = i_ldc_check_seqid(ldcp, msg)) != 0) {
+
+			DWARN(ldcp->id, "ldc_read: (0x%llx) seqid error, "
+			    "q_ptrs=0x%lx,0x%lx", ldcp->id, rx_head, rx_tail);
+
+			/* Reset last_msg_rcd to start of message */
+			if (ldcp->first_fragment != 0) {
+				ldcp->last_msg_rcd =
+					ldcp->first_fragment - 1;
+				ldcp->first_fragment = 0;
+			}
+			/*
+			 * Send a NACK -- invalid seqid
+			 * get the current tail for the response
+			 */
+			rv = i_ldc_send_pkt(ldcp, msg->type, LDC_NACK,
+			    (msg->ctrl & LDC_CTRL_MASK));
+			if (rv) {
+				cmn_err(CE_NOTE,
+				    "ldc_read: (0x%lx) err sending "
+				    "NACK msg\n", ldcp->id);
+			}
+
+			/* purge receive queue */
+			(void) i_ldc_set_rx_head(ldcp, rx_tail);
+
+			break;
+		}
+
+		/*
+		 * Process any messages of type CTRL messages
+		 * Future implementations should try to pass these to
+		 * LDC transport by resetting the intr state.
+		 *
+		 * NOTE: not done as a switch() as type can be both ctrl+data
+		 */
+		if (msg->type & LDC_CTRL) {
+			if (rv = i_ldc_ctrlmsg(ldcp, msg)) {
+				if (rv == EAGAIN)
+					continue;
+				(void) i_ldc_set_rx_head(ldcp, rx_tail);
+				*sizep = 0;
+				bytes_read = 0;
+				rv = ECONNRESET;
+				break;
+			}
+		}
+
+		/* process data ACKs */
+		if ((msg->type & LDC_DATA) && (msg->stype & LDC_ACK)) {
+			(void) i_ldc_process_data_ACK(ldcp, msg);
+		}
+
+		/* process data messages */
+		if ((msg->type & LDC_DATA) && (msg->stype & LDC_INFO)) {
+
+			uint8_t *msgbuf = (uint8_t *)(
+				(ldcp->mode == LDC_MODE_RELIABLE ||
+				ldcp->mode == LDC_MODE_STREAM)
+				? msg->rdata : msg->udata);
+
+			D2(ldcp->id,
+			    "ldc_read: (0x%llx) received data msg\n", ldcp->id);
+
+			/* get the packet length */
+			len = (msg->env & LDC_LEN_MASK);
+
+				/*
+				 * FUTURE OPTIMIZATION:
+				 * dont need to set q head for every
+				 * packet we read just need to do this when
+				 * we are done or need to wait for more
+				 * mondos to make a full packet - this is
+				 * currently expensive.
+				 */
+
+			if (ldcp->first_fragment == 0) {
+
+				/*
+				 * first packets should always have the start
+				 * bit set (even for a single packet). If not
+				 * throw away the packet
+				 */
+				if (!(msg->env & LDC_FRAG_START)) {
+
+					DWARN(DBG_ALL_LDCS,
+					    "ldc_read: (0x%llx) not start - "
+					    "frag=%x\n", ldcp->id,
+					    (msg->env) & LDC_FRAG_MASK);
+
+					/* toss pkt, inc head, cont reading */
+					bytes_read = 0;
+					target = target_bufp;
+					curr_head =
+						(curr_head + LDC_PACKET_SIZE)
+						& q_size_mask;
+					if (rv = i_ldc_set_rx_head(ldcp,
+						curr_head))
+						break;
+
+					continue;
+				}
+
+				ldcp->first_fragment = msg->seqid;
+			} else {
+				/* check to see if this is a pkt w/ START bit */
+				if (msg->env & LDC_FRAG_START) {
+					DWARN(DBG_ALL_LDCS,
+					    "ldc_read:(0x%llx) unexpected pkt"
+					    " env=0x%x discarding %d bytes,"
+					    " lastmsg=%d, currentmsg=%d\n",
+					    ldcp->id, msg->env&LDC_FRAG_MASK,
+					    bytes_read, ldcp->last_msg_rcd,
+					    msg->seqid);
+
+					/* throw data we have read so far */
+					bytes_read = 0;
+					target = target_bufp;
+					ldcp->first_fragment = msg->seqid;
+
+					if (rv = i_ldc_set_rx_head(ldcp,
+						curr_head))
+						break;
+				}
+			}
+
+			/* copy (next) pkt into buffer */
+			if (len <= (*sizep - bytes_read)) {
+				bcopy(msgbuf, target, len);
+				target += len;
+				bytes_read += len;
+			} else {
+				/*
+				 * there is not enough space in the buffer to
+				 * read this pkt. throw message away & continue
+				 * reading data from queue
+				 */
+				DWARN(DBG_ALL_LDCS,
+				    "ldc_read: (0x%llx) buffer too small, "
+				    "head=0x%lx, expect=%d, got=%d\n", ldcp->id,
+				    curr_head, *sizep, bytes_read+len);
+
+				ldcp->first_fragment = 0;
+				target = target_bufp;
+				bytes_read = 0;
+
+				/* throw away everything received so far */
+				if (rv = i_ldc_set_rx_head(ldcp, curr_head))
+					break;
+
+				/* continue reading remaining pkts */
+				continue;
+			}
+		}
+
+		/* set the message id */
+		ldcp->last_msg_rcd = msg->seqid;
+
+		/* move the head one position */
+		curr_head = (curr_head + LDC_PACKET_SIZE) & q_size_mask;
+
+		if (msg->env & LDC_FRAG_STOP) {
+
+			/*
+			 * All pkts that are part of this fragmented transfer
+			 * have been read or this was a single pkt read
+			 * or there was an error
+			 */
+
+			/* set the queue head */
+			if (rv = i_ldc_set_rx_head(ldcp, curr_head))
+				bytes_read = 0;
+
+			*sizep = bytes_read;
+
+			break;
+		}
+
+		/* advance head if it is a DATA ACK */
+		if ((msg->type & LDC_DATA) && (msg->stype & LDC_ACK)) {
+
+			/* set the queue head */
+			if (rv = i_ldc_set_rx_head(ldcp, curr_head)) {
+				bytes_read = 0;
+				break;
+			}
+
+			D2(ldcp->id, "ldc_read: (0x%llx) set ACK qhead 0x%llx",
+			    ldcp->id, curr_head);
+		}
+
+	} /* for (;;) */
+
+
+	/*
+	 * If useful data was read - Send msg ACK
+	 * OPTIMIZE: do not send ACK for all msgs - use some frequency
+	 */
+	if ((bytes_read > 0) && (ldcp->mode == LDC_MODE_RELIABLE ||
+		ldcp->mode == LDC_MODE_STREAM)) {
+
+		rv = i_ldc_send_pkt(ldcp, LDC_DATA, LDC_ACK, 0);
+		if (rv != 0) {
+			cmn_err(CE_NOTE,
+			    "ldc_read: (0x%lx) cannot send ACK\n", ldcp->id);
+			return (0);
+		}
+	}
+
+	D2(ldcp->id, "ldc_read: (0x%llx) end size=%d", ldcp->id, *sizep);
+
+	return (rv);
+}
+
+/*
+ * Use underlying reliable packet mechanism to fetch
+ * and buffer incoming packets so we can hand them back as
+ * a basic byte stream.
+ *
+ * Enter and exit with ldcp->lock held by caller
+ */
+static int
+i_ldc_read_stream(ldc_chan_t *ldcp, caddr_t target_bufp, size_t *sizep)
+{
+	int	rv;
+	size_t	size;
+
+	ASSERT(mutex_owned(&ldcp->lock));
+
+	D2(ldcp->id, "i_ldc_read_stream: (0x%llx) buffer size=%d",
+		ldcp->id, *sizep);
+
+	if (ldcp->stream_remains == 0) {
+		size = ldcp->mtu;
+		rv = i_ldc_read_packet(ldcp,
+			(caddr_t)ldcp->stream_bufferp, &size);
+		D2(ldcp->id, "i_ldc_read_stream: read packet (0x%llx) size=%d",
+			ldcp->id, size);
+
+		if (rv != 0)
+			return (rv);
+
+		ldcp->stream_remains = size;
+		ldcp->stream_offset = 0;
+	}
+
+	size = MIN(ldcp->stream_remains, *sizep);
+
+	bcopy(ldcp->stream_bufferp + ldcp->stream_offset, target_bufp, size);
+	ldcp->stream_offset += size;
+	ldcp->stream_remains -= size;
+
+	D2(ldcp->id, "i_ldc_read_stream: (0x%llx) fill from buffer size=%d",
+		ldcp->id, size);
+
+	*sizep = size;
+	return (0);
+}
+
+/*
+ * Write specified amount of bytes to the channel
+ * in multiple pkts of pkt_payload size. Each
+ * packet is tagged with an unique packet ID in
+ * the case of a reliable transport.
+ *
+ * On return, size contains the number of bytes written.
+ */
+int
+ldc_write(ldc_handle_t handle, caddr_t buf, size_t *sizep)
+{
+	ldc_chan_t	*ldcp;
+	int		rv = 0;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_write: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	mutex_enter(&ldcp->lock);
+
+	/* check if non-zero data to write */
+	if (buf == NULL || sizep == NULL) {
+		DWARN(ldcp->id, "ldc_write: (0x%llx) invalid data write\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EINVAL);
+	}
+
+	if (*sizep == 0) {
+		DWARN(ldcp->id, "ldc_write: (0x%llx) write size of zero\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (0);
+	}
+
+	/* Check if channel is UP for data exchange */
+	if (ldcp->tstate != TS_UP) {
+		DWARN(ldcp->id,
+		    "ldc_write: (0x%llx) channel is not in UP state\n",
+		    ldcp->id);
+		*sizep = 0;
+		rv = ECONNRESET;
+	} else {
+		rv = ldcp->write_p(ldcp, buf, sizep);
+	}
+
+	mutex_exit(&ldcp->lock);
+
+	return (rv);
+}
+
+/*
+ * Write a raw packet to the channel
+ * On return, size contains the number of bytes written.
+ */
+static int
+i_ldc_write_raw(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep)
+{
+	ldc_msg_t 	*ldcmsg;
+	uint64_t 	tx_head, tx_tail, new_tail;
+	int		rv = 0;
+	size_t		size;
+
+	ASSERT(mutex_owned(&ldcp->lock));
+	ASSERT(ldcp->mode == LDC_MODE_RAW);
+
+	size = *sizep;
+
+	/*
+	 * Check to see if the packet size is less than or
+	 * equal to packet size support in raw mode
+	 */
+	if (size > ldcp->pkt_payload) {
+		DWARN(ldcp->id,
+		    "ldc_write: (0x%llx) invalid size (0x%llx) for RAW mode\n",
+		    ldcp->id, *sizep);
+		*sizep = 0;
+		return (EMSGSIZE);
+	}
+
+	/* get the qptrs for the tx queue */
+	rv = hv_ldc_tx_get_state(ldcp->id,
+	    &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state);
+	if (rv != 0) {
+		cmn_err(CE_WARN,
+		    "ldc_write: (0x%lx) cannot read queue ptrs\n", ldcp->id);
+		*sizep = 0;
+		return (EIO);
+	}
+
+	if (ldcp->link_state == LDC_CHANNEL_DOWN ||
+	    ldcp->link_state == LDC_CHANNEL_RESET) {
+		DWARN(ldcp->id,
+		    "ldc_write: (0x%llx) channel down/reset\n", ldcp->id);
+		i_ldc_reset(ldcp);
+		*sizep = 0;
+		return (ECONNRESET);
+	}
+
+	tx_tail = ldcp->tx_tail;
+	tx_head = ldcp->tx_head;
+	new_tail = (tx_tail + LDC_PACKET_SIZE) &
+		((ldcp->tx_q_entries-1) << LDC_PACKET_SHIFT);
+
+	if (new_tail == tx_head) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_write: (0x%llx) TX queue is full\n", ldcp->id);
+		*sizep = 0;
+		return (EWOULDBLOCK);
+	}
+
+	D2(ldcp->id, "ldc_write: (0x%llx) start xfer size=%d",
+	    ldcp->id, size);
+
+	/* Send the data now */
+	ldcmsg = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail);
+
+		/* copy the data into pkt */
+	bcopy((uint8_t *)buf, ldcmsg, size);
+
+		/* increment tail */
+	tx_tail = new_tail;
+
+	/*
+	 * All packets have been copied into the TX queue
+	 * update the tail ptr in the HV
+	 */
+	rv = i_ldc_set_tx_tail(ldcp, tx_tail);
+	if (rv) {
+		if (rv == EWOULDBLOCK) {
+			DWARN(ldcp->id, "ldc_write: (0x%llx) write timed out\n",
+			    ldcp->id);
+			*sizep = 0;
+			return (EWOULDBLOCK);
+		}
+
+		/* cannot write data - reset channel */
+		i_ldc_reset(ldcp);
+		*sizep = 0;
+		return (ECONNRESET);
+	}
+
+	ldcp->tx_tail = tx_tail;
+	*sizep = size;
+
+	D2(ldcp->id, "ldc_write: (0x%llx) end xfer size=%d", ldcp->id, size);
+
+	return (rv);
+}
+
+
+/*
+ * Write specified amount of bytes to the channel
+ * in multiple pkts of pkt_payload size. Each
+ * packet is tagged with an unique packet ID in
+ * the case of a reliable transport.
+ *
+ * On return, size contains the number of bytes written.
+ * This function needs to ensure that the write size is < MTU size
+ */
+static int
+i_ldc_write_packet(ldc_chan_t *ldcp, caddr_t buf, size_t *size)
+{
+	ldc_msg_t 	*ldcmsg;
+	uint64_t 	tx_head, tx_tail, new_tail, start;
+	uint64_t	txq_size_mask, numavail;
+	uint8_t 	*msgbuf, *source = (uint8_t *)buf;
+	size_t 		len, bytes_written = 0, remaining;
+	int		rv;
+	uint32_t	curr_seqid;
+
+	ASSERT(mutex_owned(&ldcp->lock));
+
+	ASSERT(ldcp->mode == LDC_MODE_RELIABLE ||
+		ldcp->mode == LDC_MODE_UNRELIABLE ||
+		ldcp->mode == LDC_MODE_STREAM);
+
+	/* compute mask for increment */
+	txq_size_mask = (ldcp->tx_q_entries - 1) << LDC_PACKET_SHIFT;
+
+	/* get the qptrs for the tx queue */
+	rv = hv_ldc_tx_get_state(ldcp->id,
+	    &ldcp->tx_head, &ldcp->tx_tail, &ldcp->link_state);
+	if (rv != 0) {
+		cmn_err(CE_WARN,
+		    "ldc_write: (0x%lx) cannot read queue ptrs\n", ldcp->id);
+		*size = 0;
+		return (EIO);
+	}
+
+	if (ldcp->link_state == LDC_CHANNEL_DOWN ||
+	    ldcp->link_state == LDC_CHANNEL_RESET) {
+		DWARN(ldcp->id,
+		    "ldc_write: (0x%llx) channel down/reset\n", ldcp->id);
+		*size = 0;
+		i_ldc_reset(ldcp);
+		return (ECONNRESET);
+	}
+
+	tx_tail = ldcp->tx_tail;
+	new_tail = (tx_tail + LDC_PACKET_SIZE) %
+		(ldcp->tx_q_entries << LDC_PACKET_SHIFT);
+
+	/*
+	 * Transport mode determines whether we use HV Tx head or the
+	 * private protocol head (corresponding to last ACKd pkt) for
+	 * determining how much we can write
+	 */
+	tx_head = (ldcp->mode == LDC_MODE_RELIABLE ||
+		ldcp->mode == LDC_MODE_STREAM)
+		? ldcp->tx_ackd_head : ldcp->tx_head;
+	if (new_tail == tx_head) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_write: (0x%llx) TX queue is full\n", ldcp->id);
+		*size = 0;
+		return (EWOULDBLOCK);
+	}
+
+	/*
+	 * Make sure that the LDC Tx queue has enough space
+	 */
+	numavail = (tx_head >> LDC_PACKET_SHIFT) - (tx_tail >> LDC_PACKET_SHIFT)
+		+ ldcp->tx_q_entries - 1;
+	numavail %= ldcp->tx_q_entries;
+
+	if (*size > (numavail * ldcp->pkt_payload)) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_write: (0x%llx) TX queue has no space\n", ldcp->id);
+		return (EWOULDBLOCK);
+	}
+
+	D2(ldcp->id, "ldc_write: (0x%llx) start xfer size=%d",
+	    ldcp->id, *size);
+
+	/* Send the data now */
+	bytes_written = 0;
+	curr_seqid = ldcp->last_msg_snt;
+	start = tx_tail;
+
+	while (*size > bytes_written) {
+
+		ldcmsg = (ldc_msg_t *)(ldcp->tx_q_va + tx_tail);
+
+		msgbuf = (uint8_t *)((ldcp->mode == LDC_MODE_RELIABLE ||
+			ldcp->mode == LDC_MODE_STREAM)
+			? ldcmsg->rdata : ldcmsg->udata);
+
+		ldcmsg->type = LDC_DATA;
+		ldcmsg->stype = LDC_INFO;
+		ldcmsg->ctrl = 0;
+
+		remaining = *size - bytes_written;
+		len = min(ldcp->pkt_payload, remaining);
+		ldcmsg->env = (uint8_t)len;
+
+		curr_seqid++;
+		ldcmsg->seqid = curr_seqid;
+
+		DUMP_LDC_PKT(ldcp, "ldc_write snd data", (uint64_t)ldcmsg);
+
+		/* copy the data into pkt */
+		bcopy(source, msgbuf, len);
+
+		source += len;
+		bytes_written += len;
+
+		/* increment tail */
+		tx_tail = (tx_tail + LDC_PACKET_SIZE) & txq_size_mask;
+
+		ASSERT(tx_tail != tx_head);
+	}
+
+	/* Set the start and stop bits */
+	ldcmsg->env |= LDC_FRAG_STOP;
+	ldcmsg = (ldc_msg_t *)(ldcp->tx_q_va + start);
+	ldcmsg->env |= LDC_FRAG_START;
+
+	/*
+	 * All packets have been copied into the TX queue
+	 * update the tail ptr in the HV
+	 */
+	rv = i_ldc_set_tx_tail(ldcp, tx_tail);
+	if (rv == 0) {
+		ldcp->tx_tail = tx_tail;
+		ldcp->last_msg_snt = curr_seqid;
+		*size = bytes_written;
+	} else {
+		int rv2;
+
+		if (rv != EWOULDBLOCK) {
+			/* cannot write data - reset channel */
+			i_ldc_reset(ldcp);
+			*size = 0;
+			return (ECONNRESET);
+		}
+
+		DWARN(ldcp->id, "hv_tx_set_tail returns 0x%x (head 0x%x, "
+			"old tail 0x%x, new tail 0x%x, qsize=0x%x)\n",
+			rv, ldcp->tx_head, ldcp->tx_tail, tx_tail,
+			(ldcp->tx_q_entries << LDC_PACKET_SHIFT));
+
+		rv2 = hv_ldc_tx_get_state(ldcp->id,
+		    &tx_head, &tx_tail, &ldcp->link_state);
+
+		DWARN(ldcp->id, "hv_ldc_tx_get_state returns 0x%x "
+			"(head 0x%x, tail 0x%x state 0x%x)\n",
+			rv2, tx_head, tx_tail, ldcp->link_state);
+
+		*size = 0;
+	}
+
+	D2(ldcp->id, "ldc_write: (0x%llx) end xfer size=%d", ldcp->id, *size);
+
+	return (rv);
+}
+
+/*
+ * Write specified amount of bytes to the channel
+ * in multiple pkts of pkt_payload size. Each
+ * packet is tagged with an unique packet ID in
+ * the case of a reliable transport.
+ *
+ * On return, size contains the number of bytes written.
+ * This function needs to ensure that the write size is < MTU size
+ */
+static int
+i_ldc_write_stream(ldc_chan_t *ldcp, caddr_t buf, size_t *sizep)
+{
+	ASSERT(mutex_owned(&ldcp->lock));
+	ASSERT(ldcp->mode == LDC_MODE_STREAM);
+
+	/* Truncate packet to max of MTU size */
+	if (*sizep > ldcp->mtu) *sizep = ldcp->mtu;
+	return (i_ldc_write_packet(ldcp, buf, sizep));
+}
+
+
+/*
+ * Interfaces for channel nexus to register/unregister with LDC module
+ * The nexus will register functions to be used to register individual
+ * channels with the nexus and enable interrupts for the channels
+ */
+int
+ldc_register(ldc_cnex_t *cinfo)
+{
+	ldc_chan_t	*ldcp;
+
+	if (cinfo == NULL || cinfo->dip == NULL ||
+	    cinfo->reg_chan == NULL || cinfo->unreg_chan == NULL ||
+	    cinfo->add_intr == NULL || cinfo->rem_intr == NULL ||
+	    cinfo->clr_intr == NULL) {
+
+		DWARN(DBG_ALL_LDCS, "ldc_register: invalid nexus info\n");
+		return (EINVAL);
+	}
+
+	mutex_enter(&ldcssp->lock);
+
+	/* nexus registration */
+	ldcssp->cinfo.dip = cinfo->dip;
+	ldcssp->cinfo.reg_chan = cinfo->reg_chan;
+	ldcssp->cinfo.unreg_chan = cinfo->unreg_chan;
+	ldcssp->cinfo.add_intr = cinfo->add_intr;
+	ldcssp->cinfo.rem_intr = cinfo->rem_intr;
+	ldcssp->cinfo.clr_intr = cinfo->clr_intr;
+
+	/* register any channels that might have been previously initialized */
+	ldcp = ldcssp->chan_list;
+	while (ldcp) {
+		if ((ldcp->tstate & TS_QCONF_RDY) &&
+		    (ldcp->tstate & TS_CNEX_RDY) == 0)
+			(void) i_ldc_register_channel(ldcp);
+
+		ldcp = ldcp->next;
+	}
+
+	mutex_exit(&ldcssp->lock);
+
+	return (0);
+}
+
+int
+ldc_unregister(ldc_cnex_t *cinfo)
+{
+	if (cinfo == NULL || cinfo->dip == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_unregister: invalid nexus info\n");
+		return (EINVAL);
+	}
+
+	mutex_enter(&ldcssp->lock);
+
+	if (cinfo->dip != ldcssp->cinfo.dip) {
+		DWARN(DBG_ALL_LDCS, "ldc_unregister: invalid dip\n");
+		mutex_exit(&ldcssp->lock);
+		return (EINVAL);
+	}
+
+	/* nexus unregister */
+	ldcssp->cinfo.dip = NULL;
+	ldcssp->cinfo.reg_chan = NULL;
+	ldcssp->cinfo.unreg_chan = NULL;
+	ldcssp->cinfo.add_intr = NULL;
+	ldcssp->cinfo.rem_intr = NULL;
+	ldcssp->cinfo.clr_intr = NULL;
+
+	mutex_exit(&ldcssp->lock);
+
+	return (0);
+}
+
+
+/* ------------------------------------------------------------------------- */
+
+/*
+ * Allocate a memory handle for the channel and link it into the list
+ * Also choose which memory table to use if this is the first handle
+ * being assigned to this channel
+ */
+int
+ldc_mem_alloc_handle(ldc_handle_t handle, ldc_mem_handle_t *mhandle)
+{
+	ldc_chan_t 	*ldcp;
+	ldc_mhdl_t	*mhdl;
+	int 		rv;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_alloc_handle: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	mutex_enter(&ldcp->lock);
+
+	/* check to see if channel is initalized */
+	if (ldcp->tstate < TS_INIT) {
+		DWARN(ldcp->id,
+		    "ldc_mem_alloc_handle: (0x%llx) channel not initialized\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EINVAL);
+	}
+
+	/*
+	 * If this channel is allocating a mem handle for the
+	 * first time allocate it a memory map table and initialize it
+	 */
+	if (ldcp->mtbl == NULL) {
+
+		ldc_mtbl_t *mtbl;
+
+		/* Allocate and initialize the map table structure */
+		mtbl = kmem_zalloc(sizeof (ldc_mtbl_t), KM_SLEEP);
+		mtbl->size = MTBL_MAX_SIZE;
+		mtbl->num_entries = mtbl->num_avail =
+			(MTBL_MAX_SIZE/sizeof (ldc_mte_slot_t));
+		mtbl->next_entry = NULL;
+
+		/* Allocate the table itself */
+		mtbl->table = (ldc_mte_slot_t *)
+			contig_mem_alloc_align(mtbl->size, MMU_PAGESIZE);
+		if (mtbl->table == NULL) {
+			cmn_err(CE_WARN,
+			    "ldc_mem_alloc_handle: (0x%lx) error allocating "
+			    "table memory", ldcp->id);
+			kmem_free(mtbl, sizeof (ldc_mtbl_t));
+			mutex_exit(&ldcp->lock);
+			return (ENOMEM);
+		}
+
+		/* zero out the memory */
+		bzero(mtbl->table, mtbl->size);
+
+		/* initialize the lock */
+		mutex_init(&mtbl->lock, NULL, MUTEX_DRIVER, NULL);
+
+		/* register table for this channel */
+		rv = hv_ldc_set_map_table(ldcp->id,
+		    va_to_pa(mtbl->table), mtbl->num_entries);
+		if (rv != 0) {
+			cmn_err(CE_WARN,
+			    "ldc_mem_alloc_handle: (0x%lx) err %d mapping tbl",
+			    ldcp->id, rv);
+			contig_mem_free(mtbl->table, mtbl->size);
+			mutex_destroy(&mtbl->lock);
+			kmem_free(mtbl, sizeof (ldc_mtbl_t));
+			mutex_exit(&ldcp->lock);
+			return (EIO);
+		}
+
+		ldcp->mtbl = mtbl;
+
+		D1(ldcp->id,
+		    "ldc_mem_alloc_handle: (0x%llx) alloc'd map table 0x%llx\n",
+		    ldcp->id, ldcp->mtbl->table);
+	}
+
+	/* allocate handle for channel */
+	mhdl = kmem_zalloc(sizeof (ldc_mhdl_t), KM_SLEEP);
+
+	/* initialize the lock */
+	mutex_init(&mhdl->lock, NULL, MUTEX_DRIVER, NULL);
+
+	mhdl->status = LDC_UNBOUND;
+	mhdl->ldcp = ldcp;
+
+	/* insert memory handle (@ head) into list */
+	if (ldcp->mhdl_list == NULL) {
+		ldcp->mhdl_list = mhdl;
+		mhdl->next = NULL;
+	} else {
+		/* insert @ head */
+		mhdl->next = ldcp->mhdl_list;
+		ldcp->mhdl_list = mhdl;
+	}
+
+	/* return the handle */
+	*mhandle = (ldc_mem_handle_t)mhdl;
+
+	mutex_exit(&ldcp->lock);
+
+	D1(ldcp->id, "ldc_mem_alloc_handle: (0x%llx) allocated handle 0x%llx\n",
+	    ldcp->id, mhdl);
+
+	return (0);
+}
+
+/*
+ * Free memory handle for the channel and unlink it from the list
+ */
+int
+ldc_mem_free_handle(ldc_mem_handle_t mhandle)
+{
+	ldc_mhdl_t 	*mhdl, *phdl;
+	ldc_chan_t 	*ldcp;
+
+	if (mhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_free_handle: invalid memory handle\n");
+		return (EINVAL);
+	}
+	mhdl = (ldc_mhdl_t *)mhandle;
+
+	mutex_enter(&mhdl->lock);
+
+	ldcp = mhdl->ldcp;
+
+	if (mhdl->status == LDC_BOUND || mhdl->status == LDC_MAPPED) {
+		DWARN(ldcp->id,
+		    "ldc_mem_free_handle: cannot free, 0x%llx hdl bound\n",
+		    mhdl);
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+	mutex_exit(&mhdl->lock);
+
+	mutex_enter(&ldcp->mlist_lock);
+
+	phdl = ldcp->mhdl_list;
+
+	/* first handle */
+	if (phdl == mhdl) {
+		ldcp->mhdl_list = mhdl->next;
+		mutex_destroy(&mhdl->lock);
+		kmem_free(mhdl, sizeof (ldc_mhdl_t));
+		D1(ldcp->id,
+		    "ldc_mem_free_handle: (0x%llx) freed handle 0x%llx\n",
+		    ldcp->id, mhdl);
+	} else {
+		/* walk the list - unlink and free */
+		while (phdl != NULL) {
+			if (phdl->next == mhdl) {
+				phdl->next = mhdl->next;
+				mutex_destroy(&mhdl->lock);
+				kmem_free(mhdl, sizeof (ldc_mhdl_t));
+				D1(ldcp->id,
+				    "ldc_mem_free_handle: (0x%llx) freed "
+				    "handle 0x%llx\n", ldcp->id, mhdl);
+				break;
+			}
+			phdl = phdl->next;
+		}
+	}
+
+	if (phdl == NULL) {
+		DWARN(ldcp->id,
+		    "ldc_mem_free_handle: invalid handle 0x%llx\n", mhdl);
+		mutex_exit(&ldcp->mlist_lock);
+		return (EINVAL);
+	}
+
+	mutex_exit(&ldcp->mlist_lock);
+
+	return (0);
+}
+
+/*
+ * Bind a memory handle to a virtual address.
+ * The virtual address is converted to the corresponding real addresses.
+ * Returns pointer to the first ldc_mem_cookie and the total number
+ * of cookies for this virtual address. Other cookies can be obtained
+ * using the ldc_mem_nextcookie() call. If the pages are stored in
+ * consecutive locations in the table, a single cookie corresponding to
+ * the first location is returned. The cookie size spans all the entries.
+ *
+ * If the VA corresponds to a page that is already being exported, reuse
+ * the page and do not export it again. Bump the page's use count.
+ */
+int
+ldc_mem_bind_handle(ldc_mem_handle_t mhandle, caddr_t vaddr, size_t len,
+    uint8_t mtype, uint8_t perm, ldc_mem_cookie_t *cookie, uint32_t *ccount)
+{
+	ldc_mhdl_t	*mhdl;
+	ldc_chan_t 	*ldcp;
+	ldc_mtbl_t	*mtbl;
+	ldc_memseg_t	*memseg;
+	ldc_mte_t	tmp_mte;
+	uint64_t	index, prev_index = 0;
+	int64_t		cookie_idx;
+	uintptr_t	raddr, ra_aligned;
+	uint64_t	psize, poffset, v_offset;
+	uint64_t	pg_shift, pg_size, pg_size_code, pg_mask;
+	pgcnt_t		npages;
+	caddr_t		v_align, addr;
+	int 		i;
+
+	if (mhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_bind_handle: invalid memory handle\n");
+		return (EINVAL);
+	}
+	mhdl = (ldc_mhdl_t *)mhandle;
+	ldcp = mhdl->ldcp;
+	mtbl = ldcp->mtbl;
+
+	/* clear count */
+	*ccount = 0;
+
+	mutex_enter(&mhdl->lock);
+
+	if (mhdl->status == LDC_BOUND || mhdl->memseg != NULL) {
+		DWARN(ldcp->id,
+		    "ldc_mem_bind_handle: (0x%x) handle already bound\n",
+		    mhandle);
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	/* Force address and size to be 8-byte aligned */
+	if ((((uintptr_t)vaddr | len) & 0x7) != 0) {
+		DWARN(ldcp->id,
+		    "ldc_mem_bind_handle: addr/size is not 8-byte aligned\n");
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	/* FUTURE: get the page size, pgsz code, and shift */
+	pg_size = MMU_PAGESIZE;
+	pg_size_code = page_szc(pg_size);
+	pg_shift = page_get_shift(pg_size_code);
+	pg_mask = ~(pg_size - 1);
+
+	D1(ldcp->id, "ldc_mem_bind_handle: (0x%llx) binding "
+	    "va 0x%llx pgsz=0x%llx, pgszc=0x%llx, pg_shift=0x%llx\n",
+	    ldcp->id, vaddr, pg_size, pg_size_code, pg_shift);
+
+	/* aligned VA and its offset */
+	v_align = (caddr_t)(((uintptr_t)vaddr) & ~(pg_size - 1));
+	v_offset = ((uintptr_t)vaddr) & (pg_size - 1);
+
+	npages = (len+v_offset)/pg_size;
+	npages = ((len+v_offset)%pg_size == 0) ? npages : npages+1;
+
+	D1(ldcp->id, "ldc_mem_bind_handle: binding "
+	    "(0x%llx) v=0x%llx,val=0x%llx,off=0x%x,pgs=0x%x\n",
+	    ldcp->id, vaddr, v_align, v_offset, npages);
+
+	/* lock the memory table - exclusive access to channel */
+	mutex_enter(&mtbl->lock);
+
+	if (npages > mtbl->num_avail) {
+		DWARN(ldcp->id,
+		    "ldc_mem_bind_handle: (0x%llx) no table entries\n",
+		    ldcp->id);
+		mutex_exit(&mtbl->lock);
+		mutex_exit(&mhdl->lock);
+		return (ENOMEM);
+	}
+
+	/* Allocate a memseg structure */
+	memseg = mhdl->memseg = kmem_zalloc(sizeof (ldc_memseg_t), KM_SLEEP);
+
+	/* Allocate memory to store all pages and cookies */
+	memseg->pages = kmem_zalloc((sizeof (ldc_page_t) * npages), KM_SLEEP);
+	memseg->cookies =
+		kmem_zalloc((sizeof (ldc_mem_cookie_t) * npages), KM_SLEEP);
+
+	D2(ldcp->id, "ldc_mem_bind_handle: (0x%llx) processing 0x%llx pages\n",
+	    ldcp->id, npages);
+
+	addr = v_align;
+
+	/*
+	 * Table slots are used in a round-robin manner. The algorithm permits
+	 * inserting duplicate entries. Slots allocated earlier will typically
+	 * get freed before we get back to reusing the slot.Inserting duplicate
+	 * entries should be OK as we only lookup entries using the cookie addr
+	 * i.e. tbl index, during export, unexport and copy operation.
+	 *
+	 * One implementation what was tried was to search for a duplicate
+	 * page entry first and reuse it. The search overhead is very high and
+	 * in the vnet case dropped the perf by almost half, 50 to 24 mbps.
+	 * So it does make sense to avoid searching for duplicates.
+	 *
+	 * But during the process of searching for a free slot, if we find a
+	 * duplicate entry we will go ahead and use it, and bump its use count.
+	 */
+
+	/* index to start searching from */
+	index = mtbl->next_entry;
+	cookie_idx = -1;
+
+	tmp_mte.ll = 0;	/* initialise fields to 0 */
+
+	if (mtype & LDC_DIRECT_MAP) {
+		tmp_mte.mte_r = (perm & LDC_MEM_R) ? 1 : 0;
+		tmp_mte.mte_w = (perm & LDC_MEM_W) ? 1 : 0;
+		tmp_mte.mte_x = (perm & LDC_MEM_X) ? 1 : 0;
+	}
+
+	if (mtype & LDC_SHADOW_MAP) {
+		tmp_mte.mte_cr = (perm & LDC_MEM_R) ? 1 : 0;
+		tmp_mte.mte_cw = (perm & LDC_MEM_W) ? 1 : 0;
+	}
+
+	if (mtype & LDC_IO_MAP) {
+		tmp_mte.mte_ir = (perm & LDC_MEM_R) ? 1 : 0;
+		tmp_mte.mte_iw = (perm & LDC_MEM_W) ? 1 : 0;
+	}
+
+	D1(ldcp->id, "ldc_mem_bind_handle mte=0x%llx\n", tmp_mte.ll);
+
+	tmp_mte.mte_pgszc = pg_size_code;
+
+	/* initialize each mem table entry */
+	for (i = 0; i < npages; i++) {
+
+		/* check if slot is available in the table */
+		while (mtbl->table[index].entry.ll != 0) {
+
+			index = (index + 1) % mtbl->num_entries;
+
+			if (index == mtbl->next_entry) {
+				/* we have looped around */
+				DWARN(DBG_ALL_LDCS,
+				    "ldc_mem_bind_handle: (0x%llx) cannot find "
+				    "entry\n", ldcp->id);
+				*ccount = 0;
+
+				/* NOTE: free memory, remove previous entries */
+				/* this shouldnt happen as num_avail was ok */
+
+				mutex_exit(&mtbl->lock);
+				mutex_exit(&mhdl->lock);
+				return (ENOMEM);
+			}
+		}
+
+		/* get the real address */
+		raddr = va_to_pa((void *)addr);
+		ra_aligned = ((uintptr_t)raddr & pg_mask);
+
+		/* build the mte */
+		tmp_mte.mte_rpfn = ra_aligned >> pg_shift;
+
+		D1(ldcp->id, "ldc_mem_bind_handle mte=0x%llx\n", tmp_mte.ll);
+
+		/* update entry in table */
+		mtbl->table[index].entry = tmp_mte;
+
+		D2(ldcp->id, "ldc_mem_bind_handle: (0x%llx) stored MTE 0x%llx"
+		    " into loc 0x%llx\n", ldcp->id, tmp_mte.ll, index);
+
+		/* calculate the size and offset for this export range */
+		if (i == 0) {
+			/* first page */
+			psize = min((pg_size - v_offset), len);
+			poffset = v_offset;
+
+		} else if (i == (npages - 1)) {
+			/* last page */
+			psize =	(((uintptr_t)(vaddr + len)) &
+				    ((uint64_t)(pg_size-1)));
+			if (psize == 0)
+				psize = pg_size;
+			poffset = 0;
+
+		} else {
+			/* middle pages */
+			psize = pg_size;
+			poffset = 0;
+		}
+
+		/* store entry for this page */
+		memseg->pages[i].index = index;
+		memseg->pages[i].raddr = raddr;
+		memseg->pages[i].offset = poffset;
+		memseg->pages[i].size = psize;
+		memseg->pages[i].mte = &(mtbl->table[index]);
+
+		/* create the cookie */
+		if (i == 0 || (index != prev_index + 1)) {
+			cookie_idx++;
+			memseg->cookies[cookie_idx].addr =
+				IDX2COOKIE(index, pg_size_code, pg_shift);
+			memseg->cookies[cookie_idx].addr |= poffset;
+			memseg->cookies[cookie_idx].size = psize;
+
+		} else {
+			memseg->cookies[cookie_idx].size += psize;
+		}
+
+		D1(ldcp->id, "ldc_mem_bind_handle: bound "
+		    "(0x%llx) va=0x%llx, idx=0x%llx, "
+		    "ra=0x%llx(sz=0x%x,off=0x%x)\n",
+		    ldcp->id, addr, index, raddr, psize, poffset);
+
+		/* decrement number of available entries */
+		mtbl->num_avail--;
+
+		/* increment va by page size */
+		addr += pg_size;
+
+		/* increment index */
+		prev_index = index;
+		index = (index + 1) % mtbl->num_entries;
+
+		/* save the next slot */
+		mtbl->next_entry = index;
+	}
+
+	mutex_exit(&mtbl->lock);
+
+	/* memory handle = bound */
+	mhdl->mtype = mtype;
+	mhdl->perm = perm;
+	mhdl->status = LDC_BOUND;
+
+	/* update memseg_t */
+	memseg->vaddr = vaddr;
+	memseg->raddr = memseg->pages[0].raddr;
+	memseg->size = len;
+	memseg->npages = npages;
+	memseg->ncookies = cookie_idx + 1;
+	memseg->next_cookie = (memseg->ncookies > 1) ? 1 : 0;
+
+	/* return count and first cookie */
+	*ccount = memseg->ncookies;
+	cookie->addr = memseg->cookies[0].addr;
+	cookie->size = memseg->cookies[0].size;
+
+	D1(ldcp->id,
+	    "ldc_mem_bind_handle: (0x%llx) bound 0x%llx, va=0x%llx, "
+	    "pgs=0x%llx cookies=0x%llx\n",
+	    ldcp->id, mhdl, vaddr, npages, memseg->ncookies);
+
+	mutex_exit(&mhdl->lock);
+	return (0);
+}
+
+/*
+ * Return the next cookie associated with the specified memory handle
+ */
+int
+ldc_mem_nextcookie(ldc_mem_handle_t mhandle, ldc_mem_cookie_t *cookie)
+{
+	ldc_mhdl_t	*mhdl;
+	ldc_chan_t 	*ldcp;
+	ldc_memseg_t	*memseg;
+
+	if (mhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_nextcookie: invalid memory handle\n");
+		return (EINVAL);
+	}
+	mhdl = (ldc_mhdl_t *)mhandle;
+
+	mutex_enter(&mhdl->lock);
+
+	ldcp = mhdl->ldcp;
+	memseg = mhdl->memseg;
+
+	if (cookie == 0) {
+		DWARN(ldcp->id,
+		    "ldc_mem_nextcookie:(0x%llx) invalid cookie arg\n",
+		    ldcp->id);
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	if (memseg->next_cookie != 0) {
+		cookie->addr = memseg->cookies[memseg->next_cookie].addr;
+		cookie->size = memseg->cookies[memseg->next_cookie].size;
+		memseg->next_cookie++;
+		if (memseg->next_cookie == memseg->ncookies)
+			memseg->next_cookie = 0;
+
+	} else {
+		DWARN(ldcp->id,
+		    "ldc_mem_nextcookie:(0x%llx) no more cookies\n", ldcp->id);
+		cookie->addr = 0;
+		cookie->size = 0;
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	D1(ldcp->id,
+	    "ldc_mem_nextcookie: (0x%llx) cookie addr=0x%llx,sz=0x%llx\n",
+	    ldcp->id, cookie->addr, cookie->size);
+
+	mutex_exit(&mhdl->lock);
+	return (0);
+}
+
+/*
+ * Unbind the virtual memory region associated with the specified
+ * memory handle. Allassociated cookies are freed and the corresponding
+ * RA space is no longer exported.
+ */
+int
+ldc_mem_unbind_handle(ldc_mem_handle_t mhandle)
+{
+	ldc_mhdl_t	*mhdl;
+	ldc_chan_t 	*ldcp;
+	ldc_mtbl_t	*mtbl;
+	ldc_memseg_t	*memseg;
+	int		i;
+
+	if (mhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_unbind_handle: invalid memory handle\n");
+		return (EINVAL);
+	}
+	mhdl = (ldc_mhdl_t *)mhandle;
+
+	mutex_enter(&mhdl->lock);
+
+	if (mhdl->status == LDC_UNBOUND) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_unbind_handle: (0x%x) handle is not bound\n",
+		    mhandle);
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	ldcp = mhdl->ldcp;
+	mtbl = ldcp->mtbl;
+
+	memseg = mhdl->memseg;
+
+	/* lock the memory table - exclusive access to channel */
+	mutex_enter(&mtbl->lock);
+
+	/* undo the pages exported */
+	for (i = 0; i < memseg->npages; i++) {
+
+		/* FUTURE: check for mapped pages */
+		if (memseg->pages[i].mte->cookie) {
+			_NOTE(EMPTY)
+		}
+
+		/* clear the entry from the table */
+		memseg->pages[i].mte->entry.ll = 0;
+		mtbl->num_avail++;
+	}
+	mutex_exit(&mtbl->lock);
+
+	/* free the allocated memseg and page structures */
+	kmem_free(memseg->pages, (sizeof (ldc_page_t) * memseg->npages));
+	kmem_free(memseg->cookies,
+	    (sizeof (ldc_mem_cookie_t) * memseg->npages));
+	kmem_free(memseg, sizeof (ldc_memseg_t));
+
+	/* uninitialize the memory handle */
+	mhdl->memseg = NULL;
+	mhdl->status = LDC_UNBOUND;
+
+	D1(ldcp->id, "ldc_mem_unbind_handle: (0x%llx) unbound handle 0x%llx\n",
+	    ldcp->id, mhdl);
+
+	mutex_exit(&mhdl->lock);
+	return (0);
+}
+
+/*
+ * Get information about the dring. The base address of the descriptor
+ * ring along with the type and permission are returned back.
+ */
+int
+ldc_mem_info(ldc_mem_handle_t mhandle, ldc_mem_info_t *minfo)
+{
+	ldc_mhdl_t	*mhdl;
+
+	if (mhandle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_mem_info: invalid memory handle\n");
+		return (EINVAL);
+	}
+	mhdl = (ldc_mhdl_t *)mhandle;
+
+	if (minfo == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_mem_info: invalid args\n");
+		return (EINVAL);
+	}
+
+	mutex_enter(&mhdl->lock);
+
+	minfo->status = mhdl->status;
+	if (mhdl->status == LDC_BOUND || mhdl->status == LDC_MAPPED) {
+		minfo->vaddr = mhdl->memseg->vaddr;
+		minfo->raddr = mhdl->memseg->raddr;
+		minfo->mtype = mhdl->mtype;
+		minfo->perm = mhdl->perm;
+	}
+	mutex_exit(&mhdl->lock);
+
+	return (0);
+}
+
+/*
+ * Copy data either from or to the client specified virtual address
+ * space to or from the exported memory associated with the cookies.
+ * The direction argument determines whether the data is read from or
+ * written to exported memory.
+ */
+int
+ldc_mem_copy(ldc_handle_t handle, caddr_t vaddr, uint64_t off, size_t *size,
+    ldc_mem_cookie_t *cookies, uint32_t ccount, uint8_t direction)
+{
+	ldc_chan_t 	*ldcp;
+	uint64_t	local_voff, local_valign;
+	uint64_t	cookie_addr, cookie_size;
+	uint64_t	pg_shift, pg_size, pg_size_code;
+	uint64_t 	export_caddr, export_poff, export_psize, export_size;
+	uint64_t	local_ra, local_poff, local_psize;
+	uint64_t	copy_size, copied_len = 0, total_bal = 0, idx = 0;
+	pgcnt_t		npages;
+	size_t		len = *size;
+	int 		i, rv = 0;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_mem_copy: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	mutex_enter(&ldcp->lock);
+
+	/* check to see if channel is UP */
+	if (ldcp->tstate != TS_UP) {
+		DWARN(ldcp->id, "ldc_mem_copy: (0x%llx) channel is not UP\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EINVAL);
+	}
+
+	/* Force address and size to be 8-byte aligned */
+	if ((((uintptr_t)vaddr | len) & 0x7) != 0) {
+		DWARN(ldcp->id,
+		    "ldc_mem_copy: addr/sz is not 8-byte aligned\n");
+		mutex_exit(&ldcp->lock);
+		return (EINVAL);
+	}
+
+	/* Find the size of the exported memory */
+	export_size = 0;
+	for (i = 0; i < ccount; i++)
+		export_size += cookies[i].size;
+
+	/* check to see if offset is valid */
+	if (off > export_size) {
+		DWARN(ldcp->id,
+		    "ldc_mem_copy: (0x%llx) start offset > export mem size\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EINVAL);
+	}
+
+	/*
+	 * Check to see if the export size is smaller than the size we
+	 * are requesting to copy - if so flag an error
+	 */
+	if ((export_size - off) < *size) {
+		DWARN(ldcp->id,
+		    "ldc_mem_copy: (0x%llx) copy size > export mem size\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EINVAL);
+	}
+
+	total_bal = min(export_size, *size);
+
+	/* FUTURE: get the page size, pgsz code, and shift */
+	pg_size = MMU_PAGESIZE;
+	pg_size_code = page_szc(pg_size);
+	pg_shift = page_get_shift(pg_size_code);
+
+	D1(ldcp->id, "ldc_mem_copy: copying data "
+	    "(0x%llx) va 0x%llx pgsz=0x%llx, pgszc=0x%llx, pg_shift=0x%llx\n",
+	    ldcp->id, vaddr, pg_size, pg_size_code, pg_shift);
+
+	/* aligned VA and its offset */
+	local_valign = (((uintptr_t)vaddr) & ~(pg_size - 1));
+	local_voff = ((uintptr_t)vaddr) & (pg_size - 1);
+
+	npages = (len+local_voff)/pg_size;
+	npages = ((len+local_voff)%pg_size == 0) ? npages : npages+1;
+
+	D1(ldcp->id,
+	    "ldc_mem_copy: (0x%llx) v=0x%llx,val=0x%llx,off=0x%x,pgs=0x%x\n",
+	    ldcp->id, vaddr, local_valign, local_voff, npages);
+
+	local_ra = va_to_pa((void *)local_valign);
+	local_poff = local_voff;
+	local_psize = min(len, (pg_size - local_voff));
+
+	len -= local_psize;
+
+	/*
+	 * find the first cookie in the list of cookies
+	 * if the offset passed in is not zero
+	 */
+	for (idx = 0; idx < ccount; idx++) {
+		cookie_size = cookies[idx].size;
+		if (off < cookie_size)
+			break;
+		off -= cookie_size;
+	}
+
+	cookie_addr = cookies[idx].addr + off;
+	cookie_size = cookies[idx].size - off;
+
+	export_caddr = cookie_addr & ~(pg_size - 1);
+	export_poff = cookie_addr & (pg_size - 1);
+	export_psize = min(cookie_size, (pg_size - export_poff));
+
+	for (;;) {
+
+		copy_size = min(export_psize, local_psize);
+
+		D1(ldcp->id,
+		    "ldc_mem_copy:(0x%llx) dir=0x%x, caddr=0x%llx,"
+		    " loc_ra=0x%llx, exp_poff=0x%llx, loc_poff=0x%llx,"
+		    " exp_psz=0x%llx, loc_psz=0x%llx, copy_sz=0x%llx,"
+		    " total_bal=0x%llx\n",
+		    ldcp->id, direction, export_caddr, local_ra, export_poff,
+		    local_poff, export_psize, local_psize, copy_size,
+		    total_bal);
+
+		rv = hv_ldc_copy(ldcp->id, direction,
+		    (export_caddr + export_poff), (local_ra + local_poff),
+		    copy_size, &copied_len);
+
+		if (rv != 0) {
+			cmn_err(CE_WARN,
+			    "ldc_mem_copy: (0x%lx) err %d during copy\n",
+			    ldcp->id, rv);
+			DWARN(DBG_ALL_LDCS,
+			    "ldc_mem_copy: (0x%llx) dir=0x%x, caddr=0x%llx, "
+			    "loc_ra=0x%llx, exp_poff=0x%llx, loc_poff=0x%llx,"
+			    " exp_psz=0x%llx, loc_psz=0x%llx, copy_sz=0x%llx,"
+			    " copied_len=0x%llx, total_bal=0x%llx\n",
+			    ldcp->id, direction, export_caddr, local_ra,
+			    export_poff, local_poff, export_psize, local_psize,
+			    copy_size, copied_len, total_bal);
+
+			*size = *size - total_bal;
+			mutex_exit(&ldcp->lock);
+			return (EIO);
+		}
+
+		ASSERT(copied_len <= copy_size);
+
+		D2(ldcp->id, "ldc_mem_copy: copied=0x%llx\n", copied_len);
+		export_poff += copied_len;
+		local_poff += copied_len;
+		export_psize -= copied_len;
+		local_psize -= copied_len;
+		cookie_size -= copied_len;
+
+		total_bal -= copied_len;
+
+		if (copy_size != copied_len)
+			continue;
+
+		if (export_psize == 0 && total_bal != 0) {
+
+			if (cookie_size == 0) {
+				idx++;
+				cookie_addr = cookies[idx].addr;
+				cookie_size = cookies[idx].size;
+
+				export_caddr = cookie_addr & ~(pg_size - 1);
+				export_poff = cookie_addr & (pg_size - 1);
+				export_psize =
+					min(cookie_size, (pg_size-export_poff));
+			} else {
+				export_caddr += pg_size;
+				export_poff = 0;
+				export_psize = min(cookie_size, pg_size);
+			}
+		}
+
+		if (local_psize == 0 && total_bal != 0) {
+			local_valign += pg_size;
+			local_ra = va_to_pa((void *)local_valign);
+			local_poff = 0;
+			local_psize = min(pg_size, len);
+			len -= local_psize;
+		}
+
+		/* check if we are all done */
+		if (total_bal == 0)
+			break;
+	}
+
+	mutex_exit(&ldcp->lock);
+
+	D1(ldcp->id,
+	    "ldc_mem_copy: (0x%llx) done copying sz=0x%llx\n",
+	    ldcp->id, *size);
+
+	return (0);
+}
+
+/*
+ * Copy data either from or to the client specified virtual address
+ * space to or from HV physical memory.
+ *
+ * The direction argument determines whether the data is read from or
+ * written to HV memory. direction values are LDC_COPY_IN/OUT similar
+ * to the ldc_mem_copy interface
+ */
+int
+ldc_mem_rdwr_pa(ldc_handle_t handle, caddr_t vaddr, size_t *size,
+    caddr_t paddr, uint8_t direction)
+{
+	ldc_chan_t 	*ldcp;
+	uint64_t	local_voff, local_valign;
+	uint64_t	pg_shift, pg_size, pg_size_code;
+	uint64_t 	target_pa, target_poff, target_psize, target_size;
+	uint64_t	local_ra, local_poff, local_psize;
+	uint64_t	copy_size, copied_len = 0;
+	pgcnt_t		npages;
+	size_t		len = *size;
+	int 		rv = 0;
+
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_rdwr_pa: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	mutex_enter(&ldcp->lock);
+
+	/* check to see if channel is UP */
+	if (ldcp->tstate != TS_UP) {
+		DWARN(ldcp->id,
+		    "ldc_mem_rdwr_pa: (0x%llx) channel is not UP\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		return (EINVAL);
+	}
+
+	/* Force address and size to be 8-byte aligned */
+	if ((((uintptr_t)vaddr | len) & 0x7) != 0) {
+		DWARN(ldcp->id,
+		    "ldc_mem_rdwr_pa: addr/size is not 8-byte aligned\n");
+		mutex_exit(&ldcp->lock);
+		return (EINVAL);
+	}
+
+	target_size = *size;
+
+	/* FUTURE: get the page size, pgsz code, and shift */
+	pg_size = MMU_PAGESIZE;
+	pg_size_code = page_szc(pg_size);
+	pg_shift = page_get_shift(pg_size_code);
+
+	D1(ldcp->id, "ldc_mem_rdwr_pa: copying data "
+	    "(0x%llx) va 0x%llx pgsz=0x%llx, pgszc=0x%llx, pg_shift=0x%llx\n",
+	    ldcp->id, vaddr, pg_size, pg_size_code, pg_shift);
+
+	/* aligned VA and its offset */
+	local_valign = ((uintptr_t)vaddr) & ~(pg_size - 1);
+	local_voff = ((uintptr_t)vaddr) & (pg_size - 1);
+
+	npages = (len + local_voff) / pg_size;
+	npages = ((len + local_voff) % pg_size == 0) ? npages : npages+1;
+
+	D1(ldcp->id,
+	    "ldc_mem_rdwr_pa: (0x%llx) v=0x%llx,val=0x%llx,off=0x%x,pgs=0x%x\n",
+	    ldcp->id, vaddr, local_valign, local_voff, npages);
+
+	local_ra = va_to_pa((void *)local_valign);
+	local_poff = local_voff;
+	local_psize = min(len, (pg_size - local_voff));
+
+	len -= local_psize;
+
+	target_pa = ((uintptr_t)paddr) & ~(pg_size - 1);
+	target_poff = ((uintptr_t)paddr) & (pg_size - 1);
+	target_psize = pg_size - target_poff;
+
+	for (;;) {
+
+		copy_size = min(target_psize, local_psize);
+
+		D1(ldcp->id,
+		    "ldc_mem_rdwr_pa: (0x%llx) dir=0x%x, tar_pa=0x%llx,"
+		    " loc_ra=0x%llx, tar_poff=0x%llx, loc_poff=0x%llx,"
+		    " tar_psz=0x%llx, loc_psz=0x%llx, copy_sz=0x%llx,"
+		    " total_bal=0x%llx\n",
+		    ldcp->id, direction, target_pa, local_ra, target_poff,
+		    local_poff, target_psize, local_psize, copy_size,
+		    target_size);
+
+		rv = hv_ldc_copy(ldcp->id, direction,
+		    (target_pa + target_poff), (local_ra + local_poff),
+		    copy_size, &copied_len);
+
+		if (rv != 0) {
+			cmn_err(CE_WARN,
+			    "ldc_mem_rdwr_pa: (0x%lx) err %d during copy\n",
+			    ldcp->id, rv);
+			DWARN(DBG_ALL_LDCS,
+			    "ldc_mem_rdwr_pa: (0x%llx) dir=%lld,tar_pa=0x%llx, "
+			    "loc_ra=0x%llx, tar_poff=0x%llx, loc_poff=0x%llx,"
+			    " tar_psz=0x%llx, loc_psz=0x%llx, copy_sz=0x%llx,"
+			    " total_bal=0x%llx\n",
+			    ldcp->id, direction, target_pa, local_ra,
+			    target_poff, local_poff, target_psize, local_psize,
+			    copy_size, target_size);
+
+			*size = *size - target_size;
+			mutex_exit(&ldcp->lock);
+			return (i_ldc_h2v_error(rv));
+		}
+
+		D2(ldcp->id, "ldc_mem_rdwr_pa: copied=0x%llx\n", copied_len);
+		target_poff += copied_len;
+		local_poff += copied_len;
+		target_psize -= copied_len;
+		local_psize -= copied_len;
+
+		target_size -= copied_len;
+
+		if (copy_size != copied_len)
+			continue;
+
+		if (target_psize == 0 && target_size != 0) {
+			target_pa += pg_size;
+			target_poff = 0;
+			target_psize = min(pg_size, target_size);
+		}
+
+		if (local_psize == 0 && target_size != 0) {
+			local_valign += pg_size;
+			local_ra = va_to_pa((void *)local_valign);
+			local_poff = 0;
+			local_psize = min(pg_size, len);
+			len -= local_psize;
+		}
+
+		/* check if we are all done */
+		if (target_size == 0)
+			break;
+	}
+
+	mutex_exit(&ldcp->lock);
+
+	D1(ldcp->id, "ldc_mem_rdwr_pa: (0x%llx) done copying sz=0x%llx\n",
+	    ldcp->id, *size);
+
+	return (0);
+}
+
+/*
+ * Map an exported memory segment into the local address space. If the
+ * memory range was exported for direct map access, a HV call is made
+ * to allocate a RA range. If the map is done via a shadow copy, local
+ * shadow memory is allocated and the base VA is returned in 'vaddr'. If
+ * the mapping is a direct map then the RA is returned in 'raddr'.
+ */
+int
+ldc_mem_map(ldc_mem_handle_t mhandle, ldc_mem_cookie_t *cookie, uint32_t ccount,
+    uint8_t mtype, caddr_t *vaddr, caddr_t *raddr)
+{
+	int		i, idx;
+	ldc_chan_t 	*ldcp;
+	ldc_mhdl_t	*mhdl;
+	ldc_memseg_t	*memseg;
+	caddr_t		shadow_base = NULL, tmpaddr;
+	uint64_t	pg_size, pg_shift, pg_size_code;
+	uint64_t	exp_size = 0, npages;
+
+	if (mhandle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_mem_map: invalid memory handle\n");
+		return (EINVAL);
+	}
+	mhdl = (ldc_mhdl_t *)mhandle;
+
+	mutex_enter(&mhdl->lock);
+
+	if (mhdl->status == LDC_BOUND || mhdl->status == LDC_MAPPED ||
+	    mhdl->memseg != NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_map: (0x%llx) handle bound/mapped\n", mhandle);
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	ldcp = mhdl->ldcp;
+
+	mutex_enter(&ldcp->lock);
+
+	if (ldcp->tstate != TS_UP) {
+		DWARN(ldcp->id,
+		    "ldc_mem_dring_map: (0x%llx) channel is not UP\n",
+		    ldcp->id);
+		mutex_exit(&ldcp->lock);
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	if ((mtype & (LDC_SHADOW_MAP|LDC_DIRECT_MAP|LDC_IO_MAP)) == 0) {
+		DWARN(ldcp->id, "ldc_mem_map: invalid map type\n");
+		mutex_exit(&ldcp->lock);
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	if (mtype == LDC_SHADOW_MAP && vaddr == NULL) {
+		DWARN(ldcp->id,
+		    "ldc_mem_map: invalid vaddr arg0x%llx\n", vaddr);
+		mutex_exit(&ldcp->lock);
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	if (mtype == LDC_SHADOW_MAP &&
+	    (vaddr) && ((uintptr_t)(*vaddr) & MMU_PAGEOFFSET)) {
+		DWARN(ldcp->id,
+		    "ldc_mem_map: vaddr not page aligned, 0x%llx\n", *vaddr);
+		mutex_exit(&ldcp->lock);
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	D1(ldcp->id, "ldc_mem_map: (0x%llx) cookie = 0x%llx,0x%llx\n",
+	    mhandle, cookie->addr, cookie->size);
+
+	/* FUTURE: get the page size, pgsz code, and shift */
+	pg_size = MMU_PAGESIZE;
+	pg_size_code = page_szc(pg_size);
+	pg_shift = page_get_shift(pg_size_code);
+
+	/* calculate the number of pages in the exported cookie */
+	for (idx = 0; idx < ccount; idx++) {
+		if (cookie[idx].addr & MMU_PAGEOFFSET ||
+			cookie[idx].size & MMU_PAGEOFFSET) {
+			DWARN(ldcp->id,
+			    "ldc_mem_map: cookie addr/size not page aligned, "
+			    "0x%llx\n", cookie[idx].addr);
+			mutex_exit(&ldcp->lock);
+			mutex_exit(&mhdl->lock);
+			return (EINVAL);
+		}
+		exp_size += cookie[idx].size;
+	}
+	npages = (exp_size >> pg_shift);
+
+	/* Allocate memseg structure */
+	memseg = mhdl->memseg =	kmem_zalloc(sizeof (ldc_memseg_t), KM_SLEEP);
+
+	/* Allocate memory to store all pages and cookies */
+	memseg->pages =	kmem_zalloc((sizeof (ldc_page_t) * npages), KM_SLEEP);
+	memseg->cookies =
+		kmem_zalloc((sizeof (ldc_mem_cookie_t) * ccount), KM_SLEEP);
+
+	D2(ldcp->id, "ldc_mem_map: (0x%llx) processing 0x%llx pages\n",
+	    ldcp->id, npages);
+
+	/* Check to see if the client is requesting direct or shadow map */
+	if (mtype == LDC_SHADOW_MAP) {
+		if (*vaddr == NULL) {
+			shadow_base =
+				contig_mem_alloc_align(exp_size, PAGESIZE);
+			if (shadow_base == NULL) {
+				cmn_err(CE_WARN, "ldc_mem_map: shadow memory "
+				    "allocation failed\n");
+				kmem_free(memseg->cookies,
+				    (sizeof (ldc_mem_cookie_t) * ccount));
+				kmem_free(memseg->pages,
+				    (sizeof (ldc_page_t) * npages));
+				kmem_free(memseg, sizeof (ldc_memseg_t));
+				mutex_exit(&ldcp->lock);
+				mutex_exit(&mhdl->lock);
+				return (ENOMEM);
+			}
+
+			bzero(shadow_base, exp_size);
+			mhdl->myshadow = B_TRUE;
+
+			D1(ldcp->id, "ldc_mem_map: (0x%llx) allocated "
+			    "shadow page va=0x%llx\n", ldcp->id, shadow_base);
+		} else {
+			/*
+			 * Use client supplied memory for shadow_base
+			 * WARNING: assuming that client mem is >= exp_size
+			 */
+			shadow_base = *vaddr;
+		}
+	} else if (mtype == LDC_DIRECT_MAP) {
+		/* FUTURE: Do a direct map by calling into HV */
+		_NOTE(EMPTY)
+	}
+
+	/* Save all page and cookie information */
+	for (i = 0, tmpaddr = shadow_base; i < npages; i++) {
+		memseg->pages[i].raddr = va_to_pa(tmpaddr);
+		memseg->pages[i].size = pg_size;
+		memseg->pages[i].index = 0;
+		memseg->pages[i].offset = 0;
+		memseg->pages[i].mte = NULL;
+		tmpaddr += pg_size;
+	}
+	for (i = 0; i < ccount; i++) {
+		memseg->cookies[i].addr = cookie[i].addr;
+		memseg->cookies[i].size = cookie[i].size;
+	}
+
+	/* update memseg_t */
+	memseg->vaddr = shadow_base;
+	memseg->raddr = memseg->pages[0].raddr;
+	memseg->size = exp_size;
+	memseg->npages = npages;
+	memseg->ncookies = ccount;
+	memseg->next_cookie = 0;
+
+	/* memory handle = mapped */
+	mhdl->mtype = mtype;
+	mhdl->perm = 0;
+	mhdl->status = LDC_MAPPED;
+
+	D1(ldcp->id, "ldc_mem_map: (0x%llx) mapped 0x%llx, ra=0x%llx, "
+	    "va=0x%llx, pgs=0x%llx cookies=0x%llx\n",
+	    ldcp->id, mhdl, memseg->raddr, memseg->vaddr,
+	    memseg->npages, memseg->ncookies);
+
+	if (raddr)
+		*raddr = (caddr_t)memseg->raddr;
+	if (vaddr)
+		*vaddr = memseg->vaddr;
+
+	mutex_exit(&ldcp->lock);
+	mutex_exit(&mhdl->lock);
+	return (0);
+}
+
+/*
+ * Unmap a memory segment. Free shadow memory (if any).
+ */
+int
+ldc_mem_unmap(ldc_mem_handle_t mhandle)
+{
+	ldc_mhdl_t	*mhdl = (ldc_mhdl_t *)mhandle;
+	ldc_chan_t 	*ldcp;
+	ldc_memseg_t	*memseg;
+
+	if (mhdl == 0 || mhdl->status != LDC_MAPPED) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_unmap: (0x%llx) handle is not mapped\n",
+		    mhandle);
+		return (EINVAL);
+	}
+
+	mutex_enter(&mhdl->lock);
+
+	ldcp = mhdl->ldcp;
+	memseg = mhdl->memseg;
+
+	D1(ldcp->id, "ldc_mem_unmap: (0x%llx) unmapping handle 0x%llx\n",
+	    ldcp->id, mhdl);
+
+	/* if we allocated shadow memory - free it */
+	if (mhdl->mtype == LDC_SHADOW_MAP && mhdl->myshadow) {
+		contig_mem_free(memseg->vaddr, memseg->size);
+	}
+
+	/* free the allocated memseg and page structures */
+	kmem_free(memseg->pages, (sizeof (ldc_page_t) * memseg->npages));
+	kmem_free(memseg->cookies,
+	    (sizeof (ldc_mem_cookie_t) * memseg->ncookies));
+	kmem_free(memseg, sizeof (ldc_memseg_t));
+
+	/* uninitialize the memory handle */
+	mhdl->memseg = NULL;
+	mhdl->status = LDC_UNBOUND;
+
+	D1(ldcp->id, "ldc_mem_unmap: (0x%llx) unmapped handle 0x%llx\n",
+	    ldcp->id, mhdl);
+
+	mutex_exit(&mhdl->lock);
+	return (0);
+}
+
+/*
+ * Internal entry point for LDC mapped memory entry consistency
+ * semantics. Acquire copies the contents of the remote memory
+ * into the local shadow copy. The release operation copies the local
+ * contents into the remote memory. The offset and size specify the
+ * bounds for the memory range being synchronized.
+ */
+static int
+i_ldc_mem_acquire_release(ldc_mem_handle_t mhandle, uint8_t direction,
+    uint64_t offset, size_t size)
+{
+	int 		err;
+	ldc_mhdl_t	*mhdl;
+	ldc_chan_t	*ldcp;
+	ldc_memseg_t	*memseg;
+	caddr_t		local_vaddr;
+	size_t		copy_size;
+
+	if (mhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "i_ldc_mem_acquire_release: invalid memory handle\n");
+		return (EINVAL);
+	}
+	mhdl = (ldc_mhdl_t *)mhandle;
+
+	mutex_enter(&mhdl->lock);
+
+	if (mhdl->status != LDC_MAPPED || mhdl->ldcp == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "i_ldc_mem_acquire_release: not mapped memory\n");
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	if (offset >= mhdl->memseg->size ||
+	    (offset + size) > mhdl->memseg->size) {
+		DWARN(DBG_ALL_LDCS,
+		    "i_ldc_mem_acquire_release: memory out of range\n");
+		mutex_exit(&mhdl->lock);
+		return (EINVAL);
+	}
+
+	/* get the channel handle and memory segment */
+	ldcp = mhdl->ldcp;
+	memseg = mhdl->memseg;
+
+	if (mhdl->mtype == LDC_SHADOW_MAP) {
+
+		local_vaddr = memseg->vaddr + offset;
+		copy_size = size;
+
+		/* copy to/from remote from/to local memory */
+		err = ldc_mem_copy((ldc_handle_t)ldcp, local_vaddr, offset,
+		    &copy_size, memseg->cookies, memseg->ncookies,
+		    direction);
+		if (err || copy_size != size) {
+			cmn_err(CE_WARN,
+			    "i_ldc_mem_acquire_release: copy failed\n");
+			mutex_exit(&mhdl->lock);
+			return (err);
+		}
+	}
+
+	mutex_exit(&mhdl->lock);
+
+	return (0);
+}
+
+/*
+ * Ensure that the contents in the remote memory seg are consistent
+ * with the contents if of local segment
+ */
+int
+ldc_mem_acquire(ldc_mem_handle_t mhandle, uint64_t offset, uint64_t size)
+{
+	return (i_ldc_mem_acquire_release(mhandle, LDC_COPY_IN, offset, size));
+}
+
+
+/*
+ * Ensure that the contents in the local memory seg are consistent
+ * with the contents if of remote segment
+ */
+int
+ldc_mem_release(ldc_mem_handle_t mhandle, uint64_t offset, uint64_t size)
+{
+	return (i_ldc_mem_acquire_release(mhandle, LDC_COPY_OUT, offset, size));
+}
+
+/*
+ * Allocate a descriptor ring. The size of each each descriptor
+ * must be 8-byte aligned and the entire ring should be a multiple
+ * of MMU_PAGESIZE.
+ */
+int
+ldc_mem_dring_create(uint32_t len, uint32_t dsize, ldc_dring_handle_t *dhandle)
+{
+	ldc_dring_t *dringp;
+	size_t size = (dsize * len);
+
+	D1(DBG_ALL_LDCS, "ldc_mem_dring_create: len=0x%x, size=0x%x\n",
+	    len, dsize);
+
+	if (dhandle == NULL) {
+		DWARN(DBG_ALL_LDCS, "ldc_mem_dring_create: invalid dhandle\n");
+		return (EINVAL);
+	}
+
+	if (len == 0) {
+		DWARN(DBG_ALL_LDCS, "ldc_mem_dring_create: invalid length\n");
+		return (EINVAL);
+	}
+
+	/* descriptor size should be 8-byte aligned */
+	if (dsize == 0 || (dsize & 0x7)) {
+		DWARN(DBG_ALL_LDCS, "ldc_mem_dring_create: invalid size\n");
+		return (EINVAL);
+	}
+
+	*dhandle = 0;
+
+	/* Allocate a desc ring structure */
+	dringp = kmem_zalloc(sizeof (ldc_dring_t), KM_SLEEP);
+
+	/* Initialize dring */
+	dringp->length = len;
+	dringp->dsize = dsize;
+
+	/* round off to multiple of pagesize */
+	dringp->size = (size & MMU_PAGEMASK);
+	if (size & MMU_PAGEOFFSET)
+		dringp->size += MMU_PAGESIZE;
+
+	dringp->status = LDC_UNBOUND;
+
+	/* allocate descriptor ring memory */
+	dringp->base = contig_mem_alloc_align(dringp->size, PAGESIZE);
+	if (dringp->base == NULL) {
+		cmn_err(CE_WARN,
+		    "ldc_mem_dring_create: unable to alloc desc\n");
+		kmem_free(dringp, sizeof (ldc_dring_t));
+		return (ENOMEM);
+	}
+
+	bzero(dringp->base, dringp->size);
+
+	/* initialize the desc ring lock */
+	mutex_init(&dringp->lock, NULL, MUTEX_DRIVER, NULL);
+
+	/* Add descriptor ring to the head of global list */
+	mutex_enter(&ldcssp->lock);
+	dringp->next = ldcssp->dring_list;
+	ldcssp->dring_list = dringp;
+	mutex_exit(&ldcssp->lock);
+
+	*dhandle = (ldc_dring_handle_t)dringp;
+
+	D1(DBG_ALL_LDCS, "ldc_mem_dring_create: dring allocated\n");
+
+	return (0);
+}
+
+
+/*
+ * Destroy a descriptor ring.
+ */
+int
+ldc_mem_dring_destroy(ldc_dring_handle_t dhandle)
+{
+	ldc_dring_t *dringp;
+	ldc_dring_t *tmp_dringp;
+
+	D1(DBG_ALL_LDCS, "ldc_mem_dring_destroy: entered\n");
+
+	if (dhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_destroy: invalid desc ring handle\n");
+		return (EINVAL);
+	}
+	dringp = (ldc_dring_t *)dhandle;
+
+	if (dringp->status == LDC_BOUND) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_destroy: desc ring is bound\n");
+		return (EACCES);
+	}
+
+	mutex_enter(&dringp->lock);
+	mutex_enter(&ldcssp->lock);
+
+	/* remove from linked list - if not bound */
+	tmp_dringp = ldcssp->dring_list;
+	if (tmp_dringp == dringp) {
+		ldcssp->dring_list = dringp->next;
+		dringp->next = NULL;
+
+	} else {
+		while (tmp_dringp != NULL) {
+			if (tmp_dringp->next == dringp) {
+				tmp_dringp->next = dringp->next;
+				dringp->next = NULL;
+				break;
+			}
+			tmp_dringp = tmp_dringp->next;
+		}
+		if (tmp_dringp == NULL) {
+			DWARN(DBG_ALL_LDCS,
+			    "ldc_mem_dring_destroy: invalid descriptor\n");
+			mutex_exit(&ldcssp->lock);
+			mutex_exit(&dringp->lock);
+			return (EINVAL);
+		}
+	}
+
+	mutex_exit(&ldcssp->lock);
+
+	/* free the descriptor ring */
+	contig_mem_free((caddr_t)dringp->base, dringp->size);
+
+	mutex_exit(&dringp->lock);
+
+	/* destroy dring lock */
+	mutex_destroy(&dringp->lock);
+
+	/* free desc ring object */
+	kmem_free(dringp, sizeof (ldc_dring_t));
+
+	return (0);
+}
+
+/*
+ * Bind a previously allocated dring to a channel. The channel should
+ * be OPEN in order to bind the ring to the channel. Returns back a
+ * descriptor ring cookie. The descriptor ring is exported for remote
+ * access by the client at the other end of the channel. An entry for
+ * dring pages is stored in map table (via call to ldc_mem_bind_handle).
+ */
+int
+ldc_mem_dring_bind(ldc_handle_t handle, ldc_dring_handle_t dhandle,
+    uint8_t mtype, uint8_t perm, ldc_mem_cookie_t *cookie, uint32_t *ccount)
+{
+	int		err;
+	ldc_chan_t 	*ldcp;
+	ldc_dring_t	*dringp;
+	ldc_mem_handle_t mhandle;
+
+	/* check to see if channel is initalized */
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_bind: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	if (dhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_bind: invalid desc ring handle\n");
+		return (EINVAL);
+	}
+	dringp = (ldc_dring_t *)dhandle;
+
+	if (cookie == NULL) {
+		DWARN(ldcp->id,
+		    "ldc_mem_dring_bind: invalid cookie arg\n");
+		return (EINVAL);
+	}
+
+	mutex_enter(&dringp->lock);
+
+	if (dringp->status == LDC_BOUND) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_bind: (0x%llx) descriptor ring is bound\n",
+		    ldcp->id);
+		mutex_exit(&dringp->lock);
+		return (EINVAL);
+	}
+
+	if ((perm & LDC_MEM_RW) == 0) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_bind: invalid permissions\n");
+		mutex_exit(&dringp->lock);
+		return (EINVAL);
+	}
+
+	if ((mtype & (LDC_SHADOW_MAP|LDC_DIRECT_MAP|LDC_IO_MAP)) == 0) {
+		DWARN(DBG_ALL_LDCS, "ldc_mem_dring_bind: invalid type\n");
+		mutex_exit(&dringp->lock);
+		return (EINVAL);
+	}
+
+	dringp->ldcp = ldcp;
+
+	/* create an memory handle */
+	err = ldc_mem_alloc_handle(handle, &mhandle);
+	if (err || mhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_bind: (0x%llx) error allocating mhandle\n",
+		    ldcp->id);
+		mutex_exit(&dringp->lock);
+		return (err);
+	}
+	dringp->mhdl = mhandle;
+
+	/* bind the descriptor ring to channel */
+	err = ldc_mem_bind_handle(mhandle, dringp->base, dringp->size,
+	    mtype, perm, cookie, ccount);
+	if (err) {
+		DWARN(ldcp->id,
+		    "ldc_mem_dring_bind: (0x%llx) error binding mhandle\n",
+		    ldcp->id);
+		mutex_exit(&dringp->lock);
+		return (err);
+	}
+
+	/*
+	 * For now return error if we get more than one cookie
+	 * FUTURE: Return multiple cookies ..
+	 */
+	if (*ccount > 1) {
+		(void) ldc_mem_unbind_handle(mhandle);
+		(void) ldc_mem_free_handle(mhandle);
+
+		dringp->ldcp = NULL;
+		dringp->mhdl = NULL;
+		*ccount = 0;
+
+		mutex_exit(&dringp->lock);
+		return (EAGAIN);
+	}
+
+	/* Add descriptor ring to channel's exported dring list */
+	mutex_enter(&ldcp->exp_dlist_lock);
+	dringp->ch_next = ldcp->exp_dring_list;
+	ldcp->exp_dring_list = dringp;
+	mutex_exit(&ldcp->exp_dlist_lock);
+
+	dringp->status = LDC_BOUND;
+
+	mutex_exit(&dringp->lock);
+
+	return (0);
+}
+
+/*
+ * Return the next cookie associated with the specified dring handle
+ */
+int
+ldc_mem_dring_nextcookie(ldc_dring_handle_t dhandle, ldc_mem_cookie_t *cookie)
+{
+	int		rv = 0;
+	ldc_dring_t 	*dringp;
+	ldc_chan_t	*ldcp;
+
+	if (dhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_nextcookie: invalid desc ring handle\n");
+		return (EINVAL);
+	}
+	dringp = (ldc_dring_t *)dhandle;
+	mutex_enter(&dringp->lock);
+
+	if (dringp->status != LDC_BOUND) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_nextcookie: descriptor ring 0x%llx "
+		    "is not bound\n", dringp);
+		mutex_exit(&dringp->lock);
+		return (EINVAL);
+	}
+
+	ldcp = dringp->ldcp;
+
+	if (cookie == NULL) {
+		DWARN(ldcp->id,
+		    "ldc_mem_dring_nextcookie:(0x%llx) invalid cookie arg\n",
+		    ldcp->id);
+		mutex_exit(&dringp->lock);
+		return (EINVAL);
+	}
+
+	rv = ldc_mem_nextcookie((ldc_mem_handle_t)dringp->mhdl, cookie);
+	mutex_exit(&dringp->lock);
+
+	return (rv);
+}
+/*
+ * Unbind a previously bound dring from a channel.
+ */
+int
+ldc_mem_dring_unbind(ldc_dring_handle_t dhandle)
+{
+	ldc_dring_t 	*dringp;
+	ldc_dring_t	*tmp_dringp;
+	ldc_chan_t	*ldcp;
+
+	if (dhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_unbind: invalid desc ring handle\n");
+		return (EINVAL);
+	}
+	dringp = (ldc_dring_t *)dhandle;
+
+	mutex_enter(&dringp->lock);
+
+	if (dringp->status == LDC_UNBOUND) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_bind: descriptor ring 0x%llx is unbound\n",
+		    dringp);
+		mutex_exit(&dringp->lock);
+		return (EINVAL);
+	}
+	ldcp = dringp->ldcp;
+
+	mutex_enter(&ldcp->exp_dlist_lock);
+
+	tmp_dringp = ldcp->exp_dring_list;
+	if (tmp_dringp == dringp) {
+		ldcp->exp_dring_list = dringp->ch_next;
+		dringp->ch_next = NULL;
+
+	} else {
+		while (tmp_dringp != NULL) {
+			if (tmp_dringp->ch_next == dringp) {
+				tmp_dringp->ch_next = dringp->ch_next;
+				dringp->ch_next = NULL;
+				break;
+			}
+			tmp_dringp = tmp_dringp->ch_next;
+		}
+		if (tmp_dringp == NULL) {
+			DWARN(DBG_ALL_LDCS,
+			    "ldc_mem_dring_unbind: invalid descriptor\n");
+			mutex_exit(&ldcp->exp_dlist_lock);
+			mutex_exit(&dringp->lock);
+			return (EINVAL);
+		}
+	}
+
+	mutex_exit(&ldcp->exp_dlist_lock);
+
+	(void) ldc_mem_unbind_handle((ldc_mem_handle_t)dringp->mhdl);
+	(void) ldc_mem_free_handle((ldc_mem_handle_t)dringp->mhdl);
+
+	dringp->ldcp = NULL;
+	dringp->mhdl = NULL;
+	dringp->status = LDC_UNBOUND;
+
+	mutex_exit(&dringp->lock);
+
+	return (0);
+}
+
+/*
+ * Get information about the dring. The base address of the descriptor
+ * ring along with the type and permission are returned back.
+ */
+int
+ldc_mem_dring_info(ldc_dring_handle_t dhandle, ldc_mem_info_t *minfo)
+{
+	ldc_dring_t	*dringp;
+	int		rv;
+
+	if (dhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_info: invalid desc ring handle\n");
+		return (EINVAL);
+	}
+	dringp = (ldc_dring_t *)dhandle;
+
+	mutex_enter(&dringp->lock);
+
+	if (dringp->mhdl) {
+		rv = ldc_mem_info(dringp->mhdl, minfo);
+		if (rv) {
+			DWARN(DBG_ALL_LDCS,
+			    "ldc_mem_dring_info: error reading mem info\n");
+			mutex_exit(&dringp->lock);
+			return (rv);
+		}
+	} else {
+		minfo->vaddr = dringp->base;
+		minfo->raddr = NULL;
+		minfo->status = dringp->status;
+	}
+
+	mutex_exit(&dringp->lock);
+
+	return (0);
+}
+
+/*
+ * Map an exported descriptor ring into the local address space. If the
+ * descriptor ring was exported for direct map access, a HV call is made
+ * to allocate a RA range. If the map is done via a shadow copy, local
+ * shadow memory is allocated.
+ */
+int
+ldc_mem_dring_map(ldc_handle_t handle, ldc_mem_cookie_t *cookie,
+    uint32_t ccount, uint32_t len, uint32_t dsize, uint8_t mtype,
+    ldc_dring_handle_t *dhandle)
+{
+	int		err;
+	ldc_chan_t 	*ldcp = (ldc_chan_t *)handle;
+	ldc_mem_handle_t mhandle;
+	ldc_dring_t	*dringp;
+	size_t		dring_size;
+
+	if (dhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_map: invalid dhandle\n");
+		return (EINVAL);
+	}
+
+	/* check to see if channel is initalized */
+	if (handle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_map: invalid channel handle\n");
+		return (EINVAL);
+	}
+	ldcp = (ldc_chan_t *)handle;
+
+	if (cookie == NULL) {
+		DWARN(ldcp->id,
+		    "ldc_mem_dring_map: (0x%llx) invalid cookie\n",
+		    ldcp->id);
+		return (EINVAL);
+	}
+
+	/* FUTURE: For now we support only one cookie per dring */
+	ASSERT(ccount == 1);
+
+	if (cookie->size < (dsize * len)) {
+		DWARN(ldcp->id,
+		    "ldc_mem_dring_map: (0x%llx) invalid dsize/len\n",
+		    ldcp->id);
+		return (EINVAL);
+	}
+
+	*dhandle = 0;
+
+	/* Allocate an dring structure */
+	dringp = kmem_zalloc(sizeof (ldc_dring_t), KM_SLEEP);
+
+	D1(ldcp->id,
+	    "ldc_mem_dring_map: 0x%x,0x%x,0x%x,0x%llx,0x%llx\n",
+	    mtype, len, dsize, cookie->addr, cookie->size);
+
+	/* Initialize dring */
+	dringp->length = len;
+	dringp->dsize = dsize;
+
+	/* round of to multiple of page size */
+	dring_size = len * dsize;
+	dringp->size = (dring_size & MMU_PAGEMASK);
+	if (dring_size & MMU_PAGEOFFSET)
+		dringp->size += MMU_PAGESIZE;
+
+	dringp->ldcp = ldcp;
+
+	/* create an memory handle */
+	err = ldc_mem_alloc_handle(handle, &mhandle);
+	if (err || mhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_map: cannot alloc hdl err=%d\n",
+		    err);
+		kmem_free(dringp, sizeof (ldc_dring_t));
+		return (ENOMEM);
+	}
+
+	dringp->mhdl = mhandle;
+	dringp->base = NULL;
+
+	/* map the dring into local memory */
+	err = ldc_mem_map(mhandle, cookie, ccount, mtype,
+	    &(dringp->base), NULL);
+	if (err || dringp->base == NULL) {
+		cmn_err(CE_WARN,
+		    "ldc_mem_dring_map: cannot map desc ring err=%d\n", err);
+		(void) ldc_mem_free_handle(mhandle);
+		kmem_free(dringp, sizeof (ldc_dring_t));
+		return (ENOMEM);
+	}
+
+	/* initialize the desc ring lock */
+	mutex_init(&dringp->lock, NULL, MUTEX_DRIVER, NULL);
+
+	/* Add descriptor ring to channel's imported dring list */
+	mutex_enter(&ldcp->imp_dlist_lock);
+	dringp->ch_next = ldcp->imp_dring_list;
+	ldcp->imp_dring_list = dringp;
+	mutex_exit(&ldcp->imp_dlist_lock);
+
+	dringp->status = LDC_MAPPED;
+
+	*dhandle = (ldc_dring_handle_t)dringp;
+
+	return (0);
+}
+
+/*
+ * Unmap a descriptor ring. Free shadow memory (if any).
+ */
+int
+ldc_mem_dring_unmap(ldc_dring_handle_t dhandle)
+{
+	ldc_dring_t 	*dringp;
+	ldc_dring_t	*tmp_dringp;
+	ldc_chan_t	*ldcp;
+
+	if (dhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_unmap: invalid desc ring handle\n");
+		return (EINVAL);
+	}
+	dringp = (ldc_dring_t *)dhandle;
+
+	if (dringp->status != LDC_MAPPED) {
+		DWARN(DBG_ALL_LDCS,
+		    "ldc_mem_dring_unmap: not a mapped desc ring\n");
+		return (EINVAL);
+	}
+
+	mutex_enter(&dringp->lock);
+
+	ldcp = dringp->ldcp;
+
+	mutex_enter(&ldcp->imp_dlist_lock);
+
+	/* find and unlink the desc ring from channel import list */
+	tmp_dringp = ldcp->imp_dring_list;
+	if (tmp_dringp == dringp) {
+		ldcp->imp_dring_list = dringp->ch_next;
+		dringp->ch_next = NULL;
+
+	} else {
+		while (tmp_dringp != NULL) {
+			if (tmp_dringp->ch_next == dringp) {
+				tmp_dringp->ch_next = dringp->ch_next;
+				dringp->ch_next = NULL;
+				break;
+			}
+			tmp_dringp = tmp_dringp->ch_next;
+		}
+		if (tmp_dringp == NULL) {
+			DWARN(DBG_ALL_LDCS,
+			    "ldc_mem_dring_unmap: invalid descriptor\n");
+			mutex_exit(&ldcp->imp_dlist_lock);
+			mutex_exit(&dringp->lock);
+			return (EINVAL);
+		}
+	}
+
+	mutex_exit(&ldcp->imp_dlist_lock);
+
+	/* do a LDC memory handle unmap and free */
+	(void) ldc_mem_unmap(dringp->mhdl);
+	(void) ldc_mem_free_handle((ldc_mem_handle_t)dringp->mhdl);
+
+	dringp->status = 0;
+	dringp->ldcp = NULL;
+
+	mutex_exit(&dringp->lock);
+
+	/* destroy dring lock */
+	mutex_destroy(&dringp->lock);
+
+	/* free desc ring object */
+	kmem_free(dringp, sizeof (ldc_dring_t));
+
+	return (0);
+}
+
+/*
+ * Internal entry point for descriptor ring access entry consistency
+ * semantics. Acquire copies the contents of the remote descriptor ring
+ * into the local shadow copy. The release operation copies the local
+ * contents into the remote dring. The start and end locations specify
+ * bounds for the entries being synchronized.
+ */
+static int
+i_ldc_dring_acquire_release(ldc_dring_handle_t dhandle,
+    uint8_t direction, uint64_t start, uint64_t end)
+{
+	int 			err;
+	ldc_dring_t		*dringp;
+	ldc_chan_t		*ldcp;
+	uint64_t		soff;
+	size_t			copy_size;
+
+	if (dhandle == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "i_ldc_dring_acquire_release: invalid desc ring handle\n");
+		return (EINVAL);
+	}
+	dringp = (ldc_dring_t *)dhandle;
+	mutex_enter(&dringp->lock);
+
+	if (dringp->status != LDC_MAPPED || dringp->ldcp == NULL) {
+		DWARN(DBG_ALL_LDCS,
+		    "i_ldc_dring_acquire_release: not a mapped desc ring\n");
+		mutex_exit(&dringp->lock);
+		return (EINVAL);
+	}
+
+	if (start >= dringp->length || end >= dringp->length) {
+		DWARN(DBG_ALL_LDCS,
+		    "i_ldc_dring_acquire_release: index out of range\n");
+		mutex_exit(&dringp->lock);
+		return (EINVAL);
+	}
+
+	/* get the channel handle */
+	ldcp = dringp->ldcp;
+
+	copy_size = (start <= end) ? (((end - start) + 1) * dringp->dsize) :
+		((dringp->length - start) * dringp->dsize);
+
+	/* Calculate the relative offset for the first desc */
+	soff = (start * dringp->dsize);
+
+	/* copy to/from remote from/to local memory */
+	D1(ldcp->id, "i_ldc_dring_acquire_release: c1 off=0x%llx sz=0x%llx\n",
+	    soff, copy_size);
+	err = i_ldc_mem_acquire_release((ldc_mem_handle_t)dringp->mhdl,
+	    direction, soff, copy_size);
+	if (err) {
+		DWARN(ldcp->id,
+		    "i_ldc_dring_acquire_release: copy failed\n");
+		mutex_exit(&dringp->lock);
+		return (err);
+	}
+
+	/* do the balance */
+	if (start > end) {
+		copy_size = ((end + 1) * dringp->dsize);
+		soff = 0;
+
+		/* copy to/from remote from/to local memory */
+		D1(ldcp->id, "i_ldc_dring_acquire_release: c2 "
+		    "off=0x%llx sz=0x%llx\n", soff, copy_size);
+		err = i_ldc_mem_acquire_release((ldc_mem_handle_t)dringp->mhdl,
+		    direction, soff, copy_size);
+		if (err) {
+			DWARN(ldcp->id,
+			    "i_ldc_dring_acquire_release: copy failed\n");
+			mutex_exit(&dringp->lock);
+			return (err);
+		}
+	}
+
+	mutex_exit(&dringp->lock);
+
+	return (0);
+}
+
+/*
+ * Ensure that the contents in the local dring are consistent
+ * with the contents if of remote dring
+ */
+int
+ldc_mem_dring_acquire(ldc_dring_handle_t dhandle, uint64_t start, uint64_t end)
+{
+	return (i_ldc_dring_acquire_release(dhandle, LDC_COPY_IN, start, end));
+}
+
+/*
+ * Ensure that the contents in the remote dring are consistent
+ * with the contents if of local dring
+ */
+int
+ldc_mem_dring_release(ldc_dring_handle_t dhandle, uint64_t start, uint64_t end)
+{
+	return (i_ldc_dring_acquire_release(dhandle, LDC_COPY_OUT, start, end));
+}
+
+
+/* ------------------------------------------------------------------------- */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/mdeg.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,914 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * MD Event Generator (MDEG) Module
+ */
+
+#include <sys/machsystm.h>
+#include <sys/taskq.h>
+#include <sys/disp.h>
+#include <sys/cmn_err.h>
+#include <sys/note.h>
+
+#include <sys/mdeg.h>
+#include <sys/mach_descrip.h>
+#include <sys/mdesc.h>
+
+/*
+ * A single client registration
+ */
+typedef struct mdeg_clnt {
+	boolean_t		valid;		/* structure is in active use */
+	mdeg_node_match_t	*nmatch;	/* node match filter */
+	mdeg_node_spec_t	*pspec;		/* parent match filter */
+	mdeg_cb_t		cb;		/* the client callback */
+	caddr_t			cb_arg;		/* argument to the callback */
+	uint64_t		magic;		/* sanity checking magic */
+	mdeg_handle_t		hdl;		/* handle assigned by MDEG */
+} mdeg_clnt_t;
+
+/*
+ * Global MDEG data
+ *
+ * Locking Strategy:
+ *
+ *   mdeg.lock - lock used to sychronize system wide MD updates. An
+ *	MD update must be treated as an atomic event. The lock is
+ *	taken when notification that a new MD is available and held
+ *	until all clients have been notified.
+ *
+ *   mdeg.rwlock - lock used to sychronize access to the table of
+ *	registered clients. The reader lock must be held when looking
+ *	up client information in the table. The writer lock must be
+ *	held when modifying any client information.
+ */
+static struct mdeg {
+	taskq_t 	*taskq;		/* for internal processing */
+	boolean_t	enabled;	/* enable/disable taskq processing */
+	kmutex_t	lock;		/* synchronize MD updates */
+	md_t		*md_prev;	/* previous MD */
+	md_t		*md_curr;	/* current MD */
+	mdeg_clnt_t	*tbl;		/* table of registered clients */
+	krwlock_t	rwlock;		/* client table lock */
+	uint_t		maxclnts;	/* client table size */
+	uint_t		nclnts;		/* current number of clients */
+} mdeg;
+
+/*
+ * Debugging routines
+ */
+#ifdef DEBUG
+uint_t mdeg_debug = 0x0;
+
+static void mdeg_dump_clnt(mdeg_clnt_t *clnt);
+static void mdeg_dump_table(void);
+
+#define	MDEG_DBG		if (mdeg_debug) printf
+#define	MDEG_DUMP_CLNT		mdeg_dump_clnt
+#define	MDEG_DUMP_TABLE		mdeg_dump_table
+
+#else /* DEBUG */
+
+#define	MDEG_DBG		_NOTE(CONSTCOND) if (0) printf
+#define	MDEG_DUMP_CLNT
+#define	MDEG_DUMP_TABLE()
+
+#endif /* DEBUG */
+
+/*
+ * Global constants
+ */
+#define	MDEG_MAX_TASKQ_THR	512	/* maximum number of taskq threads */
+#define	MDEG_MAX_CLNTS_INIT	64	/* initial client table size */
+
+#define	MDEG_MAGIC		0x4D4445475F48444Cull	/* 'MDEG_HDL' */
+
+/*
+ * A client handle is a 64 bit value with two pieces of
+ * information encoded in it. The upper 32 bits are the
+ * index into the table of a particular client structure.
+ * The lower 32 bits are a counter that is incremented
+ * each time a client structure is reused.
+ */
+#define	MDEG_IDX_SHIFT			32
+#define	MDEG_COUNT_MASK			0xfffffffful
+
+#define	MDEG_ALLOC_HDL(_idx, _count)	(((uint64_t)_idx << MDEG_IDX_SHIFT) | \
+					((uint64_t)(_count + 1) &	      \
+					MDEG_COUNT_MASK))
+#define	MDEG_HDL2IDX(hdl)		(hdl >> MDEG_IDX_SHIFT)
+#define	MDEG_HDL2COUNT(hdl)		(hdl & MDEG_COUNT_MASK)
+
+static const char trunc_str[] = " ... }";
+
+/*
+ * Utility routines
+ */
+static mdeg_clnt_t *mdeg_alloc_clnt(void);
+static void mdeg_notify_client(void *);
+static mde_cookie_t mdeg_find_start_node(md_t *, mdeg_node_spec_t *);
+static boolean_t mdeg_node_spec_match(md_t *, mde_cookie_t, mdeg_node_spec_t *);
+static void mdeg_get_diff_results(md_diff_cookie_t, mdeg_result_t *);
+
+int
+mdeg_init(void)
+{
+	int	tblsz;
+
+	/*
+	 * Grab the current MD
+	 */
+	if ((mdeg.md_curr = md_get_handle()) == NULL) {
+		cmn_err(CE_WARN, "unable to cache snapshot of MD");
+		return (-1);
+	}
+
+	/*
+	 * Initialize table of registered clients
+	 */
+	mdeg.maxclnts = MDEG_MAX_CLNTS_INIT;
+
+	tblsz = mdeg.maxclnts * sizeof (mdeg_clnt_t);
+	mdeg.tbl = kmem_zalloc(tblsz, KM_SLEEP);
+
+	rw_init(&mdeg.rwlock, NULL, RW_DRIVER, NULL);
+
+	mdeg.nclnts = 0;
+
+	/*
+	 * Initialize global lock
+	 */
+	mutex_init(&mdeg.lock, NULL, MUTEX_DRIVER, NULL);
+
+	/*
+	 * Initialize the task queue
+	 */
+	mdeg.taskq = taskq_create("mdeg_taskq", 1, minclsyspri, 1,
+	    MDEG_MAX_TASKQ_THR, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
+	/* ready to begin handling clients */
+	mdeg.enabled = B_TRUE;
+
+	return (0);
+}
+
+void
+mdeg_fini(void)
+{
+	/*
+	 * Flip the enabled switch off to make sure that
+	 * no events get dispatched while things are being
+	 * torn down.
+	 */
+	mdeg.enabled = B_FALSE;
+
+	/* destroy the task queue */
+	taskq_destroy(mdeg.taskq);
+
+	/*
+	 * Deallocate the table of registered clients
+	 */
+	kmem_free(mdeg.tbl, mdeg.maxclnts * sizeof (mdeg_clnt_t));
+	rw_destroy(&mdeg.rwlock);
+
+	/*
+	 * Free up the cached MDs.
+	 */
+	if (mdeg.md_curr)
+		(void) md_fini_handle(mdeg.md_curr);
+
+	if (mdeg.md_prev)
+		(void) md_fini_handle(mdeg.md_prev);
+
+	mutex_destroy(&mdeg.lock);
+}
+
+static mdeg_clnt_t *
+mdeg_alloc_clnt(void)
+{
+	mdeg_clnt_t	*clnt;
+	int		idx;
+	mdeg_clnt_t	*newtbl;
+	uint_t		newmaxclnts;
+	uint_t		newtblsz;
+	uint_t		oldtblsz;
+
+	ASSERT(RW_WRITE_HELD(&mdeg.rwlock));
+
+	/* search for an unused slot in the table */
+	for (idx = 0; idx < mdeg.maxclnts; idx++) {
+		clnt = &mdeg.tbl[idx];
+		if (!clnt->valid) {
+			break;
+		}
+	}
+
+	/* found any empty slot */
+	if (idx != mdeg.maxclnts) {
+		goto found;
+	}
+
+	/*
+	 * There was no free space in the table. Grow
+	 * the table to double its current size.
+	 */
+
+	MDEG_DBG("client table full:\n");
+	MDEG_DUMP_TABLE();
+
+	newmaxclnts = mdeg.maxclnts * 2;
+	newtblsz = newmaxclnts * sizeof (mdeg_clnt_t);
+
+	newtbl = kmem_zalloc(newtblsz, KM_SLEEP);
+
+	/* copy old table data to the new table */
+	oldtblsz = mdeg.maxclnts * sizeof (mdeg_clnt_t);
+	bcopy(mdeg.tbl, newtbl, oldtblsz);
+
+	/*
+	 * Since the old table was full, the first free entry
+	 * will be just past the end of the old table.
+	 */
+	clnt = &mdeg.tbl[mdeg.maxclnts];
+
+	/* clean up the old table */
+	kmem_free(mdeg.tbl, oldtblsz);
+	mdeg.tbl = newtbl;
+	mdeg.maxclnts = newmaxclnts;
+
+found:
+	ASSERT(clnt->valid == 0);
+
+	clnt->hdl = MDEG_ALLOC_HDL(idx, MDEG_HDL2COUNT(clnt->hdl));
+
+	return (clnt);
+}
+
+static mdeg_clnt_t *
+mdeg_get_client(mdeg_handle_t hdl)
+{
+	int		idx;
+	mdeg_clnt_t	*clnt;
+
+	idx = MDEG_HDL2IDX(hdl);
+
+	/* check if index is out of bounds */
+	if ((idx < 0) || (idx >= mdeg.maxclnts)) {
+		MDEG_DBG("mdeg_get_client: index out of bounds\n");
+		return (NULL);
+	}
+
+	clnt = &mdeg.tbl[idx];
+
+	/* check for a valid client */
+	if (!clnt->valid) {
+		MDEG_DBG("mdeg_get_client: client is not valid\n");
+		return (NULL);
+	}
+
+	/* make sure the handle is an exact match */
+	if (clnt->hdl != hdl) {
+		MDEG_DBG("mdeg_get_client: bad handle\n");
+		return (NULL);
+	}
+
+	if (clnt->magic != MDEG_MAGIC) {
+		MDEG_DBG("mdeg_get_client: bad magic\n");
+		return (NULL);
+	}
+
+	return (clnt);
+}
+
+/*
+ * Send a notification to a client immediately after it registers.
+ * The result_t is a list of all the nodes that match their specified
+ * nodes of interest, all returned on the added list. This serves
+ * as a base of reference to the client. All future MD updates are
+ * relative to this list.
+ */
+static int
+mdeg_notify_client_reg(mdeg_clnt_t *clnt)
+{
+	md_t			*mdp = NULL;
+	mde_str_cookie_t	nname;
+	mde_str_cookie_t	aname;
+	mde_cookie_t		startnode;
+	int			nnodes;
+	int			nodechk;
+	mde_cookie_t		*listp = NULL;
+	mdeg_result_t		*mdeg_res = NULL;
+	int			rv = MDEG_SUCCESS;
+
+	mutex_enter(&mdeg.lock);
+
+	/*
+	 * Handle the special case where the node specification
+	 * is NULL. In this case, call the client callback without
+	 * any results. All processing is left to the client.
+	 */
+	if (clnt->pspec == NULL) {
+		/* call the client callback */
+		(*clnt->cb)(clnt->cb_arg, NULL);
+		goto done;
+	}
+
+	if ((mdp = md_get_handle()) == NULL) {
+		cmn_err(CE_WARN, "unable to retrieve current MD");
+		rv = MDEG_FAILURE;
+		goto done;
+	}
+
+	startnode = mdeg_find_start_node(mdp, clnt->pspec);
+	if (startnode == MDE_INVAL_ELEM_COOKIE) {
+		/* not much we can do */
+		cmn_err(CE_WARN, "unable to match node specifier");
+		rv = MDEG_FAILURE;
+		goto done;
+	}
+
+	/*
+	 * Use zalloc to provide correct default values for the
+	 * unused removed, match_prev, and match_curr lists.
+	 */
+	mdeg_res = kmem_zalloc(sizeof (mdeg_result_t), KM_SLEEP);
+
+	nname = md_find_name(mdp, clnt->nmatch->namep);
+	aname = md_find_name(mdp, "fwd");
+
+	nnodes = md_scan_dag(mdp, startnode, nname, aname, NULL);
+
+	if (nnodes == 0) {
+		MDEG_DBG("mdeg_notify_client_reg: no nodes of interest\n");
+		rv = MDEG_SUCCESS;
+		goto done;
+	} else if (nnodes == -1) {
+		MDEG_DBG("error scanning DAG\n");
+		rv = MDEG_FAILURE;
+		goto done;
+	}
+
+	MDEG_DBG("mdeg_notify_client_reg: %d node%s of interest\n",
+	    nnodes, (nnodes == 1) ? "" : "s");
+
+	/* get the list of nodes of interest */
+	listp = kmem_alloc(sizeof (mde_cookie_t) * nnodes, KM_SLEEP);
+	nodechk = md_scan_dag(mdp, startnode, nname, aname, listp);
+
+	ASSERT(nodechk == nnodes);
+
+	mdeg_res->added.mdp = mdp;
+	mdeg_res->added.mdep = listp;
+	mdeg_res->added.nelem = nnodes;
+
+	/* call the client callback */
+	(*clnt->cb)(clnt->cb_arg, mdeg_res);
+
+done:
+	mutex_exit(&mdeg.lock);
+
+	if (mdp)
+		(void) md_fini_handle(mdp);
+
+	if (listp)
+		kmem_free(listp, sizeof (mde_cookie_t) * nnodes);
+
+	if (mdeg_res)
+		kmem_free(mdeg_res, sizeof (mdeg_result_t));
+
+	return (rv);
+}
+
+/*
+ * Register to receive an event notification when the system
+ * machine description is updated.
+ *
+ * Passing NULL for the node specification parameter is valid
+ * as long as the match specification is also NULL. In this
+ * case, the client will receive a notification when the MD
+ * has been updated, but the callback will not include any
+ * information. The client is then responsible for obtaining
+ * its own copy of the system MD and performing any processing
+ * manually.
+ */
+int
+mdeg_register(mdeg_node_spec_t *pspecp, mdeg_node_match_t *nmatchp,
+    mdeg_cb_t cb, void *cb_arg, mdeg_handle_t *hdlp)
+{
+	mdeg_clnt_t	*clnt;
+
+	/*
+	 * If the RW lock is held, a client is calling
+	 * register from its own callback.
+	 */
+	if (RW_LOCK_HELD(&mdeg.rwlock)) {
+		MDEG_DBG("mdeg_register: rwlock already held\n");
+		return (MDEG_FAILURE);
+	}
+
+	/* node spec and node match must both be valid, or both NULL */
+	if (((pspecp != NULL) && (nmatchp == NULL)) ||
+	    ((pspecp == NULL) && (nmatchp != NULL))) {
+		MDEG_DBG("mdeg_register: invalid parameters\n");
+		return (MDEG_FAILURE);
+	}
+
+	rw_enter(&mdeg.rwlock, RW_WRITER);
+
+	clnt = mdeg_alloc_clnt();
+
+	ASSERT(clnt);
+
+	/*
+	 * Fill in the rest of the data
+	 */
+	clnt->nmatch = nmatchp;
+	clnt->pspec = pspecp;
+	clnt->cb = cb;
+	clnt->cb_arg = cb_arg;
+	clnt->magic = MDEG_MAGIC;
+
+	/* do this last */
+	clnt->valid = B_TRUE;
+
+	MDEG_DBG("client registered (0x%lx):\n", clnt->hdl);
+	MDEG_DUMP_CLNT(clnt);
+
+	mdeg.nclnts++;
+
+	if (mdeg_notify_client_reg(clnt) != MDEG_SUCCESS) {
+		bzero(clnt, sizeof (mdeg_clnt_t));
+		rw_exit(&mdeg.rwlock);
+		return (MDEG_FAILURE);
+	}
+
+	rw_exit(&mdeg.rwlock);
+
+	*hdlp = clnt->hdl;
+
+	return (MDEG_SUCCESS);
+}
+
+int
+mdeg_unregister(mdeg_handle_t hdl)
+{
+	mdeg_clnt_t	*clnt;
+	mdeg_handle_t	mdh;
+
+	/*
+	 * If the RW lock is held, a client is calling
+	 * unregister from its own callback.
+	 */
+	if (RW_LOCK_HELD(&mdeg.rwlock)) {
+		MDEG_DBG("mdeg_unregister: rwlock already held\n");
+		return (MDEG_FAILURE);
+	}
+
+	/* lookup the client */
+	if ((clnt = mdeg_get_client(hdl)) == NULL) {
+		return (MDEG_FAILURE);
+	}
+
+	rw_enter(&mdeg.rwlock, RW_WRITER);
+
+	MDEG_DBG("client unregistered (0x%lx):\n", hdl);
+	MDEG_DUMP_CLNT(clnt);
+
+	/* save the handle to prevent reuse */
+	mdh = clnt->hdl;
+	bzero(clnt, sizeof (mdeg_clnt_t));
+
+	clnt->hdl = mdh;
+
+	mdeg.nclnts--;
+
+	rw_exit(&mdeg.rwlock);
+
+	return (MDEG_SUCCESS);
+}
+
+/*
+ * Simple algorithm for now, grab the global lock and let all
+ * the clients update themselves in parallel. There is a lot of
+ * room for improvement here. We could eliminate some scans of
+ * the DAG by imcrementally scanning at lower levels of the DAG
+ * rather than having each client start its own scan from the root.
+ */
+void
+mdeg_notify_clients(void)
+{
+	md_t		*md_new;
+	mdeg_clnt_t	*clnt;
+	int		idx;
+	int		nclnt;
+
+	rw_enter(&mdeg.rwlock, RW_READER);
+	mutex_enter(&mdeg.lock);
+
+	/*
+	 * Rotate the MDs
+	 */
+	if ((md_new = md_get_handle()) == NULL) {
+		cmn_err(CE_WARN, "unable to retrieve new MD");
+		goto done;
+	}
+
+	if (mdeg.md_prev) {
+		(void) md_fini_handle(mdeg.md_prev);
+	}
+
+	mdeg.md_prev = mdeg.md_curr;
+	mdeg.md_curr = md_new;
+
+	if (mdeg.nclnts == 0) {
+		MDEG_DBG("mdeg_notify_clients: no clients registered\n");
+		goto done;
+	}
+
+	/* dispatch the update notification to all clients */
+	for (idx = 0, nclnt = 0; idx < mdeg.maxclnts; idx++) {
+		clnt = &mdeg.tbl[idx];
+
+		if (!clnt->valid)
+			continue;
+
+		MDEG_DBG("notifying client 0x%lx (%d/%d)\n", clnt->hdl,
+		    ++nclnt, mdeg.nclnts);
+
+		(void) taskq_dispatch(mdeg.taskq, mdeg_notify_client,
+		    (void *)clnt, TQ_SLEEP);
+	}
+
+	taskq_wait(mdeg.taskq);
+
+done:
+	mutex_exit(&mdeg.lock);
+	rw_exit(&mdeg.rwlock);
+}
+
+static void
+mdeg_notify_client(void *arg)
+{
+	mdeg_clnt_t		*clnt = (mdeg_clnt_t *)arg;
+	md_diff_cookie_t	mdd = MD_INVAL_DIFF_COOKIE;
+	mdeg_result_t		mdeg_res;
+	mde_cookie_t		md_prev_start;
+	mde_cookie_t		md_curr_start;
+
+	rw_enter(&mdeg.rwlock, RW_READER);
+
+	if (!mdeg.enabled) {
+		/* trying to shutdown */
+		MDEG_DBG("mdeg_notify_client: mdeg disabled, aborting\n");
+		goto cleanup;
+	}
+
+	/*
+	 * Handle the special case where the node specification
+	 * is NULL. In this case, call the client callback without
+	 * any results. All processing is left to the client.
+	 */
+	if (clnt->pspec == NULL) {
+		/* call the client callback */
+		(*clnt->cb)(clnt->cb_arg, NULL);
+
+		MDEG_DBG("MDEG client callback done\n");
+		goto cleanup;
+	}
+
+	/* find our start nodes */
+	md_prev_start = mdeg_find_start_node(mdeg.md_prev, clnt->pspec);
+	if (md_prev_start == MDE_INVAL_ELEM_COOKIE) {
+		goto cleanup;
+	}
+
+	md_curr_start = mdeg_find_start_node(mdeg.md_curr, clnt->pspec);
+	if (md_curr_start == MDE_INVAL_ELEM_COOKIE) {
+		goto cleanup;
+	}
+
+	/* diff the MDs */
+	mdd = md_diff_init(mdeg.md_prev, md_prev_start, mdeg.md_curr,
+	    md_curr_start, clnt->nmatch->namep, clnt->nmatch->matchp);
+
+	if (mdd == MD_INVAL_DIFF_COOKIE) {
+		MDEG_DBG("unable to diff MDs\n");
+		goto cleanup;
+	}
+
+	/*
+	 * Cache the results of the diff
+	 */
+	mdeg_get_diff_results(mdd, &mdeg_res);
+
+	/* call the client callback */
+	(*clnt->cb)(clnt->cb_arg, &mdeg_res);
+
+	MDEG_DBG("MDEG client callback done\n");
+
+cleanup:
+	rw_exit(&mdeg.rwlock);
+
+	if (mdd != MD_INVAL_DIFF_COOKIE)
+		(void) md_diff_fini(mdd);
+}
+
+static mde_cookie_t
+mdeg_find_start_node(md_t *md, mdeg_node_spec_t *nspec)
+{
+	mde_cookie_t		*nodesp;
+	mde_str_cookie_t	nname;
+	mde_str_cookie_t	aname;
+	int			nnodes;
+	int			idx;
+
+	if ((md == NULL) || (nspec == NULL))
+		return (MDE_INVAL_ELEM_COOKIE);
+
+	nname = md_find_name(md, nspec->namep);
+	aname = md_find_name(md, "fwd");
+
+	nnodes = md_scan_dag(md, NULL, nname, aname, NULL);
+	if (nnodes == 0)
+		return (MDE_INVAL_ELEM_COOKIE);
+
+	nodesp = kmem_alloc(sizeof (mde_cookie_t) * nnodes, KM_SLEEP);
+
+	(void) md_scan_dag(md, NULL, nname, aname, nodesp);
+
+	for (idx = 0; idx < nnodes; idx++) {
+
+		if (mdeg_node_spec_match(md, nodesp[idx], nspec)) {
+			mde_cookie_t res = nodesp[idx];
+
+			kmem_free(nodesp, sizeof (mde_cookie_t) * nnodes);
+			return (res);
+		}
+	}
+
+	kmem_free(nodesp, sizeof (mde_cookie_t) * nnodes);
+	return (MDE_INVAL_ELEM_COOKIE);
+}
+
+static boolean_t
+mdeg_node_spec_match(md_t *md, mde_cookie_t node, mdeg_node_spec_t *nspec)
+{
+	mdeg_prop_spec_t	*prop;
+
+	ASSERT(md && nspec);
+	ASSERT(node != MDE_INVAL_ELEM_COOKIE);
+
+	prop = nspec->specp;
+
+	while (prop->type != MDET_LIST_END) {
+
+		switch (prop->type) {
+		case MDET_PROP_VAL: {
+			uint64_t val;
+
+			if (md_get_prop_val(md, node, prop->namep, &val) != 0)
+				return (B_FALSE);
+
+			if (prop->ps_val != val)
+				return (B_FALSE);
+
+			break;
+		}
+		case MDET_PROP_STR: {
+			char	*str;
+
+			if (md_get_prop_str(md, node, prop->namep, &str) != 0)
+				return (B_FALSE);
+
+			if (strcmp(prop->ps_str, str) != 0)
+				return (B_FALSE);
+
+			break;
+		}
+
+		default:
+			return (B_FALSE);
+		}
+
+		prop++;
+	}
+
+	return (B_TRUE);
+}
+
+static void
+mdeg_get_diff_results(md_diff_cookie_t mdd, mdeg_result_t *res)
+{
+	/*
+	 * Cache added nodes.
+	 */
+	res->added.mdp = mdeg.md_curr;
+	res->added.nelem = md_diff_added(mdd, &(res->added.mdep));
+
+	if (res->added.nelem == -1) {
+		bzero(&(res->added), sizeof (mdeg_diff_t));
+	}
+
+	/*
+	 * Cache removed nodes.
+	 */
+	res->removed.mdp = mdeg.md_prev;
+	res->removed.nelem = md_diff_removed(mdd, &(res->removed.mdep));
+
+	if (res->removed.nelem == -1) {
+		bzero(&(res->removed), sizeof (mdeg_diff_t));
+	}
+
+	/*
+	 * Cache matching node pairs.
+	 */
+	res->match_curr.mdp = mdeg.md_curr;
+	res->match_prev.mdp = mdeg.md_prev;
+	res->match_curr.nelem = md_diff_matched(mdd, &(res->match_prev.mdep),
+	    &(res->match_curr.mdep));
+	res->match_prev.nelem = res->match_curr.nelem;
+
+	if (res->match_prev.nelem == -1) {
+		bzero(&(res->match_prev), sizeof (mdeg_diff_t));
+		bzero(&(res->match_curr), sizeof (mdeg_diff_t));
+	}
+}
+
+#ifdef DEBUG
+/*
+ * Generate a string that represents the node specifier
+ * structure. Clamp the string length if the specifier
+ * structure contains too much information.
+ *
+ *	General form:
+ *
+ *		<nodename>:{<propname>=<propval>,...}
+ *	e.g.
+ *		vdevice:{name=vsw,reg=0x0}
+ */
+static void
+mdeg_spec_str(mdeg_node_spec_t *spec, char *buf, int len)
+{
+	mdeg_prop_spec_t	*prop;
+	int			offset;
+	boolean_t		first = B_TRUE;
+	char			*end = buf + len;
+
+	offset = snprintf(buf, len, "%s:{", spec->namep);
+
+	buf += offset;
+	len -= offset;
+	if (len <= 0)
+		goto trunc;
+
+	prop = spec->specp;
+
+	while (prop->type != MDET_LIST_END) {
+
+		switch (prop->type) {
+		case MDET_PROP_VAL:
+			offset = snprintf(buf, len, "%s%s=0x%lx",
+			    (first) ? "" : ",", prop->namep, prop->ps_val);
+			buf += offset;
+			len -= offset;
+			if (len <= 0)
+				goto trunc;
+			break;
+
+		case MDET_PROP_STR:
+			offset = snprintf(buf, len, "%s%s=%s",
+			    (first) ? "" : ",", prop->namep, prop->ps_str);
+			buf += offset;
+			len -= offset;
+			if (len <= 0)
+				goto trunc;
+			break;
+
+		default:
+			(void) snprintf(buf, len, "}");
+			return;
+		}
+
+		if (first)
+			first = B_FALSE;
+		prop++;
+	}
+
+	(void) snprintf(buf, len, "}");
+	return;
+
+trunc:
+	/* string too long, truncate it */
+	buf = end - (strlen(trunc_str) + 1);
+	(void) sprintf(buf, trunc_str);
+}
+
+/*
+ * Generate a string that represents the match structure.
+ * Clamp the string length if the match structure contains
+ * too much information.
+ *
+ *	General form:
+ *
+ *		<nodename>:{<propname>,...}
+ *	e.g.
+ *		nmatch=vport:{reg}
+ */
+static void
+mdeg_match_str(mdeg_node_match_t *match, char *buf, int len)
+{
+	md_prop_match_t	*prop;
+	int		offset;
+	boolean_t	first = B_TRUE;
+	char		*end = buf + len;
+
+	offset = snprintf(buf, len, "%s:{", match->namep);
+
+	buf += offset;
+	len -= offset;
+	if (len <= 0)
+		goto trunc;
+
+	prop = match->matchp;
+
+	while (prop->type != MDET_LIST_END) {
+		offset = snprintf(buf, len, "%s%s", (first) ? "" : ",",
+		    prop->namep);
+		buf += offset;
+		len -= offset;
+		if (len <= 0)
+			goto trunc;
+
+		if (first)
+			first = B_FALSE;
+		prop++;
+	}
+
+	(void) snprintf(buf, len, "}");
+	return;
+
+trunc:
+	/* string too long, truncate it */
+	buf = end - (strlen(trunc_str) + 1);
+	(void) sprintf(buf, trunc_str);
+}
+
+#define	MAX_FIELD_STR	80
+
+static void
+mdeg_dump_clnt(mdeg_clnt_t *clnt)
+{
+	char	str[MAX_FIELD_STR];
+
+	if (!clnt->valid) {
+		MDEG_DBG("  valid=B_FALSE\n");
+		return;
+	}
+
+	mdeg_spec_str(clnt->pspec, str, MAX_FIELD_STR);
+	MDEG_DBG("  pspecp=%s\n", str);
+
+	mdeg_match_str(clnt->nmatch, str, MAX_FIELD_STR);
+	MDEG_DBG("  nmatch=%s\n", str);
+}
+
+static void
+mdeg_dump_table(void)
+{
+	int		idx;
+	mdeg_clnt_t	*clnt;
+
+	for (idx = 0; idx < mdeg.maxclnts; idx++) {
+		clnt = &(mdeg.tbl[idx]);
+
+		MDEG_DBG("client %d (0x%lx):\n", idx, clnt->hdl);
+		mdeg_dump_clnt(clnt);
+	}
+}
+#endif /* DEBUG */
--- a/usr/src/uts/sun4v/io/mdesc.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/io/mdesc.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -55,23 +54,29 @@
  * Operational state flags
  */
 
-#define	MDESC_DIDMINOR	0x2		/* Created minors */
-#define	MDESC_DIDMUTEX	0x8		/* Created mutex */
-#define	MDESC_DIDCV	0x10		/* Created cv */
-#define	MDESC_BUSY	0x20		/* Device is busy */
+#define	MDESC_GOT_HANDLE	0x10		/* Got mdesc handle */
+#define	MDESC_BUSY		0x20		/* Device is busy */
 
-static void *mdesc_state_head;
+static void		*mdesc_state_head;
+static vmem_t		*mdesc_minor;
+static uint16_t 	mdesc_max_opens = 256;
+static uint16_t		mdesc_opens = 0;
+static int		mdesc_attached = 0;
+static dev_info_t	*mdesc_devi;
+static kmutex_t		mdesc_lock;
 
 struct mdesc_state {
 	int		instance;
-	dev_info_t	*devi;
+	dev_t		dev;
 	kmutex_t	lock;
 	kcondvar_t	cv;
 	size_t		mdesc_len;
-	uint8_t		*mdesc;
+	md_t		*mdesc;
 	int		flags;
 };
 
+typedef struct mdesc_state mdesc_state_t;
+
 static int mdesc_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
 static int mdesc_attach(dev_info_t *, ddi_attach_cmd_t);
 static int mdesc_detach(dev_info_t *, ddi_detach_cmd_t);
@@ -129,19 +134,13 @@
 };
 
 
-
-
-
-
-
-
 int
 _init(void)
 {
 	int retval;
 
 	if ((retval = ddi_soft_state_init(&mdesc_state_head,
-	    sizeof (struct mdesc_state), 1)) != 0)
+	    sizeof (struct mdesc_state), mdesc_max_opens)) != 0)
 		return (retval);
 	if ((retval = mod_install(&modlinkage)) != 0) {
 		ddi_soft_state_fini(&mdesc_state_head);
@@ -189,9 +188,10 @@
 
 	switch (cmd) {
 	case DDI_INFO_DEVT2DEVINFO:
-		if ((mdsp = ddi_get_soft_state(mdesc_state_head,
-		    getminor((dev_t)arg))) != NULL) {
-			*resultp = mdsp->devi;
+		mdsp = ddi_get_soft_state(mdesc_state_head,
+		    getminor((dev_t)arg));
+		if (mdsp != NULL) {
+			*resultp = mdesc_devi;
 			retval = DDI_SUCCESS;
 		} else
 			*resultp = NULL;
@@ -212,47 +212,23 @@
 mdesc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 {
 	int instance = ddi_get_instance(dip);
-	struct mdesc_state *mdsp;
 
 	switch (cmd) {
 	case DDI_ATTACH:
-		if (ddi_soft_state_zalloc(mdesc_state_head, instance) !=
-		    DDI_SUCCESS) {
-			cmn_err(CE_WARN, "%s@%d: Unable to allocate state",
-			    MDESC_NAME, instance);
-			return (DDI_FAILURE);
-		}
-		if ((mdsp = ddi_get_soft_state(mdesc_state_head, instance)) ==
-		    NULL) {
-			cmn_err(CE_WARN, "%s@%d: Unable to obtain state",
-			    MDESC_NAME, instance);
-			ddi_soft_state_free(dip, instance);
-			return (DDI_FAILURE);
-		}
+
 		if (ddi_create_minor_node(dip, MDESC_NAME, S_IFCHR, instance,
 		    DDI_PSEUDO, 0) != DDI_SUCCESS) {
 			cmn_err(CE_WARN, "%s@%d: Unable to create minor node",
 			    MDESC_NAME, instance);
-			(void) mdesc_detach(dip, DDI_DETACH);
 			return (DDI_FAILURE);
 		}
-		mdsp->flags |= MDESC_DIDMINOR;
-
-		mdsp->instance = instance;
-		mdsp->devi = dip;
-
-		mutex_init(&mdsp->lock, NULL, MUTEX_DRIVER, NULL);
-		mdsp->flags |= MDESC_DIDMUTEX;
-
-		cv_init(&mdsp->cv, NULL, CV_DRIVER, NULL);
-		mdsp->flags |= MDESC_DIDCV;
-
-			/* point the driver at the kernel's copy of the data */
-		mdsp->mdesc = (uint8_t *)machine_descrip.va;
-		mdsp->mdesc_len = (machine_descrip.va != NULL) ?
-		    machine_descrip.size : 0;
-
 		ddi_report_dev(dip);
+		mdesc_devi = dip;
+		mdesc_minor = vmem_create("mdesc_minor", (void *) 1,
+		    mdesc_max_opens, 1, NULL, NULL, NULL, 0,
+		    VM_SLEEP | VMC_IDENTIFIER);
+		mutex_init(&mdesc_lock, NULL, MUTEX_DRIVER, NULL);
+		mdesc_attached = 1;
 		return (DDI_SUCCESS);
 	case DDI_RESUME:
 		return (DDI_SUCCESS);
@@ -261,27 +237,16 @@
 	}
 }
 
-
-
+/*ARGSUSED*/
 static int
 mdesc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 {
-	int instance = ddi_get_instance(dip);
-	struct mdesc_state *mdsp;
-
 	switch (cmd) {
 	case DDI_DETACH:
-		mdsp = ddi_get_soft_state(mdesc_state_head, instance);
-		if (mdsp != NULL) {
-			ASSERT(!(mdsp->flags & MDESC_BUSY));
-			if (mdsp->flags & MDESC_DIDCV)
-				cv_destroy(&mdsp->cv);
-			if (mdsp->flags & MDESC_DIDMUTEX)
-				mutex_destroy(&mdsp->lock);
-			if (mdsp->flags & MDESC_DIDMINOR)
-				ddi_remove_minor_node(dip, NULL);
-		}
-		ddi_soft_state_free(mdesc_state_head, instance);
+		mutex_destroy(&mdesc_lock);
+		vmem_destroy(mdesc_minor);
+		ddi_remove_minor_node(mdesc_devi, NULL);
+		mdesc_attached = 0;
 		return (DDI_SUCCESS);
 
 	case DDI_SUSPEND:
@@ -292,29 +257,108 @@
 	}
 }
 
+static void
+mdesc_destroy_state(mdesc_state_t *mdsp)
+{
+	minor_t minor = getminor(mdsp->dev);
+
+	if (mdsp->flags & MDESC_GOT_HANDLE)
+		(void) md_fini_handle(mdsp->mdesc);
+
+	cv_destroy(&mdsp->cv);
+	mutex_destroy(&mdsp->lock);
+	ddi_soft_state_free(mdesc_state_head, minor);
+	vmem_free(mdesc_minor, (void *)(uintptr_t)minor, 1);
+}
+
+static mdesc_state_t *
+mdesc_create_state(dev_t *devp)
+{
+	major_t	major;
+	minor_t	minor;
+	mdesc_state_t *mdsp;
+
+	minor = (minor_t)(uintptr_t)vmem_alloc(mdesc_minor, 1,
+	    VM_BESTFIT | VM_SLEEP);
+
+	if (ddi_soft_state_zalloc(mdesc_state_head, minor) !=
+	    DDI_SUCCESS) {
+		cmn_err(CE_WARN, "%s@%d: Unable to allocate state",
+		    MDESC_NAME, minor);
+		vmem_free(mdesc_minor, (void *)(uintptr_t)minor, 1);
+		return (NULL);
+	}
+
+	mdsp = ddi_get_soft_state(mdesc_state_head, minor);
+
+	if (devp != NULL) {
+		major = getemajor(*devp);
+	} else {
+		major = ddi_driver_major(mdesc_devi);
+	}
+
+	mdsp->dev = makedevice(major, minor);
+
+	if (devp != NULL)
+		*devp = mdsp->dev;
+
+	mdsp->instance = minor;
+
+	mutex_init(&mdsp->lock, NULL, MUTEX_DRIVER, NULL);
+
+	cv_init(&mdsp->cv, NULL, CV_DRIVER, NULL);
+
+	mdsp->mdesc = md_get_handle();
+
+	if (mdsp->mdesc == NULL) {
+		mdesc_destroy_state(mdsp);
+		return (NULL);
+	}
+	mdsp->flags |= MDESC_GOT_HANDLE;
+
+	mdsp->mdesc_len = md_get_bin_size(mdsp->mdesc);
+
+	if (mdsp->mdesc_len == 0) {
+		mdesc_destroy_state(mdsp);
+		mdsp = NULL;
+	}
+
+	return (mdsp);
+}
 
 
 /*ARGSUSED*/
 static int
 mdesc_open(dev_t *devp, int flag, int otyp, cred_t *credp)
 {
-	int instance = getminor(*devp);
 	struct mdesc_state *mdsp;
 
-	if ((mdsp = ddi_get_soft_state(mdesc_state_head, instance)) == NULL)
-		return (ENXIO);
-
-	ASSERT(mdsp->instance == instance);
-
 	if (otyp != OTYP_CHR)
 		return (EINVAL);
+	if (!mdesc_attached)
+		return (ENXIO);
+
+	mutex_enter(&mdesc_lock);
+
+	if (mdesc_opens >= mdesc_max_opens) {
+		mutex_exit(&mdesc_lock);
+		return (ENXIO);
+	}
+
+	mdsp = mdesc_create_state(devp);
+
+	if (mdsp == NULL) {
+		mutex_exit(&mdesc_lock);
+		return (ENXIO);
+	}
+
+	mdesc_opens++;
+
+	mutex_exit(&mdesc_lock);
 
 	return (0);
 }
 
-
-
-
 /*ARGSUSED*/
 static int
 mdesc_close(dev_t dev, int flag, int otyp, cred_t *credp)
@@ -322,13 +366,25 @@
 	struct mdesc_state *mdsp;
 	int instance = getminor(dev);
 
+	if (otyp != OTYP_CHR)
+		return (EINVAL);
+
+	mutex_enter(&mdesc_lock);
+	if (mdesc_opens == 0) {
+		mutex_exit(&mdesc_lock);
+		return (0);
+	}
+	mutex_exit(&mdesc_lock);
+
 	if ((mdsp = ddi_get_soft_state(mdesc_state_head, instance)) == NULL)
 		return (ENXIO);
 
 	ASSERT(mdsp->instance == instance);
 
-	if (otyp != OTYP_CHR)
-		return (EINVAL);
+	mdesc_destroy_state(mdsp);
+	mutex_enter(&mdesc_lock);
+	mdesc_opens--;
+	mutex_exit(&mdesc_lock);
 
 	return (0);
 }
@@ -363,6 +419,7 @@
 	int instance = getminor(dev);
 	size_t len;
 	int retval;
+	caddr_t buf;
 
 	len = uiop->uio_resid;
 
@@ -400,7 +457,11 @@
 	mdsp->flags |= MDESC_BUSY;
 	mutex_exit(&mdsp->lock);
 
-	retval = uiomove((void *)(mdsp->mdesc + uiop->uio_offset),
+	buf = md_get_md_raw(mdsp->mdesc);
+	if (buf == NULL)
+		return (ENXIO);
+
+	retval = uiomove((void *)(buf + uiop->uio_offset),
 		len, rw, uiop);
 
 	mutex_enter(&mdsp->lock);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/platsvc.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,371 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * sun4v Platform Services Module
+ */
+
+#include <sys/modctl.h>
+#include <sys/cmn_err.h>
+#include <sys/machsystm.h>
+#include <sys/note.h>
+#include <sys/uadmin.h>
+#include <sys/ds.h>
+#include <sys/platsvc.h>
+
+/*
+ * Debugging routines
+ */
+#ifdef DEBUG
+uint_t ps_debug = 0x0;
+#define	DBG	if (ps_debug) printf
+#else /* DEBUG */
+#define	DBG	_NOTE(CONSTCOND) if (0) printf
+#endif /* DEBUG */
+
+/*
+ * Time resolution conversions.
+ */
+#define	MS2NANO(x)	((x) * MICROSEC)
+#define	MS2SEC(x)	((x) / MILLISEC)
+#define	MS2MIN(x)	(MS2SEC(x) / 60)
+
+/*
+ * Domains Services interaction
+ */
+static ds_svc_hdl_t	ds_md_handle;
+static ds_svc_hdl_t	ds_shutdown_handle;
+static ds_svc_hdl_t	ds_panic_handle;
+
+static ds_ver_t		ps_vers[] = {{ 1, 0 }};
+#define	PS_NVERS	(sizeof (ps_vers) / sizeof (ps_vers[0]))
+
+static ds_capability_t ps_md_cap = {
+	"md-update",		/* svc_id */
+	ps_vers,		/* vers */
+	PS_NVERS		/* nvers */
+};
+
+static ds_capability_t ps_shutdown_cap = {
+	"domain-shutdown",	/* svc_id */
+	ps_vers,		/* vers */
+	PS_NVERS		/* nvers */
+};
+
+static ds_capability_t ps_panic_cap = {
+	"domain-panic",		/* svc_id */
+	ps_vers,		/* vers */
+	PS_NVERS		/* nvers */
+};
+
+static void ps_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl);
+static void ps_unreg_handler(ds_cb_arg_t arg);
+
+static void ps_md_data_handler(ds_cb_arg_t arg, void * buf, size_t buflen);
+static void ps_shutdown_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen);
+static void ps_panic_data_handler(ds_cb_arg_t arg, void * buf, size_t buflen);
+
+static ds_clnt_ops_t ps_md_ops = {
+	ps_reg_handler,			/* ds_reg_cb */
+	ps_unreg_handler,		/* ds_unreg_cb */
+	ps_md_data_handler,		/* ds_data_cb */
+	&ds_md_handle			/* cb_arg */
+};
+
+static ds_clnt_ops_t ps_shutdown_ops = {
+	ps_reg_handler,			/* ds_reg_cb */
+	ps_unreg_handler,		/* ds_unreg_cb */
+	ps_shutdown_data_handler,	/* ds_data_cb */
+	&ds_shutdown_handle		/* cb_arg */
+};
+
+static ds_clnt_ops_t ps_panic_ops = {
+	ps_reg_handler,			/* ds_reg_cb */
+	ps_unreg_handler,		/* ds_unreg_cb */
+	ps_panic_data_handler,		/* ds_data_cb */
+	&ds_panic_handle		/* cb_arg */
+};
+
+static int ps_init(void);
+static void ps_fini(void);
+
+/*
+ * Powerdown timeout value of 5 minutes.
+ */
+#define	PLATSVC_POWERDOWN_DELAY		1200
+
+static struct modlmisc modlmisc = {
+	&mod_miscops,
+	"sun4v Platform Services %I%"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	(void *)&modlmisc,
+	NULL
+};
+
+int
+_init(void)
+{
+	int	rv;
+
+	if ((rv = ps_init()) != 0)
+		return (rv);
+
+	if ((rv = mod_install(&modlinkage)) != 0)
+		ps_fini();
+
+	return (rv);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int platsvc_allow_unload;
+
+int
+_fini(void)
+{
+	int	status;
+
+	if (platsvc_allow_unload == 0)
+		return (EBUSY);
+
+	if ((status = mod_remove(&modlinkage)) == 0)
+		ps_fini();
+
+	return (status);
+}
+
+static int
+ps_init(void)
+{
+	int	rv;
+	extern int mdeg_init(void);
+
+	/* register with domain services framework */
+	rv = ds_cap_init(&ps_md_cap, &ps_md_ops);
+	if (rv != 0) {
+		cmn_err(CE_WARN, "ds_cap_init md-update failed: %d", rv);
+		return (rv);
+	}
+
+	rv = ds_cap_init(&ps_shutdown_cap, &ps_shutdown_ops);
+	if (rv != 0) {
+		cmn_err(CE_WARN, "ds_cap_init domain-shutdown failed: %d", rv);
+		(void) ds_cap_fini(&ps_md_cap);
+		return (rv);
+	}
+
+	rv = ds_cap_init(&ps_panic_cap, &ps_panic_ops);
+	if (rv != 0) {
+		cmn_err(CE_WARN, "ds_cap_init domain-panic failed: %d", rv);
+		(void) ds_cap_fini(&ps_md_cap);
+		(void) ds_cap_fini(&ps_shutdown_cap);
+		return (rv);
+	}
+
+	rv = mdeg_init();
+
+	return (rv);
+}
+
+static void
+ps_fini(void)
+{
+	extern void mdeg_fini(void);
+
+	/*
+	 * Stop incoming requests from Zeus
+	 */
+	(void) ds_cap_fini(&ps_md_cap);
+	(void) ds_cap_fini(&ps_shutdown_cap);
+	(void) ds_cap_fini(&ps_panic_cap);
+
+	mdeg_fini();
+}
+
+static void
+ps_md_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
+{
+	extern int mach_descrip_update(void);
+	extern void mdeg_notify_clients(void);
+
+	ds_svc_hdl_t		 ds_handle;
+	platsvc_md_update_req_t	 *msg = buf;
+	platsvc_md_update_resp_t resp_msg;
+	uint_t			 rv;
+
+	if (arg == NULL)
+		return;
+
+	ds_handle = ds_md_handle;
+
+	if (msg == NULL || buflen != sizeof (platsvc_md_update_req_t)) {
+		resp_msg.req_num = 0;
+		resp_msg.result = MD_UPDATE_INVALID_MSG;
+		if ((rv = ds_cap_send(ds_handle, &resp_msg,
+		    sizeof (resp_msg))) != 0) {
+			cmn_err(CE_NOTE, "md ds_cap_send failed (%d)", rv);
+		}
+		return;
+	}
+
+	DBG("MD Reload...\n");
+	if (mach_descrip_update()) {
+		cmn_err(CE_WARN, "MD reload failed\n");
+		return;
+	}
+
+	/*
+	 * notify registered clients that MD has
+	 * been updated
+	 */
+	mdeg_notify_clients();
+
+	resp_msg.req_num = msg->req_num;
+	resp_msg.result = MD_UPDATE_SUCCESS;
+	if ((rv = ds_cap_send(ds_handle, &resp_msg, sizeof (resp_msg))) != 0) {
+		cmn_err(CE_NOTE, "md ds_cap_send resp failed (%d)", rv);
+	}
+}
+
+static void
+ps_shutdown_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
+{
+	ds_svc_hdl_t		ds_handle;
+	platsvc_shutdown_req_t	*msg = buf;
+	platsvc_shutdown_resp_t	resp_msg;
+	uint_t			rv;
+	hrtime_t		start;
+
+	if (arg == NULL)
+		return;
+
+	ds_handle = ds_shutdown_handle;
+
+	if (msg == NULL || buflen != sizeof (platsvc_shutdown_req_t)) {
+		resp_msg.req_num = 0;
+		resp_msg.result = DOMAIN_SHUTDOWN_INVALID_MSG;
+		resp_msg.reason[0] = '\0';
+		if ((rv = ds_cap_send(ds_handle, &resp_msg,
+		    sizeof (resp_msg))) != 0) {
+			cmn_err(CE_NOTE, "shutdown ds_cap_send failed (%d)",
+			    rv);
+		}
+		return;
+	}
+
+	resp_msg.req_num = msg->req_num;
+	resp_msg.result = DOMAIN_SHUTDOWN_SUCCESS;
+	resp_msg.reason[0] = '\0';
+
+	if ((rv = ds_cap_send(ds_handle, &resp_msg, sizeof (resp_msg))) != 0) {
+		cmn_err(CE_NOTE, "shutdown ds_cap_send resp failed (%d)", rv);
+	}
+
+	/*
+	 * Honor the ldoms manager's shutdown delay requirement.
+	 */
+	cmn_err(CE_NOTE, "shutdown requested by ldom manager, "
+	    "system shutdown in %d minutes", MS2MIN(msg->delay));
+
+	start = gethrtime();
+	while (gethrtime() - start < MS2NANO(msg->delay))
+		;
+
+	(void) kadmin(A_SHUTDOWN, AD_POWEROFF, NULL, kcred);
+}
+
+
+static void
+ps_panic_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
+{
+	ds_svc_hdl_t		ds_handle;
+	platsvc_panic_req_t	*msg = buf;
+	platsvc_panic_resp_t	resp_msg;
+	uint_t			rv;
+
+	if (arg == NULL)
+		return;
+
+	ds_handle = ds_panic_handle;
+
+	if (msg == NULL || buflen != sizeof (platsvc_panic_req_t)) {
+		resp_msg.req_num = 0;
+		resp_msg.result = DOMAIN_PANIC_INVALID_MSG;
+		resp_msg.reason[0] = '\0';
+		if ((rv = ds_cap_send(ds_handle, &resp_msg,
+		    sizeof (resp_msg))) != 0) {
+			cmn_err(CE_NOTE, "panic ds_cap_send resp failed (%d)",
+			    rv);
+		}
+		return;
+	}
+
+	resp_msg.req_num = msg->req_num;
+	resp_msg.result = DOMAIN_PANIC_SUCCESS;
+	resp_msg.reason[0] = '\0';
+	if ((rv = ds_cap_send(ds_handle, &resp_msg, sizeof (resp_msg))) != 0) {
+		cmn_err(CE_NOTE, "panic ds_cap_send resp failed (%d)", rv);
+	}
+
+	cmn_err(CE_PANIC, "Panic forced by ldom manager");
+	_NOTE(NOTREACHED)
+}
+
+static void
+ps_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
+{
+	DBG("ps_reg_handler: arg=0x%p, ver=%d.%d, hdl=0x%lx\n",
+	    arg, ver->major, ver->minor, hdl);
+
+	if ((ds_svc_hdl_t *)arg == &ds_md_handle)
+		ds_md_handle = hdl;
+	if ((ds_svc_hdl_t *)arg == &ds_shutdown_handle)
+		ds_shutdown_handle = hdl;
+	if ((ds_svc_hdl_t *)arg == &ds_panic_handle)
+		ds_panic_handle = hdl;
+}
+
+static void
+ps_unreg_handler(ds_cb_arg_t arg)
+{
+	DBG("ps_unreg_handler: arg=0x%p\n", arg);
+
+	if ((ds_svc_hdl_t *)arg == &ds_md_handle)
+		ds_md_handle = DS_INVALID_HDL;
+	if ((ds_svc_hdl_t *)arg == &ds_shutdown_handle)
+		ds_shutdown_handle = DS_INVALID_HDL;
+	if ((ds_svc_hdl_t *)arg == &ds_panic_handle)
+		ds_panic_handle = DS_INVALID_HDL;
+}
--- a/usr/src/uts/sun4v/io/qcn.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/io/qcn.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -87,7 +87,8 @@
 };
 static cyclic_id_t qcn_poll_cycid = CYCLIC_NONE;
 static uint64_t	qcn_poll_interval = 5;  /* milli sec */
-static uint64_t	sb_interval = 0;
+static uint64_t sb_interval = 0;
+uint_t qcn_force_polling = 0;
 #endif
 
 #define	QCN_MI_IDNUM		0xABCE
@@ -338,7 +339,8 @@
 	 * the console to work on older firmware releases.
 	 */
 	binding_name = ddi_binding_name(qcn_state->qcn_dip);
-	if (strcmp(binding_name, "qcn") == 0)
+	if ((strcmp(binding_name, "qcn") == 0) ||
+	    (qcn_force_polling))
 		qcn_state->qcn_polling = 1;
 
 	if (qcn_state->qcn_polling) {
@@ -802,7 +804,7 @@
 		buf = (caddr_t)bp->b_rptr;
 
 		for (i = 0; i < len; i++) {
-			if (hv_cnputchar(buf[i]) == -1)
+			if (hv_cnputchar(buf[i]) == H_EWOULDBLOCK)
 				break;
 		}
 		if (i != len) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/vcc.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,2406 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/open.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/ksynch.h>
+#include <sys/modctl.h>
+#include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
+#include <sys/debug.h>
+#include <sys/promif.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cyclic.h>
+#include <sys/termio.h>
+#include <sys/intr.h>
+#include <sys/ivintr.h>
+#include <sys/note.h>
+#include <sys/stat.h>
+#include <sys/fcntl.h>
+#include <sys/sysmacros.h>
+
+#include <sys/ldc.h>
+#include <sys/mdeg.h>
+#include <sys/vcc_impl.h>
+
+/*
+ * Function prototypes.
+ */
+
+/* DDI entrypoints */
+static int	vcc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int	vcc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+static int	vcc_open(dev_t *devp, int flag, int otyp, cred_t *cred);
+static int	vcc_close(dev_t dev, int flag, int otyp, cred_t *cred);
+static int	vcc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
+			cred_t *credp, int *rvalp);
+static int	vcc_read(dev_t dev, struct uio *uiop, cred_t *credp);
+static int	vcc_write(dev_t dev, struct uio *uiop, cred_t *credp);
+static int	vcc_chpoll(dev_t dev, short events, int anyyet,
+			short *reventsp, struct pollhead **phpp);
+static int	vcc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,
+			void *arg, void **resultp);
+
+/* callback functions */
+static uint_t	vcc_ldc_cb(uint64_t event, caddr_t arg);
+static int	vcc_mdeg_cb(void *cb_argp, mdeg_result_t *resp);
+
+/* Internal functions */
+static int	i_vcc_ldc_init(vcc_t *vccp, vcc_port_t *vport);
+static int	i_vcc_add_port(vcc_t *vccp, char *group_name, uint64_t tcp_port,
+			uint_t portno, char *domain_name);
+static int	i_vcc_config_port(vcc_t *vccp, uint_t portno, uint64_t ldc_id);
+static int	i_vcc_reset_events(vcc_t *vccp);
+static int	i_vcc_cons_tbl(vcc_t *vccp, uint_t num_ports,
+			caddr_t buf, int mode);
+static int	i_vcc_del_cons_ok(vcc_t *vccp, caddr_t buf, int mode);
+static int	i_vcc_close_port(vcc_port_t *vport);
+static int	i_vcc_write_ldc(vcc_port_t *vport, vcc_msg_t *buf);
+
+static void *vcc_ssp;
+
+static struct cb_ops vcc_cb_ops = {
+	vcc_open,	    /* open */
+	vcc_close,	    /* close */
+	nodev,		    /* strategy */
+	nodev,		    /* print */
+	nodev,		    /* dump */
+	vcc_read,	    /* read */
+	vcc_write,	    /* write */
+	vcc_ioctl,	    /* ioctl */
+	nodev,		    /* devmap */
+	nodev,		    /* mmap */
+	ddi_segmap,	    /* segmap */
+	vcc_chpoll,	    /* chpoll */
+	ddi_prop_op,	    /* prop_op */
+	NULL,		    /* stream */
+	D_NEW | D_MP	    /* flags */
+};
+
+
+static struct dev_ops vcc_ops = {
+	DEVO_REV,		/* rev */
+	0,			/* ref count */
+	vcc_getinfo,		/* getinfo */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	vcc_attach,		/* attach */
+	vcc_detach,		/* detach */
+	nodev,			/* reset */
+	&vcc_cb_ops,		/* cb_ops */
+	(struct bus_ops *)NULL	/* bus_ops */
+};
+
+extern struct mod_ops mod_driverops;
+
+#define	    VCC_CHANNEL_ENDPOINT	"channel-endpoint"
+#define	    VCC_ID_PROP		"id"
+
+/*
+ * This is the string displayed by modinfo(1m).
+ */
+static char vcc_ident[] = "sun4v Virtual Console Concentrator Driver v%I%";
+
+static struct modldrv md = {
+	&mod_driverops, 	/* Type - it is a driver */
+	vcc_ident,		/* Name of the module */
+	&vcc_ops,		/* driver specfic opts */
+};
+
+static struct modlinkage ml = {
+	MODREV_1,
+	&md,
+	NULL
+};
+
+/*
+ * Matching criteria passed to the MDEG to register interest
+ * in changes to 'virtual-device-port' nodes identified by their
+ * 'id' property.
+ */
+static md_prop_match_t vcc_port_prop_match[] = {
+	{ MDET_PROP_VAL,	    "id"   },
+	{ MDET_LIST_END,	    NULL    }
+};
+
+static mdeg_node_match_t vcc_port_match = {"virtual-device-port",
+					vcc_port_prop_match};
+
+/*
+ * Specification of an MD node passed to the MDEG to filter any
+ * 'virtual-device-port' nodes that do not belong to the specified node.
+ * This template is copied for each vldc instance and filled in with
+ * the appropriate 'cfg-handle' value before being passed to the MDEG.
+ */
+static mdeg_prop_spec_t vcc_prop_template[] = {
+	{ MDET_PROP_STR,    "name",	"virtual-console-concentrator"	},
+	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
+	{ MDET_LIST_END,    NULL,		NULL	}
+};
+
+#define	VCC_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val);
+
+
+#ifdef DEBUG
+
+/*
+ * Print debug messages
+ *
+ * set vldcdbg to 0xf to enable all messages
+ *
+ * 0x8 - Errors
+ * 0x4 - Warnings
+ * 0x2 - All debug messages (most verbose)
+ * 0x1 - Minimal debug messages
+ */
+
+int vccdbg = 0x8;
+
+static void
+vccdebug(const char *fmt, ...)
+{
+	char buf[512];
+	va_list ap;
+
+	va_start(ap, fmt);
+	(void) vsprintf(buf, fmt, ap);
+	va_end(ap);
+
+	cmn_err(CE_CONT, "%s\n", buf);
+}
+
+#define	D1		\
+if (vccdbg & 0x01)	\
+	vccdebug
+
+#define	D2		\
+if (vccdbg & 0x02)	\
+	vccdebug
+
+#define	DWARN		\
+if (vccdbg & 0x04)	\
+	vccdebug
+
+#else
+
+#define	D1
+#define	D2
+#define	DWARN
+
+#endif
+
+/* _init(9E): initialize the loadable module */
+int
+_init(void)
+{
+	int error;
+
+	/* init the soft state structure */
+	error = ddi_soft_state_init(&vcc_ssp, sizeof (vcc_t), 1);
+	if (error != 0) {
+		return (error);
+	}
+
+	/* Link the driver into the system */
+	error = mod_install(&ml);
+
+	return (error);
+
+}
+
+/* _info(9E): return information about the loadable module */
+int
+_info(struct modinfo *modinfop)
+{
+	/* Report status of the dynamically loadable driver module */
+	return (mod_info(&ml, modinfop));
+}
+
+/* _fini(9E): prepare the module for unloading. */
+int
+_fini(void)
+{
+	int error;
+
+	/* Unlink the driver module from the system */
+	if ((error = mod_remove(&ml)) == 0) {
+		/*
+		 * We have successfully "removed" the driver.
+		 * destroy soft state
+		 */
+		ddi_soft_state_fini(&vcc_ssp);
+	}
+
+	return (error);
+}
+
+/* getinfo(9E) */
+static int
+vcc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,  void *arg, void **resultp)
+{
+	_NOTE(ARGUNUSED(dip))
+
+	int	instance = VCCINST(getminor((dev_t)arg));
+	vcc_t	*vccp = NULL;
+
+	switch (cmd) {
+
+	case DDI_INFO_DEVT2DEVINFO:
+		if ((vccp = ddi_get_soft_state(vcc_ssp, instance)) == NULL) {
+			*resultp = NULL;
+			return (DDI_FAILURE);
+		}
+		*resultp = vccp->dip;
+		return (DDI_SUCCESS);
+
+	case DDI_INFO_DEVT2INSTANCE:
+		*resultp = (void *)(uintptr_t)instance;
+		return (DDI_SUCCESS);
+
+	default:
+		*resultp = NULL;
+		return (DDI_FAILURE);
+	}
+}
+
+/*
+ * There are two cases that need special blocking. One of them is to block
+ * a minor node without a port and another is to block application other
+ * than vntsd.
+ *
+ * A minor node can exist in the file system without associated with a port
+ * because when a port is deleted, ddi_remove_minor does not unlink it.
+ * Clients might try to open a minor node even after the corresponding port
+ * node has been removed.  To identify and block these calls,
+ * we need to validate the association between a port and its minor node.
+ *
+ * An application other than vntsd can access a console port as long
+ * as vntsd is not using the port. A port opened by an application other
+ * than vntsd will be closed when vntsd wants to use the port.
+ * However, other application could use same file descriptor
+ * access vcc cb_ops. So we need to identify and block caller other
+ * than vntsd, when vntsd is using the port.
+ */
+static int
+i_vcc_can_use_port(vcc_minor_t *minorp, vcc_port_t *vport)
+{
+	if (vport->minorp != minorp) {
+		/* port config changed */
+		return (ENXIO);
+	}
+
+	if (vport->valid_pid == VCC_NO_PID_BLOCKING) {
+		/* no blocking needed */
+		return (0);
+	}
+
+	if (vport->valid_pid != ddi_get_pid()) {
+		return (EIO);
+	}
+
+	return (0);
+}
+
+
+/* Syncronization between thread using cv_wait */
+static int
+i_vcc_wait_port_status(vcc_port_t *vport, kcondvar_t *cv, uint32_t status)
+{
+
+	int	    rv;
+
+	ASSERT(mutex_owned(&vport->lock));
+
+	for (; ; ) {
+
+		if ((vport->status & VCC_PORT_AVAIL) == 0) {
+			/* port has been deleted */
+			D1("i_vcc_wait_port_status: port%d deleted\n",
+			    vport->number);
+			return (EIO);
+		}
+
+		if ((vport->status & VCC_PORT_OPEN) == 0) {
+			D1("i_vcc_wait_port_status: port%d is closed \n",
+			    vport->number);
+			return (EIO);
+		}
+
+		if (vport->status & VCC_PORT_LDC_LINK_DOWN) {
+			return (EIO);
+		}
+
+		if ((vport->valid_pid != VCC_NO_PID_BLOCKING) &&
+		    (vport->valid_pid != ddi_get_pid())) {
+			return (EIO);
+		}
+
+		if ((vport->status & status) == status) {
+			return (0);
+		}
+
+		if (!ddi_can_receive_sig()) {
+			return (EIO);
+		}
+
+		rv = cv_wait_sig(cv, &vport->lock);
+		if (rv == 0) {
+			D1("i_vcc_wait_port_status: port%d get intr \n",
+			    vport->number);
+			/* got signal */
+			return (EINTR);
+		}
+	}
+
+}
+
+/* Syncronization between threads, signal state change */
+static void
+i_vcc_set_port_status(vcc_port_t *vport, kcondvar_t *cv, uint32_t status)
+{
+
+	mutex_enter(&vport->lock);
+	vport->status |= status;
+	cv_broadcast(cv);
+	mutex_exit(&vport->lock);
+}
+
+/* initialize a ldc channel */
+static int
+i_vcc_ldc_init(vcc_t *vccp, vcc_port_t *vport)
+{
+	ldc_attr_t 	attr;
+	int		rv = EIO;
+
+	ASSERT(mutex_owned(&vport->lock));
+	ASSERT(vport->ldc_id != VCC_INVALID_CHANNEL);
+
+	/* initialize the channel */
+	attr.devclass = LDC_DEV_SERIAL;
+	attr.instance = ddi_get_instance(vccp->dip);
+	attr.qlen = VCC_QUEUE_LEN;
+	attr.mode = LDC_MODE_RAW;
+
+	if ((rv = ldc_init(vport->ldc_id, &attr, &(vport->ldc_handle))) != 0) {
+		cmn_err(CE_CONT, "i_vcc_ldc_init: port %d inv channel 0x%lx\n",
+		    vport->number, vport->ldc_id);
+		vport->ldc_id = VCC_INVALID_CHANNEL;
+		return (rv);
+	}
+
+	/* register it */
+	if ((rv = ldc_reg_callback(vport->ldc_handle, vcc_ldc_cb,
+		(caddr_t)vport)) != 0) {
+		cmn_err(CE_CONT, "i_vcc_ldc_init: port@%d ldc_register_cb"
+			"failed\n", vport->number);
+		(void) ldc_fini(vport->ldc_handle);
+		vport->ldc_id = VCC_INVALID_CHANNEL;
+		return (rv);
+	}
+
+	/* open and bring channel up */
+	if ((rv = ldc_open(vport->ldc_handle)) != 0) {
+		cmn_err(CE_CONT, "i_vcc_ldc_init: port@%d inv channel 0x%lx\n",
+		    vport->number, vport->ldc_id);
+		(void) ldc_unreg_callback(vport->ldc_handle);
+		(void) ldc_fini(vport->ldc_handle);
+		vport->ldc_id = VCC_INVALID_CHANNEL;
+		return (rv);
+	}
+
+	/* init the channel status */
+	if ((rv = ldc_status(vport->ldc_handle, &vport->ldc_status)) != 0) {
+		cmn_err(CE_CONT, "i_vcc_ldc_init: port@%d ldc_status failed\n",
+		    vport->number);
+		(void) ldc_close(vport->ldc_handle);
+		(void) ldc_unreg_callback(vport->ldc_handle);
+		(void) ldc_fini(vport->ldc_handle);
+		vport->ldc_id = VCC_INVALID_CHANNEL;
+		return (rv);
+	}
+
+	return (0);
+}
+
+/*  release a ldc channel */
+static int
+i_vcc_ldc_fini(vcc_port_t *vport)
+{
+	int 		rv = EIO;
+	vcc_msg_t	buf;
+
+	D1("i_vcc_ldc_fini: port@%lld, ldc_id%%llx\n", vport->number,
+	    vport->ldc_id);
+
+	ASSERT(mutex_owned(&vport->lock));
+
+	/* wait for write available */
+	rv = i_vcc_wait_port_status(vport, &vport->write_cv,
+	    VCC_PORT_USE_WRITE_LDC);
+	if (rv) {
+		return (rv);
+	}
+	vport->status &= ~VCC_PORT_USE_WRITE_LDC;
+	/* send a HUP message */
+	buf.type = LDC_CONSOLE_CTRL;
+	buf.ctrl_msg = LDC_CONSOLE_HUP;
+	buf.size = 0;
+
+	/* in case of error, we still want to clean up ldc channel */
+	(void) i_vcc_write_ldc(vport, &buf);
+
+	mutex_exit(&vport->lock);
+	i_vcc_set_port_status(vport, &vport->write_cv, VCC_PORT_USE_WRITE_LDC);
+	mutex_enter(&vport->lock);
+
+	(void) ldc_set_cb_mode(vport->ldc_handle, LDC_CB_DISABLE);
+	if ((rv = ldc_close(vport->ldc_handle)) != 0) {
+		cmn_err(CE_CONT, "i_vcc_ldc_fini: cannot close channel %ld\n",
+		    vport->ldc_id);
+		return (rv);
+	}
+
+	if ((rv = ldc_unreg_callback(vport->ldc_handle)) != 0) {
+		cmn_err(CE_CONT, "i_vcc_ldc_fini: port@%d ldc_unreg_callback"
+			"failed\n", vport->number);
+		return (rv);
+	}
+
+	if ((rv = ldc_fini(vport->ldc_handle)) != 0) {
+		cmn_err(CE_CONT, "i_vcc_ldc_fini: cannot finilize channel"
+		    "%ld\n", vport->ldc_id);
+		return (rv);
+	}
+
+	return (0);
+}
+
+/* read data from ldc channel */
+
+static int
+i_vcc_read_ldc(vcc_port_t *vport, char *data_buf, size_t *sz)
+{
+
+	int		rv;
+	size_t		size;
+	size_t		space_left = *sz;
+	vcc_msg_t  	buf;
+	int 		i;
+
+
+
+
+	/* make sure holding read lock */
+	ASSERT((vport->status & VCC_PORT_USE_READ_LDC) == 0);
+	ASSERT(space_left >= VCC_MTU_SZ);
+
+	*sz = 0;
+	while (space_left >= VCC_MTU_SZ)  {
+		size = sizeof (buf);
+
+		rv = ldc_read(vport->ldc_handle, (caddr_t)&buf, &size);
+
+		if (rv) {
+			return (rv);
+		}
+
+
+		/*
+		 * FIXME: ldc_read should not reaturn 0 with
+		 * either size == 0, buf.size == 0 or size < VCC_HDR_SZ
+		 */
+		if (size == 0) {
+			if (*sz > 0) {
+				return (0);
+			}
+			return (EAGAIN);
+		}
+
+		if (size < VCC_HDR_SZ) {
+			return (EIO);
+		}
+
+		/*
+		 * only data is expected from console - otherwise
+		 * return error
+		 */
+		if (buf.type != LDC_CONSOLE_DATA) {
+			return (EIO);
+		}
+
+		if (buf.size == 0) {
+			if (*sz > 0) {
+				return (0);
+			}
+			return (EAGAIN);
+		}
+
+		/* copy  data */
+		for (i = 0; i < buf.size; i++, (*sz)++) {
+			data_buf[*sz] = buf.data[i];
+		}
+
+		space_left -= buf.size;
+	}
+
+	return (0);
+}
+
+/* callback from ldc */
+static uint_t
+vcc_ldc_cb(uint64_t event, caddr_t arg)
+{
+
+	vcc_port_t  *vport = (vcc_port_t *)arg;
+	boolean_t   isempty;
+
+	/*
+	 * do not need to hold lock because if ldc calls back, the
+	 * ldc_handle must be valid.
+	 */
+	D2("vcc_ldc_cb: callback invoked port=%d events=%llx\n",
+	    vport->number, event);
+
+	/* check event from ldc */
+	if (event & LDC_EVT_WRITE) {
+		/* channel has space for write */
+
+		i_vcc_set_port_status(vport, &vport->write_cv,
+			VCC_PORT_LDC_WRITE_READY);
+		return (LDC_SUCCESS);
+	}
+
+	if (event & LDC_EVT_READ) {
+
+		/* channel has data for read */
+		(void) ldc_chkq(vport->ldc_handle, &isempty);
+		if (isempty) {
+			/* data already read */
+			return (LDC_SUCCESS);
+		}
+
+		i_vcc_set_port_status(vport, &vport->read_cv,
+			VCC_PORT_LDC_DATA_READY);
+		return (LDC_SUCCESS);
+	}
+
+	if (event & LDC_EVT_DOWN) {
+		/* channel is down */
+		i_vcc_set_port_status(vport, &vport->write_cv,
+					VCC_PORT_LDC_LINK_DOWN);
+		cv_broadcast(&vport->read_cv);
+
+	}
+
+	return (LDC_SUCCESS);
+
+}
+
+
+/* configure a vcc port with ldc channel */
+static int
+i_vcc_config_port(vcc_t *vccp, uint_t portno, uint64_t ldc_id)
+{
+	int 		rv = EIO;
+	vcc_port_t 	*vport;
+
+	if ((portno >= VCC_MAX_PORTS) || (portno == VCC_CONTROL_PORT)) {
+		cmn_err(CE_CONT, "i_vcc_config_port: invalid port number %d\n",
+		    portno);
+		return (EINVAL);
+	}
+
+	vport = &(vccp->port[portno]);
+	if ((vport->status & VCC_PORT_AVAIL) == 0) {
+		cmn_err(CE_CONT, "i_vcc_config_port: port@%d does not exist\n",
+		    portno);
+		return (EINVAL);
+	}
+
+
+	if (vport->ldc_id != VCC_INVALID_CHANNEL) {
+		cmn_err(CE_CONT, "i_vcc_config_port: port@%d channel already"
+		    "configured\n", portno);
+		return (EINVAL);
+	}
+
+	mutex_enter(&vport->lock);
+
+	/* store the ldc ID */
+	vport->ldc_id = ldc_id;
+	/* check if someone has already opened this port */
+	if (vport->status & VCC_PORT_OPEN) {
+
+		if ((rv = i_vcc_ldc_init(vccp, vport)) != 0) {
+			mutex_exit(&vport->lock);
+			return (rv);
+		}
+
+		/* mark port as ready */
+		vport->status |= VCC_PORT_LDC_CHANNEL_READY;
+		cv_broadcast(&vport->read_cv);
+		cv_broadcast(&vport->write_cv);
+	}
+
+	mutex_exit(&vport->lock);
+
+	D1("i_vcc_config_port: port@%d ldc=%d, domain=%s",
+	    vport->number, vport->ldc_id, vport->minorp->domain_name);
+
+	return (0);
+}
+
+/* add a vcc console port */
+static int
+i_vcc_add_port(vcc_t *vccp, char *group_name, uint64_t tcp_port,
+    uint_t portno, char *domain_name)
+{
+	int 		instance;
+	int		rv = MDEG_FAILURE;
+	minor_t 	minor;
+	vcc_port_t 	*vport;
+	uint_t		minor_idx;
+	char		name[MAXPATHLEN];
+
+	if ((portno >= VCC_MAX_PORTS) || (portno == VCC_CONTROL_PORT)) {
+		DWARN("i_vcc_add_port: invalid port number %d\n", portno);
+		return (MDEG_FAILURE);
+	}
+
+	vport = &(vccp->port[portno]);
+	if (vport->status & VCC_PORT_AVAIL) {
+		/* this port already exists */
+		cmn_err(CE_CONT, "i_vcc_add_port: invalid port - port@%d "
+			"exists\n", portno);
+		return (MDEG_FAILURE);
+	}
+
+	vport->number = portno;
+	vport->ldc_id = VCC_INVALID_CHANNEL;
+
+	if (domain_name == NULL) {
+		cmn_err(CE_CONT, "i_vcc_add_port: invalid domain name\n");
+		return (MDEG_FAILURE);
+	}
+
+	if (group_name == NULL) {
+		cmn_err(CE_CONT, "i_vcc_add_port: invalid group name\n");
+		return (MDEG_FAILURE);
+	}
+
+	/* look up minor number */
+	for (minor_idx = 0; minor_idx < vccp->minors_assigned; minor_idx++) {
+		if (strcmp(vccp->minor_tbl[minor_idx].domain_name,
+			    domain_name) == 0) {
+			/* found previous assigned minor number */
+			break;
+		}
+	}
+
+	if (minor_idx == vccp->minors_assigned) {
+		/* end of lookup - assign new minor number */
+		if (minor_idx == VCC_MAX_PORTS) {
+			cmn_err(CE_CONT, "i_vcc_add_port:"
+			    "too many minornodes (%d)\n",
+			    minor_idx);
+			return (MDEG_FAILURE);
+		}
+
+		(void) strlcpy(vccp->minor_tbl[minor_idx].domain_name,
+		    domain_name, MAXPATHLEN);
+
+		vccp->minors_assigned++;
+	}
+
+	vport->minorp = &vccp->minor_tbl[minor_idx];
+	vccp->minor_tbl[minor_idx].portno = portno;
+
+	(void) strlcpy(vport->group_name, group_name, MAXPATHLEN);
+
+	vport->tcp_port = tcp_port;
+	D1("i_vcc_add_port:@%d domain=%s, group=%s, tcp=%lld",
+	    vport->number, vport->minorp->domain_name,
+	    vport->group_name, vport->tcp_port);
+
+
+	/*
+	 * Create a minor node. The minor number is
+	 * (instance << VCC_INST_SHIFT) | minor_idx
+	 */
+	instance = ddi_get_instance(vccp->dip);
+
+	minor = (instance << VCC_INST_SHIFT) | (minor_idx);
+
+	(void) snprintf(name, MAXPATHLEN - 1, "%s%s", VCC_MINOR_NAME_PREFIX,
+	    domain_name);
+
+	rv = ddi_create_minor_node(vccp->dip, name, S_IFCHR, minor,
+	    DDI_NT_SERIAL, 0);
+
+	if (rv != DDI_SUCCESS) {
+		vccp->minors_assigned--;
+		return (MDEG_FAILURE);
+	}
+
+	mutex_enter(&vport->lock);
+	vport->status = VCC_PORT_AVAIL | VCC_PORT_ADDED;
+	mutex_exit(&vport->lock);
+
+
+	return (MDEG_SUCCESS);
+}
+
+/* delete a port */
+static int
+i_vcc_delete_port(vcc_t *vccp, vcc_port_t *vport)
+{
+
+	char	name[MAXPATHLEN];
+	int	rv;
+
+
+	ASSERT(mutex_owned(&vport->lock));
+
+	if ((vport->status & VCC_PORT_AVAIL) == 0) {
+		D1("vcc_del_port port already deleted \n");
+		return (0);
+	}
+
+	if (vport->status & VCC_PORT_OPEN) {
+		/* do not block mdeg callback */
+		vport->valid_pid = VCC_NO_PID_BLOCKING;
+		rv = i_vcc_close_port(vport);
+	}
+
+	/* remove minor node */
+	(void) snprintf(name, MAXPATHLEN-1, "%s%s", VCC_MINOR_NAME_PREFIX,
+	    vport->minorp->domain_name);
+
+	ddi_remove_minor_node(vccp->dip, name);
+
+	/* let read and write thread know */
+	cv_broadcast(&vport->read_cv);
+	cv_broadcast(&vport->write_cv);
+	vport->status = 0;
+	return (rv);
+
+
+}
+
+/* register callback to MDEG */
+static int
+i_vcc_mdeg_register(vcc_t *vccp, int instance)
+{
+	mdeg_prop_spec_t	*pspecp;
+	mdeg_node_spec_t	*ispecp;
+	mdeg_handle_t		mdeg_hdl;
+	int			sz;
+	int			rv;
+
+	/*
+	 * Allocate and initialize a per-instance copy
+	 * of the global property spec array that will
+	 * uniquely identify this vcc instance.
+	 */
+	sz = sizeof (vcc_prop_template);
+	pspecp = kmem_alloc(sz, KM_SLEEP);
+
+	bcopy(vcc_prop_template, pspecp, sz);
+
+	VCC_SET_MDEG_PROP_INST(pspecp, instance);
+
+	/* initialize the complete prop spec structure */
+	ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
+	ispecp->namep = "virtual-device";
+	ispecp->specp = pspecp;
+
+	/* perform the registration */
+	rv = mdeg_register(ispecp, &vcc_port_match, vcc_mdeg_cb,
+	    vccp, &mdeg_hdl);
+
+	if (rv != MDEG_SUCCESS) {
+		cmn_err(CE_CONT, "i_vcc_mdeg_register:"
+		    "mdeg_register failed (%d)\n", rv);
+		kmem_free(ispecp, sizeof (mdeg_node_spec_t));
+		kmem_free(pspecp, sz);
+		return (DDI_FAILURE);
+	}
+
+	/* save off data that will be needed later */
+	vccp->md_ispecp = (void *)ispecp;
+	vccp->mdeg_hdl = mdeg_hdl;
+
+	return (0);
+}
+
+/* destroy all mutex from port table */
+static void
+i_vcc_cleanup_port_table(vcc_t *vccp)
+{
+	int i;
+	vcc_port_t *vport;
+
+	for (i = 0; i < VCC_MAX_PORTS; i++) {
+		vport = &(vccp->port[i]);
+		mutex_destroy(&vport->lock);
+		cv_destroy(&vport->read_cv);
+		cv_destroy(&vport->write_cv);
+	}
+}
+
+/*
+ * attach(9E): attach a device to the system.
+ * called once for each instance of the device on the system.
+ */
+static int
+vcc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int 		i, instance, inst;
+	int 		rv = DDI_FAILURE;
+	vcc_t		*vccp;
+	minor_t 	minor;
+	vcc_port_t	*vport;
+
+	switch (cmd) {
+
+	case DDI_ATTACH:
+
+		instance = ddi_get_instance(dip);
+		if (ddi_soft_state_zalloc(vcc_ssp, instance) != DDI_SUCCESS)
+			return (DDI_FAILURE);
+
+		vccp = ddi_get_soft_state(vcc_ssp, instance);
+		if (vccp == NULL) {
+			ddi_soft_state_free(vccp, instance);
+			return (ENXIO);
+		}
+
+		D1("vcc_attach: DDI_ATTACH instance=%d\n", instance);
+
+		/* initialize the mutex */
+		mutex_init(&vccp->lock, NULL, MUTEX_DRIVER, NULL);
+
+		mutex_enter(&vccp->lock);
+
+		vccp->dip = dip;
+
+		for (i = 0; i < VCC_MAX_PORTS; i++) {
+			vport = &(vccp->port[i]);
+			mutex_init(&vport->lock, NULL, MUTEX_DRIVER, NULL);
+			cv_init(&vport->read_cv, NULL, CV_DRIVER, NULL);
+			cv_init(&vport->write_cv, NULL, CV_DRIVER, NULL);
+			vport->valid_pid = VCC_NO_PID_BLOCKING;
+		}
+
+		vport = &vccp->port[VCC_CONTROL_PORT];
+		mutex_enter(&vport->lock);
+
+		vport->minorp = &vccp->minor_tbl[VCC_CONTROL_MINOR_IDX];
+		vport->status |= VCC_PORT_AVAIL;
+
+		/* create a minor node for vcc control */
+		minor = (instance << VCC_INST_SHIFT) | VCC_CONTROL_MINOR_IDX;
+
+		vccp->minor_tbl[VCC_CONTROL_PORT].portno =
+		    VCC_CONTROL_MINOR_IDX;
+
+
+		rv = ddi_create_minor_node(vccp->dip, "ctl", S_IFCHR, minor,
+		    DDI_NT_SERIAL, 0);
+
+		mutex_exit(&vport->lock);
+
+		if (rv != DDI_SUCCESS) {
+			cmn_err(CE_CONT, "vcc_attach: error"
+			    "creating control minor node\n");
+
+			i_vcc_cleanup_port_table(vccp);
+
+			mutex_exit(&vccp->lock);
+			/* clean up soft state */
+			ddi_soft_state_free(vccp, instance);
+
+			return (DDI_FAILURE);
+		}
+
+		/* get the instance number by reading 'reg' property */
+		inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+		    "reg", -1);
+		if (inst == -1) {
+			cmn_err(CE_CONT, "vcc_attach: vcc%d has no "
+				"'reg' property\n",
+			    ddi_get_instance(dip));
+
+			i_vcc_cleanup_port_table(vccp);
+
+			/* remove minor */
+			ddi_remove_minor_node(vccp->dip, NULL);
+
+			/* clean up soft state */
+			mutex_exit(&vccp->lock);
+			ddi_soft_state_free(vccp, instance);
+
+			return (DDI_FAILURE);
+		}
+
+		/*
+		 * Mdeg might invoke callback in the same call sequence
+		 * if there is a domain port at the time of registration.
+		 * Since the callback also grabs vcc->lock mutex, to avoid
+		 * mutex reentry error, release the lock before registration
+		 */
+		mutex_exit(&vccp->lock);
+
+		/* register for notifications from Zeus */
+		rv = i_vcc_mdeg_register(vccp, inst);
+		if (rv != MDEG_SUCCESS) {
+			cmn_err(CE_CONT, "vcc_attach: error register to MD\n");
+
+			i_vcc_cleanup_port_table(vccp);
+
+			/* remove minor */
+			ddi_remove_minor_node(vccp->dip, NULL);
+
+			/* clean up soft state */
+			ddi_soft_state_free(vccp, instance);
+
+			return (DDI_FAILURE);
+		}
+
+		return (DDI_SUCCESS);
+
+	case DDI_RESUME:
+
+		return (DDI_SUCCESS);
+
+	default:
+
+		return (DDI_FAILURE);
+	}
+}
+
+/*
+ * detach(9E): detach a device from the system.
+ */
+static int
+vcc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int		    i, instance;
+	vcc_t		    *vccp;
+	mdeg_node_spec_t    *ispecp;
+	vcc_port_t	    *vport;
+
+	switch (cmd) {
+
+	case DDI_DETACH:
+
+		instance = ddi_get_instance(dip);
+		vccp = ddi_get_soft_state(vcc_ssp, instance);
+		if (vccp == NULL)
+			return (ENXIO);
+
+		D1("vcc_detach: DDI_DETACH instance=%d\n", instance);
+
+		mutex_enter(&vccp->lock);
+
+		/* unregister from MD event generator */
+
+		ASSERT(vccp->mdeg_hdl);
+		(void) mdeg_unregister(vccp->mdeg_hdl);
+
+		ispecp = (mdeg_node_spec_t *)vccp->md_ispecp;
+		ASSERT(ispecp);
+
+		kmem_free(ispecp->specp, sizeof (vcc_prop_template));
+		kmem_free(ispecp, sizeof (mdeg_node_spec_t));
+
+		/* remove minor nodes */
+		ddi_remove_minor_node(vccp->dip, NULL);
+		mutex_exit(&vccp->lock);
+
+		for (i = 0; i < VCC_MAX_PORTS; i++) {
+
+			vport = &vccp->port[i];
+			mutex_enter(&vport->lock);
+			if (i == VCC_CONTROL_PORT) {
+				if (vport->status & VCC_PORT_OPEN) {
+					(void) i_vcc_close_port(vport);
+				}
+			}
+
+			if ((vccp->port[i].status & VCC_PORT_AVAIL) &&
+			    (i != VCC_CONTROL_PORT)) {
+				D1("vcc_detach: removing port port@%d\n", i);
+				(void) i_vcc_delete_port(vccp, vport);
+			}
+			mutex_exit(&vport->lock);
+			cv_destroy(&vport->read_cv);
+			cv_destroy(&vport->write_cv);
+			mutex_destroy(&vport->lock);
+		}
+
+
+
+		/* destroy mutex and free the soft state */
+		mutex_destroy(&vccp->lock);
+		ddi_soft_state_free(vcc_ssp, instance);
+
+		return (DDI_SUCCESS);
+
+	case DDI_SUSPEND:
+
+		return (DDI_SUCCESS);
+
+	default:
+
+		return (DDI_FAILURE);
+	}
+}
+
+/* cb_open */
+static int
+vcc_open(dev_t *devp, int flag, int otyp, cred_t *cred)
+{
+	_NOTE(ARGUNUSED(otyp, cred))
+
+	int	    instance;
+	int	    rv = EIO;
+	minor_t	    minor;
+	uint_t	    portno;
+	vcc_t	    *vccp;
+	vcc_port_t  *vport;
+
+	minor = getminor(*devp);
+	instance = VCCINST(minor);
+
+	vccp = ddi_get_soft_state(vcc_ssp, instance);
+	if (vccp == NULL) {
+		return (ENXIO);
+	}
+
+	portno = VCCPORT(vccp, minor);
+
+	vport = &(vccp->port[portno]);
+
+	mutex_enter(&vport->lock);
+
+	if (vport->status & VCC_PORT_OPEN) {
+		/* only one open per port */
+		cmn_err(CE_CONT, "vcc_open: virtual-console-concentrator@%d:%d "
+		    "is already open\n", instance, portno);
+		mutex_exit(&vport->lock);
+		return (EAGAIN);
+	}
+
+	/* check minor no and pid */
+	if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor),
+			    vport)) != 0) {
+		mutex_exit(&vport->lock);
+		return (rv);
+	}
+
+	if (portno == VCC_CONTROL_PORT) {
+		vport->status |= VCC_PORT_OPEN;
+		mutex_exit(&vport->lock);
+		return (0);
+	}
+
+
+	/* check if channel has been initialized */
+	if ((vport->status & VCC_PORT_LDC_CHANNEL_READY) == 0) {
+		rv = i_vcc_ldc_init(vccp, vport);
+		if (rv) {
+			mutex_exit(&vport->lock);
+			return (EIO);
+		}
+
+		/* mark port as ready */
+		vport->status |= VCC_PORT_LDC_CHANNEL_READY;
+	}
+
+	vport->status |= VCC_PORT_USE_READ_LDC | VCC_PORT_USE_WRITE_LDC|
+	    VCC_PORT_TERM_RD|VCC_PORT_TERM_WR|VCC_PORT_OPEN;
+
+	if ((flag & O_NONBLOCK) || (flag & O_NDELAY)) {
+		vport->status |= VCC_PORT_NONBLOCK;
+	}
+
+	mutex_exit(&vport->lock);
+
+	return (0);
+}
+
+/* close port */
+static int
+i_vcc_close_port(vcc_port_t *vport)
+{
+	int	rv = EIO;
+
+	if ((vport->status & VCC_PORT_OPEN) == 0) {
+		return (0);
+	}
+
+	ASSERT(mutex_owned(&vport->lock));
+
+	if (vport->status & VCC_PORT_LDC_CHANNEL_READY) {
+		/* clean up ldc channel */
+		if ((rv = i_vcc_ldc_fini(vport)) != 0) {
+			return (rv);
+		}
+		vport->status &= ~VCC_PORT_LDC_CHANNEL_READY;
+	}
+
+	/* reset  rd/wr suspends  */
+	vport->status |= VCC_PORT_TERM_RD | VCC_PORT_TERM_WR;
+	vport->status &= ~VCC_PORT_NONBLOCK;
+	vport->status &= ~VCC_PORT_OPEN;
+	vport->valid_pid = VCC_NO_PID_BLOCKING;
+
+	/* signal any blocked read and write thread */
+	cv_broadcast(&vport->read_cv);
+	cv_broadcast(&vport->write_cv);
+
+	return (0);
+}
+
+/* cb_close */
+static int
+vcc_close(dev_t dev, int flag, int otyp, cred_t *cred)
+{
+	_NOTE(ARGUNUSED(flag, otyp, cred))
+
+	int	    instance;
+	minor_t	    minor;
+	int	    rv = EIO;
+	uint_t	    portno;
+	vcc_t	    *vccp;
+	vcc_port_t  *vport;
+
+	minor = getminor(dev);
+
+	instance = VCCINST(minor);
+	vccp = ddi_get_soft_state(vcc_ssp, instance);
+	if (vccp == NULL) {
+		return (ENXIO);
+	}
+
+	portno = VCCPORT(vccp, minor);
+
+	D1("vcc_close: closing virtual-console-concentrator@%d:%d\n",
+	    instance, portno);
+
+	vport = &(vccp->port[portno]);
+
+
+	if ((vport->status & VCC_PORT_OPEN) == 0) {
+		return (0);
+	}
+
+	if (portno == VCC_CONTROL_PORT) {
+		/*
+		 * vntsd closes control port before it exits. There
+		 * could be events still pending for vntsd.
+		 */
+		rv = i_vcc_reset_events(vccp);
+		return (0);
+	}
+
+	mutex_enter(&vport->lock);
+
+	/* check minor no and pid */
+	if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor),
+			    vport)) != 0) {
+		mutex_exit(&vport->lock);
+		return (rv);
+	}
+
+	rv = i_vcc_close_port(vport);
+	mutex_exit(&vport->lock);
+
+	return (rv);
+}
+
+/*
+ * ioctl VCC_CONS_TBL - vntsd allocates buffer according to return of
+ * VCC_NUM_PORTS. However, when vntsd requests for the console table, console
+ * ports could be deleted or added. parameter num_ports is number of structures
+ * that vntsd allocated for the table. If there are more ports than
+ * num_ports, set up to wakeup vntsd to add ports.
+ * If there less ports than num_ports, fill (-1) for cons_no to tell vntsd.
+ */
+static int
+i_vcc_cons_tbl(vcc_t *vccp, uint_t num_ports, caddr_t buf, int mode)
+{
+	vcc_console_t	cons;
+	int		i;
+	vcc_port_t	*vport;
+	boolean_t	notify_vntsd = B_FALSE;
+	char pathname[MAXPATHLEN];
+
+
+	(void) ddi_pathname(vccp->dip, pathname);
+	for (i = 0; i < VCC_MAX_PORTS; i++) {
+
+		vport = &vccp->port[i];
+
+		if (i == VCC_CONTROL_PORT) {
+			continue;
+		}
+
+		if ((vport->status & VCC_PORT_AVAIL) == 0) {
+			continue;
+		}
+
+		/* a port exists before vntsd becomes online */
+		mutex_enter(&vport->lock);
+
+		if (num_ports == 0) {
+			/* more ports than vntsd's buffer can hold */
+			vport->status |= VCC_PORT_ADDED;
+			notify_vntsd = B_TRUE;
+			mutex_exit(&vport->lock);
+			continue;
+		}
+
+		bzero(&cons, sizeof (vcc_console_t));
+
+		/* construct console buffer */
+		cons.cons_no = vport->number;
+		cons.tcp_port = vport->tcp_port;
+		(void) memcpy(cons.domain_name,
+		    vport->minorp->domain_name, MAXPATHLEN);
+
+		(void) memcpy(cons.group_name, vport->group_name,
+		    MAXPATHLEN);
+		vport->status &= ~VCC_PORT_ADDED;
+		mutex_exit(&vport->lock);
+
+		(void) snprintf(cons.dev_name, MAXPATHLEN-1, "%s:%s%s",
+		    pathname, VCC_MINOR_NAME_PREFIX, cons.domain_name);
+
+		/* copy out data */
+		if (ddi_copyout(&cons, (void *)buf,
+			    sizeof (vcc_console_t), mode)) {
+			mutex_exit(&vport->lock);
+			return (EFAULT);
+		}
+		buf += sizeof (vcc_console_t);
+
+		num_ports--;
+
+	}
+
+	if (num_ports == 0) {
+		/* vntsd's buffer is full */
+
+		if (notify_vntsd) {
+			/* more ports need to notify vntsd */
+			vport = &vccp->port[VCC_CONTROL_PORT];
+			mutex_enter(&vport->lock);
+			vport->pollevent |= VCC_POLL_ADD_PORT;
+			mutex_exit(&vport->lock);
+		}
+
+		return (0);
+	}
+
+	/* less ports than vntsd expected */
+	bzero(&cons, sizeof (vcc_console_t));
+	cons.cons_no = -1;
+
+	while (num_ports > 0) {
+		/* fill vntsd buffer with no console */
+		if (ddi_copyout(&cons, (void *)buf,
+			    sizeof (vcc_console_t), mode) != 0) {
+			mutex_exit(&vport->lock);
+			return (EFAULT);
+		}
+		D1("i_vcc_cons_tbl: a port is  deleted\n");
+		buf += sizeof (vcc_console_t) +MAXPATHLEN;
+		num_ports--;
+	}
+
+	return (0);
+}
+
+
+/* turn off event flag if there is no more change */
+static void
+i_vcc_turn_off_event(vcc_t *vccp, uint32_t port_status, uint32_t event)
+{
+
+	vcc_port_t *vport;
+	int i;
+
+	for (i = 0; i < VCC_MAX_PORTS; i++) {
+
+		vport = &(vccp->port[i]);
+
+		if ((vport->status & VCC_PORT_AVAIL) == 0) {
+			continue;
+		}
+
+
+		if (vport->status & port_status) {
+			/* more port changes status */
+			return;
+		}
+
+	}
+
+	/* no more changed port  */
+	vport = &vccp->port[VCC_CONTROL_PORT];
+
+	/* turn off event */
+	mutex_enter(&vport->lock);
+	vport->pollevent &= ~event;
+	mutex_exit(&vport->lock);
+}
+
+/* ioctl VCC_CONS_INFO */
+static int
+i_vcc_cons_info(vcc_t *vccp, caddr_t buf, int mode)
+{
+	vcc_console_t	cons;
+	uint_t		portno;
+	vcc_port_t	*vport;
+	char pathname[MAXPATHLEN];
+
+	/* read in portno */
+	if (ddi_copyin((void*)buf, &portno, sizeof (uint_t), mode)) {
+		return (EFAULT);
+	}
+
+	D1("i_vcc_cons_info@%d:\n", portno);
+
+	if ((portno >= VCC_MAX_PORTS) || (portno == VCC_CONTROL_PORT)) {
+		return (EINVAL);
+	}
+
+	vport = &vccp->port[portno];
+
+	if ((vport->status & VCC_PORT_AVAIL) == 0) {
+		return (EINVAL);
+	}
+
+	mutex_enter(&vport->lock);
+	vport->status &= ~VCC_PORT_ADDED;
+
+	/* construct configruation data  */
+	bzero(&cons, sizeof (vcc_console_t));
+
+	cons.cons_no = vport->number;
+	cons.tcp_port = vport->tcp_port;
+
+	(void) memcpy(cons.domain_name, vport->minorp->domain_name, MAXPATHLEN);
+
+	(void) memcpy(cons.group_name, vport->group_name, MAXPATHLEN);
+
+	mutex_exit(&vport->lock);
+
+	(void) ddi_pathname(vccp->dip, pathname),
+
+	/* copy device name */
+	(void) snprintf(cons.dev_name, MAXPATHLEN-1, "%s:%s%s",
+	    pathname, VCC_MINOR_NAME_PREFIX, cons.domain_name);
+	/* copy data */
+	if (ddi_copyout(&cons, (void *)buf,
+		    sizeof (vcc_console_t), mode) != 0) {
+		mutex_exit(&vport->lock);
+		return (EFAULT);
+	}
+
+	D1("i_vcc_cons_info@%d:domain:%s serv:%s tcp@%lld %s\n",
+	    cons.cons_no, cons.domain_name,
+	    cons.group_name, cons.tcp_port, cons.dev_name);
+
+	i_vcc_turn_off_event(vccp, VCC_PORT_ADDED, VCC_POLL_ADD_PORT);
+
+	return (0);
+}
+
+
+/* response to vntsd inquiry ioctl call */
+static int
+i_vcc_inquiry(vcc_t *vccp, caddr_t buf, int mode)
+{
+	vcc_port_t	*vport;
+	uint_t		i;
+	vcc_response_t	msg;
+
+	vport = &(vccp->port[VCC_CONTROL_PORT]);
+
+	if ((vport->pollevent & VCC_POLL_ADD_PORT) == 0) {
+		return (EINVAL);
+	}
+
+		/* an added port */
+
+	D1("i_vcc_inquiry\n");
+
+	for (i = 0; i < VCC_MAX_PORTS; i++) {
+		if ((vccp->port[i].status & VCC_PORT_AVAIL) == 0) {
+			continue;
+		}
+
+		if (vccp->port[i].status & VCC_PORT_ADDED) {
+			/* port added */
+			msg.reason = VCC_CONS_ADDED;
+			msg.cons_no = i;
+
+			if (ddi_copyout((void *)&msg, (void *)buf,
+				    sizeof (msg), mode) == -1) {
+				cmn_err(CE_CONT, "i_vcc_find_changed_port:"
+					"ddi_copyout"
+				    " failed\n");
+				return (EFAULT);
+			}
+			return (0);
+		}
+	}
+
+	return (EINVAL);
+}
+
+/* clean up events after vntsd exits */
+static int
+i_vcc_reset_events(vcc_t *vccp)
+{
+	uint_t	    i;
+	vcc_port_t  *vport;
+
+	for (i = 0; i < VCC_MAX_PORTS; i++) {
+		vport = &(vccp->port[i]);
+
+		if ((vport->status & VCC_PORT_AVAIL) == 0) {
+			continue;
+		}
+
+		ASSERT(!mutex_owned(&vport->lock));
+
+		if (i == VCC_CONTROL_PORT) {
+			/* close control port */
+			mutex_enter(&vport->lock);
+			vport->status &= ~VCC_PORT_OPEN;
+
+			/* clean up poll events */
+			vport->pollevent = 0;
+			vport->pollflag = 0;
+			mutex_exit(&vport->lock);
+			continue;
+		}
+		if (vport->status & VCC_PORT_ADDED) {
+			/* pending added port event to vntsd */
+			mutex_enter(&vport->lock);
+			vport->status &= ~VCC_PORT_ADDED;
+			mutex_exit(&vport->lock);
+		}
+
+	}
+
+	vport = &vccp->port[VCC_CONTROL_PORT];
+
+	return (0);
+}
+
+/* ioctl VCC_FORCE_CLOSE */
+static int
+i_vcc_force_close(vcc_t *vccp, caddr_t buf, int mode)
+{
+	uint_t		portno;
+	vcc_port_t	*vport;
+	int		rv;
+
+	/* read in portno */
+	if (ddi_copyin((void*)buf, &portno, sizeof (uint_t), mode)) {
+		return (EFAULT);
+	}
+
+	D1("i_vcc_force_close@%d:\n", portno);
+
+	if ((portno >= VCC_MAX_PORTS) || (portno == VCC_CONTROL_PORT)) {
+		return (EINVAL);
+	}
+
+	vport = &vccp->port[portno];
+
+	if ((vport->status & VCC_PORT_AVAIL) == 0) {
+		return (EINVAL);
+	}
+
+	mutex_enter(&vport->lock);
+
+	rv = i_vcc_close_port(vport);
+
+	/* block callers other than vntsd */
+	vport->valid_pid = ddi_get_pid();
+
+	mutex_exit(&vport->lock);
+	return (rv);
+
+}
+
+/* ioctl VCC_CONS_STATUS */
+static int
+i_vcc_cons_status(vcc_t *vccp, caddr_t buf, int mode)
+{
+	vcc_console_t	console;
+	vcc_port_t	*vport;
+
+	/* read in portno */
+	if (ddi_copyin((void*)buf, &console, sizeof (console), mode)) {
+		return (EFAULT);
+	}
+
+	D1("i_vcc_cons_status@%d:\n", console.cons_no);
+
+	if ((console.cons_no >= VCC_MAX_PORTS) ||
+		(console.cons_no == VCC_CONTROL_PORT)) {
+		return (EINVAL);
+	}
+
+
+	vport = &vccp->port[console.cons_no];
+	if ((vport->status & VCC_PORT_AVAIL) == 0) {
+		console.cons_no = -1;
+	} else  if (strncmp(console.domain_name, vport->minorp->domain_name,
+		    MAXPATHLEN)) {
+		console.cons_no = -1;
+	} else if (strncmp(console.group_name, vport->group_name,
+		    MAXPATHLEN)) {
+		console.cons_no = -1;
+	} else if (console.tcp_port != vport->tcp_port) {
+		console.cons_no = -1;
+	}
+
+	D1("i_vcc_cons_status@%d: %s %s %llx\n", console.cons_no,
+	    console.group_name, console.domain_name, console.tcp_port);
+	if (ddi_copyout(&console, (void *)buf, sizeof (console), mode) == -1) {
+		cmn_err(CE_CONT, "i_vcc_cons_status ddi_copyout failed\n");
+		return (EFAULT);
+	}
+
+	return (0);
+}
+
+/* cb_ioctl handler for vcc control port */
+static int
+i_vcc_ctrl_ioctl(vcc_t *vccp, int cmd, void* arg, int mode)
+{
+
+	static uint_t	num_ports;
+
+
+	switch (cmd) {
+
+	case VCC_NUM_CONSOLE:
+
+		mutex_enter(&vccp->lock);
+		num_ports = vccp->num_ports;
+		mutex_exit(&vccp->lock);
+		/* number of consoles */
+
+		return (ddi_copyout((void *)&num_ports, arg,
+			    sizeof (int), mode));
+	case VCC_CONS_TBL:
+
+		/* console config table */
+		return (i_vcc_cons_tbl(vccp, num_ports, (caddr_t)arg, mode));
+
+	case VCC_INQUIRY:
+
+		/* reason for wakeup */
+		return (i_vcc_inquiry(vccp, (caddr_t)arg, mode));
+
+	case VCC_CONS_INFO:
+		/* a console config */
+		return (i_vcc_cons_info(vccp, (caddr_t)arg, mode));
+
+	case VCC_FORCE_CLOSE:
+		/* force to close a console */
+		return (i_vcc_force_close(vccp, (caddr_t)arg, mode));
+
+	case VCC_CONS_STATUS:
+		/* console status */
+		return (i_vcc_cons_status(vccp, (caddr_t)arg, mode));
+
+	default:
+
+		/* unknown command */
+		return (ENODEV);
+	}
+
+
+}
+
+/* write data to ldc. may block if channel has no space for write */
+static int
+i_vcc_write_ldc(vcc_port_t *vport, vcc_msg_t *buf)
+{
+	int	rv = EIO;
+	size_t	size;
+
+	ASSERT(mutex_owned(&vport->lock));
+	ASSERT((vport->status & VCC_PORT_USE_WRITE_LDC) == 0);
+
+	for (; ; ) {
+
+		size = VCC_HDR_SZ + buf->size;
+		rv = ldc_write(vport->ldc_handle, (caddr_t)buf, &size);
+
+		D1("i_vcc_write_ldc: port@%d: err=%d %d bytes\n",
+		    vport->number, rv, size);
+
+		if (rv == 0) {
+			return (rv);
+		}
+
+		if (rv != EWOULDBLOCK) {
+			return (EIO);
+		}
+
+		if (vport->status & VCC_PORT_NONBLOCK) {
+			return (EAGAIN);
+		}
+
+		/*  block util ldc has more space */
+
+		rv = i_vcc_wait_port_status(vport, &vport->write_cv,
+		    VCC_PORT_LDC_WRITE_READY);
+
+		if (rv) {
+			return (rv);
+		}
+
+		vport->status &= ~VCC_PORT_LDC_WRITE_READY;
+
+	}
+
+}
+
+
+
+/* cb_ioctl handler for port ioctl */
+static int
+i_vcc_port_ioctl(vcc_t *vccp, minor_t minor, int portno, int cmd, void *arg,
+    int mode)
+{
+
+	vcc_port_t	*vport;
+	struct termios	term;
+	vcc_msg_t	buf;
+	int		rv;
+
+	D1("i_vcc_port_ioctl@%d cmd %d\n", portno, cmd);
+
+	vport = &(vccp->port[portno]);
+
+	if ((vport->status & VCC_PORT_AVAIL) == 0) {
+		return (EIO);
+	}
+
+
+	switch (cmd) {
+
+	/* terminal support */
+	case TCGETA:
+	case TCGETS:
+
+		mutex_enter(&vport->lock);
+
+		/* check minor no and pid */
+		if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor),
+				    vport)) != 0) {
+			mutex_exit(&vport->lock);
+			return (rv);
+		}
+
+		(void) memcpy(&term, &vport->term, sizeof (term));
+		mutex_exit(&vport->lock);
+
+		return (ddi_copyout(&term, arg, sizeof (term), mode));
+
+	case TCSETS:
+	case TCSETA:
+	case TCSETAW:
+	case TCSETAF:
+
+		if (ddi_copyin(arg, &term, sizeof (term), mode) != 0) {
+			return (EFAULT);
+		}
+
+		mutex_enter(&vport->lock);
+
+		/* check minor no and pid */
+		if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor),
+				    vport)) != 0) {
+			mutex_exit(&vport->lock);
+			return (rv);
+		}
+
+		(void) memcpy(&vport->term, &term, sizeof (term));
+		mutex_exit(&vport->lock);
+		return (0);
+
+
+	case TCSBRK:
+
+		/* send break to console */
+		mutex_enter(&vport->lock);
+
+		/* check minor no and pid */
+		if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor),
+				    vport)) != 0) {
+			mutex_exit(&vport->lock);
+			return (rv);
+		}
+
+		/* wait for write available */
+		rv = i_vcc_wait_port_status(vport, &vport->write_cv,
+		    VCC_PORT_LDC_CHANNEL_READY| VCC_PORT_USE_WRITE_LDC);
+		if (rv) {
+			mutex_exit(&vport->lock);
+			return (rv);
+		}
+
+		vport->status &= ~VCC_PORT_USE_WRITE_LDC;
+
+		buf.type = LDC_CONSOLE_CTRL;
+		buf.ctrl_msg = LDC_CONSOLE_BREAK;
+		buf.size = 0;
+
+		rv = i_vcc_write_ldc(vport, &buf);
+
+		mutex_exit(&vport->lock);
+
+		i_vcc_set_port_status(vport, &vport->write_cv,
+			VCC_PORT_USE_WRITE_LDC);
+		return (0);
+
+	case TCXONC:
+		/* suspend read or write */
+		if (ddi_copyin(arg, &cmd, sizeof (int), mode) != 0) {
+			return (EFAULT);
+		}
+
+		mutex_enter(&vport->lock);
+
+		/* check minor no and pid */
+		if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor),
+				    vport)) != 0) {
+			mutex_exit(&vport->lock);
+			return (rv);
+		}
+
+
+		switch (cmd) {
+
+		case 0:
+			vport->status |= VCC_PORT_TERM_WR;
+			cv_broadcast(&vport->write_cv);
+			break;
+		case 1:
+			/* get write lock */
+			rv = i_vcc_wait_port_status(vport, &vport->write_cv,
+			    VCC_PORT_USE_WRITE_LDC);
+			if (rv) {
+				mutex_exit(&vport->lock);
+				return (rv);
+			}
+			vport->status &= ~VCC_PORT_TERM_WR;
+			cv_broadcast(&vport->write_cv);
+			break;
+		case 2:
+			vport->status |= VCC_PORT_TERM_RD;
+			cv_broadcast(&vport->read_cv);
+			break;
+		case 3:
+			/* get read lock */
+			rv = i_vcc_wait_port_status(vport, &vport->write_cv,
+			    VCC_PORT_USE_READ_LDC);
+			if (rv) {
+				mutex_exit(&vport->lock);
+				return (rv);
+			}
+			vport->status &= ~VCC_PORT_TERM_RD;
+			cv_broadcast(&vport->read_cv);
+			break;
+
+		default:
+			break;
+		}
+
+		mutex_exit(&vport->lock);
+		return (0);
+
+	case TCFLSH:
+		return (0);
+
+	default:
+		return (ENODEV);
+	}
+
+}
+
+/* cb_ioctl */
+static int
+vcc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
+    cred_t *credp, int *rvalp)
+{
+	_NOTE(ARGUNUSED(credp, rvalp))
+
+	int instance;
+	minor_t minor;
+	int portno;
+	vcc_t *vccp;
+
+	minor = getminor(dev);
+
+	instance = VCCINST(minor);
+
+	vccp = ddi_get_soft_state(vcc_ssp, instance);
+	if (vccp == NULL) {
+		return (ENXIO);
+	}
+
+	portno = VCCPORT(vccp, minor);
+
+	D1("vcc_ioctl: virtual-console-concentrator@%d:%d\n", instance, portno);
+
+	if (portno >= VCC_MAX_PORTS) {
+		cmn_err(CE_CONT, "vcc_ioctl:virtual-console-concentrator@%d"
+			" invalid portno\n", portno);
+		return (EINVAL);
+	}
+
+	D1("vcc_ioctl: virtual-console-concentrator@%d:%d ioctl cmd=%d\n",
+	    instance, portno, cmd);
+
+	if (portno == VCC_CONTROL_PORT) {
+		/* control ioctl */
+		return (i_vcc_ctrl_ioctl(vccp, cmd, (void *)arg, mode));
+	}
+
+	/* data port ioctl */
+	return (i_vcc_port_ioctl(vccp, minor, portno, cmd, (void *)arg, mode));
+}
+
+/* cb_read */
+static int
+vcc_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	_NOTE(ARGUNUSED(credp))
+
+	int	    instance;
+	minor_t	    minor;
+	uint_t	    portno;
+	vcc_t	    *vccp;
+	vcc_port_t  *vport;
+	int	    rv = EIO;	/* by default fail ! */
+	char 		*buf;
+	size_t		uio_size;
+	size_t		size;
+
+	minor = getminor(dev);
+
+	instance = VCCINST(minor);
+
+	vccp = ddi_get_soft_state(vcc_ssp, instance);
+	if (vccp == NULL) {
+		return (ENXIO);
+	}
+
+	portno = VCCPORT(vccp, minor);
+
+	/* no read for control port */
+	if (portno == VCC_CONTROL_PORT) {
+		return (EIO);
+	}
+
+	/* temp buf to hold ldc data */
+	uio_size = uiop->uio_resid;
+
+	if (uio_size < VCC_MTU_SZ) {
+		return (EINVAL);
+	}
+
+	vport = &(vccp->port[portno]);
+
+	mutex_enter(&vport->lock);
+
+	/* check minor no and pid */
+	if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor),
+			    vport)) != 0) {
+		mutex_exit(&vport->lock);
+		return (rv);
+	}
+
+
+	rv = i_vcc_wait_port_status(vport, &vport->read_cv,
+		    VCC_PORT_TERM_RD|VCC_PORT_LDC_CHANNEL_READY|
+		    VCC_PORT_USE_READ_LDC);
+	if (rv) {
+		mutex_exit(&vport->lock);
+		return (rv);
+	}
+
+	buf = kmem_alloc(uio_size, KM_SLEEP);
+
+	vport->status &= ~VCC_PORT_USE_READ_LDC;
+
+	for (; ; ) {
+
+		size = uio_size;
+		rv = i_vcc_read_ldc(vport, buf, &size);
+
+
+		if (rv == EAGAIN) {
+			/* should block? */
+			if (vport->status & VCC_PORT_NONBLOCK) {
+				break;
+			}
+
+		} else if (rv) {
+			/* error */
+			break;
+		}
+
+		if (size > 0) {
+			/* got data */
+			break;
+		}
+
+		/* wait for data from ldc */
+		vport->status &= ~VCC_PORT_LDC_DATA_READY;
+		rv = i_vcc_wait_port_status(vport, &vport->read_cv,
+			    VCC_PORT_LDC_DATA_READY);
+		if (rv) {
+			break;
+		}
+	}
+
+	mutex_exit(&vport->lock);
+
+	if ((rv == 0) && (size > 0)) {
+		/* data is in buf */
+		rv = uiomove(buf, size, UIO_READ, uiop);
+	}
+
+	kmem_free(buf, uio_size);
+	i_vcc_set_port_status(vport, &vport->read_cv, VCC_PORT_USE_READ_LDC);
+
+	return (rv);
+}
+
+
+/* cb_write */
+static int
+vcc_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	_NOTE(ARGUNUSED(credp))
+
+	int	    instance;
+	minor_t	    minor;
+	size_t	    size;
+	size_t	    bytes;
+	uint_t	    portno;
+	vcc_t	    *vccp;
+
+	vcc_port_t  *vport;
+	int	    rv = EIO;
+
+	vcc_msg_t	buf;
+
+	minor = getminor(dev);
+
+	instance = VCCINST(minor);
+
+	vccp = ddi_get_soft_state(vcc_ssp, instance);
+	if (vccp == NULL) {
+		return (ENXIO);
+	}
+
+	portno = VCCPORT(vccp, minor);
+
+	/* no write for control port */
+	if (portno == VCC_CONTROL_PORT) {
+		return (EIO);
+	}
+	vport = &(vccp->port[portno]);
+
+	/*
+	 * check if the channel has been configured,
+	 * if write has been suspend and grab write lock.
+	 */
+	mutex_enter(&vport->lock);
+
+	/* check minor no and pid */
+	if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor),
+			    vport)) != 0) {
+		mutex_exit(&vport->lock);
+		return (rv);
+	}
+
+	rv = i_vcc_wait_port_status(vport, &vport->write_cv,
+		VCC_PORT_TERM_WR|VCC_PORT_LDC_CHANNEL_READY|
+		VCC_PORT_USE_WRITE_LDC);
+	if (rv) {
+		mutex_exit(&vport->lock);
+		return (rv);
+	}
+
+	vport->status &= ~VCC_PORT_USE_WRITE_LDC;
+	mutex_exit(&vport->lock);
+	size = uiop->uio_resid;
+
+	D2("vcc_write: virtual-console-concentrator@%d:%d writing %d bytes\n",
+	    instance, portno, size);
+
+
+
+	buf.type = LDC_CONSOLE_DATA;
+
+	while (size) {
+
+		bytes = MIN(size, VCC_MTU_SZ);
+		/* move data */
+		rv = uiomove(&(buf.data), bytes, UIO_WRITE, uiop);
+
+		if (rv) {
+			break;
+		}
+
+		/* write to ldc */
+		buf.size = bytes;
+
+		mutex_enter(&vport->lock);
+
+		/* check minor no and pid */
+		if ((rv = i_vcc_can_use_port(VCCMINORP(vccp, minor),
+			    vport)) != 0) {
+			mutex_exit(&vport->lock);
+			return (rv);
+		}
+
+		rv = i_vcc_write_ldc(vport, &buf);
+
+		mutex_exit(&vport->lock);
+
+		if (rv) {
+			break;
+		}
+
+		size -= bytes;
+
+	}
+
+	i_vcc_set_port_status(vport, &vport->write_cv, VCC_PORT_USE_WRITE_LDC);
+	return (rv);
+}
+
+/* mdeg callback for a removed port */
+static int
+i_vcc_md_remove_port(md_t *mdp, mde_cookie_t mdep, vcc_t *vccp)
+{
+	uint64_t  portno;	/* md requires 64bit for port number */
+	int rv = MDEG_FAILURE;
+	vcc_port_t *vport;
+
+	if (md_get_prop_val(mdp, mdep, "id", &portno)) {
+		cmn_err(CE_CONT, "vcc_mdeg_cb: port has no 'id' property\n");
+		return (MDEG_FAILURE);
+	}
+
+	if ((portno >= VCC_MAX_PORTS) || (portno < 0)) {
+		cmn_err(CE_CONT, "i_vcc_md_remove_port@%ld invalid port no\n",
+			portno);
+		return (MDEG_FAILURE);
+	}
+
+	if (portno == VCC_CONTROL_PORT) {
+		cmn_err(CE_CONT, "i_vcc_md_remove_port@%ld can not remove"
+			"control port\n",
+		    portno);
+		return (MDEG_FAILURE);
+	}
+
+	vport = &(vccp->port[portno]);
+
+	/* delete the port */
+	mutex_enter(&vport->lock);
+	rv = i_vcc_delete_port(vccp, vport);
+	mutex_exit(&vport->lock);
+
+	mutex_enter(&vccp->lock);
+	vccp->num_ports--;
+	mutex_exit(&vccp->lock);
+
+	return (rv ? MDEG_FAILURE : MDEG_SUCCESS);
+}
+
+static int
+i_vcc_get_ldc_id(md_t *md, mde_cookie_t mdep, uint64_t *ldc_id)
+{
+	int		num_nodes;
+	size_t		size;
+	mde_cookie_t	*channel;
+	int		num_channels;
+
+
+	if ((num_nodes = md_node_count(md)) <= 0) {
+		cmn_err(CE_CONT, "i_vcc_get_ldc_channel_id:"
+		    "  Invalid node count in Machine Description subtree");
+		return (-1);
+	}
+	size = num_nodes*(sizeof (*channel));
+	channel = kmem_zalloc(size, KM_SLEEP);
+	ASSERT(channel != NULL);	/* because KM_SLEEP */
+
+
+	/* Look for channel endpoint child(ren) of the vdisk MD node */
+	if ((num_channels = md_scan_dag(md, mdep,
+		    md_find_name(md, "channel-endpoint"),
+		    md_find_name(md, "fwd"), channel)) <= 0) {
+		cmn_err(CE_CONT, "i_vcc_get_ldc_id:  No 'channel-endpoint'"
+		    " found for vcc");
+		kmem_free(channel, size);
+		return (-1);
+	}
+
+	/* Get the "id" value for the first channel endpoint node */
+	if (md_get_prop_val(md, channel[0], "id", ldc_id) != 0) {
+		cmn_err(CE_CONT, "i_vcc_get_ldc:  No id property found "
+		    "for channel-endpoint of vcc");
+		kmem_free(channel, size);
+		return (-1);
+	}
+
+	if (num_channels > 1) {
+		cmn_err(CE_CONT, "i_vcc_get_ldc:  Warning:  Using ID of first"
+		    " of multiple channels for this vcc");
+	}
+
+	kmem_free(channel, size);
+	return (0);
+}
+/* mdeg callback for an added port  */
+static int
+i_vcc_md_add_port(md_t *mdp, mde_cookie_t mdep, vcc_t *vccp)
+{
+	uint64_t	portno;		/* md requires 64 bit */
+	char		*domain_name;
+	char		*group_name;
+	uint64_t	ldc_id;
+	uint64_t	tcp_port;
+	vcc_port_t	*vport;
+
+	/* read in the port's reg property */
+	if (md_get_prop_val(mdp, mdep, "id", &portno)) {
+		cmn_err(CE_CONT, "i_vcc_md_add_port_: port has no 'id' "
+			"property\n");
+		return (MDEG_FAILURE);
+	}
+
+	/* read in the port's "vcc-doman-name" property */
+	if (md_get_prop_str(mdp, mdep, "vcc-domain-name", &domain_name)) {
+		cmn_err(CE_CONT, "i_vcc_md_add_port: port%ld has "
+			"no 'vcc-domain-name' property\n", portno);
+		return (MDEG_FAILURE);
+	}
+
+
+	/* read in the port's "vcc-group-name" property */
+	if (md_get_prop_str(mdp, mdep, "vcc-group-name", &group_name)) {
+		cmn_err(CE_CONT, "i_vcc_md_add_port: port%ld has no "
+			"'vcc-group-name'property\n", portno);
+		return (MDEG_FAILURE);
+	}
+
+
+	/* read in the port's "vcc-tcp-port" property */
+	if (md_get_prop_val(mdp, mdep, "vcc-tcp-port", &tcp_port)) {
+		cmn_err(CE_CONT, "i_vcc_md_add_port: port%ld has no"
+			"'vcc-tcp-port' property\n", portno);
+		return (MDEG_FAILURE);
+	}
+
+	D1("i_vcc_md_add_port: port@%d domain-name=%s group-name=%s"
+	    " tcp-port=%lld\n", portno, domain_name, group_name, tcp_port);
+
+	/* add the port */
+	if (i_vcc_add_port(vccp, group_name, tcp_port, portno, domain_name)) {
+		return (MDEG_FAILURE);
+	}
+
+	vport = &vccp->port[portno];
+	if (i_vcc_get_ldc_id(mdp, mdep, &ldc_id)) {
+		mutex_enter(&vport->lock);
+		(void) i_vcc_delete_port(vccp, vport);
+		mutex_exit(&vport->lock);
+		return (MDEG_FAILURE);
+	}
+
+	/* configure the port */
+	if (i_vcc_config_port(vccp, portno, ldc_id)) {
+		mutex_enter(&vport->lock);
+		(void) i_vcc_delete_port(vccp, vport);
+		mutex_exit(&vport->lock);
+		return (MDEG_FAILURE);
+	}
+
+	mutex_enter(&vccp->lock);
+	vccp->num_ports++;
+	mutex_exit(&vccp->lock);
+
+	vport = &vccp->port[VCC_CONTROL_PORT];
+
+	if (vport->pollflag & VCC_POLL_CONFIG) {
+		/* wakeup vntsd */
+		mutex_enter(&vport->lock);
+		vport->pollevent |= VCC_POLL_ADD_PORT;
+		mutex_exit(&vport->lock);
+		pollwakeup(&vport->poll, POLLIN);
+	}
+
+	return (MDEG_SUCCESS);
+}
+
+/* mdeg callback */
+static int
+vcc_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
+{
+	int	idx;
+	vcc_t 	*vccp;
+	int	rv;
+
+	vccp = (vcc_t *)cb_argp;
+	ASSERT(vccp);
+
+	if (resp == NULL) {
+		return (MDEG_FAILURE);
+	}
+
+	/* added port */
+	D1("vcc_mdeg_cb: added %d port(s)\n", resp->added.nelem);
+
+	for (idx = 0; idx < resp->added.nelem; idx++) {
+		rv = i_vcc_md_add_port(resp->added.mdp,
+		    resp->added.mdep[idx], vccp);
+
+		if (rv !=  MDEG_SUCCESS) {
+			return (rv);
+		}
+	}
+
+	/* removed port */
+	D1("vcc_mdeg_cb: removed %d port(s)\n", resp->removed.nelem);
+
+	for (idx = 0; idx < resp->removed.nelem; idx++) {
+		rv = i_vcc_md_remove_port(resp->removed.mdp,
+		    resp->removed.mdep[idx], vccp);
+
+		if (rv !=  MDEG_SUCCESS) {
+			return (rv);
+		}
+	}
+
+	/*
+	 * XXX - Currently no support for updating already active
+	 * ports. So, ignore the match_curr and match_prev arrays
+	 * for now.
+	 */
+
+
+	return (MDEG_SUCCESS);
+}
+
+
+/* cb_chpoll */
+static int
+vcc_chpoll(dev_t dev, short events, int anyyet,  short *reventsp,
+    struct pollhead **phpp)
+{
+	int	    instance;
+	minor_t	    minor;
+	uint_t	    portno;
+	vcc_t	    *vccp;
+	vcc_port_t  *vport;
+
+	minor = getminor(dev);
+
+	instance = VCCINST(minor);
+
+	vccp = ddi_get_soft_state(vcc_ssp, instance);
+	if (vccp == NULL) {
+		return (ENXIO);
+	}
+
+	portno = VCCPORT(vccp, minor);
+
+	vport = &(vccp->port[portno]);
+
+	D1("vcc_chpoll: virtual-console-concentrator@%d events 0x%x\n",
+	    portno, events);
+
+	*reventsp = 0;
+
+	if (portno != VCC_CONTROL_PORT) {
+		return (ENXIO);
+	}
+
+	/* poll for config change */
+	if (vport->pollevent) {
+		*reventsp |= (events & POLLIN);
+	}
+
+	if (((*reventsp) == 0) && (!anyyet)) {
+		*phpp = &vport->poll;
+		if (events & POLLIN) {
+			mutex_enter(&vport->lock);
+			vport->pollflag |= VCC_POLL_CONFIG;
+			mutex_exit(&vport->lock);
+		} else {
+			return (ENXIO);
+		}
+	}
+
+	D1("vcc_chpoll: virtual-console-concentrator@%d:%d ev=0x%x, "
+	    "rev=0x%x pev=0x%x, flag=0x%x\n",
+	    instance, portno, events, (*reventsp),
+	    vport->pollevent, vport->pollflag);
+
+
+	return (0);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/vdc.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,3560 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * LDoms virtual disk client (vdc) device driver
+ *
+ * This driver runs on a guest logical domain and communicates with the virtual
+ * disk server (vds) driver running on the service domain which is exporting
+ * virtualized "disks" to the guest logical domain.
+ *
+ * The driver can be divided into four sections:
+ *
+ * 1) generic device driver housekeeping
+ *	_init, _fini, attach, detach, ops structures, etc.
+ *
+ * 2) communication channel setup
+ *	Setup the communications link over the LDC channel that vdc uses to
+ *	talk to the vDisk server. Initialise the descriptor ring which
+ *	allows the LDC clients to transfer data via memory mappings.
+ *
+ * 3) Support exported to upper layers (filesystems, etc)
+ *	The upper layers call into vdc via strategy(9E) and DKIO(7I)
+ *	ioctl calls. vdc will copy the data to be written to the descriptor
+ *	ring or maps the buffer to store the data read by the vDisk
+ *	server into the descriptor ring. It then sends a message to the
+ *	vDisk server requesting it to complete the operation.
+ *
+ * 4) Handling responses from vDisk server.
+ *	The vDisk server will ACK some or all of the messages vdc sends to it
+ *	(this is configured during the handshake). Upon receipt of an ACK
+ *	vdc will check the descriptor ring and signal to the upper layer
+ *	code waiting on the IO.
+ */
+
+#include <sys/conf.h>
+#include <sys/disp.h>
+#include <sys/ddi.h>
+#include <sys/dkio.h>
+#include <sys/efi_partition.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/mach_descrip.h>
+#include <sys/modctl.h>
+#include <sys/mdeg.h>
+#include <sys/note.h>
+#include <sys/open.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/types.h>
+#include <sys/promif.h>
+#include <sys/vtoc.h>
+#include <sys/archsystm.h>
+#include <sys/sysmacros.h>
+
+#include <sys/cdio.h>
+#include <sys/dktp/cm.h>
+#include <sys/dktp/fdisk.h>
+#include <sys/scsi/generic/sense.h>
+#include <sys/scsi/impl/uscsi.h>	/* Needed for defn of USCSICMD ioctl */
+#include <sys/scsi/targets/sddef.h>
+
+#include <sys/ldoms.h>
+#include <sys/ldc.h>
+#include <sys/vio_common.h>
+#include <sys/vio_mailbox.h>
+#include <sys/vdsk_common.h>
+#include <sys/vdsk_mailbox.h>
+#include <sys/vdc.h>
+
+/*
+ * function prototypes
+ */
+
+/* standard driver functions */
+static int	vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred);
+static int	vdc_close(dev_t dev, int flag, int otyp, cred_t *cred);
+static int	vdc_strategy(struct buf *buf);
+static int	vdc_print(dev_t dev, char *str);
+static int	vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk);
+static int	vdc_read(dev_t dev, struct uio *uio, cred_t *cred);
+static int	vdc_write(dev_t dev, struct uio *uio, cred_t *cred);
+static int	vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
+			cred_t *credp, int *rvalp);
+static int	vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred);
+static int	vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred);
+
+static int	vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,
+			void *arg, void **resultp);
+static int	vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int	vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+
+/* setup */
+static int	vdc_send(ldc_handle_t ldc_handle, caddr_t pkt, size_t *msglen);
+static int	vdc_do_ldc_init(vdc_t *vdc);
+static int	vdc_start_ldc_connection(vdc_t *vdc);
+static int	vdc_create_device_nodes(vdc_t *vdc);
+static int	vdc_create_device_nodes_props(vdc_t *vdc);
+static int	vdc_get_ldc_id(dev_info_t *dip, uint64_t *ldc_id);
+static void	vdc_terminate_ldc(vdc_t *vdc);
+static int	vdc_init_descriptor_ring(vdc_t *vdc);
+static void	vdc_destroy_descriptor_ring(vdc_t *vdc);
+
+/* handshake with vds */
+static void		vdc_init_handshake_negotiation(void *arg);
+static int		vdc_init_ver_negotiation(vdc_t *vdc);
+static int		vdc_init_attr_negotiation(vdc_t *vdc);
+static int		vdc_init_dring_negotiate(vdc_t *vdc);
+static int		vdc_handle_ver_negotiate();
+static int		vdc_handle_attr_negotiate();
+static void		vdc_reset_connection(vdc_t *vdc, boolean_t resetldc);
+static boolean_t	vdc_is_able_to_tx_data(vdc_t *vdc, int flag);
+
+/* processing */
+static void	vdc_process_msg_thread(vdc_t *vdc);
+static uint_t	vdc_handle_cb(uint64_t event, caddr_t arg);
+static void	vdc_process_msg(void *arg);
+static int	vdc_process_ctrl_msg(vdc_t *vdc, vio_msg_t msg);
+static int	vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg);
+static int	vdc_process_err_msg(vdc_t *vdc, vio_msg_t msg);
+static void	vdc_do_process_msg(vdc_t *vdc);
+static int	vdc_get_next_dring_entry_id(vdc_t *vdc, uint_t needed);
+static int	vdc_populate_descriptor(vdc_t *vdc, caddr_t addr,
+			size_t nbytes, int op, uint64_t arg, uint64_t slice);
+static int	vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx,
+			vio_dring_msg_t dmsg);
+static int	vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx);
+static int	vdc_get_response(vdc_t *vdc, int start, int end);
+static int	vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx,
+			caddr_t addr, size_t nbytes, int operation);
+static boolean_t vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int
+			num_msgs);
+
+/* dkio */
+static int	vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode);
+static int	vdc_create_fake_geometry(vdc_t *vdc);
+
+/*
+ * Module variables
+ */
+uint64_t	vdc_hz_timeout;
+uint64_t	vdc_usec_timeout = VDC_USEC_TIMEOUT_MIN;
+uint64_t	vdc_dump_usec_timeout = VDC_USEC_TIMEOUT_MIN / 300;
+static int	vdc_retries = VDC_RETRIES;
+static int	vdc_dump_retries = VDC_RETRIES * 10;
+
+/* Soft state pointer */
+static void	*vdc_state;
+
+/* variable level controlling the verbosity of the error/debug messages */
+int	vdc_msglevel = 0;
+
+
+static void
+vdc_msg(const char *format, ...)
+{
+	va_list	args;
+
+	va_start(args, format);
+	vcmn_err(CE_CONT, format, args);
+	va_end(args);
+}
+
+static struct cb_ops vdc_cb_ops = {
+	vdc_open,	/* cb_open */
+	vdc_close,	/* cb_close */
+	vdc_strategy,	/* cb_strategy */
+	vdc_print,	/* cb_print */
+	vdc_dump,	/* cb_dump */
+	vdc_read,	/* cb_read */
+	vdc_write,	/* cb_write */
+	vdc_ioctl,	/* cb_ioctl */
+	nodev,		/* cb_devmap */
+	nodev,		/* cb_mmap */
+	nodev,		/* cb_segmap */
+	nochpoll,	/* cb_chpoll */
+	ddi_prop_op,	/* cb_prop_op */
+	NULL,		/* cb_str */
+	D_MP | D_64BIT,	/* cb_flag */
+	CB_REV,		/* cb_rev */
+	vdc_aread,	/* cb_aread */
+	vdc_awrite	/* cb_awrite */
+};
+
+static struct dev_ops vdc_ops = {
+	DEVO_REV,	/* devo_rev */
+	0,		/* devo_refcnt */
+	vdc_getinfo,	/* devo_getinfo */
+	nulldev,	/* devo_identify */
+	nulldev,	/* devo_probe */
+	vdc_attach,	/* devo_attach */
+	vdc_detach,	/* devo_detach */
+	nodev,		/* devo_reset */
+	&vdc_cb_ops,	/* devo_cb_ops */
+	NULL,		/* devo_bus_ops */
+	nulldev		/* devo_power */
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	"virtual disk client %I%",
+	&vdc_ops,
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	&modldrv,
+	NULL
+};
+
+/* -------------------------------------------------------------------------- */
+
+/*
+ * Device Driver housekeeping and setup
+ */
+
+int
+_init(void)
+{
+	int	status;
+
+	if ((status = ddi_soft_state_init(&vdc_state, sizeof (vdc_t), 1)) != 0)
+		return (status);
+	if ((status = mod_install(&modlinkage)) != 0)
+		ddi_soft_state_fini(&vdc_state);
+	return (status);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int	status;
+
+	if ((status = mod_remove(&modlinkage)) != 0)
+		return (status);
+	ddi_soft_state_fini(&vdc_state);
+	return (0);
+}
+
+static int
+vdc_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd,  void *arg, void **resultp)
+{
+	_NOTE(ARGUNUSED(dip))
+
+	int	instance = SDUNIT(getminor((dev_t)arg));
+	vdc_t	*vdc = NULL;
+
+	switch (cmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
+			*resultp = NULL;
+			return (DDI_FAILURE);
+		}
+		*resultp = vdc->dip;
+		return (DDI_SUCCESS);
+	case DDI_INFO_DEVT2INSTANCE:
+		*resultp = (void *)(uintptr_t)instance;
+		return (DDI_SUCCESS);
+	default:
+		*resultp = NULL;
+		return (DDI_FAILURE);
+	}
+}
+
+static int
+vdc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int	instance;
+	int	rv;
+	uint_t	retries = 0;
+	vdc_t	*vdc = NULL;
+
+	switch (cmd) {
+	case DDI_DETACH:
+		/* the real work happens below */
+		break;
+	case DDI_SUSPEND:
+		/* nothing to do for this non-device */
+		return (DDI_SUCCESS);
+	default:
+		return (DDI_FAILURE);
+	}
+
+	ASSERT(cmd == DDI_DETACH);
+	instance = ddi_get_instance(dip);
+	PR1("%s[%d] Entered\n", __func__, instance);
+
+	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
+		vdc_msg("%s[%d]:  Could not get state structure.",
+		    __func__, instance);
+		return (DDI_FAILURE);
+	}
+
+	if (vdc->open) {
+		PR0("%s[%d]: Cannot detach: device is open",
+				__func__, instance);
+		return (DDI_FAILURE);
+	}
+
+	PR0("%s[%d] proceeding...\n", __func__, instance);
+
+	/*
+	 * try and disable callbacks to prevent another handshake
+	 */
+	rv = ldc_set_cb_mode(vdc->ldc_handle, LDC_CB_DISABLE);
+	PR0("%s[%d] callback disabled (rv=%d)\n", __func__, instance, rv);
+
+	/*
+	 * Prevent any more attempts to start a handshake with the vdisk
+	 * server and tear down the existing connection.
+	 */
+	mutex_enter(&vdc->lock);
+	vdc->initialized |= VDC_HANDSHAKE_STOP;
+	vdc_reset_connection(vdc, B_TRUE);
+	mutex_exit(&vdc->lock);
+
+	if (vdc->initialized & VDC_THREAD) {
+		mutex_enter(&vdc->msg_proc_lock);
+		vdc->msg_proc_thr_state = VDC_THR_STOP;
+		vdc->msg_pending = B_TRUE;
+		cv_signal(&vdc->msg_proc_cv);
+
+		while (vdc->msg_proc_thr_state != VDC_THR_DONE) {
+			PR0("%s[%d]: Waiting for thread to exit\n",
+				__func__, instance);
+			rv = cv_timedwait(&vdc->msg_proc_cv,
+				&vdc->msg_proc_lock, VD_GET_TIMEOUT_HZ(1));
+			if ((rv == -1) && (retries++ > vdc_retries))
+				break;
+		}
+		mutex_exit(&vdc->msg_proc_lock);
+	}
+
+	mutex_enter(&vdc->lock);
+
+	if (vdc->initialized & VDC_DRING)
+		vdc_destroy_descriptor_ring(vdc);
+
+	if (vdc->initialized & VDC_LDC)
+		vdc_terminate_ldc(vdc);
+
+	mutex_exit(&vdc->lock);
+
+	if (vdc->initialized & VDC_MINOR) {
+		ddi_prop_remove_all(dip);
+		ddi_remove_minor_node(dip, NULL);
+	}
+
+	if (vdc->initialized & VDC_LOCKS) {
+		mutex_destroy(&vdc->lock);
+		mutex_destroy(&vdc->attach_lock);
+		mutex_destroy(&vdc->msg_proc_lock);
+		mutex_destroy(&vdc->dring_lock);
+		cv_destroy(&vdc->cv);
+		cv_destroy(&vdc->attach_cv);
+		cv_destroy(&vdc->msg_proc_cv);
+	}
+
+	if (vdc->minfo)
+		kmem_free(vdc->minfo, sizeof (struct dk_minfo));
+
+	if (vdc->cinfo)
+		kmem_free(vdc->cinfo, sizeof (struct dk_cinfo));
+
+	if (vdc->vtoc)
+		kmem_free(vdc->vtoc, sizeof (struct vtoc));
+
+	if (vdc->initialized & VDC_SOFT_STATE)
+		ddi_soft_state_free(vdc_state, instance);
+
+	PR0("%s[%d] End %p\n", __func__, instance, vdc);
+
+	return (DDI_SUCCESS);
+}
+
+
+static int
+vdc_do_attach(dev_info_t *dip)
+{
+	int		instance;
+	vdc_t		*vdc = NULL;
+	int		status;
+	uint_t		retries = 0;
+
+	ASSERT(dip != NULL);
+
+	instance = ddi_get_instance(dip);
+	if (ddi_soft_state_zalloc(vdc_state, instance) != DDI_SUCCESS) {
+		vdc_msg("%s:(%d): Couldn't alloc state structure",
+		    __func__, instance);
+		return (DDI_FAILURE);
+	}
+
+	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
+		vdc_msg("%s:(%d): Could not get state structure.",
+		    __func__, instance);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * We assign the value to initialized in this case to zero out the
+	 * variable and then set bits in it to indicate what has been done
+	 */
+	vdc->initialized = VDC_SOFT_STATE;
+
+	vdc_hz_timeout = drv_usectohz(vdc_usec_timeout);
+
+	vdc->dip	= dip;
+	vdc->instance	= instance;
+	vdc->open	= 0;
+	vdc->vdisk_type	= VD_DISK_TYPE_UNK;
+	vdc->state	= VD_STATE_INIT;
+	vdc->ldc_state	= 0;
+	vdc->session_id = 0;
+	vdc->block_size = DEV_BSIZE;
+	vdc->max_xfer_sz = VD_MAX_BLOCK_SIZE / DEV_BSIZE;
+
+	vdc->vtoc = NULL;
+	vdc->cinfo = NULL;
+	vdc->minfo = NULL;
+
+	mutex_init(&vdc->lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&vdc->attach_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&vdc->msg_proc_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&vdc->dring_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&vdc->cv, NULL, CV_DRIVER, NULL);
+	cv_init(&vdc->attach_cv, NULL, CV_DRIVER, NULL);
+	cv_init(&vdc->msg_proc_cv, NULL, CV_DRIVER, NULL);
+	vdc->initialized |= VDC_LOCKS;
+
+	vdc->msg_pending = B_FALSE;
+	vdc->msg_proc_thr_id = thread_create(NULL, 0, vdc_process_msg_thread,
+		vdc, 0, &p0, TS_RUN, minclsyspri);
+	if (vdc->msg_proc_thr_id == NULL) {
+		cmn_err(CE_NOTE, "[%d] Failed to create msg processing thread",
+				instance);
+		return (DDI_FAILURE);
+	}
+	vdc->initialized |= VDC_THREAD;
+
+	/* initialise LDC channel which will be used to communicate with vds */
+	if (vdc_do_ldc_init(vdc) != 0) {
+		cmn_err(CE_NOTE, "[%d] Couldn't initialize LDC", instance);
+		return (DDI_FAILURE);
+	}
+
+	/* Bring up connection with vds via LDC */
+	status = vdc_start_ldc_connection(vdc);
+	if (status != 0) {
+		vdc_msg("%s[%d]  Could not start LDC", __func__, instance);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * We need to wait until the handshake has completed before leaving
+	 * the attach(). This is to allow the device node(s) to be created
+	 * and the first usage of the filesystem to succeed.
+	 */
+	mutex_enter(&vdc->attach_lock);
+	while ((vdc->ldc_state != LDC_UP) ||
+		(vdc->state != VD_STATE_DATA)) {
+
+		PR0("%s[%d] handshake in progress [VD %d (LDC %d)]\n",
+			__func__, instance, vdc->state, vdc->ldc_state);
+
+		status = cv_timedwait(&vdc->attach_cv, &vdc->attach_lock,
+				VD_GET_TIMEOUT_HZ(1));
+		if (status == -1) {
+			if (retries >= vdc_retries) {
+				PR0("%s[%d] Give up handshake wait.\n",
+						__func__, instance);
+				mutex_exit(&vdc->attach_lock);
+				return (DDI_FAILURE);
+			} else {
+				PR0("%s[%d] Retry #%d for handshake.\n",
+						__func__, instance, retries);
+				retries++;
+			}
+		}
+	}
+	mutex_exit(&vdc->attach_lock);
+
+	if (vdc->vtoc == NULL)
+		vdc->vtoc = kmem_zalloc(sizeof (struct vtoc), KM_SLEEP);
+
+	status = vdc_populate_descriptor(vdc, (caddr_t)vdc->vtoc,
+			P2ROUNDUP(sizeof (struct vtoc), sizeof (uint64_t)),
+			VD_OP_GET_VTOC, FKIOCTL, 0);
+	if (status) {
+		cmn_err(CE_NOTE, "[%d] Failed to get VTOC", instance);
+		return (status);
+	}
+
+	/*
+	 * Now that we have the device info we can create the
+	 * device nodes and properties
+	 */
+	status = vdc_create_device_nodes(vdc);
+	if (status) {
+		cmn_err(CE_NOTE, "[%d] Failed to create device nodes",
+				instance);
+		return (status);
+	}
+	status = vdc_create_device_nodes_props(vdc);
+	if (status) {
+		cmn_err(CE_NOTE, "[%d] Failed to create device nodes"
+				" properties", instance);
+		return (status);
+	}
+
+	ddi_report_dev(dip);
+
+	PR0("%s[%d] Attach completed\n", __func__, instance);
+	return (status);
+}
+
+static int
+vdc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int	status;
+
+	PR0("%s[%d]  Entered.  Built %s %s\n", __func__, ddi_get_instance(dip),
+		__DATE__, __TIME__);
+
+	switch (cmd) {
+	case DDI_ATTACH:
+		if ((status = vdc_do_attach(dip)) != 0)
+			(void) vdc_detach(dip, DDI_DETACH);
+		return (status);
+	case DDI_RESUME:
+		/* nothing to do for this non-device */
+		return (DDI_SUCCESS);
+	default:
+		return (DDI_FAILURE);
+	}
+}
+
+static int
+vdc_do_ldc_init(vdc_t *vdc)
+{
+	int			status = 0;
+	ldc_status_t		ldc_state;
+	ldc_attr_t		ldc_attr;
+	uint64_t		ldc_id = 0;
+	dev_info_t		*dip = NULL;
+
+	ASSERT(vdc != NULL);
+
+	dip = vdc->dip;
+	vdc->initialized |= VDC_LDC;
+
+	if ((status = vdc_get_ldc_id(dip, &ldc_id)) != 0) {
+		vdc_msg("%s:  Failed to get <ldc_id> property\n", __func__);
+		return (EIO);
+	}
+	vdc->ldc_id = ldc_id;
+
+	ldc_attr.devclass = LDC_DEV_BLK;
+	ldc_attr.instance = vdc->instance;
+	ldc_attr.mode = LDC_MODE_UNRELIABLE;	/* unreliable transport */
+	ldc_attr.qlen = VD_LDC_QLEN;
+
+	if ((vdc->initialized & VDC_LDC_INIT) == 0) {
+		status = ldc_init(ldc_id, &ldc_attr, &vdc->ldc_handle);
+		if (status != 0) {
+			cmn_err(CE_NOTE, "[%d] ldc_init(chan %ld) returned %d",
+					vdc->instance, ldc_id, status);
+			return (status);
+		}
+		vdc->initialized |= VDC_LDC_INIT;
+	}
+	status = ldc_status(vdc->ldc_handle, &ldc_state);
+	if (status != 0) {
+		vdc_msg("Cannot discover LDC status [err=%d].", status);
+		return (status);
+	}
+	vdc->ldc_state = ldc_state;
+
+	if ((vdc->initialized & VDC_LDC_CB) == 0) {
+		status = ldc_reg_callback(vdc->ldc_handle, vdc_handle_cb,
+		    (caddr_t)vdc);
+		if (status != 0) {
+			vdc_msg("%s: ldc_reg_callback()=%d", __func__, status);
+			return (status);
+		}
+		vdc->initialized |= VDC_LDC_CB;
+	}
+
+	vdc->initialized |= VDC_LDC;
+
+	/*
+	 * At this stage we have initialised LDC, we will now try and open
+	 * the connection.
+	 */
+	if (vdc->ldc_state == LDC_INIT) {
+		status = ldc_open(vdc->ldc_handle);
+		if (status != 0) {
+			cmn_err(CE_NOTE, "[%d] ldc_open(chan %ld) returned %d",
+					vdc->instance, vdc->ldc_id, status);
+			return (status);
+		}
+		vdc->initialized |= VDC_LDC_OPEN;
+	}
+
+	return (status);
+}
+
+static int
+vdc_start_ldc_connection(vdc_t *vdc)
+{
+	int		status = 0;
+
+	ASSERT(vdc != NULL);
+
+	mutex_enter(&vdc->lock);
+
+	if (vdc->ldc_state == LDC_UP) {
+		PR0("%s:  LDC is already UP ..\n", __func__);
+		mutex_exit(&vdc->lock);
+		return (0);
+	}
+
+	if ((status = ldc_up(vdc->ldc_handle)) != 0) {
+		switch (status) {
+		case ECONNREFUSED:	/* listener not ready at other end */
+			PR0("%s: ldc_up(%d,...) return %d\n",
+					__func__, vdc->ldc_id, status);
+			status = 0;
+			break;
+		default:
+			cmn_err(CE_NOTE, "[%d] Failed to bring up LDC: "
+					"channel=%ld, err=%d",
+					vdc->instance, vdc->ldc_id, status);
+		}
+	}
+
+	PR0("%s[%d] Finished bringing up LDC\n", __func__, vdc->instance);
+
+	mutex_exit(&vdc->lock);
+
+	return (status);
+}
+
+
+/*
+ * Function:
+ *	vdc_create_device_nodes
+ *
+ * Description:
+ *	This function creates the block and character device nodes under
+ *	/devices along with the node properties. It is called as part of
+ *	the attach(9E) of the instance during the handshake with vds after
+ *	vds has sent the attributes to vdc.
+ *
+ *	If the device is of type VD_DISK_TYPE_SLICE then the minor node
+ *	of 2 is used in keeping with the Solaris convention that slice 2
+ *	refers to a whole disk. Slices start at 'a'
+ *
+ * Parameters:
+ *	vdc 		- soft state pointer
+ *
+ * Return Values
+ *	0		- Success
+ *	EIO		- Failed to create node
+ *	EINVAL		- Unknown type of disk exported
+ */
+static int
+vdc_create_device_nodes(vdc_t *vdc)
+{
+	/* uses NNNN which is OK as long as # of disks <= 10000 */
+	char		name[sizeof ("disk@NNNN:s,raw")];
+	dev_info_t	*dip = NULL;
+	int		instance;
+	int		num_slices = 1;
+	int		i;
+
+	ASSERT(vdc != NULL);
+
+	instance = vdc->instance;
+	dip = vdc->dip;
+
+	switch (vdc->vdisk_type) {
+	case VD_DISK_TYPE_DISK:
+		num_slices = V_NUMPAR;
+		break;
+	case VD_DISK_TYPE_SLICE:
+		num_slices = 1;
+		break;
+	case VD_DISK_TYPE_UNK:
+	default:
+		return (EINVAL);
+	}
+
+	for (i = 0; i < num_slices; i++) {
+		(void) snprintf(name, sizeof (name), "%c", 'a' + i);
+		if (ddi_create_minor_node(dip, name, S_IFBLK,
+		    VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) {
+			vdc_msg("%s[%d]: Couldn't add block node %s.",
+				__func__, instance, name);
+			return (EIO);
+		}
+
+		/* if any device node is created we set this flag */
+		vdc->initialized |= VDC_MINOR;
+
+		(void) snprintf(name, sizeof (name), "%c%s",
+			'a' + i, ",raw");
+		if (ddi_create_minor_node(dip, name, S_IFCHR,
+		    VD_MAKE_DEV(instance, i), DDI_NT_BLOCK, 0) != DDI_SUCCESS) {
+			vdc_msg("%s[%d]:  Could not add raw node %s.",
+				__func__, instance, name);
+			return (EIO);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Function:
+ *	vdc_create_device_nodes_props
+ *
+ * Description:
+ *	This function creates the block and character device nodes under
+ *	/devices along with the node properties. It is called as part of
+ *	the attach(9E) of the instance during the handshake with vds after
+ *	vds has sent the attributes to vdc.
+ *
+ * Parameters:
+ *	vdc 		- soft state pointer
+ *
+ * Return Values
+ *	0		- Success
+ *	EIO		- Failed to create device node property
+ *	EINVAL		- Unknown type of disk exported
+ */
+static int
+vdc_create_device_nodes_props(vdc_t *vdc)
+{
+	dev_info_t	*dip = NULL;
+	int		instance;
+	int		num_slices = 1;
+	int64_t		size = 0;
+	dev_t		dev;
+	int		rv;
+	int		i;
+
+	ASSERT(vdc != NULL);
+
+	instance = vdc->instance;
+	dip = vdc->dip;
+
+	if ((vdc->vtoc == NULL) || (vdc->vtoc->v_sanity != VTOC_SANE)) {
+		cmn_err(CE_NOTE, "![%d] Could not create device node property."
+				" No VTOC available", instance);
+		return (ENXIO);
+	}
+
+	switch (vdc->vdisk_type) {
+	case VD_DISK_TYPE_DISK:
+		num_slices = V_NUMPAR;
+		break;
+	case VD_DISK_TYPE_SLICE:
+		num_slices = 1;
+		break;
+	case VD_DISK_TYPE_UNK:
+	default:
+		return (EINVAL);
+	}
+
+	for (i = 0; i < num_slices; i++) {
+		dev = makedevice(ddi_driver_major(dip),
+			VD_MAKE_DEV(instance, i));
+
+		size = vdc->vtoc->v_part[i].p_size * vdc->vtoc->v_sectorsz;
+		PR0("%s[%d] sz %ld (%ld Mb)  p_size %lx\n",
+				__func__, instance, size, size / (1024 * 1024),
+				vdc->vtoc->v_part[i].p_size);
+
+		rv = ddi_prop_update_int64(dev, dip, VDC_SIZE_PROP_NAME, size);
+		if (rv != DDI_PROP_SUCCESS) {
+			vdc_msg("%s:(%d): Couldn't add \"%s\" [%d]\n",
+				__func__, instance, VDC_SIZE_PROP_NAME, size);
+			return (EIO);
+		}
+
+		rv = ddi_prop_update_int64(dev, dip, VDC_NBLOCKS_PROP_NAME,
+			lbtodb(size));
+		if (rv != DDI_PROP_SUCCESS) {
+			vdc_msg("%s:(%d): Couldn't add \"%s\" [%d]\n", __func__,
+				instance, VDC_NBLOCKS_PROP_NAME, lbtodb(size));
+			return (EIO);
+		}
+	}
+
+	return (0);
+}
+
+static int
+vdc_open(dev_t *dev, int flag, int otyp, cred_t *cred)
+{
+	_NOTE(ARGUNUSED(cred))
+
+	int		instance;
+	int		status = 0;
+	vdc_t		*vdc;
+
+	ASSERT(dev != NULL);
+	instance = SDUNIT(getminor(*dev));
+
+	PR0("%s[%d] minor = %d flag = %x, otyp = %x\n", __func__, instance,
+			getminor(*dev), flag, otyp);
+
+	if ((otyp != OTYP_CHR) && (otyp != OTYP_BLK))
+		return (EINVAL);
+
+	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
+		vdc_msg("%s[%d] Could not get state.", __func__, instance);
+		return (ENXIO);
+	}
+
+	/*
+	 * Check to see if we can communicate with vds
+	 */
+	status = vdc_is_able_to_tx_data(vdc, flag);
+	if (status == B_FALSE) {
+		PR0("%s[%d] Not ready to transmit data\n", __func__, instance);
+		return (ENOLINK);
+	}
+
+	mutex_enter(&vdc->lock);
+	vdc->open++;
+	mutex_exit(&vdc->lock);
+
+	return (0);
+}
+
+static int
+vdc_close(dev_t dev, int flag, int otyp, cred_t *cred)
+{
+	_NOTE(ARGUNUSED(cred))
+
+	int	instance;
+	vdc_t	*vdc;
+
+	instance = SDUNIT(getminor(dev));
+
+	PR0("%s[%d] flag = %x, otyp = %x\n", __func__, instance, flag, otyp);
+
+	if ((otyp != OTYP_CHR) && (otyp != OTYP_BLK))
+		return (EINVAL);
+
+	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
+		vdc_msg("%s[%d] Could not get state.", __func__, instance);
+		return (ENXIO);
+	}
+
+	/*
+	 * Check to see if we can communicate with vds
+	 */
+	if (vdc_is_able_to_tx_data(vdc, 0) == B_FALSE) {
+		PR0("%s[%d] Not ready to transmit data\n", __func__, instance);
+		return (ETIMEDOUT);
+	}
+
+	if (vdc->dkio_flush_pending) {
+		PR0("%s[%d]: Cannot detach: %d outstanding DKIO flushes",
+			__func__, instance, vdc->dkio_flush_pending);
+		return (EBUSY);
+	}
+
+	/*
+	 * Should not need the mutex here, since the framework should protect
+	 * against more opens on this device, but just in case.
+	 */
+	mutex_enter(&vdc->lock);
+	vdc->open--;
+	mutex_exit(&vdc->lock);
+
+	return (0);
+}
+
+static int
+vdc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp)
+{
+	_NOTE(ARGUNUSED(credp))
+	_NOTE(ARGUNUSED(rvalp))
+
+	return (vd_process_ioctl(dev, cmd, (caddr_t)arg, mode));
+}
+
+static int
+vdc_print(dev_t dev, char *str)
+{
+	cmn_err(CE_NOTE, "vdc%d:  %s", SDUNIT(getminor(dev)), str);
+	return (0);
+}
+
+static int
+vdc_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
+{
+	int			rv = 0;
+	size_t			nbytes = (nblk * DEV_BSIZE);
+	int			instance = SDUNIT(getminor(dev));
+	vdc_t			*vdc;
+
+	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
+		vdc_msg("%s (%d):  Could not get state.", __func__, instance);
+		return (ENXIO);
+	}
+
+	rv = vdc_populate_descriptor(vdc, addr, nbytes, VD_OP_BWRITE,
+					blkno, SDPART(getminor(dev)));
+
+	PR1("%s: status=%d\n", __func__, rv);
+
+	return (rv);
+}
+
+/* -------------------------------------------------------------------------- */
+
+/*
+ * Disk access routines
+ *
+ */
+
+/*
+ * vdc_strategy()
+ *
+ * Return Value:
+ *	0:	As per strategy(9E), the strategy() function must return 0
+ *		[ bioerror(9f) sets b_flags to the proper error code ]
+ */
+static int
+vdc_strategy(struct buf *buf)
+{
+	int		rv = -1;
+	vdc_t		*vdc = NULL;
+	int		instance = SDUNIT(getminor(buf->b_edev));
+	int	op = (buf->b_flags & B_READ) ? VD_OP_BREAD : VD_OP_BWRITE;
+
+	PR1("%s: %s %ld bytes at block %ld : b_addr=0x%p",
+	    __func__, (buf->b_flags & B_READ) ? "Read" : "Write",
+	    buf->b_bcount, buf->b_lblkno, buf->b_un.b_addr);
+
+	if ((vdc = ddi_get_soft_state(vdc_state, instance)) == NULL) {
+		vdc_msg("%s[%d]:  Could not get state.", __func__, instance);
+		bioerror(buf, ENXIO);
+		biodone(buf);
+		return (0);
+	}
+
+	ASSERT(buf->b_bcount <= (vdc->max_xfer_sz * vdc->block_size));
+
+	if (vdc_is_able_to_tx_data(vdc, O_NONBLOCK) == B_FALSE) {
+		vdc_msg("%s: Not ready to transmit data", __func__);
+		bioerror(buf, ENXIO);
+		biodone(buf);
+		return (0);
+	}
+	bp_mapin(buf);
+
+	rv = vdc_populate_descriptor(vdc, buf->b_un.b_addr, buf->b_bcount, op,
+			buf->b_lblkno, SDPART(getminor(buf->b_edev)));
+
+	PR1("%s: status=%d", __func__, rv);
+	bioerror(buf, rv);
+	biodone(buf);
+	return (0);
+}
+
+
+static int
+vdc_read(dev_t dev, struct uio *uio, cred_t *cred)
+{
+	_NOTE(ARGUNUSED(cred))
+
+	PR1("vdc_read():  Entered");
+	return (physio(vdc_strategy, NULL, dev, B_READ, minphys, uio));
+}
+
+static int
+vdc_write(dev_t dev, struct uio *uio, cred_t *cred)
+{
+	_NOTE(ARGUNUSED(cred))
+
+	PR1("vdc_write():  Entered");
+	return (physio(vdc_strategy, NULL, dev, B_WRITE, minphys, uio));
+}
+
+static int
+vdc_aread(dev_t dev, struct aio_req *aio, cred_t *cred)
+{
+	_NOTE(ARGUNUSED(cred))
+
+	PR1("vdc_aread():  Entered");
+	return (aphysio(vdc_strategy, anocancel, dev, B_READ, minphys, aio));
+}
+
+static int
+vdc_awrite(dev_t dev, struct aio_req *aio, cred_t *cred)
+{
+	_NOTE(ARGUNUSED(cred))
+
+	PR1("vdc_awrite():  Entered");
+	return (aphysio(vdc_strategy, anocancel, dev, B_WRITE, minphys, aio));
+}
+
+
+/* -------------------------------------------------------------------------- */
+
+/*
+ * Handshake support
+ */
+
+/*
+ * vdc_init_handshake_negotiation
+ *
+ * Description:
+ *	This function is called to trigger the handshake negotiations between
+ *	the client (vdc) and the server (vds). It may be called multiple times.
+ *
+ * Parameters:
+ *	vdc - soft state pointer
+ */
+static void
+vdc_init_handshake_negotiation(void *arg)
+{
+	vdc_t		*vdc = (vdc_t *)(void *)arg;
+	vd_state_t	state;
+
+	ASSERT(vdc != NULL);
+	ASSERT(vdc->ldc_state == LDC_UP);
+
+	mutex_enter(&vdc->lock);
+
+	/*
+	 * Do not continue if another thread has triggered a handshake which
+	 * is in progress or detach() has stopped further handshakes.
+	 */
+	if (vdc->initialized & (VDC_HANDSHAKE | VDC_HANDSHAKE_STOP)) {
+		PR0("%s[%d] Negotiation not triggered. [init=%x]\n",
+			__func__, vdc->instance, vdc->initialized);
+		mutex_exit(&vdc->lock);
+		return;
+	}
+
+	PR0("Initializing vdc<->vds handshake\n");
+
+	vdc->initialized |= VDC_HANDSHAKE;
+
+	state = vdc->state;
+
+	if (state == VD_STATE_INIT) {
+		(void) vdc_init_ver_negotiation(vdc);
+	} else if (state == VD_STATE_VER) {
+		(void) vdc_init_attr_negotiation(vdc);
+	} else if (state == VD_STATE_ATTR) {
+		(void) vdc_init_dring_negotiate(vdc);
+	} else if (state == VD_STATE_DATA) {
+		/*
+		 * nothing to do - we have already completed the negotiation
+		 * and we can transmit data when ready.
+		 */
+		PR0("%s[%d] Negotiation triggered after handshake completed",
+			__func__, vdc->instance);
+	}
+
+	mutex_exit(&vdc->lock);
+}
+
+static int
+vdc_init_ver_negotiation(vdc_t *vdc)
+{
+	vio_ver_msg_t	pkt;
+	size_t		msglen = sizeof (pkt);
+	int		status = -1;
+
+	PR0("%s: Entered.\n", __func__);
+
+	ASSERT(vdc != NULL);
+	ASSERT(mutex_owned(&vdc->lock));
+
+	/*
+	 * set the Session ID to a unique value
+	 * (the lower 32 bits of the clock tick)
+	 */
+	vdc->session_id = ((uint32_t)gettick() & 0xffffffff);
+
+	pkt.tag.vio_msgtype = VIO_TYPE_CTRL;
+	pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
+	pkt.tag.vio_subtype_env = VIO_VER_INFO;
+	pkt.tag.vio_sid = vdc->session_id;
+	pkt.dev_class = VDEV_DISK;
+	pkt.ver_major = VD_VER_MAJOR;
+	pkt.ver_minor = VD_VER_MINOR;
+
+	status = vdc_send(vdc->ldc_handle, (caddr_t)&pkt, &msglen);
+	PR0("%s: vdc_send(status = %d)\n", __func__, status);
+
+	if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) {
+		PR0("%s[%d] vdc_send failed: id(%lx) rv(%d) size(%d)\n",
+				__func__, vdc->instance, vdc->ldc_handle,
+				status, msglen);
+		if (msglen != sizeof (vio_ver_msg_t))
+			status = ENOMSG;
+	}
+
+	return (status);
+}
+
+static int
+vdc_init_attr_negotiation(vdc_t *vdc)
+{
+	vd_attr_msg_t	pkt;
+	size_t		msglen = sizeof (pkt);
+	int		status;
+
+	ASSERT(vdc != NULL);
+	ASSERT(mutex_owned(&vdc->lock));
+
+	PR0("%s[%d] entered\n", __func__, vdc->instance);
+
+	/* fill in tag */
+	pkt.tag.vio_msgtype = VIO_TYPE_CTRL;
+	pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
+	pkt.tag.vio_subtype_env = VIO_ATTR_INFO;
+	pkt.tag.vio_sid = vdc->session_id;
+	/* fill in payload */
+	pkt.max_xfer_sz = vdc->max_xfer_sz;
+	pkt.vdisk_block_size = vdc->block_size;
+	pkt.xfer_mode = VIO_DRING_MODE;
+	pkt.operations = 0;	/* server will set bits of valid operations */
+	pkt.vdisk_type = 0;	/* server will set to valid device type */
+	pkt.vdisk_size = 0;	/* server will set to valid size */
+
+	status = vdc_send(vdc->ldc_handle, (caddr_t)&pkt, &msglen);
+	PR0("%s: vdc_send(status = %d)\n", __func__, status);
+
+	if ((status != 0) || (msglen != sizeof (vio_ver_msg_t))) {
+		PR0("%s[%d] ldc_write failed: id(%lx) rv(%d) size (%d)\n",
+			__func__, vdc->instance, vdc->ldc_handle,
+			status, msglen);
+		if (msglen != sizeof (vio_ver_msg_t))
+			status = ENOMSG;
+	}
+
+	return (status);
+}
+
+static int
+vdc_init_dring_negotiate(vdc_t *vdc)
+{
+	vio_dring_reg_msg_t	pkt;
+	size_t			msglen = sizeof (pkt);
+	int			status = -1;
+
+	ASSERT(vdc != NULL);
+	ASSERT(mutex_owned(&vdc->lock));
+
+	status = vdc_init_descriptor_ring(vdc);
+	PR0("%s[%d] Init of descriptor ring completed (status = %d)\n",
+			__func__, vdc->instance, status);
+	if (status != 0) {
+		cmn_err(CE_CONT, "[%d] Failed to init DRing (status = %d)\n",
+				vdc->instance, status);
+		vdc_reset_connection(vdc, B_FALSE);
+		return (status);
+	}
+
+	/* fill in tag */
+	pkt.tag.vio_msgtype = VIO_TYPE_CTRL;
+	pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
+	pkt.tag.vio_subtype_env = VIO_DRING_REG;
+	pkt.tag.vio_sid = vdc->session_id;
+	/* fill in payload */
+	pkt.dring_ident = 0;
+	pkt.num_descriptors = VD_DRING_LEN;
+	pkt.descriptor_size = VD_DRING_ENTRY_SZ;
+	pkt.options = (VIO_TX_DRING | VIO_RX_DRING);
+	pkt.ncookies = vdc->dring_cookie_count;
+	pkt.cookie[0] = vdc->dring_cookie[0];	/* for now just one cookie */
+
+	status = vdc_send(vdc->ldc_handle, (caddr_t)&pkt, &msglen);
+	if (status != 0) {
+		PR0("%s[%d] Failed to register DRing (status = %d)\n",
+				__func__, vdc->instance, status);
+		vdc_reset_connection(vdc, B_FALSE);
+	}
+
+	return (status);
+}
+
+
+/* -------------------------------------------------------------------------- */
+
+/*
+ * LDC helper routines
+ */
+
+/*
+ * Function:
+ *	vdc_send()
+ *
+ * Description:
+ *	The function encapsulates the call to write a message using LDC.
+ *	If LDC indicates that the call failed due to the queue being full,
+ *	we retry the ldc_write() [ up to 'vdc_retries' time ], otherwise
+ *	we return the error returned by LDC.
+ *
+ * Arguments:
+ *	ldc_handle	- LDC handle for the channel this instance of vdc uses
+ *	pkt		- address of LDC message to be sent
+ *	msglen		- the size of the message being sent. When the function
+ *			  returns, this contains the number of bytes written.
+ *
+ * Return Code:
+ *	0		- Success.
+ *	EINVAL		- pkt or msglen were NULL
+ *	ECONNRESET	- The connection was not up.
+ *	EWOULDBLOCK	- LDC queue is full
+ *	xxx		- other error codes returned by ldc_write
+ */
+static int
+vdc_send(ldc_handle_t ldc_handle, caddr_t pkt, size_t *msglen)
+{
+	size_t	size = 0;
+	int	retries = 0;
+	int	status = 0;
+
+	ASSERT(msglen != NULL);
+	ASSERT(*msglen != 0);
+
+	do {
+		size = *msglen;
+		status = ldc_write(ldc_handle, pkt, &size);
+	} while (status == EWOULDBLOCK && retries++ < vdc_retries);
+
+	/* return the last size written */
+	*msglen = size;
+
+	return (status);
+}
+
+/*
+ * Function:
+ *	vdc_get_ldc_id()
+ *
+ * Description:
+ *	This function gets the 'ldc-id' for this particular instance of vdc.
+ *	The id returned is the guest domain channel endpoint LDC uses for
+ *	communication with vds.
+ *
+ * Arguments:
+ *	dip	- dev info pointer for this instance of the device driver.
+ *	ldc_id	- pointer to variable used to return the 'ldc-id' found.
+ *
+ * Return Code:
+ *	0	- Success.
+ *	ENOENT	- Expected node or property did not exist.
+ *	ENXIO	- Unexpected error communicating with MD framework
+ */
+static int
+vdc_get_ldc_id(dev_info_t *dip, uint64_t *ldc_id)
+{
+	int		status = ENOENT;
+	char		*node_name = NULL;
+	md_t		*mdp = NULL;
+	int		num_nodes;
+	int		num_vdevs;
+	int		num_chans;
+	mde_cookie_t	rootnode;
+	mde_cookie_t	*listp = NULL;
+	mde_cookie_t	*chanp = NULL;
+	boolean_t	found_inst = B_FALSE;
+	int		listsz;
+	int		idx;
+	uint64_t	md_inst;
+	int		obp_inst;
+	int		instance = ddi_get_instance(dip);
+
+	ASSERT(ldc_id != NULL);
+	*ldc_id = 0;
+
+	/*
+	 * Get the OBP instance number for comparison with the MD instance
+	 *
+	 * The "cfg-handle" property of a vdc node in an MD contains the MD's
+	 * notion of "instance", or unique identifier, for that node; OBP
+	 * stores the value of the "cfg-handle" MD property as the value of
+	 * the "reg" property on the node in the device tree it builds from
+	 * the MD and passes to Solaris.  Thus, we look up the devinfo node's
+	 * "reg" property value to uniquely identify this device instance.
+	 * If the "reg" property cannot be found, the device tree state is
+	 * presumably so broken that there is no point in continuing.
+	 */
+	if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, OBP_REG)) {
+		cmn_err(CE_WARN, "'%s' property does not exist", OBP_REG);
+		return (ENOENT);
+	}
+	obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+			OBP_REG, -1);
+	PR1("%s[%d]: OBP inst=%d\n", __func__, instance, obp_inst);
+
+	/*
+	 * We now walk the MD nodes and if an instance of a vdc node matches
+	 * the instance got from OBP we get the ldc-id property.
+	 */
+	if ((mdp = md_get_handle()) == NULL) {
+		cmn_err(CE_WARN, "unable to init machine description");
+		return (ENXIO);
+	}
+
+	num_nodes = md_node_count(mdp);
+	ASSERT(num_nodes > 0);
+
+	listsz = num_nodes * sizeof (mde_cookie_t);
+
+	/* allocate memory for nodes */
+	listp = kmem_zalloc(listsz, KM_SLEEP);
+	chanp = kmem_zalloc(listsz, KM_SLEEP);
+
+	rootnode = md_root_node(mdp);
+	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
+
+	/*
+	 * Search for all the virtual devices, we will then check to see which
+	 * ones are disk nodes.
+	 */
+	num_vdevs = md_scan_dag(mdp, rootnode,
+			md_find_name(mdp, VDC_MD_VDEV_NAME),
+			md_find_name(mdp, "fwd"), listp);
+
+	if (num_vdevs <= 0) {
+		cmn_err(CE_NOTE, "No '%s' node found", VDC_MD_VDEV_NAME);
+		status = ENOENT;
+		goto done;
+	}
+
+	PR1("%s[%d] num_vdevs=%d\n", __func__, instance, num_vdevs);
+	for (idx = 0; idx < num_vdevs; idx++) {
+		status = md_get_prop_str(mdp, listp[idx], "name", &node_name);
+		if ((status != 0) || (node_name == NULL)) {
+			cmn_err(CE_NOTE, "Unable to get name of node type '%s'"
+					": err %d", VDC_MD_VDEV_NAME, status);
+			continue;
+		}
+
+		PR1("%s[%d] Found node %s\n", __func__, instance, node_name);
+		if (strcmp(VDC_MD_DISK_NAME, node_name) == 0) {
+			status = md_get_prop_val(mdp, listp[idx],
+					VDC_MD_CFG_HDL, &md_inst);
+			PR1("%s[%d] vdc inst# in MD=%d\n",
+					__func__, instance, md_inst);
+			if ((status == 0) && (md_inst == obp_inst)) {
+				found_inst = B_TRUE;
+				break;
+			}
+		}
+	}
+
+	if (found_inst == B_FALSE) {
+		cmn_err(CE_NOTE, "Unable to find correct '%s' node",
+				VDC_MD_DISK_NAME);
+		status = ENOENT;
+		goto done;
+	}
+	PR0("%s[%d] MD inst=%d\n", __func__, instance, md_inst);
+
+	/* get the channels for this node */
+	num_chans = md_scan_dag(mdp, listp[idx],
+			md_find_name(mdp, VDC_MD_CHAN_NAME),
+			md_find_name(mdp, "fwd"), chanp);
+
+	/* expecting at least one channel */
+	if (num_chans <= 0) {
+		cmn_err(CE_NOTE, "No '%s' node for '%s' port",
+				VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME);
+		status = ENOENT;
+		goto done;
+
+	} else if (num_chans != 1) {
+		PR0("%s[%d] Expected 1 '%s' node for '%s' port, found %d\n",
+			__func__, instance, VDC_MD_CHAN_NAME, VDC_MD_VDEV_NAME,
+			num_chans);
+	}
+
+	/*
+	 * We use the first channel found (index 0), irrespective of how
+	 * many are there in total.
+	 */
+	if (md_get_prop_val(mdp, chanp[0], VDC_ID_PROP, ldc_id) != 0) {
+		cmn_err(CE_NOTE, "Channel '%s' property not found",
+				VDC_ID_PROP);
+		status = ENOENT;
+	}
+
+	PR0("%s[%d] LDC id is 0x%lx\n", __func__, instance, *ldc_id);
+
+done:
+	if (chanp)
+		kmem_free(chanp, listsz);
+	if (listp)
+		kmem_free(listp, listsz);
+
+	(void) md_fini_handle(mdp);
+
+	return (status);
+}
+
+
+/*
+ * vdc_is_able_to_tx_data()
+ *
+ * Description:
+ *	This function checks if we are able to send data to the
+ *	vDisk server (vds). The LDC connection needs to be up and
+ *	vdc & vds need to have completed the handshake negotiation.
+ *
+ * Parameters:
+ *	vdc 		- soft state pointer
+ *	flag		- flag to indicate if we can block or not
+ *			  [ If O_NONBLOCK or O_NDELAY (which are defined in
+ *			    open(2)) are set then do not block)
+ *
+ * Return Values
+ *	B_TRUE		- can talk to vds
+ *	B_FALSE		- unable to talk to vds
+ */
+static boolean_t
+vdc_is_able_to_tx_data(vdc_t *vdc, int flag)
+{
+	vd_state_t	state;
+	uint32_t	ldc_state;
+	uint_t		retries = 0;
+	int		rv = -1;
+
+	ASSERT(vdc != NULL);
+
+	mutex_enter(&vdc->lock);
+	state = vdc->state;
+	ldc_state = vdc->ldc_state;
+	mutex_exit(&vdc->lock);
+
+	if ((state == VD_STATE_DATA) && (ldc_state == LDC_UP))
+		return (B_TRUE);
+
+	if ((flag & O_NONBLOCK) || (flag & O_NDELAY)) {
+		PR0("%s[%d] Not ready to tx - state %d LDC state %d\n",
+			__func__, vdc->instance, state, ldc_state);
+		return (B_FALSE);
+	}
+
+	/*
+	 * We want to check and see if any negotiations triggered earlier
+	 * have succeeded. We are prepared to wait a little while in case
+	 * they are still in progress.
+	 */
+	mutex_enter(&vdc->lock);
+	while ((vdc->ldc_state != LDC_UP) || (vdc->state != VD_STATE_DATA)) {
+		PR0("%s: Waiting for connection at state %d (LDC state %d)\n",
+			__func__, vdc->state, vdc->ldc_state);
+
+		rv = cv_timedwait(&vdc->cv, &vdc->lock,
+			VD_GET_TIMEOUT_HZ(retries));
+
+		/*
+		 * An rv of -1 indicates that we timed out without the LDC
+		 * state changing so it looks like the other side (vdc) is
+		 * not yet ready/responding.
+		 *
+		 * Any other value of rv indicates that the LDC triggered an
+		 * interrupt so we just loop again, check the handshake state
+		 * and keep waiting if necessary.
+		 */
+		if (rv == -1) {
+			if (retries >= vdc_retries) {
+				PR0("%s[%d] handshake wait timed out.\n",
+						__func__, vdc->instance);
+				mutex_exit(&vdc->lock);
+				return (B_FALSE);
+			} else {
+				PR1("%s[%d] Retry #%d for handshake timedout\n",
+					__func__, vdc->instance, retries);
+				retries++;
+			}
+		}
+	}
+
+	ASSERT(vdc->ldc_state == LDC_UP);
+	ASSERT(vdc->state == VD_STATE_DATA);
+
+	mutex_exit(&vdc->lock);
+
+	return (B_TRUE);
+}
+
+
+static void
+vdc_terminate_ldc(vdc_t *vdc)
+{
+	int	instance = ddi_get_instance(vdc->dip);
+
+	ASSERT(vdc != NULL);
+	ASSERT(mutex_owned(&vdc->lock));
+
+	PR0("%s[%d] initialized=%x\n", __func__, instance, vdc->initialized);
+
+	if (vdc->initialized & VDC_LDC_OPEN) {
+		PR0("%s[%d]: ldc_close()\n", __func__, instance);
+		(void) ldc_close(vdc->ldc_handle);
+	}
+	if (vdc->initialized & VDC_LDC_CB) {
+		PR0("%s[%d]: ldc_unreg_callback()\n", __func__, instance);
+		(void) ldc_unreg_callback(vdc->ldc_handle);
+	}
+	if (vdc->initialized & VDC_LDC) {
+		PR0("%s[%d]: ldc_fini()\n", __func__, instance);
+		(void) ldc_fini(vdc->ldc_handle);
+		vdc->ldc_handle = NULL;
+	}
+
+	vdc->initialized &= ~(VDC_LDC | VDC_LDC_CB | VDC_LDC_OPEN);
+}
+
+static void
+vdc_reset_connection(vdc_t *vdc, boolean_t reset_ldc)
+{
+	int	status;
+
+	ASSERT(vdc != NULL);
+	ASSERT(mutex_owned(&vdc->lock));
+
+	PR0("%s[%d] Entered\n", __func__, vdc->instance);
+
+	vdc->state = VD_STATE_INIT;
+
+	if (reset_ldc == B_TRUE) {
+		status = ldc_reset(vdc->ldc_handle);
+		PR0("%s[%d]  ldc_reset() = %d\n",
+				__func__, vdc->instance, status);
+	}
+
+	vdc->initialized &= ~VDC_HANDSHAKE;
+	PR0("%s[%d] init=%x\n", __func__, vdc->instance, vdc->initialized);
+}
+
+/* -------------------------------------------------------------------------- */
+
+/*
+ * Descriptor Ring helper routines
+ */
+
+static int
+vdc_init_descriptor_ring(vdc_t *vdc)
+{
+	vd_dring_entry_t	*dep = NULL;	/* DRing Entry pointer */
+	int	status = -1;
+	int	i;
+
+	PR0("%s\n", __func__);
+
+	ASSERT(vdc != NULL);
+	ASSERT(mutex_owned(&vdc->lock));
+	ASSERT(vdc->ldc_handle != NULL);
+
+	status = ldc_mem_dring_create(VD_DRING_LEN, VD_DRING_ENTRY_SZ,
+			&vdc->ldc_dring_hdl);
+	if ((vdc->ldc_dring_hdl == NULL) || (status != 0)) {
+		PR0("%s: Failed to create a descriptor ring", __func__);
+		return (status);
+	}
+	vdc->initialized |= VDC_DRING;
+	vdc->dring_entry_size = VD_DRING_ENTRY_SZ;
+	vdc->dring_len = VD_DRING_LEN;
+
+	vdc->dring_cookie = kmem_zalloc(sizeof (ldc_mem_cookie_t), KM_SLEEP);
+
+	status = ldc_mem_dring_bind(vdc->ldc_handle, vdc->ldc_dring_hdl,
+			LDC_SHADOW_MAP, LDC_MEM_RW, &vdc->dring_cookie[0],
+			&vdc->dring_cookie_count);
+	if (status != 0) {
+		PR0("%s: Failed to bind descriptor ring (%p) to channel (%p)\n",
+			__func__, vdc->ldc_dring_hdl, vdc->ldc_handle);
+		return (status);
+	}
+	ASSERT(vdc->dring_cookie_count == 1);
+	vdc->initialized |= VDC_DRING_BOUND;
+
+	status = ldc_mem_dring_info(vdc->ldc_dring_hdl, &vdc->dring_mem_info);
+	if (status != 0) {
+		PR0("%s: Failed to get info for descriptor ring (%p)\n",
+			__func__, vdc->ldc_dring_hdl);
+		return (status);
+	}
+
+	/* Allocate the local copy of this dring */
+	vdc->local_dring = kmem_zalloc(VD_DRING_LEN * sizeof (vdc_local_desc_t),
+						KM_SLEEP);
+	vdc->initialized |= VDC_DRING_LOCAL;
+
+	/*
+	 * Mark all DRing entries as free and init priv desc memory handles
+	 * If any entry is initialized, we need to free it later so we set
+	 * the bit in 'initialized' at the start.
+	 */
+	vdc->initialized |= VDC_DRING_ENTRY;
+	for (i = 0; i < VD_DRING_LEN; i++) {
+		dep = VDC_GET_DRING_ENTRY_PTR(vdc, i);
+		dep->hdr.dstate = VIO_DESC_FREE;
+
+		status = ldc_mem_alloc_handle(vdc->ldc_handle,
+				&vdc->local_dring[i].desc_mhdl);
+		if (status != 0) {
+			cmn_err(CE_NOTE, "![%d] Failed to alloc mem handle for"
+					" descriptor %d", vdc->instance, i);
+			return (status);
+		}
+		vdc->local_dring[i].flags = VIO_DESC_FREE;
+		vdc->local_dring[i].flags |= VDC_ALLOC_HANDLE;
+		vdc->local_dring[i].dep = dep;
+
+		mutex_init(&vdc->local_dring[i].lock, NULL, MUTEX_DRIVER, NULL);
+		cv_init(&vdc->local_dring[i].cv, NULL, CV_DRIVER, NULL);
+	}
+
+	/*
+	 * We init the index of the last DRing entry used. Since the code to
+	 * get the next available entry increments it before selecting one,
+	 * we set it to the last DRing entry so that it wraps around to zero
+	 * for the 1st entry to be used.
+	 */
+	vdc->dring_curr_idx = VD_DRING_LEN - 1;
+
+	return (status);
+}
+
+static void
+vdc_destroy_descriptor_ring(vdc_t *vdc)
+{
+	ldc_mem_handle_t	mhdl = NULL;
+	int	status = -1;
+	int	i;	/* loop */
+
+	ASSERT(vdc != NULL);
+	ASSERT(mutex_owned(&vdc->lock));
+	ASSERT(vdc->state == VD_STATE_INIT);
+
+	PR0("%s: Entered\n", __func__);
+
+	if (vdc->initialized & VDC_DRING_ENTRY) {
+		for (i = 0; i < VD_DRING_LEN; i++) {
+			mhdl = vdc->local_dring[i].desc_mhdl;
+
+			if (vdc->local_dring[i].flags | VDC_ALLOC_HANDLE)
+				(void) ldc_mem_free_handle(mhdl);
+
+			mutex_destroy(&vdc->local_dring[i].lock);
+			cv_destroy(&vdc->local_dring[i].cv);
+
+			bzero(&vdc->local_dring[i].desc_mhdl,
+				sizeof (ldc_mem_handle_t));
+		}
+		vdc->initialized &= ~VDC_DRING_ENTRY;
+	}
+
+	if (vdc->initialized & VDC_DRING_LOCAL) {
+		kmem_free(vdc->local_dring,
+				VD_DRING_LEN * sizeof (vdc_local_desc_t));
+		vdc->initialized &= ~VDC_DRING_LOCAL;
+	}
+
+	if (vdc->initialized & VDC_DRING_BOUND) {
+		status = ldc_mem_dring_unbind(vdc->ldc_dring_hdl);
+		if (status == 0) {
+			vdc->initialized &= ~VDC_DRING_BOUND;
+		} else {
+			vdc_msg("%s: Failed to unbind Descriptor Ring (%lx)\n",
+				vdc->ldc_dring_hdl);
+		}
+	}
+
+	if (vdc->initialized & VDC_DRING_INIT) {
+		status = ldc_mem_dring_destroy(vdc->ldc_dring_hdl);
+		if (status == 0) {
+			vdc->ldc_dring_hdl = NULL;
+			bzero(&vdc->dring_mem_info, sizeof (ldc_mem_info_t));
+			vdc->initialized &= ~VDC_DRING_INIT;
+		} else {
+			vdc_msg("%s: Failed to destroy Descriptor Ring (%lx)\n",
+				vdc->ldc_dring_hdl);
+		}
+	}
+}
+
+/*
+ * vdc_get_next_dring_entry_idx()
+ *
+ * Description:
+ *	This function gets the index of the next Descriptor Ring entry available
+ *
+ * Return Value:
+ *	0 <= rv < VD_DRING_LEN		Next available slot
+ *	-1 				DRing is full
+ */
+static int
+vdc_get_next_dring_entry_idx(vdc_t *vdc, uint_t num_slots_needed)
+{
+	_NOTE(ARGUNUSED(num_slots_needed))
+
+	vd_dring_entry_t	*dep = NULL;	/* Dring Entry Pointer */
+	int			idx = -1;
+	int			start_idx = 0;
+
+	ASSERT(vdc != NULL);
+	ASSERT(vdc->dring_len == VD_DRING_LEN);
+	ASSERT(vdc->dring_curr_idx >= 0);
+	ASSERT(vdc->dring_curr_idx < VD_DRING_LEN);
+	ASSERT(mutex_owned(&vdc->dring_lock));
+
+	/* Start at the last entry used */
+	idx = start_idx = vdc->dring_curr_idx;
+
+	/*
+	 * Loop through Descriptor Ring checking for a free entry until we reach
+	 * the entry we started at. We should never come close to filling the
+	 * Ring at any stage, instead this is just to prevent an entry which
+	 * gets into an inconsistent state (e.g. due to a request timing out)
+	 * from blocking progress.
+	 */
+	do {
+		/* Get the next entry after the last known index tried */
+		idx = (idx + 1) % VD_DRING_LEN;
+
+		dep = VDC_GET_DRING_ENTRY_PTR(vdc, idx);
+		ASSERT(dep != NULL);
+
+		if (dep->hdr.dstate == VIO_DESC_FREE) {
+			ASSERT(idx >= 0);
+			ASSERT(idx < VD_DRING_LEN);
+			vdc->dring_curr_idx = idx;
+			return (idx);
+
+		} else if (dep->hdr.dstate == VIO_DESC_READY) {
+			PR0("%s: Entry %d waiting to be accepted\n",
+					__func__, idx);
+			continue;
+
+		} else if (dep->hdr.dstate == VIO_DESC_ACCEPTED) {
+			PR0("%s: Entry %d waiting to be processed\n",
+					__func__, idx);
+			continue;
+
+		} else if (dep->hdr.dstate == VIO_DESC_DONE) {
+			PR0("%s: Entry %d done but not marked free\n",
+					__func__, idx);
+
+			/*
+			 * If we are currently panicking, interrupts are
+			 * disabled and we will not be getting ACKs from the
+			 * vDisk server so we mark the descriptor ring entries
+			 * as FREE here instead of in the ACK handler.
+			 */
+			if (panicstr) {
+				(void) vdc_depopulate_descriptor(vdc, idx);
+				dep->hdr.dstate = VIO_DESC_FREE;
+				vdc->local_dring[idx].flags = VIO_DESC_FREE;
+			}
+			continue;
+
+		} else {
+			vdc_msg("Public Descriptor Ring entry corrupted");
+			mutex_enter(&vdc->lock);
+			vdc_reset_connection(vdc, B_TRUE);
+			mutex_exit(&vdc->lock);
+			return (-1);
+		}
+
+	} while (idx != start_idx);
+
+	return (-1);
+}
+
+/*
+ * Function:
+ *	vdc_populate_descriptor
+ *
+ * Description:
+ *	This routine writes the data to be transmitted to vds into the
+ *	descriptor, notifies vds that the ring has been updated and
+ *	then waits for the request to be processed.
+ *
+ * Arguments:
+ *	vdc	- the soft state pointer
+ *	addr	- start address of memory region.
+ *	nbytes	- number of bytes to read/write
+ *	operation - operation we want vds to perform (VD_OP_XXX)
+ *	arg	- parameter to be sent to server (depends on VD_OP_XXX type)
+ *			. mode for ioctl(9e)
+ *			. LP64 diskaddr_t (block I/O)
+ *	slice	- the disk slice this request is for
+ *
+ * Return Codes:
+ *	0
+ *	EAGAIN
+ *		EFAULT
+ *		ENXIO
+ *		EIO
+ */
+static int
+vdc_populate_descriptor(vdc_t *vdc, caddr_t addr, size_t nbytes, int operation,
+				uint64_t arg, uint64_t slice)
+{
+	vdc_local_desc_t *local_dep = NULL;	/* Local Dring Entry Pointer */
+	vd_dring_entry_t *dep = NULL;		/* Dring Entry Pointer */
+	int			idx = 0;	/* Index of DRing entry used */
+	vio_dring_msg_t		dmsg;
+	size_t			msglen = sizeof (dmsg);
+	int			status = 0;
+	int			rv;
+	int			retries = 0;
+
+	ASSERT(vdc != NULL);
+	ASSERT(slice < V_NUMPAR);
+
+	/*
+	 * Get next available DRing entry.
+	 */
+	mutex_enter(&vdc->dring_lock);
+	idx = vdc_get_next_dring_entry_idx(vdc, 1);
+	if (idx == -1) {
+		mutex_exit(&vdc->dring_lock);
+		vdc_msg("%s[%d]: no descriptor ring entry avail, seq=%d\n",
+			__func__, vdc->instance, vdc->seq_num);
+
+		/*
+		 * Since strategy should not block we don't wait for the DRing
+		 * to empty and instead return
+		 */
+		return (EAGAIN);
+	}
+
+	ASSERT(idx < VD_DRING_LEN);
+	local_dep = &vdc->local_dring[idx];
+	dep = local_dep->dep;
+	ASSERT(dep != NULL);
+
+	/*
+	 * Wait for anybody still using the DRing entry to finish.
+	 * (e.g. still waiting for vds to respond to a request)
+	 */
+	mutex_enter(&local_dep->lock);
+
+	switch (operation) {
+	case VD_OP_BREAD:
+	case VD_OP_BWRITE:
+		PR1("buf=%p, block=%lx, nbytes=%lx\n", addr, arg, nbytes);
+		dep->payload.addr = (diskaddr_t)arg;
+		rv = vdc_populate_mem_hdl(vdc, idx, addr, nbytes, operation);
+		break;
+
+	case VD_OP_FLUSH:
+	case VD_OP_GET_VTOC:
+	case VD_OP_SET_VTOC:
+	case VD_OP_GET_DISKGEOM:
+	case VD_OP_SET_DISKGEOM:
+	case VD_OP_SCSICMD:
+		if (nbytes > 0) {
+			rv = vdc_populate_mem_hdl(vdc, idx, addr, nbytes,
+							operation);
+		}
+		break;
+	default:
+		cmn_err(CE_NOTE, "[%d] Unsupported vDisk operation [%d]\n",
+				vdc->instance, operation);
+		rv = EINVAL;
+	}
+
+	if (rv != 0) {
+		mutex_exit(&local_dep->lock);
+		mutex_exit(&vdc->dring_lock);
+		return (rv);
+	}
+
+	/*
+	 * fill in the data details into the DRing
+	 */
+	dep->payload.req_id = VDC_GET_NEXT_REQ_ID(vdc);
+	dep->payload.operation = operation;
+	dep->payload.nbytes = nbytes;
+	dep->payload.status = EINPROGRESS;	/* vds will set valid value */
+	dep->payload.slice = slice;
+	dep->hdr.dstate = VIO_DESC_READY;
+	dep->hdr.ack = 1;		/* request an ACK for every message */
+
+	local_dep->flags = VIO_DESC_READY;
+	local_dep->addr = addr;
+
+	/*
+	 * Send a msg with the DRing details to vds
+	 */
+	VIO_INIT_DRING_DATA_TAG(dmsg);
+	VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc);
+	dmsg.dring_ident = vdc->dring_ident;
+	dmsg.start_idx = idx;
+	dmsg.end_idx = idx;
+
+	PR1("ident=0x%llx, st=%d, end=%d, seq=%d req=%d dep=%p\n",
+			vdc->dring_ident, dmsg.start_idx, dmsg.end_idx,
+			dmsg.seq_num, dep->payload.req_id, dep);
+
+	status = vdc_send(vdc->ldc_handle, (caddr_t)&dmsg, &msglen);
+	PR1("%s[%d]: ldc_write() status=%d\n", __func__, vdc->instance, status);
+	if (status != 0) {
+		mutex_exit(&local_dep->lock);
+		mutex_exit(&vdc->dring_lock);
+		vdc_msg("%s: ldc_write(%d)\n", __func__, status);
+		return (EAGAIN);
+	}
+
+	/*
+	 * XXX - potential performance enhancement (Investigate at a later date)
+	 *
+	 * for calls from strategy(9E), instead of waiting for a response from
+	 * vds, we could return at this stage and let the ACK handling code
+	 * trigger the biodone(9F)
+	 */
+
+	/*
+	 * When a guest is panicking, the completion of requests needs to be
+	 * handled differently because interrupts are disabled and vdc
+	 * will not get messages. We have to poll for the messages instead.
+	 */
+	if (ddi_in_panic()) {
+		int start = 0;
+		retries = 0;
+		for (;;) {
+			msglen = sizeof (dmsg);
+			status = ldc_read(vdc->ldc_handle, (caddr_t)&dmsg,
+					&msglen);
+			if (status) {
+				status = EINVAL;
+				break;
+			}
+
+			/*
+			 * if there are no packets wait and check again
+			 */
+			if ((status == 0) && (msglen == 0)) {
+				if (retries++ > vdc_dump_retries) {
+					PR0("[%d] Giving up waiting, idx %d\n",
+							vdc->instance, idx);
+					status = EAGAIN;
+					break;
+				}
+
+				PR1("Waiting for next packet @ %d\n", idx);
+				delay(drv_usectohz(vdc_dump_usec_timeout));
+				continue;
+			}
+
+			/*
+			 * Ignore all messages that are not ACKs/NACKs to
+			 * DRing requests.
+			 */
+			if ((dmsg.tag.vio_msgtype != VIO_TYPE_DATA) ||
+			    (dmsg.tag.vio_subtype_env != VIO_DRING_DATA)) {
+				PR0("discarding pkt: type=%d sub=%d env=%d\n",
+					dmsg.tag.vio_msgtype,
+					dmsg.tag.vio_subtype,
+					dmsg.tag.vio_subtype_env);
+				continue;
+			}
+
+			/*
+			 * set the appropriate return value for the
+			 * current request.
+			 */
+			switch (dmsg.tag.vio_subtype) {
+			case VIO_SUBTYPE_ACK:
+				status = 0;
+				break;
+			case VIO_SUBTYPE_NACK:
+				status = EAGAIN;
+				break;
+			default:
+				continue;
+			}
+
+			start = dmsg.start_idx;
+			if (start >= VD_DRING_LEN) {
+				PR0("[%d] Bogus ack data : start %d\n",
+					vdc->instance, start);
+				continue;
+			}
+
+			dep = VDC_GET_DRING_ENTRY_PTR(vdc, start);
+
+			PR1("[%d] Dumping start=%d idx=%d state=%d\n",
+				vdc->instance, start, idx, dep->hdr.dstate);
+
+			if (dep->hdr.dstate != VIO_DESC_DONE) {
+				PR0("[%d] Entry @ %d - state !DONE %d\n",
+					vdc->instance, start, dep->hdr.dstate);
+				continue;
+			}
+
+			(void) vdc_depopulate_descriptor(vdc, start);
+
+			/*
+			 * We want to process all Dring entries up to
+			 * the current one so that we can return an
+			 * error with the correct request.
+			 */
+			if (idx > start) {
+				PR0("[%d] Looping: start %d, idx %d\n",
+						vdc->instance, idx, start);
+				continue;
+			}
+
+			/* exit - all outstanding requests are completed */
+			break;
+		}
+
+		mutex_exit(&local_dep->lock);
+		mutex_exit(&vdc->dring_lock);
+
+		return (status);
+	}
+
+	/*
+	 * Now watch the DRing entries we modified to get the response
+	 * from vds.
+	 */
+	status = vdc_wait_for_descriptor_update(vdc, idx, dmsg);
+	if (status == ETIMEDOUT) {
+		/* debug info when dumping state on vds side */
+		dep->payload.status = ECANCELED;
+	}
+
+	status = vdc_depopulate_descriptor(vdc, idx);
+	PR1("%s[%d] Status=%d\n", __func__, vdc->instance, status);
+
+	mutex_exit(&local_dep->lock);
+	mutex_exit(&vdc->dring_lock);
+
+	return (status);
+}
+
+static int
+vdc_wait_for_descriptor_update(vdc_t *vdc, uint_t idx, vio_dring_msg_t dmsg)
+{
+	vd_dring_entry_t *dep = NULL;		/* Dring Entry Pointer */
+	vdc_local_desc_t *local_dep = NULL;	/* Local Dring Entry Pointer */
+	size_t	msglen = sizeof (dmsg);
+	int	retries = 0;
+	int	status = ENXIO;
+	int	rv = 0;
+
+	ASSERT(vdc != NULL);
+	ASSERT(idx < VD_DRING_LEN);
+	local_dep = &vdc->local_dring[idx];
+	ASSERT(local_dep != NULL);
+	dep = local_dep->dep;
+	ASSERT(dep != NULL);
+
+	while (dep->hdr.dstate != VIO_DESC_DONE) {
+		rv = cv_timedwait(&local_dep->cv, &local_dep->lock,
+			VD_GET_TIMEOUT_HZ(retries));
+		if (rv == -1) {
+			/*
+			 * If they persist in ignoring us we'll storm off in a
+			 * huff and return ETIMEDOUT to the upper layers.
+			 */
+			if (retries >= vdc_retries) {
+				PR0("%s: Finished waiting on entry %d\n",
+					__func__, idx);
+				status = ETIMEDOUT;
+				break;
+			} else {
+				retries++;
+				PR0("%s[%d]: Timeout #%d on entry %d "
+				    "[seq %d][req %d]\n", __func__,
+				    vdc->instance,
+				    retries, idx, dmsg.seq_num,
+				    dep->payload.req_id);
+			}
+
+			if (dep->hdr.dstate & VIO_DESC_ACCEPTED) {
+				PR0("%s[%d]: vds has accessed entry %d [seq %d]"
+				    "[req %d] but not ack'ed it yet\n",
+				    __func__, vdc->instance, idx, dmsg.seq_num,
+				    dep->payload.req_id);
+				continue;
+			}
+
+			/*
+			 * we resend the message as it may have been dropped
+			 * and have never made it to the other side (vds).
+			 * (We reuse the original message but update seq ID)
+			 */
+			VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc);
+			retries = 0;
+			status = vdc_send(vdc->ldc_handle, (caddr_t)&dmsg,
+					&msglen);
+			if (status != 0) {
+				vdc_msg("%s: Error (%d) while resending after "
+					"timeout\n", __func__, status);
+				status = ETIMEDOUT;
+				break;
+			}
+		}
+	}
+
+	return (status);
+}
+
+static int
+vdc_get_response(vdc_t *vdc, int start, int end)
+{
+	vdc_local_desc_t	*ldep = NULL;	/* Local Dring Entry Pointer */
+	vd_dring_entry_t	*dep = NULL;	/* Dring Entry Pointer */
+	int			status = ENXIO;
+	int			idx = -1;
+
+	ASSERT(vdc != NULL);
+	ASSERT(start >= 0);
+	ASSERT(start <= VD_DRING_LEN);
+	ASSERT(start >= -1);
+	ASSERT(start <= VD_DRING_LEN);
+
+	idx = start;
+	ldep = &vdc->local_dring[idx];
+	ASSERT(ldep != NULL);
+	dep = ldep->dep;
+	ASSERT(dep != NULL);
+
+	PR0("%s[%d] DRING entry=%d status=%d\n", __func__, vdc->instance,
+			idx, VIO_GET_DESC_STATE(dep->hdr.dstate));
+	while (VIO_GET_DESC_STATE(dep->hdr.dstate) == VIO_DESC_DONE) {
+		if ((end != -1) && (idx > end))
+			return (0);
+
+		switch (ldep->operation) {
+		case VD_OP_BREAD:
+		case VD_OP_BWRITE:
+			/* call bioxxx */
+			break;
+		default:
+			/* signal waiter */
+			break;
+		}
+
+		/* Clear the DRing entry */
+		status = vdc_depopulate_descriptor(vdc, idx);
+		PR0("%s[%d] Status=%d\n", __func__, vdc->instance, status);
+
+		/* loop accounting to get next DRing entry */
+		idx++;
+		ldep = &vdc->local_dring[idx];
+		dep = ldep->dep;
+	}
+
+	return (status);
+}
+
+static int
+vdc_depopulate_descriptor(vdc_t *vdc, uint_t idx)
+{
+	vd_dring_entry_t *dep = NULL;		/* Dring Entry Pointer */
+	vdc_local_desc_t *ldep = NULL;	/* Local Dring Entry Pointer */
+	int	status = ENXIO;
+
+	ASSERT(vdc != NULL);
+	ASSERT(idx < VD_DRING_LEN);
+	ldep = &vdc->local_dring[idx];
+	ASSERT(ldep != NULL);
+	dep = ldep->dep;
+	ASSERT(dep != NULL);
+
+	status = dep->payload.status;
+	VDC_MARK_DRING_ENTRY_FREE(vdc, idx);
+	ldep = &vdc->local_dring[idx];
+	VIO_SET_DESC_STATE(ldep->flags, VIO_DESC_FREE);
+
+	/*
+	 * If the upper layer passed in a misaligned address we copied the
+	 * data into an aligned buffer before sending it to LDC - we now
+	 * copy it back to the original buffer.
+	 */
+	if (ldep->align_addr) {
+		ASSERT(ldep->addr != NULL);
+		ASSERT(dep->payload.nbytes > 0);
+
+		bcopy(ldep->align_addr, ldep->addr, dep->payload.nbytes);
+		kmem_free(ldep->align_addr,
+				sizeof (caddr_t) * dep->payload.nbytes);
+		ldep->align_addr = NULL;
+	}
+
+	status = ldc_mem_unbind_handle(ldep->desc_mhdl);
+	if (status != 0) {
+		cmn_err(CE_NOTE, "[%d] unbind mem hdl 0x%lx @ idx %d failed:%d",
+			vdc->instance, ldep->desc_mhdl, idx, status);
+	}
+
+	return (status);
+}
+
+static int
+vdc_populate_mem_hdl(vdc_t *vdc, uint_t idx, caddr_t addr, size_t nbytes,
+			int operation)
+{
+	vd_dring_entry_t	*dep = NULL;
+	vdc_local_desc_t	*ldep = NULL;
+	ldc_mem_handle_t	mhdl;
+	caddr_t			vaddr;
+	int			perm = LDC_MEM_RW;
+	int			rv = 0;
+	int			i;
+
+	ASSERT(vdc != NULL);
+	ASSERT(idx < VD_DRING_LEN);
+
+	dep = VDC_GET_DRING_ENTRY_PTR(vdc, idx);
+	ldep = &vdc->local_dring[idx];
+	mhdl = ldep->desc_mhdl;
+
+	switch (operation) {
+	case VD_OP_BREAD:
+		perm = LDC_MEM_W;
+		break;
+
+	case VD_OP_BWRITE:
+		perm = LDC_MEM_R;
+		break;
+
+	case VD_OP_FLUSH:
+	case VD_OP_GET_VTOC:
+	case VD_OP_SET_VTOC:
+	case VD_OP_GET_DISKGEOM:
+	case VD_OP_SET_DISKGEOM:
+	case VD_OP_SCSICMD:
+		perm = LDC_MEM_RW;
+		break;
+
+	default:
+		ASSERT(0);	/* catch bad programming in vdc */
+	}
+
+	/*
+	 * LDC expects any addresses passed in to be 8-byte aligned. We need
+	 * to copy the contents of any misaligned buffers to a newly allocated
+	 * buffer and bind it instead (and copy the the contents back to the
+	 * original buffer passed in when depopulating the descriptor)
+	 */
+	vaddr = addr;
+	if (((uint64_t)addr & 0x7) != 0) {
+		ldep->align_addr =
+			kmem_zalloc(sizeof (caddr_t) * nbytes, KM_SLEEP);
+		PR0("%s[%d] Misaligned address %lx reallocating "
+		    "(buf=%lx entry=%d)\n",
+		    __func__, vdc->instance, addr, ldep->align_addr, idx);
+		bcopy(addr, ldep->align_addr, nbytes);
+		vaddr = ldep->align_addr;
+	}
+
+	rv = ldc_mem_bind_handle(mhdl, vaddr, P2ROUNDUP(nbytes, 8),
+		vdc->dring_mem_info.mtype, perm, &dep->payload.cookie[0],
+		&dep->payload.ncookies);
+	PR1("%s[%d] bound mem handle; ncookies=%d\n",
+			__func__, vdc->instance, dep->payload.ncookies);
+	if (rv != 0) {
+		vdc_msg("%s[%d] failed to ldc_mem_bind_handle "
+		    "(mhdl=%lx, buf=%lx entry=%d err=%d)\n",
+		    __func__, vdc->instance, mhdl, addr, idx, rv);
+		if (ldep->align_addr) {
+			kmem_free(ldep->align_addr,
+				sizeof (caddr_t) * dep->payload.nbytes);
+			ldep->align_addr = NULL;
+		}
+		return (EAGAIN);
+	}
+
+	/*
+	 * Get the other cookies (if any).
+	 */
+	for (i = 1; i < dep->payload.ncookies; i++) {
+		rv = ldc_mem_nextcookie(mhdl, &dep->payload.cookie[i]);
+		if (rv != 0) {
+			(void) ldc_mem_unbind_handle(mhdl);
+			vdc_msg("%s: failed to get next cookie(mhdl=%lx "
+				"cnum=%d), err=%d", __func__, mhdl, i, rv);
+			if (ldep->align_addr) {
+				kmem_free(ldep->align_addr,
+					sizeof (caddr_t) * dep->payload.nbytes);
+				ldep->align_addr = NULL;
+			}
+			return (EAGAIN);
+		}
+	}
+
+	return (rv);
+}
+
+/*
+ * Interrupt handlers for messages from LDC
+ */
+
+static uint_t
+vdc_handle_cb(uint64_t event, caddr_t arg)
+{
+	ldc_status_t	ldc_state;
+	int		rv = 0;
+
+	vdc_t	*vdc = (vdc_t *)(void *)arg;
+
+	ASSERT(vdc != NULL);
+
+	PR1("%s[%d] event=%x seqID=%d\n",
+			__func__, vdc->instance, event, vdc->seq_num);
+
+	/*
+	 * Depending on the type of event that triggered this callback,
+	 * we modify the handhske state or read the data.
+	 *
+	 * NOTE: not done as a switch() as event could be triggered by
+	 * a state change and a read request. Also the ordering	of the
+	 * check for the event types is deliberate.
+	 */
+	if (event & LDC_EVT_UP) {
+		PR0("%s[%d] Received LDC_EVT_UP\n", __func__, vdc->instance);
+
+		/* get LDC state */
+		rv = ldc_status(vdc->ldc_handle, &ldc_state);
+		if (rv != 0) {
+			cmn_err(CE_NOTE, "[%d] Couldn't get LDC status %d",
+					vdc->instance, rv);
+			vdc_reset_connection(vdc, B_TRUE);
+			return (LDC_SUCCESS);
+		}
+
+		/*
+		 * Reset the transaction sequence numbers when LDC comes up.
+		 * We then kick off the handshake negotiation with the vDisk
+		 * server.
+		 */
+		mutex_enter(&vdc->lock);
+		vdc->seq_num = 0;
+		vdc->seq_num_reply = 0;
+		vdc->ldc_state = ldc_state;
+		ASSERT(ldc_state == LDC_UP);
+		mutex_exit(&vdc->lock);
+
+		vdc_init_handshake_negotiation(vdc);
+
+		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
+	}
+
+	if (event & LDC_EVT_READ) {
+		/*
+		 * Wake up the worker thread to process the message
+		 */
+		mutex_enter(&vdc->msg_proc_lock);
+		vdc->msg_pending = B_TRUE;
+		cv_signal(&vdc->msg_proc_cv);
+		mutex_exit(&vdc->msg_proc_lock);
+
+		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
+
+		/* that's all we have to do - no need to handle DOWN/RESET */
+		return (LDC_SUCCESS);
+	}
+
+	if (event & LDC_EVT_RESET) {
+		PR0("%s[%d] Recvd LDC RESET event\n", __func__, vdc->instance);
+	}
+
+	if (event & LDC_EVT_DOWN) {
+		PR0("%s[%d] Recvd LDC DOWN event\n", __func__, vdc->instance);
+
+		/* get LDC state */
+		rv = ldc_status(vdc->ldc_handle, &ldc_state);
+		if (rv != 0) {
+			cmn_err(CE_NOTE, "[%d] Couldn't get LDC status %d",
+					vdc->instance, rv);
+			ldc_state = LDC_OPEN;
+		}
+		mutex_enter(&vdc->lock);
+		vdc->ldc_state = ldc_state;
+		mutex_exit(&vdc->lock);
+
+		vdc_reset_connection(vdc, B_TRUE);
+	}
+
+	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET | LDC_EVT_DOWN | LDC_EVT_READ))
+		cmn_err(CE_NOTE, "![%d] Unexpected LDC event (%lx) received",
+				vdc->instance, event);
+
+	return (LDC_SUCCESS);
+}
+
+/* -------------------------------------------------------------------------- */
+
+/*
+ * The following functions process the incoming messages from vds
+ */
+
+
+static void
+vdc_process_msg_thread(vdc_t *vdc)
+{
+	int		status = 0;
+	boolean_t	q_is_empty = B_TRUE;
+
+	ASSERT(vdc != NULL);
+
+	mutex_enter(&vdc->msg_proc_lock);
+	PR0("%s[%d]: Starting\n", __func__, vdc->instance);
+
+	vdc->msg_proc_thr_state = VDC_THR_RUNNING;
+
+	while (vdc->msg_proc_thr_state == VDC_THR_RUNNING) {
+
+		PR1("%s[%d] Waiting\n", __func__, vdc->instance);
+		while (vdc->msg_pending == B_FALSE)
+			cv_wait(&vdc->msg_proc_cv, &vdc->msg_proc_lock);
+
+		PR1("%s[%d] Message Received\n", __func__, vdc->instance);
+
+		/* check if there is data */
+		status = ldc_chkq(vdc->ldc_handle, &q_is_empty);
+		if ((status != 0) &&
+		    (vdc->msg_proc_thr_state == VDC_THR_RUNNING)) {
+			cmn_err(CE_NOTE, "[%d] Unable to communicate with vDisk"
+					" server. Cannot check LDC queue: %d",
+					vdc->instance, status);
+			mutex_enter(&vdc->lock);
+			vdc_reset_connection(vdc, B_TRUE);
+			mutex_exit(&vdc->lock);
+			vdc->msg_proc_thr_state = VDC_THR_STOP;
+			continue;
+		}
+
+		if (q_is_empty == B_FALSE) {
+			PR1("%s: new pkt(s) available\n", __func__);
+			vdc_process_msg(vdc);
+		}
+
+		vdc->msg_pending = B_FALSE;
+	}
+
+	PR0("Message processing thread stopped\n");
+	vdc->msg_pending = B_FALSE;
+	vdc->msg_proc_thr_state = VDC_THR_DONE;
+	cv_signal(&vdc->msg_proc_cv);
+	mutex_exit(&vdc->msg_proc_lock);
+	thread_exit();
+}
+
+
+/*
+ * Function:
+ *	vdc_process_msg()
+ *
+ * Description:
+ *	This function is called by the message processing thread each time it
+ *	is triggered when LDC sends an interrupt to indicate that there are
+ *	more packets on the queue. When it is called it will continue to loop
+ *	and read the messages until there are no more left of the queue. If it
+ *	encounters an invalid sized message it will drop it and check the next
+ *	message.
+ *
+ * Arguments:
+ *	arg	- soft state pointer for this instance of the device driver.
+ *
+ * Return Code:
+ *	None.
+ */
+static void
+vdc_process_msg(void *arg)
+{
+	vdc_t		*vdc = (vdc_t *)(void *)arg;
+	vio_msg_t	vio_msg;
+	size_t		nbytes = sizeof (vio_msg);
+	int		status;
+
+	ASSERT(vdc != NULL);
+
+	mutex_enter(&vdc->lock);
+
+	PR1("%s\n", __func__);
+
+	for (;;) {
+
+		/* read all messages - until no more left */
+		status = ldc_read(vdc->ldc_handle, (caddr_t)&vio_msg, &nbytes);
+
+		if (status) {
+			vdc_msg("%s: ldc_read() failed = %d", __func__, status);
+
+			/* if status is ECONNRESET --- reset vdc state */
+			if (status == EIO || status == ECONNRESET) {
+				vdc_reset_connection(vdc, B_FALSE);
+			}
+
+			mutex_exit(&vdc->lock);
+			return;
+		}
+
+		if ((nbytes > 0) && (nbytes < sizeof (vio_msg_tag_t))) {
+			cmn_err(CE_CONT, "![%d] Expect %lu bytes; recv'd %lu\n",
+				vdc->instance, sizeof (vio_msg_tag_t), nbytes);
+			mutex_exit(&vdc->lock);
+			return;
+		}
+
+		if (nbytes == 0) {
+			PR2("%s[%d]: ldc_read() done..\n",
+					__func__, vdc->instance);
+			mutex_exit(&vdc->lock);
+			return;
+		}
+
+		PR1("%s[%d] (%x/%x/%x)\n", __func__, vdc->instance,
+		    vio_msg.tag.vio_msgtype,
+		    vio_msg.tag.vio_subtype,
+		    vio_msg.tag.vio_subtype_env);
+
+		/*
+		 * Verify the Session ID of the message
+		 *
+		 * Every message after the Version has been negotiated should
+		 * have the correct session ID set.
+		 */
+		if ((vio_msg.tag.vio_sid != vdc->session_id) &&
+		    (vio_msg.tag.vio_subtype_env != VIO_VER_INFO)) {
+			PR0("%s: Incorrect SID 0x%x msg 0x%lx, expected 0x%x\n",
+				__func__, vio_msg.tag.vio_sid, &vio_msg,
+				vdc->session_id);
+			vdc_reset_connection(vdc, B_FALSE);
+			mutex_exit(&vdc->lock);
+			return;
+		}
+
+		switch (vio_msg.tag.vio_msgtype) {
+		case VIO_TYPE_CTRL:
+			status = vdc_process_ctrl_msg(vdc, vio_msg);
+			break;
+		case VIO_TYPE_DATA:
+			status = vdc_process_data_msg(vdc, vio_msg);
+			break;
+		case VIO_TYPE_ERR:
+			status = vdc_process_err_msg(vdc, vio_msg);
+			break;
+		default:
+			PR1("%s", __func__);
+			status = EINVAL;
+			break;
+		}
+
+		if (status != 0) {
+			PR0("%s[%d] Error (%d) occcurred processing msg\n",
+					__func__, vdc->instance, status);
+			vdc_reset_connection(vdc, B_FALSE);
+		}
+	}
+	_NOTE(NOTREACHED)
+}
+
+/*
+ * Function:
+ *	vdc_process_ctrl_msg()
+ *
+ * Description:
+ *	This function is called by the message processing thread each time
+ *	an LDC message with a msgtype of VIO_TYPE_CTRL is received.
+ *
+ * Arguments:
+ *	vdc	- soft state pointer for this instance of the device driver.
+ *	msg	- the LDC message sent by vds
+ *
+ * Return Codes:
+ *	0	- Success.
+ *	EPROTO	- A message was received which shouldn't have happened according
+ *		  to the protocol
+ *	ENOTSUP	- An action which is allowed according to the protocol but which
+ *		  isn't (or doesn't need to be) implemented yet.
+ *	EINVAL	- An invalid value was returned as part of a message.
+ */
+static int
+vdc_process_ctrl_msg(vdc_t *vdc, vio_msg_t msg)
+{
+	size_t			msglen = sizeof (msg);
+	vd_attr_msg_t		*attr_msg = NULL;
+	vio_dring_reg_msg_t	*dring_msg = NULL;
+	int			status = -1;
+
+	ASSERT(msg.tag.vio_msgtype == VIO_TYPE_CTRL);
+	ASSERT(vdc != NULL);
+	ASSERT(mutex_owned(&vdc->lock));
+
+	/* Depending on which state we are in; process the message */
+	switch (vdc->state) {
+	case VD_STATE_INIT:
+		if (msg.tag.vio_subtype_env != VIO_VER_INFO) {
+			status = EPROTO;
+			break;
+		}
+
+		switch (msg.tag.vio_subtype) {
+		case VIO_SUBTYPE_ACK:
+			vdc->state = VD_STATE_VER;
+			status = vdc_init_attr_negotiation(vdc);
+			break;
+		case VIO_SUBTYPE_NACK:
+			/*
+			 * For now there is only one version number so we
+			 * cannot step back to an earlier version but in the
+			 * future we may need to add further logic here
+			 * to try negotiating an earlier version as the VIO
+			 * design allow for it.
+			 */
+
+			/*
+			 * vds could not handle the version we sent so we just
+			 * stop negotiating.
+			 */
+			status = EPROTO;
+			break;
+
+		case VIO_SUBTYPE_INFO:
+			/*
+			 * Handle the case where vds starts handshake
+			 * (for now only vdc is the instigatior)
+			 */
+			status = ENOTSUP;
+			break;
+
+		default:
+			status = ENOTSUP;
+			break;
+		}
+		break;
+
+	case VD_STATE_VER:
+		if (msg.tag.vio_subtype_env != VIO_ATTR_INFO) {
+			status = EPROTO;
+			break;
+		}
+
+		switch (msg.tag.vio_subtype) {
+		case VIO_SUBTYPE_ACK:
+			/*
+			 * We now verify the attributes sent by vds.
+			 */
+			attr_msg = (vd_attr_msg_t *)&msg;
+			vdc->vdisk_size = attr_msg->vdisk_size;
+			vdc->vdisk_type = attr_msg->vdisk_type;
+
+			if ((attr_msg->max_xfer_sz != vdc->max_xfer_sz) ||
+			    (attr_msg->vdisk_block_size != vdc->block_size)) {
+				/*
+				 * Future support: step down to the block size
+				 * and max transfer size suggested by the
+				 * server. (If this value is less than 128K
+				 * then multiple Dring entries per request
+				 * would need to be implemented)
+				 */
+				cmn_err(CE_NOTE, "[%d] Couldn't process block "
+					"attrs from vds", vdc->instance);
+				status = EINVAL;
+				break;
+			}
+
+			if ((attr_msg->xfer_mode != VIO_DRING_MODE) ||
+			    (attr_msg->vdisk_size > INT64_MAX) ||
+			    (attr_msg->vdisk_type > VD_DISK_TYPE_DISK)) {
+				vdc_msg("%s[%d] Couldn't process attrs "
+				    "from vds", __func__, vdc->instance);
+				status = EINVAL;
+				break;
+			}
+
+			vdc->state = VD_STATE_ATTR;
+			status = vdc_init_dring_negotiate(vdc);
+			break;
+
+		case VIO_SUBTYPE_NACK:
+			/*
+			 * vds could not handle the attributes we sent so we
+			 * stop negotiating.
+			 */
+			status = EPROTO;
+			break;
+
+		case VIO_SUBTYPE_INFO:
+			/*
+			 * Handle the case where vds starts the handshake
+			 * (for now; vdc is the only supported instigatior)
+			 */
+			status = ENOTSUP;
+			break;
+
+		default:
+			status = ENOTSUP;
+			break;
+		}
+		break;
+
+
+	case VD_STATE_ATTR:
+		if (msg.tag.vio_subtype_env != VIO_DRING_REG) {
+			status = EPROTO;
+			break;
+		}
+
+		switch (msg.tag.vio_subtype) {
+		case VIO_SUBTYPE_ACK:
+			/* Verify that we have sent all the descr. ring info */
+			/* nop for now as we have just 1 dring */
+			dring_msg = (vio_dring_reg_msg_t *)&msg;
+
+			/* save the received dring_ident */
+			vdc->dring_ident = dring_msg->dring_ident;
+			PR0("%s[%d] Received dring ident=0x%lx\n",
+				__func__, vdc->instance, vdc->dring_ident);
+
+			/*
+			 * Send an RDX message to vds to indicate we are ready
+			 * to send data
+			 */
+			msg.tag.vio_msgtype = VIO_TYPE_CTRL;
+			msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
+			msg.tag.vio_subtype_env = VIO_RDX;
+			msg.tag.vio_sid = vdc->session_id;
+			status = vdc_send(vdc->ldc_handle, (caddr_t)&msg,
+					&msglen);
+			if (status != 0) {
+				cmn_err(CE_NOTE, "[%d] Failed to send RDX"
+					" message (%d)", vdc->instance, status);
+				break;
+			}
+
+			status = vdc_create_fake_geometry(vdc);
+			if (status != 0) {
+				cmn_err(CE_NOTE, "[%d] Failed to create disk "
+					"geometery(%d)", vdc->instance, status);
+				break;
+			}
+
+			vdc->state = VD_STATE_RDX;
+			break;
+
+		case VIO_SUBTYPE_NACK:
+			/*
+			 * vds could not handle the DRing info we sent so we
+			 * stop negotiating.
+			 */
+			cmn_err(CE_CONT, "server could not register DRing\n");
+			vdc_reset_connection(vdc, B_FALSE);
+			vdc_destroy_descriptor_ring(vdc);
+			status = EPROTO;
+			break;
+
+		case VIO_SUBTYPE_INFO:
+			/*
+			 * Handle the case where vds starts handshake
+			 * (for now only vdc is the instigatior)
+			 */
+			status = ENOTSUP;
+			break;
+		default:
+			status = ENOTSUP;
+		}
+		break;
+
+	case VD_STATE_RDX:
+		if (msg.tag.vio_subtype_env != VIO_RDX) {
+			status = EPROTO;
+			break;
+		}
+
+		PR0("%s: Received RDX - handshake successful\n", __func__);
+
+		status = 0;
+		vdc->state = VD_STATE_DATA;
+
+		cv_broadcast(&vdc->attach_cv);
+		break;
+
+	default:
+		cmn_err(CE_NOTE, "[%d] unknown handshake negotiation state %d",
+				vdc->instance, vdc->state);
+		break;
+	}
+
+	return (status);
+}
+
+
+/*
+ * Function:
+ *	vdc_process_data_msg()
+ *
+ * Description:
+ *	This function is called by the message processing thread each time it
+ *	a message with a msgtype of VIO_TYPE_DATA is received. It will either
+ *	be an ACK or NACK from vds[1] which vdc handles as follows.
+ *		ACK	- wake up the waiting thread
+ *		NACK	- resend any messages necessary
+ *
+ *	[1] Although the message format allows it, vds should not send a
+ *	    VIO_SUBTYPE_INFO message to vdc asking it to read data; if for
+ *	    some bizarre reason it does, vdc will reset the connection.
+ *
+ * Arguments:
+ *	vdc	- soft state pointer for this instance of the device driver.
+ *	msg	- the LDC message sent by vds
+ *
+ * Return Code:
+ *	0	- Success.
+ *	> 0	- error value returned by LDC
+ */
+static int
+vdc_process_data_msg(vdc_t *vdc, vio_msg_t msg)
+{
+	int			status = 0;
+	vdc_local_desc_t	*local_dep = NULL;
+	vio_dring_msg_t		*dring_msg = NULL;
+	size_t			msglen = sizeof (*dring_msg);
+	uint_t			num_msgs;
+	uint_t			start;
+	uint_t			end;
+	uint_t			i;
+
+	ASSERT(msg.tag.vio_msgtype == VIO_TYPE_DATA);
+	ASSERT(vdc != NULL);
+	ASSERT(mutex_owned(&vdc->lock));
+
+	dring_msg = (vio_dring_msg_t *)&msg;
+
+	/*
+	 * Check to see if the message has bogus data
+	 */
+	start = dring_msg->start_idx;
+	end = dring_msg->end_idx;
+	if ((start >= VD_DRING_LEN) || (end >= VD_DRING_LEN)) {
+		vdc_msg("%s: Bogus ACK data : start %d, end %d\n",
+			__func__, start, end);
+		return (EPROTO);
+	}
+
+	/*
+	 * calculate the number of messages that vds ACK'ed
+	 *
+	 * Assumes, (like the rest of vdc) that there is a 1:1 mapping
+	 * between requests and Dring entries.
+	 */
+	num_msgs = (end >= start) ?
+			(end - start + 1) :
+			(VD_DRING_LEN - start + end + 1);
+
+	/*
+	 * Verify that the sequence number is what vdc expects.
+	 */
+	if (vdc_verify_seq_num(vdc, dring_msg, num_msgs) == B_FALSE) {
+		return (ENXIO);
+	}
+
+	switch (msg.tag.vio_subtype) {
+	case VIO_SUBTYPE_ACK:
+		PR2("%s: DATA ACK\n", __func__);
+
+		/*
+		 * Wake the thread waiting for each DRing entry ACK'ed
+		 */
+		for (i = 0; i < num_msgs; i++) {
+			int idx = (start + i) % VD_DRING_LEN;
+
+			local_dep = &vdc->local_dring[idx];
+			mutex_enter(&local_dep->lock);
+			cv_signal(&local_dep->cv);
+			mutex_exit(&local_dep->lock);
+		}
+		break;
+
+	case VIO_SUBTYPE_NACK:
+		PR0("%s: DATA NACK\n", __func__);
+		dring_msg = (vio_dring_msg_t *)&msg;
+		VDC_DUMP_DRING_MSG(dring_msg);
+
+		/* Resend necessary messages */
+		for (i = 0; i < num_msgs; i++) {
+			int idx = (start + i) % VD_DRING_LEN;
+
+			local_dep = &vdc->local_dring[idx];
+			ASSERT(local_dep != NULL);
+			mutex_enter(&local_dep->lock);
+
+			if (local_dep->dep->hdr.dstate != VIO_DESC_READY) {
+				PR0("%s[%d]: Won't resend entry %d [flag=%d]\n",
+					__func__, vdc->instance, idx,
+					local_dep->dep->hdr.dstate);
+				mutex_exit(&local_dep->lock);
+				break;
+			}
+
+			/* we'll reuse the message passed in */
+			VIO_INIT_DRING_DATA_TAG(msg);
+			dring_msg->tag.vio_sid = vdc->session_id;
+			dring_msg->seq_num = ++(vdc->seq_num);
+			VDC_DUMP_DRING_MSG(dring_msg);
+
+			status = vdc_send(vdc->ldc_handle, (caddr_t)&dring_msg,
+					&msglen);
+			PR1("%s: ldc_write() status=%d\n", __func__, status);
+			if (status != 0) {
+				vdc_msg("%s ldc_write(%d)\n", __func__, status);
+				mutex_exit(&local_dep->lock);
+				break;
+			}
+
+			mutex_exit(&local_dep->lock);
+		}
+		break;
+
+	case VIO_SUBTYPE_INFO:
+	default:
+		cmn_err(CE_NOTE, "[%d] Got an unexpected DATA msg [subtype %d]",
+				vdc->instance, msg.tag.vio_subtype);
+		break;
+	}
+
+	return (status);
+}
+
+/*
+ * Function:
+ *	vdc_process_err_msg()
+ *
+ * NOTE: No error messages are used as part of the vDisk protocol
+ */
+static int
+vdc_process_err_msg(vdc_t *vdc, vio_msg_t msg)
+{
+	_NOTE(ARGUNUSED(vdc))
+	_NOTE(ARGUNUSED(msg))
+
+	int	status = ENOTSUP;
+
+	ASSERT(msg.tag.vio_msgtype == VIO_TYPE_ERR);
+	cmn_err(CE_NOTE, "[%d] Got an ERR msg", vdc->instance);
+
+	return (status);
+}
+
+/*
+ * Function:
+ *	vdc_verify_seq_num()
+ *
+ * Description:
+ *	This functions verifies that the sequence number sent back by vds with
+ *	the latest message correctly follows the last request processed.
+ *
+ * Arguments:
+ *	vdc		- soft state pointer for this instance of the driver.
+ *	dring_msg	- pointer to the LDC message sent by vds
+ *	num_msgs	- the number of requests being acknowledged
+ *
+ * Return Code:
+ *	B_TRUE	- Success.
+ *	B_FALSE	- The seq numbers are so out of sync, vdc cannot deal with them
+ */
+static boolean_t
+vdc_verify_seq_num(vdc_t *vdc, vio_dring_msg_t *dring_msg, int num_msgs)
+{
+	ASSERT(vdc != NULL);
+	ASSERT(dring_msg != NULL);
+
+	/*
+	 * Check to see if the messages were responded to in the correct
+	 * order by vds. There are 3 possible scenarios:
+	 *	- the seq_num we expected is returned (everything is OK)
+	 *	- a seq_num earlier than the last one acknowledged is returned,
+	 *	  if so something is seriously wrong so we reset the connection
+	 *	- a seq_num greater than what we expected is returned.
+	 */
+	if (dring_msg->seq_num != (vdc->seq_num_reply + num_msgs)) {
+		vdc_msg("%s[%d]: Bogus seq_num %d, expected %d\n",
+			__func__, vdc->instance, dring_msg->seq_num,
+			vdc->seq_num_reply + num_msgs);
+		if (dring_msg->seq_num < (vdc->seq_num_reply + num_msgs)) {
+			return (B_FALSE);
+		} else {
+			/*
+			 * vds has responded with a seq_num greater than what we
+			 * expected
+			 */
+			return (B_FALSE);
+		}
+	}
+	vdc->seq_num_reply += num_msgs;
+
+	return (B_TRUE);
+}
+
+/* -------------------------------------------------------------------------- */
+
+/*
+ * DKIO(7) support
+ *
+ * XXX FIXME - needs to be converted to use the structures defined in the
+ * latest VIO spec to communicate with the vDisk server.
+ */
+
+typedef struct vdc_dk_arg {
+	struct dk_callback	dkc;
+	int			mode;
+	dev_t			dev;
+	vdc_t			*vdc;
+} vdc_dk_arg_t;
+
+/*
+ * Function:
+ * 	vdc_dkio_flush_cb()
+ *
+ * Description:
+ *	This routine is a callback for DKIOCFLUSHWRITECACHE which can be called
+ *	by kernel code.
+ *
+ * Arguments:
+ *	arg	- a pointer to a vdc_dk_arg_t structure.
+ */
+void
+vdc_dkio_flush_cb(void *arg)
+{
+	struct vdc_dk_arg	*dk_arg = (struct vdc_dk_arg *)arg;
+	struct dk_callback	*dkc = NULL;
+	vdc_t			*vdc = NULL;
+	int			rv;
+
+	if (dk_arg == NULL) {
+		vdc_msg("%s[?] DKIOCFLUSHWRITECACHE arg is NULL\n", __func__);
+		return;
+	}
+	dkc = &dk_arg->dkc;
+	vdc = dk_arg->vdc;
+	ASSERT(vdc != NULL);
+
+	rv = vdc_populate_descriptor(vdc, NULL, 0, VD_OP_FLUSH,
+		dk_arg->mode, SDPART(getminor(dk_arg->dev)));
+	if (rv != 0) {
+		PR0("%s[%d] DKIOCFLUSHWRITECACHE failed : model %x\n",
+			__func__, vdc->instance,
+			ddi_model_convert_from(dk_arg->mode & FMODELS));
+		return;
+	}
+
+	/*
+	 * Trigger the call back to notify the caller the the ioctl call has
+	 * been completed.
+	 */
+	if ((dk_arg->mode & FKIOCTL) &&
+	    (dkc != NULL) &&
+	    (dkc->dkc_callback != NULL)) {
+		ASSERT(dkc->dkc_cookie != NULL);
+		(*dkc->dkc_callback)(dkc->dkc_cookie, ENOTSUP);
+	}
+
+	/* Indicate that one less DKIO write flush is outstanding */
+	mutex_enter(&vdc->lock);
+	vdc->dkio_flush_pending--;
+	ASSERT(vdc->dkio_flush_pending >= 0);
+	mutex_exit(&vdc->lock);
+}
+
+
+/*
+ * This structure is used in the DKIO(7I) array below.
+ */
+typedef struct vdc_dk_ioctl {
+	uint8_t		op;		/* VD_OP_XXX value */
+	int		cmd;		/* Solaris ioctl operation number */
+	uint8_t		copy;		/* copyin and/or copyout needed ? */
+	size_t		nbytes;		/* size of structure to be copied */
+	size_t		nbytes32;	/* size of 32bit struct if different */
+					/*   to 64bit struct (zero otherwise) */
+} vdc_dk_ioctl_t;
+
+/*
+ * Subset of DKIO(7I) operations currently supported
+ */
+static vdc_dk_ioctl_t	dk_ioctl[] = {
+	{VD_OP_FLUSH, DKIOCFLUSHWRITECACHE, 0,
+		0, 0},
+	{VD_OP_GET_WCE, DKIOCGETWCE, 0,
+		0, 0},
+	{VD_OP_SET_WCE, DKIOCSETWCE, 0,
+		0, 0},
+	{VD_OP_GET_VTOC, DKIOCGVTOC, VD_COPYOUT,
+		sizeof (struct vtoc), sizeof (struct vtoc32)},
+	{VD_OP_SET_VTOC, DKIOCSVTOC, VD_COPYIN,
+		sizeof (struct vtoc), sizeof (struct vtoc32)},
+	{VD_OP_SET_DISKGEOM, DKIOCSGEOM, VD_COPYIN,
+		sizeof (struct dk_geom), 0},
+	{VD_OP_GET_DISKGEOM, DKIOCGGEOM, VD_COPYOUT,
+		sizeof (struct dk_geom), 0},
+	{VD_OP_GET_DISKGEOM, DKIOCG_PHYGEOM, VD_COPYOUT,
+		sizeof (struct dk_geom), 0},
+	{VD_OP_GET_DISKGEOM, DKIOCG_VIRTGEOM, VD_COPYOUT,
+		sizeof (struct dk_geom), 0},
+	{VD_OP_SET_DISKGEOM, DKIOCSGEOM, VD_COPYOUT,
+		sizeof (struct dk_geom), 0},
+	{VD_OP_SCSICMD, USCSICMD, VD_COPYIN|VD_COPYOUT,
+		sizeof (struct uscsi_cmd), sizeof (struct uscsi_cmd32)},
+	{0, DKIOCINFO, VD_COPYOUT,
+		sizeof (struct dk_cinfo), 0},
+	{0, DKIOCGMEDIAINFO, VD_COPYOUT,
+		sizeof (struct dk_minfo), 0},
+	{0, DKIOCREMOVABLE, 0,
+		0, 0},
+	{0, CDROMREADOFFSET, 0,
+		0, 0}
+};
+
+/*
+ * Function:
+ *	vd_process_ioctl()
+ *
+ * Description:
+ *	This routine is the driver entry point for handling user
+ *	requests to get the device geometry.
+ *
+ * Arguments:
+ *	dev	- the device number
+ *	cmd	- the operation [dkio(7I)] to be processed
+ *	arg	- pointer to user provided structure
+ *		  (contains data to be set or reference parameter for get)
+ *	mode	- bit flag, indicating open settings, 32/64 bit type, etc
+ *	rvalp	- calling process return value, used in some ioctl calls
+ *		  (passed throught to vds who fills in the value)
+ *
+ * Assumptions:
+ *	vds will make the ioctl calls in the 64 bit address space so vdc
+ *	will convert the data to/from 32 bit as necessary before doing
+ *	the copyin or copyout.
+ *
+ * Return Code:
+ *	0
+ *	EFAULT
+ *	ENXIO
+ *	EIO
+ *	ENOTSUP
+ */
+static int
+vd_process_ioctl(dev_t dev, int cmd, caddr_t arg, int mode)
+{
+	int		instance = SDUNIT(getminor(dev));
+	vdc_t		*vdc = NULL;
+	int		op = -1;		/* VD_OP_XXX value */
+	int		rv = -1;
+	int		idx = 0;		/* index into dk_ioctl[] */
+	size_t		len = 0;		/* #bytes to send to vds */
+	size_t		alloc_len = 0;		/* #bytes to allocate mem for */
+	size_t		copy_len = 0;		/* #bytes to copy in/out */
+	caddr_t		mem_p = NULL;
+	boolean_t	do_convert_32to64 = B_FALSE;
+	size_t		nioctls = (sizeof (dk_ioctl)) / (sizeof (dk_ioctl[0]));
+
+	PR0("%s: Processing ioctl(%x) for dev %x : model %x\n",
+		__func__, cmd, dev, ddi_model_convert_from(mode & FMODELS));
+
+	vdc = ddi_get_soft_state(vdc_state, instance);
+	if (vdc == NULL) {
+		cmn_err(CE_NOTE, "![%d] Could not get soft state structure",
+		    instance);
+		return (ENXIO);
+	}
+
+	/*
+	 * Check to see if we can communicate with the vDisk server
+	 */
+	rv = vdc_is_able_to_tx_data(vdc, O_NONBLOCK);
+	if (rv == B_FALSE) {
+		PR0("%s[%d] Not ready to transmit data\n", __func__, instance);
+		return (ENOLINK);
+	}
+
+	/*
+	 * Validate the ioctl operation to be performed.
+	 *
+	 * If we have looped through the array without finding a match then we
+	 * don't support this ioctl.
+	 */
+	for (idx = 0; idx < nioctls; idx++) {
+		if (cmd == dk_ioctl[idx].cmd)
+			break;
+	}
+
+	if (idx >= nioctls) {
+		PR0("%s[%d] Unsupported ioctl(%x)\n",
+				__func__, vdc->instance, cmd);
+		return (ENOTSUP);
+	}
+
+	copy_len = len = dk_ioctl[idx].nbytes;
+	op = dk_ioctl[idx].op;
+
+	/*
+	 * Some ioctl operations have different sized structures for 32 bit
+	 * and 64 bit. If the userland caller is 32 bit, we need to check
+	 * to see if the operation is one of those special cases and
+	 * flag that we need to convert to and/or from 32 bit since vds
+	 * will make the call as 64 bit.
+	 */
+	if ((ddi_model_convert_from(mode & FMODELS) == DDI_MODEL_ILP32) &&
+	    (dk_ioctl[idx].nbytes != 0) &&
+	    (dk_ioctl[idx].nbytes32 != 0)) {
+		do_convert_32to64 = B_TRUE;
+		copy_len = dk_ioctl[idx].nbytes32;
+	}
+
+	/*
+	 * Deal with the ioctls which the server does not provide.
+	 */
+	switch (cmd) {
+	case CDROMREADOFFSET:
+	case DKIOCREMOVABLE:
+		return (ENOTTY);
+
+	case DKIOCINFO:
+		{
+			struct dk_cinfo	cinfo;
+			if (vdc->cinfo == NULL)
+				return (ENXIO);
+
+			bcopy(vdc->cinfo, &cinfo, sizeof (struct dk_cinfo));
+			cinfo.dki_partition = SDPART(getminor(dev));
+
+			rv = ddi_copyout(&cinfo, (void *)arg,
+					sizeof (struct dk_cinfo), mode);
+			if (rv != 0)
+				return (EFAULT);
+
+			return (0);
+		}
+
+	case DKIOCGMEDIAINFO:
+		if (vdc->minfo == NULL)
+			return (ENXIO);
+
+		rv = ddi_copyout(vdc->minfo, (void *)arg,
+				sizeof (struct dk_minfo), mode);
+		if (rv != 0)
+			return (EFAULT);
+
+		return (0);
+	}
+
+	/* catch programming error in vdc - should be a VD_OP_XXX ioctl */
+	ASSERT(op != 0);
+
+	/* LDC requires that the memory being mapped is 8-byte aligned */
+	alloc_len = P2ROUNDUP(len, sizeof (uint64_t));
+	PR1("%s[%d]: struct size %d alloc %d\n",
+			__func__, instance, len, alloc_len);
+
+	if (alloc_len != 0)
+		mem_p = kmem_zalloc(alloc_len, KM_SLEEP);
+
+	if (dk_ioctl[idx].copy & VD_COPYIN) {
+		if (arg == NULL) {
+			if (mem_p != NULL)
+				kmem_free(mem_p, alloc_len);
+			return (EINVAL);
+		}
+
+		ASSERT(copy_len != 0);
+
+		rv = ddi_copyin((void *)arg, mem_p, copy_len, mode);
+		if (rv != 0) {
+			if (mem_p != NULL)
+				kmem_free(mem_p, alloc_len);
+			return (EFAULT);
+		}
+
+		/*
+		 * some operations need the data to be converted from 32 bit
+		 * to 64 bit structures so that vds can process them on the
+		 * other side.
+		 */
+		if (do_convert_32to64) {
+			switch (cmd) {
+			case DKIOCSVTOC:
+			{
+				struct vtoc	vt;
+				struct vtoc32	vt32;
+
+				ASSERT(mem_p != NULL);
+				vt32 = *((struct vtoc32 *)(mem_p));
+
+				vtoc32tovtoc(vt32, vt);
+				bcopy(&vt, mem_p, len);
+				break;
+			}
+
+			case USCSICMD:
+			{
+				struct uscsi_cmd	scmd;
+				struct uscsi_cmd	*uscmd = &scmd;
+				struct uscsi_cmd32	*uscmd32;
+
+				ASSERT(mem_p != NULL);
+				uscmd32 = (struct uscsi_cmd32 *)mem_p;
+
+				/*
+				 * Convert the ILP32 uscsi data from the
+				 * application to LP64 for internal use.
+				 */
+				uscsi_cmd32touscsi_cmd(uscmd32, uscmd);
+				bcopy(uscmd, mem_p, len);
+				break;
+			}
+			default:
+				break;
+			}
+		}
+	}
+
+	/*
+	 * handle the special case of DKIOCFLUSHWRITECACHE
+	 */
+	if (cmd == DKIOCFLUSHWRITECACHE) {
+		struct dk_callback *dkc = (struct dk_callback *)arg;
+
+		PR0("%s[%d]: DKIOCFLUSHWRITECACHE\n", __func__, instance);
+
+		/* no mem should have been allocated hence no need to free it */
+		ASSERT(mem_p == NULL);
+
+		/*
+		 * If arg is NULL, we break here and the call operates
+		 * synchronously; waiting for vds to return.
+		 *
+		 * i.e. after the request to vds returns successfully,
+		 * all writes completed prior to the ioctl will have been
+		 * flushed from the disk write cache to persistent media.
+		 */
+		if (dkc != NULL) {
+			vdc_dk_arg_t	arg;
+			arg.mode = mode;
+			arg.dev = dev;
+			bcopy(dkc, &arg.dkc, sizeof (*dkc));
+
+			mutex_enter(&vdc->lock);
+			vdc->dkio_flush_pending++;
+			arg.vdc = vdc;
+			mutex_exit(&vdc->lock);
+
+			/* put the request on a task queue */
+			rv = taskq_dispatch(system_taskq, vdc_dkio_flush_cb,
+				(void *)&arg, DDI_SLEEP);
+
+			return (rv == NULL ? ENOMEM : 0);
+		}
+	}
+
+	/*
+	 * send request to vds to service the ioctl.
+	 */
+	rv = vdc_populate_descriptor(vdc, mem_p, alloc_len, op, mode,
+			SDPART((getminor(dev))));
+	if (rv != 0) {
+		/*
+		 * This is not necessarily an error. The ioctl could
+		 * be returning a value such as ENOTTY to indicate
+		 * that the ioctl is not applicable.
+		 */
+		PR0("%s[%d]: vds returned %d for ioctl 0x%x\n",
+			__func__, instance, rv, cmd);
+		if (mem_p != NULL)
+			kmem_free(mem_p, alloc_len);
+		return (rv);
+	}
+
+	/*
+	 * If the VTOC has been changed, then vdc needs to update the copy
+	 * it saved in the soft state structure and try and update the device
+	 * node properties. Failing to set the properties should not cause
+	 * an error to be return the caller though.
+	 */
+	if (cmd == DKIOCSVTOC) {
+		bcopy(mem_p, vdc->vtoc, sizeof (struct vtoc));
+		if (vdc_create_device_nodes_props(vdc)) {
+			cmn_err(CE_NOTE, "![%d] Failed to update device nodes"
+				" properties", instance);
+		}
+	}
+
+	/*
+	 * if we don't have to do a copyout, we have nothing left to do
+	 * so we just return.
+	 */
+	if ((dk_ioctl[idx].copy & VD_COPYOUT) == 0) {
+		if (mem_p != NULL)
+			kmem_free(mem_p, alloc_len);
+		return (0);
+	}
+
+	/* sanity check */
+	if (mem_p == NULL)
+		return (EFAULT);
+
+
+	/*
+	 * some operations need the data to be converted from 64 bit
+	 * back to 32 bit structures after vds has processed them.
+	 */
+	if (do_convert_32to64) {
+		switch (cmd) {
+		case DKIOCGVTOC:
+		{
+			struct vtoc	vt;
+			struct vtoc32	vt32;
+
+			ASSERT(mem_p != NULL);
+			vt = *((struct vtoc *)(mem_p));
+
+			vtoctovtoc32(vt, vt32);
+			bcopy(&vt32, mem_p, copy_len);
+			break;
+		}
+
+		case USCSICMD:
+		{
+			struct uscsi_cmd32	*uc32;
+			struct uscsi_cmd	*uc;
+
+			len = sizeof (struct uscsi_cmd32);
+
+			ASSERT(mem_p != NULL);
+			uc = (struct uscsi_cmd *)mem_p;
+			uc32 = kmem_zalloc(len, KM_SLEEP);
+
+			uscsi_cmdtouscsi_cmd32(uc, uc32);
+			bcopy(uc32, mem_p, copy_len);
+			PR0("%s[%d]: uscsi_cmd32:%x\n", __func__, instance,
+				((struct uscsi_cmd32 *)mem_p)->uscsi_cdblen);
+			kmem_free(uc32, len);
+			break;
+		}
+		default:
+			PR1("%s[%d]: This mode (%x) should just work for(%x)\n",
+				__func__, instance, mode, cmd);
+			break;
+		}
+	}
+
+	ASSERT(len != 0);
+	ASSERT(mem_p != NULL);
+
+	rv = ddi_copyout(mem_p, (void *)arg, copy_len, mode);
+	if (rv != 0) {
+		vdc_msg("%s[%d]: Could not do copy out for ioctl (%x)\n",
+			__func__, instance, cmd);
+		rv = EFAULT;
+	}
+
+	if (mem_p != NULL)
+		kmem_free(mem_p, alloc_len);
+
+	return (rv);
+}
+
+/*
+ * Function:
+ *	vdc_create_fake_geometry()
+ *
+ * Description:
+ *	This routine fakes up the disk info needed for some DKIO ioctls.
+ *		- DKIOCINFO
+ *		- DKIOCGMEDIAINFO
+ *
+ *	[ just like lofi(7D) and ramdisk(7D) ]
+ *
+ * Arguments:
+ *	vdc	- soft state pointer for this instance of the device driver.
+ *
+ * Return Code:
+ *	0	- Success
+ */
+static int
+vdc_create_fake_geometry(vdc_t *vdc)
+{
+	ASSERT(vdc != NULL);
+
+	/*
+	 * DKIOCINFO support
+	 */
+	vdc->cinfo = kmem_zalloc(sizeof (struct dk_cinfo), KM_SLEEP);
+
+	(void) strcpy(vdc->cinfo->dki_cname, VDC_DRIVER_NAME);
+	(void) strcpy(vdc->cinfo->dki_dname, VDC_DRIVER_NAME);
+	vdc->cinfo->dki_maxtransfer = vdc->max_xfer_sz / vdc->block_size;
+	vdc->cinfo->dki_ctype = DKC_SCSI_CCS;
+	vdc->cinfo->dki_flags = DKI_FMTVOL;
+	vdc->cinfo->dki_cnum = 0;
+	vdc->cinfo->dki_addr = 0;
+	vdc->cinfo->dki_space = 0;
+	vdc->cinfo->dki_prio = 0;
+	vdc->cinfo->dki_vec = 0;
+	vdc->cinfo->dki_unit = vdc->instance;
+	vdc->cinfo->dki_slave = 0;
+	/*
+	 * The partition number will be created on the fly depending on the
+	 * actual slice (i.e. minor node) that is used to request the data.
+	 */
+	vdc->cinfo->dki_partition = 0;
+
+	/*
+	 * DKIOCGMEDIAINFO support
+	 */
+	vdc->minfo = kmem_zalloc(sizeof (struct dk_minfo), KM_SLEEP);
+	vdc->minfo->dki_media_type = DK_FIXED_DISK;
+	vdc->minfo->dki_capacity = 1;
+	vdc->minfo->dki_lbsize = DEV_BSIZE;
+
+	return (0);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/vds.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,2013 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Virtual disk server
+ */
+
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/dkio.h>
+#include <sys/file.h>
+#include <sys/mdeg.h>
+#include <sys/modhash.h>
+#include <sys/note.h>
+#include <sys/pathname.h>
+#include <sys/sunddi.h>
+#include <sys/sunldi.h>
+#include <sys/sysmacros.h>
+#include <sys/vio_common.h>
+#include <sys/vdsk_mailbox.h>
+#include <sys/vdsk_common.h>
+#include <sys/vtoc.h>
+#include <sys/scsi/impl/uscsi.h>
+
+
+/* Virtual disk server initialization flags */
+#define	VDS_LOCKING		0x01
+#define	VDS_LDI			0x02
+#define	VDS_MDEG		0x04
+
+/* Virtual disk server tunable parameters */
+#define	VDS_LDC_RETRIES		3
+#define	VDS_NCHAINS		32
+
+/* Identification parameters for MD, synthetic dkio(7i) structures, etc. */
+#define	VDS_NAME		"virtual-disk-server"
+
+#define	VD_NAME			"vd"
+#define	VD_VOLUME_NAME		"vdisk"
+#define	VD_ASCIILABEL		"Virtual Disk"
+
+#define	VD_CHANNEL_ENDPOINT	"channel-endpoint"
+#define	VD_ID_PROP		"id"
+#define	VD_BLOCK_DEVICE_PROP	"vds-block-device"
+
+/* Virtual disk initialization flags */
+#define	VD_LOCKING		0x01
+#define	VD_TASKQ		0x02
+#define	VD_LDC			0x04
+#define	VD_DRING		0x08
+#define	VD_SID			0x10
+#define	VD_SEQ_NUM		0x20
+
+/* Flags for opening/closing backing devices via LDI */
+#define	VD_OPEN_FLAGS		(FEXCL | FREAD | FWRITE)
+
+/*
+ * By Solaris convention, slice/partition 2 represents the entire disk;
+ * unfortunately, this convention does not appear to be codified.
+ */
+#define	VD_ENTIRE_DISK_SLICE	2
+
+/* Return a cpp token as a string */
+#define	STRINGIZE(token)	#token
+
+/*
+ * Print a message prefixed with the current function name to the message log
+ * (and optionally to the console for verbose boots); these macros use cpp's
+ * concatenation of string literals and C99 variable-length-argument-list
+ * macros
+ */
+#define	PRN(...)	_PRN("?%s():  "__VA_ARGS__, "")
+#define	_PRN(format, ...)					\
+	cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__)
+
+/* Return a pointer to the "i"th vdisk dring element */
+#define	VD_DRING_ELEM(i)	((vd_dring_entry_t *)(void *)	\
+	    (vd->dring + (i)*vd->descriptor_size))
+
+/* Return the virtual disk client's type as a string (for use in messages) */
+#define	VD_CLIENT(vd)							\
+	(((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" :	\
+	    (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" :	\
+		(((vd)->xfer_mode == 0) ? "null client" :		\
+		    "unsupported client")))
+
+/* Debugging macros */
+#ifdef DEBUG
+#define	PR0 if (vd_msglevel > 0)	PRN
+#define	PR1 if (vd_msglevel > 1)	PRN
+#define	PR2 if (vd_msglevel > 2)	PRN
+
+#define	VD_DUMP_DRING_ELEM(elem)					\
+	PRN("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n",		\
+	    elem->hdr.dstate,						\
+	    elem->payload.operation,					\
+	    elem->payload.status,					\
+	    elem->payload.nbytes,					\
+	    elem->payload.addr,						\
+	    elem->payload.ncookies);
+
+#else	/* !DEBUG */
+#define	PR0(...)
+#define	PR1(...)
+#define	PR2(...)
+
+#define	VD_DUMP_DRING_ELEM(elem)
+
+#endif	/* DEBUG */
+
+
+typedef struct vds {
+	uint_t		initialized;	/* driver inst initialization flags */
+	dev_info_t	*dip;		/* driver inst devinfo pointer */
+	kmutex_t	lock;		/* lock for this structure */
+	ldi_ident_t	ldi_ident;	/* driver's identifier for LDI */
+	mod_hash_t	*vd_table;	/* table of virtual disks served */
+	mdeg_handle_t	mdeg;		/* handle for MDEG operations  */
+} vds_t;
+
+typedef struct vd {
+	uint_t			initialized;	/* vdisk initialization flags */
+	kmutex_t		lock;		/* lock for this structure */
+	vds_t			*vds;		/* server for this vdisk */
+	ddi_taskq_t		*taskq;		/* taskq for this vdisk */
+	ldi_handle_t		ldi_handle[V_NUMPAR];	/* LDI slice handles */
+	dev_t			dev[V_NUMPAR];	/* dev numbers for slices */
+	uint_t			nslices;	/* number for slices */
+	size_t			vdisk_size;	/* number of blocks in vdisk */
+	vd_disk_type_t		vdisk_type;	/* slice or entire disk */
+	boolean_t		pseudo;		/* underlying pseudo dev */
+	struct dk_geom		dk_geom;	/* synthetic for slice type */
+	struct vtoc		vtoc;		/* synthetic for slice type */
+	ldc_status_t		ldc_state;	/* LDC connection state */
+	ldc_handle_t		ldc_handle;	/* handle for LDC comm */
+	size_t			max_msglen;	/* largest LDC message len */
+	boolean_t		enabled;	/* whether vdisk is enabled */
+	vd_state_t		state;		/* client handshake state */
+	uint8_t			xfer_mode;	/* transfer mode with client */
+	uint32_t		sid;		/* client's session ID */
+	uint64_t		seq_num;	/* message sequence number */
+	uint64_t		dring_ident;	/* identifier of dring */
+	ldc_dring_handle_t	dring_handle;	/* handle for dring ops */
+	uint32_t		descriptor_size;	/* num bytes in desc */
+	uint32_t		dring_len;	/* number of dring elements */
+	caddr_t			dring;		/* address of dring */
+} vd_t;
+
+typedef struct vds_operation {
+	uint8_t	operation;
+	int	(*function)(vd_t *vd, vd_dring_payload_t *request);
+} vds_operation_t;
+
+typedef struct ioctl {
+	uint8_t		operation;
+	const char	*operation_name;
+	int		cmd;
+	const char	*cmd_name;
+	uint_t		copy;
+	size_t		nbytes;
+} ioctl_t;
+
+
+static int	vds_ldc_retries = VDS_LDC_RETRIES;
+static void	*vds_state;
+static uint64_t	vds_operations;	/* see vds_operation[] definition below */
+
+static int	vd_open_flags = VD_OPEN_FLAGS;
+
+#ifdef DEBUG
+static int	vd_msglevel;
+#endif /* DEBUG */
+
+
+static int
+vd_bread(vd_t *vd, vd_dring_payload_t *request)
+{
+	int		status;
+	struct buf	buf;
+
+	PR1("Read %lu bytes at block %lu", request->nbytes, request->addr);
+	if (request->nbytes == 0)
+		return (EINVAL);	/* no service for trivial requests */
+	ASSERT(mutex_owned(&vd->lock));
+	ASSERT(request->slice < vd->nslices);
+
+	bioinit(&buf);
+	buf.b_flags	= B_BUSY | B_READ;
+	buf.b_bcount	= request->nbytes;
+	buf.b_un.b_addr = kmem_alloc(buf.b_bcount, KM_SLEEP);
+	buf.b_lblkno	= request->addr;
+	buf.b_edev	= vd->dev[request->slice];
+
+	if ((status = ldi_strategy(vd->ldi_handle[request->slice], &buf)) == 0)
+		status = biowait(&buf);
+	biofini(&buf);
+	if ((status == 0) &&
+	    ((status = ldc_mem_copy(vd->ldc_handle, buf.b_un.b_addr, 0,
+		    &request->nbytes, request->cookie, request->ncookies,
+		    LDC_COPY_OUT)) != 0)) {
+		PRN("ldc_mem_copy() returned errno %d copying to client",
+		    status);
+	}
+	kmem_free(buf.b_un.b_addr, buf.b_bcount);	/* nbytes can change */
+	return (status);
+}
+
+static int
+vd_do_bwrite(vd_t *vd, uint_t slice, diskaddr_t block, size_t nbytes,
+    ldc_mem_cookie_t *cookie, uint64_t ncookies, caddr_t data)
+{
+	int		status;
+	struct buf	buf;
+
+	ASSERT(mutex_owned(&vd->lock));
+	ASSERT(slice < vd->nslices);
+	ASSERT(nbytes != 0);
+	ASSERT(data != NULL);
+
+	/* Get data from client */
+	if ((status = ldc_mem_copy(vd->ldc_handle, data, 0, &nbytes,
+		    cookie, ncookies, LDC_COPY_IN)) != 0) {
+		PRN("ldc_mem_copy() returned errno %d copying from client",
+		    status);
+		return (status);
+	}
+
+	bioinit(&buf);
+	buf.b_flags	= B_BUSY | B_WRITE;
+	buf.b_bcount	= nbytes;
+	buf.b_un.b_addr	= data;
+	buf.b_lblkno	= block;
+	buf.b_edev	= vd->dev[slice];
+
+	if ((status = ldi_strategy(vd->ldi_handle[slice], &buf)) == 0)
+		status = biowait(&buf);
+	biofini(&buf);
+	return (status);
+}
+
+static int
+vd_bwrite(vd_t *vd, vd_dring_payload_t *request)
+{
+	int	status;
+	caddr_t	data;
+
+
+	PR1("Write %ld bytes at block %lu", request->nbytes, request->addr);
+	if (request->nbytes == 0)
+		return (EINVAL);	/* no service for trivial requests */
+	data = kmem_alloc(request->nbytes, KM_SLEEP);
+	status = vd_do_bwrite(vd, request->slice, request->addr,
+	    request->nbytes, request->cookie, request->ncookies, data);
+	kmem_free(data, request->nbytes);
+	return (status);
+}
+
+static int
+vd_do_slice_ioctl(vd_t *vd, int cmd, void *buf)
+{
+	switch (cmd) {
+	case DKIOCGGEOM:
+		ASSERT(buf != NULL);
+		bcopy(&vd->dk_geom, buf, sizeof (vd->dk_geom));
+		return (0);
+	case DKIOCGVTOC:
+		ASSERT(buf != NULL);
+		bcopy(&vd->vtoc, buf, sizeof (vd->vtoc));
+		return (0);
+	default:
+		return (ENOTSUP);
+	}
+}
+
+static int
+vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, ioctl_t *ioctl)
+{
+	int	rval = 0, status;
+	size_t	nbytes = request->nbytes;	/* modifiable copy */
+
+
+	ASSERT(mutex_owned(&vd->lock));
+	ASSERT(request->slice < vd->nslices);
+	PR0("Performing %s", ioctl->operation_name);
+
+	/* Get data from client, if necessary */
+	if (ioctl->copy & VD_COPYIN)  {
+		ASSERT(nbytes != 0 && buf != NULL);
+		PR1("Getting \"arg\" data from client");
+		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
+			    request->cookie, request->ncookies,
+			    LDC_COPY_IN)) != 0) {
+			PRN("ldc_mem_copy() returned errno %d "
+			    "copying from client", status);
+			return (status);
+		}
+	}
+
+	/*
+	 * Handle single-slice block devices internally; otherwise, have the
+	 * real driver perform the ioctl()
+	 */
+	if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) {
+		if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, buf)) != 0)
+			return (status);
+	} else if ((status = ldi_ioctl(vd->ldi_handle[request->slice],
+		    ioctl->cmd, (intptr_t)buf, FKIOCTL, kcred, &rval)) != 0) {
+		PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status);
+		return (status);
+	}
+#ifdef DEBUG
+	if (rval != 0) {
+		PRN("%s set rval = %d, which is not being returned to client",
+		    ioctl->cmd_name, rval);
+	}
+#endif /* DEBUG */
+
+	/* Send data to client, if necessary */
+	if (ioctl->copy & VD_COPYOUT)  {
+		ASSERT(nbytes != 0 && buf != NULL);
+		PR1("Sending \"arg\" data to client");
+		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
+			    request->cookie, request->ncookies,
+			    LDC_COPY_OUT)) != 0) {
+			PRN("ldc_mem_copy() returned errno %d "
+			    "copying to client", status);
+			return (status);
+		}
+	}
+
+	return (status);
+}
+
+#define	RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t))
+static int
+vd_ioctl(vd_t *vd, vd_dring_payload_t *request)
+{
+	static ioctl_t	ioctl[] = {
+		/* Command (no-copy) operations */
+		{VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), DKIOCFLUSHWRITECACHE,
+		    STRINGIZE(DKIOCFLUSHWRITECACHE), 0, 0},
+
+		/* "Get" (copy-out) operations */
+		{VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), DKIOCGETWCE,
+		    STRINGIZE(DKIOCGETWCE), VD_COPYOUT, RNDSIZE(int)},
+		{VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), DKIOCGGEOM,
+		    STRINGIZE(DKIOCGGEOM), VD_COPYOUT, RNDSIZE(struct dk_geom)},
+		{VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), DKIOCGVTOC,
+		    STRINGIZE(DKIOCGVTOC), VD_COPYOUT, RNDSIZE(struct vtoc)},
+
+		/* "Set" (copy-in) operations */
+		{VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), DKIOCSETWCE,
+		    STRINGIZE(DKIOCSETWCE), VD_COPYOUT, RNDSIZE(int)},
+		{VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), DKIOCSGEOM,
+		    STRINGIZE(DKIOCSGEOM), VD_COPYIN, RNDSIZE(struct dk_geom)},
+		{VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), DKIOCSVTOC,
+		    STRINGIZE(DKIOCSVTOC), VD_COPYIN, RNDSIZE(struct vtoc)},
+
+		/* "Get/set" (copy-in/copy-out) operations */
+		{VD_OP_SCSICMD, STRINGIZE(VD_OP_SCSICMD), USCSICMD,
+		    STRINGIZE(USCSICMD), VD_COPYIN|VD_COPYOUT,
+		    RNDSIZE(struct uscsi_cmd)}
+
+	};
+	int		i, status;
+	void		*buf = NULL;
+	size_t		nioctls = (sizeof (ioctl))/(sizeof (ioctl[0]));
+
+
+	ASSERT(mutex_owned(&vd->lock));
+	ASSERT(request->slice < vd->nslices);
+
+	/*
+	 * Determine ioctl corresponding to caller's "operation" and
+	 * validate caller's "nbytes"
+	 */
+	for (i = 0; i < nioctls; i++) {
+		if (request->operation == ioctl[i].operation) {
+			if (request->nbytes > ioctl[i].nbytes) {
+				PRN("%s:  Expected <= %lu \"nbytes\", "
+				    "got %lu", ioctl[i].operation_name,
+				    ioctl[i].nbytes, request->nbytes);
+				return (EINVAL);
+			} else if ((request->nbytes % sizeof (uint64_t)) != 0) {
+				PRN("%s:  nbytes = %lu not a multiple of %lu",
+				    ioctl[i].operation_name, request->nbytes,
+				    sizeof (uint64_t));
+				return (EINVAL);
+			}
+
+			break;
+		}
+	}
+	ASSERT(i < nioctls);	/* because "operation" already validated */
+
+	if (request->nbytes)
+		buf = kmem_zalloc(request->nbytes, KM_SLEEP);
+	status = vd_do_ioctl(vd, request, buf, &ioctl[i]);
+	if (request->nbytes)
+		kmem_free(buf, request->nbytes);
+	return (status);
+}
+
+/*
+ * Define the supported operations once the functions for performing them have
+ * been defined
+ */
+static const vds_operation_t	vds_operation[] = {
+	{VD_OP_BREAD,		vd_bread},
+	{VD_OP_BWRITE,		vd_bwrite},
+	{VD_OP_FLUSH,		vd_ioctl},
+	{VD_OP_GET_WCE,		vd_ioctl},
+	{VD_OP_SET_WCE,		vd_ioctl},
+	{VD_OP_GET_VTOC,	vd_ioctl},
+	{VD_OP_SET_VTOC,	vd_ioctl},
+	{VD_OP_GET_DISKGEOM,	vd_ioctl},
+	{VD_OP_SET_DISKGEOM,	vd_ioctl},
+	{VD_OP_SCSICMD,		vd_ioctl}
+};
+
+static const size_t	vds_noperations =
+	(sizeof (vds_operation))/(sizeof (vds_operation[0]));
+
+/*
+ * Process a request using a defined operation
+ */
+static int
+vd_process_request(vd_t *vd, vd_dring_payload_t *request)
+{
+	int	i;
+
+
+	PR1("Entered");
+	ASSERT(mutex_owned(&vd->lock));
+
+	/* Range-check slice */
+	if (request->slice >= vd->nslices) {
+		PRN("Invalid \"slice\" %u (max %u) for virtual disk",
+		    request->slice, (vd->nslices - 1));
+		return (EINVAL);
+	}
+
+	/* Perform the requested operation */
+	for (i = 0; i < vds_noperations; i++)
+		if (request->operation == vds_operation[i].operation)
+			return (vds_operation[i].function(vd, request));
+
+	/* No matching operation found */
+	PRN("Unsupported operation %u", request->operation);
+	return (ENOTSUP);
+}
+
+static int
+send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen)
+{
+	int	retry, status;
+	size_t	nbytes;
+
+
+	for (retry = 0, status = EWOULDBLOCK;
+	    retry < vds_ldc_retries && status == EWOULDBLOCK;
+	    retry++) {
+		PR1("ldc_write() attempt %d", (retry + 1));
+		nbytes = msglen;
+		status = ldc_write(ldc_handle, msg, &nbytes);
+	}
+
+	if (status != 0) {
+		PRN("ldc_write() returned errno %d", status);
+		return (status);
+	} else if (nbytes != msglen) {
+		PRN("ldc_write() performed only partial write");
+		return (EIO);
+	}
+
+	PR1("SENT %lu bytes", msglen);
+	return (0);
+}
+
+/*
+ * Return 1 if the "type", "subtype", and "env" fields of the "tag" first
+ * argument match the corresponding remaining arguments; otherwise, return 0
+ */
+int
+vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env)
+{
+	return ((tag->vio_msgtype == type) &&
+		(tag->vio_subtype == subtype) &&
+		(tag->vio_subtype_env == env)) ? 1 : 0;
+}
+
+static int
+process_ver_msg(vio_msg_t *msg, size_t msglen)
+{
+	vio_ver_msg_t	*ver_msg = (vio_ver_msg_t *)msg;
+
+
+	ASSERT(msglen >= sizeof (msg->tag));
+
+	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
+		VIO_VER_INFO)) {
+		return (ENOMSG);	/* not a version message */
+	}
+
+	if (msglen != sizeof (*ver_msg)) {
+		PRN("Expected %lu-byte version message; "
+		    "received %lu bytes", sizeof (*ver_msg), msglen);
+		return (EBADMSG);
+	}
+
+	if (ver_msg->dev_class != VDEV_DISK) {
+		PRN("Expected device class %u (disk); received %u",
+		    VDEV_DISK, ver_msg->dev_class);
+		return (EBADMSG);
+	}
+
+	if ((ver_msg->ver_major != VD_VER_MAJOR) ||
+	    (ver_msg->ver_minor != VD_VER_MINOR)) {
+		/* Unsupported version; send back supported version */
+		ver_msg->ver_major = VD_VER_MAJOR;
+		ver_msg->ver_minor = VD_VER_MINOR;
+		return (EBADMSG);
+	}
+
+	/* Valid message, version accepted */
+	ver_msg->dev_class = VDEV_DISK_SERVER;
+	return (0);
+}
+
+static int
+vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+{
+	vd_attr_msg_t	*attr_msg = (vd_attr_msg_t *)msg;
+
+
+	PR0("Entered");
+	ASSERT(mutex_owned(&vd->lock));
+	ASSERT(msglen >= sizeof (msg->tag));
+
+	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
+		VIO_ATTR_INFO)) {
+		return (ENOMSG);	/* not an attribute message */
+	}
+
+	if (msglen != sizeof (*attr_msg)) {
+		PRN("Expected %lu-byte attribute message; "
+		    "received %lu bytes", sizeof (*attr_msg), msglen);
+		return (EBADMSG);
+	}
+
+	if (attr_msg->max_xfer_sz == 0) {
+		PRN("Received maximum transfer size of 0 from client");
+		return (EBADMSG);
+	}
+
+	if ((attr_msg->xfer_mode != VIO_DESC_MODE) &&
+	    (attr_msg->xfer_mode != VIO_DRING_MODE)) {
+		PRN("Client requested unsupported transfer mode");
+		return (EBADMSG);
+	}
+
+
+	/* Success:  valid message and transfer mode */
+	vd->xfer_mode = attr_msg->xfer_mode;
+	if (vd->xfer_mode == VIO_DESC_MODE) {
+		/*
+		 * The vd_dring_inband_msg_t contains one cookie; need room
+		 * for up to n-1 more cookies, where "n" is the number of full
+		 * pages plus possibly one partial page required to cover
+		 * "max_xfer_sz".  Add room for one more cookie if
+		 * "max_xfer_sz" isn't an integral multiple of the page size.
+		 * Must first get the maximum transfer size in bytes.
+		 */
+#if 1	/* NEWOBP */
+		size_t	max_xfer_bytes = attr_msg->vdisk_block_size ?
+		    attr_msg->vdisk_block_size*attr_msg->max_xfer_sz :
+		    attr_msg->max_xfer_sz;
+		size_t	max_inband_msglen =
+		    sizeof (vd_dring_inband_msg_t) +
+		    ((max_xfer_bytes/PAGESIZE +
+			((max_xfer_bytes % PAGESIZE) ? 1 : 0))*
+			(sizeof (ldc_mem_cookie_t)));
+#else	/* NEWOBP */
+		size_t	max_inband_msglen =
+		    sizeof (vd_dring_inband_msg_t) +
+		    ((attr_msg->max_xfer_sz/PAGESIZE
+			+ (attr_msg->max_xfer_sz % PAGESIZE ? 1 : 0))*
+			(sizeof (ldc_mem_cookie_t)));
+#endif	/* NEWOBP */
+
+		/*
+		 * Set the maximum expected message length to
+		 * accommodate in-band-descriptor messages with all
+		 * their cookies
+		 */
+		vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen);
+	}
+
+	attr_msg->vdisk_size = vd->vdisk_size;
+	attr_msg->vdisk_type = vd->vdisk_type;
+	attr_msg->operations = vds_operations;
+	PR0("%s", VD_CLIENT(vd));
+	return (0);
+}
+
+static int
+vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+{
+	int			status;
+	size_t			expected;
+	ldc_mem_info_t		dring_minfo;
+	vio_dring_reg_msg_t	*reg_msg = (vio_dring_reg_msg_t *)msg;
+
+
+	PR0("Entered");
+	ASSERT(mutex_owned(&vd->lock));
+	ASSERT(msglen >= sizeof (msg->tag));
+
+	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
+		VIO_DRING_REG)) {
+		return (ENOMSG);	/* not a register-dring message */
+	}
+
+	if (msglen < sizeof (*reg_msg)) {
+		PRN("Expected at least %lu-byte register-dring message; "
+		    "received %lu bytes", sizeof (*reg_msg), msglen);
+		return (EBADMSG);
+	}
+
+	expected = sizeof (*reg_msg) +
+	    (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0]));
+	if (msglen != expected) {
+		PRN("Expected %lu-byte register-dring message; "
+		    "received %lu bytes", expected, msglen);
+		return (EBADMSG);
+	}
+
+	if (vd->initialized & VD_DRING) {
+		PRN("A dring was previously registered; only support one");
+		return (EBADMSG);
+	}
+
+	if (reg_msg->ncookies != 1) {
+		/*
+		 * In addition to fixing the assertion in the success case
+		 * below, supporting drings which require more than one
+		 * "cookie" requires increasing the value of vd->max_msglen
+		 * somewhere in the code path prior to receiving the message
+		 * which results in calling this function.  Note that without
+		 * making this change, the larger message size required to
+		 * accommodate multiple cookies cannot be successfully
+		 * received, so this function will not even get called.
+		 * Gracefully accommodating more dring cookies might
+		 * reasonably demand exchanging an additional attribute or
+		 * making a minor protocol adjustment
+		 */
+		PRN("reg_msg->ncookies = %u != 1", reg_msg->ncookies);
+		return (EBADMSG);
+	}
+
+	status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie,
+	    reg_msg->ncookies, reg_msg->num_descriptors,
+	    reg_msg->descriptor_size, LDC_SHADOW_MAP, &vd->dring_handle);
+	if (status != 0) {
+		PRN("ldc_mem_dring_map() returned errno %d", status);
+		return (status);
+	}
+
+	/*
+	 * To remove the need for this assertion, must call
+	 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a
+	 * successful call to ldc_mem_dring_map()
+	 */
+	ASSERT(reg_msg->ncookies == 1);
+
+	if ((status =
+		ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) {
+		PRN("ldc_mem_dring_info() returned errno %d", status);
+		if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)
+			PRN("ldc_mem_dring_unmap() returned errno %d", status);
+		return (status);
+	}
+
+	if (dring_minfo.vaddr == NULL) {
+		PRN("Descriptor ring virtual address is NULL");
+		return (EBADMSG);	/* FIXME appropriate status? */
+	}
+
+
+	/* Valid message and dring mapped */
+	PR1("descriptor size = %u, dring length = %u",
+	    vd->descriptor_size, vd->dring_len);
+	vd->initialized |= VD_DRING;
+	vd->dring_ident = 1;	/* "There Can Be Only One" */
+	vd->dring = dring_minfo.vaddr;
+	vd->descriptor_size = reg_msg->descriptor_size;
+	vd->dring_len = reg_msg->num_descriptors;
+	reg_msg->dring_ident = vd->dring_ident;
+	return (0);
+}
+
+static int
+vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+{
+	vio_dring_unreg_msg_t	*unreg_msg = (vio_dring_unreg_msg_t *)msg;
+
+
+	PR0("Entered");
+	ASSERT(mutex_owned(&vd->lock));
+	ASSERT(msglen >= sizeof (msg->tag));
+
+	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
+		VIO_DRING_UNREG)) {
+		return (ENOMSG);	/* not an unregister-dring message */
+	}
+
+	if (msglen != sizeof (*unreg_msg)) {
+		PRN("Expected %lu-byte unregister-dring message; "
+		    "received %lu bytes", sizeof (*unreg_msg), msglen);
+		return (EBADMSG);
+	}
+
+	if (unreg_msg->dring_ident != vd->dring_ident) {
+		PRN("Expected dring ident %lu; received %lu",
+		    vd->dring_ident, unreg_msg->dring_ident);
+		return (EBADMSG);
+	}
+
+	/* FIXME set ack in unreg_msg? */
+	return (0);
+}
+
+static int
+process_rdx_msg(vio_msg_t *msg, size_t msglen)
+{
+	PR0("Entered");
+	ASSERT(msglen >= sizeof (msg->tag));
+
+	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX))
+		return (ENOMSG);	/* not an RDX message */
+
+	if (msglen != sizeof (vio_rdx_msg_t)) {
+		PRN("Expected %lu-byte RDX message; received %lu bytes",
+		    sizeof (vio_rdx_msg_t), msglen);
+		return (EBADMSG);
+	}
+
+	return (0);
+}
+
+static void
+vd_reset_connection(vd_t *vd, boolean_t reset_ldc)
+{
+	int	status = 0;
+
+
+	ASSERT(mutex_owned(&vd->lock));
+	PR0("Resetting connection with %s", VD_CLIENT(vd));
+	if ((vd->initialized & VD_DRING) &&
+	    ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0))
+		PRN("ldc_mem_dring_unmap() returned errno %d", status);
+	if ((reset_ldc == B_TRUE) &&
+	    ((status = ldc_reset(vd->ldc_handle)) != 0))
+		PRN("ldc_reset() returned errno %d", status);
+	vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING);
+	vd->state = VD_STATE_INIT;
+	vd->max_msglen = sizeof (vio_msg_t);	/* baseline vio message size */
+}
+
+static int
+vd_check_seq_num(vd_t *vd, uint64_t seq_num)
+{
+	ASSERT(mutex_owned(&vd->lock));
+	if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) {
+		PRN("Received seq_num %lu; expected %lu",
+		    seq_num, (vd->seq_num + 1));
+		vd_reset_connection(vd, B_FALSE);
+		return (1);
+	}
+
+	vd->seq_num = seq_num;
+	vd->initialized |= VD_SEQ_NUM;	/* superfluous after first time... */
+	return (0);
+}
+
+/*
+ * Return the expected size of an inband-descriptor message with all the
+ * cookies it claims to include
+ */
+static size_t
+expected_inband_size(vd_dring_inband_msg_t *msg)
+{
+	return ((sizeof (*msg)) +
+	    (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0])));
+}
+
+/*
+ * Process an in-band descriptor message:  used with clients like OBP, with
+ * which vds exchanges descriptors within VIO message payloads, rather than
+ * operating on them within a descriptor ring
+ */
+static int
+vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+{
+	size_t			expected;
+	vd_dring_inband_msg_t	*desc_msg = (vd_dring_inband_msg_t *)msg;
+
+
+	PR1("Entered");
+	ASSERT(mutex_owned(&vd->lock));
+	ASSERT(msglen >= sizeof (msg->tag));
+
+	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
+		VIO_DESC_DATA))
+		return (ENOMSG);	/* not an in-band-descriptor message */
+
+	if (msglen < sizeof (*desc_msg)) {
+		PRN("Expected at least %lu-byte descriptor message; "
+		    "received %lu bytes", sizeof (*desc_msg), msglen);
+		return (EBADMSG);
+	}
+
+	if (msglen != (expected = expected_inband_size(desc_msg))) {
+		PRN("Expected %lu-byte descriptor message; "
+		    "received %lu bytes", expected, msglen);
+		return (EBADMSG);
+	}
+
+	if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) {
+		return (EBADMSG);
+	}
+
+	/* Valid message; process the request */
+	desc_msg->payload.status = vd_process_request(vd, &desc_msg->payload);
+	return (0);
+}
+
+static boolean_t
+vd_accept_dring_elems(vd_t *vd, uint32_t start, uint32_t ndesc)
+{
+	uint32_t	i, n;
+
+
+	/* Check descriptor states */
+	for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) {
+		if (VD_DRING_ELEM(i)->hdr.dstate != VIO_DESC_READY) {
+			PRN("descriptor %u not ready", i);
+			VD_DUMP_DRING_ELEM(VD_DRING_ELEM(i));
+			return (B_FALSE);
+		}
+	}
+
+	/* Descriptors are valid; accept them */
+	for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len)
+		VD_DRING_ELEM(i)->hdr.dstate = VIO_DESC_ACCEPTED;
+
+	return (B_TRUE);
+}
+
+static int
+vd_process_dring(vd_t *vd, uint32_t start, uint32_t end)
+{
+	int		status;
+	boolean_t	accepted;
+	uint32_t	i, io_status, n, ndesc;
+
+
+	ASSERT(mutex_owned(&vd->lock));
+	PR1("start = %u, end = %u", start, end);
+
+	/* Validate descriptor range */
+	if ((start >= vd->dring_len) || (end >= vd->dring_len)) {
+		PRN("\"start\" = %u, \"end\" = %u; both must be less than %u",
+		    start, end, vd->dring_len);
+		return (EINVAL);
+	}
+
+	/* Acquire updated dring elements */
+	if ((status = ldc_mem_dring_acquire(vd->dring_handle,
+		    start, end)) != 0) {
+		PRN("ldc_mem_dring_acquire() returned errno %d", status);
+		return (status);
+	}
+	/* Accept updated dring elements */
+	ndesc = ((end < start) ? end + vd->dring_len : end) - start + 1;
+	PR1("ndesc = %u", ndesc);
+	accepted = vd_accept_dring_elems(vd, start, ndesc);
+	/* Release dring elements */
+	if ((status = ldc_mem_dring_release(vd->dring_handle,
+		    start, end)) != 0) {
+		PRN("ldc_mem_dring_release() returned errno %d", status);
+		return (status);
+	}
+	/* If a descriptor was in the wrong state, return an error */
+	if (!accepted)
+		return (EINVAL);
+
+
+	/* Process accepted dring elements */
+	for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) {
+		vd_dring_entry_t	*elem = VD_DRING_ELEM(i);
+
+		/* Process descriptor outside acquire/release bracket */
+		PR1("Processing dring element %u", i);
+		io_status = vd_process_request(vd, &elem->payload);
+
+		/* Re-acquire client's dring element */
+		if ((status = ldc_mem_dring_acquire(vd->dring_handle,
+			    i, i)) != 0) {
+			PRN("ldc_mem_dring_acquire() returned errno %d",
+			    status);
+			return (status);
+		}
+		/* Update processed element */
+		if (elem->hdr.dstate == VIO_DESC_ACCEPTED) {
+			elem->payload.status	= io_status;
+			elem->hdr.dstate	= VIO_DESC_DONE;
+		} else {
+			/* Perhaps client timed out waiting for I/O... */
+			accepted = B_FALSE;
+			PRN("element %u no longer \"accepted\"", i);
+			VD_DUMP_DRING_ELEM(elem);
+		}
+		/* Release updated processed element */
+		if ((status = ldc_mem_dring_release(vd->dring_handle,
+			    i, i)) != 0) {
+			PRN("ldc_mem_dring_release() returned errno %d",
+			    status);
+			return (status);
+		}
+		/* If the descriptor was in the wrong state, return an error */
+		if (!accepted)
+			return (EINVAL);
+	}
+
+	return (0);
+}
+
+static int
+vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+{
+	vio_dring_msg_t	*dring_msg = (vio_dring_msg_t *)msg;
+
+
+	PR1("Entered");
+	ASSERT(mutex_owned(&vd->lock));
+	ASSERT(msglen >= sizeof (msg->tag));
+
+	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
+		VIO_DRING_DATA)) {
+		return (ENOMSG);	/* not a dring-data message */
+	}
+
+	if (msglen != sizeof (*dring_msg)) {
+		PRN("Expected %lu-byte dring message; received %lu bytes",
+		    sizeof (*dring_msg), msglen);
+		return (EBADMSG);
+	}
+
+	if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) {
+		return (EBADMSG);
+	}
+
+	if (dring_msg->dring_ident != vd->dring_ident) {
+		PRN("Expected dring ident %lu; received ident %lu",
+		    vd->dring_ident, dring_msg->dring_ident);
+		return (EBADMSG);
+	}
+
+
+	/* Valid message; process dring */
+	dring_msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
+	return (vd_process_dring(vd, dring_msg->start_idx, dring_msg->end_idx));
+}
+
+static int
+recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes)
+{
+	int	retry, status;
+	size_t	size = *nbytes;
+	boolean_t	isempty = B_FALSE;
+
+
+	/* FIXME work around interrupt problem */
+	if ((ldc_chkq(ldc_handle, &isempty) != 0) || isempty)
+		return (ENOMSG);
+
+	for (retry = 0, status = ETIMEDOUT;
+	    retry < vds_ldc_retries && status == ETIMEDOUT;
+	    retry++) {
+		PR1("ldc_read() attempt %d", (retry + 1));
+		*nbytes = size;
+		status = ldc_read(ldc_handle, msg, nbytes);
+	}
+
+	if (status != 0) {
+		PRN("ldc_read() returned errno %d", status);
+		return (status);
+	} else if (*nbytes == 0) {
+		PR1("ldc_read() returned 0 and no message read");
+		return (ENOMSG);
+	}
+
+	PR1("RCVD %lu-byte message", *nbytes);
+	return (0);
+}
+
+static int
+vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+{
+	int		status;
+
+
+	PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype,
+	    msg->tag.vio_subtype, msg->tag.vio_subtype_env);
+	ASSERT(mutex_owned(&vd->lock));
+
+	/*
+	 * Validate session ID up front, since it applies to all messages
+	 * once set
+	 */
+	if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) {
+		PRN("Expected SID %u, received %u", vd->sid,
+		    msg->tag.vio_sid);
+		return (EBADMSG);
+	}
+
+
+	/*
+	 * Process the received message based on connection state
+	 */
+	switch (vd->state) {
+	case VD_STATE_INIT:	/* expect version message */
+		if ((status = process_ver_msg(msg, msglen)) != 0)
+			return (status);
+
+		/* The first version message sets the SID */
+		ASSERT(!(vd->initialized & VD_SID));
+		vd->sid = msg->tag.vio_sid;
+		vd->initialized |= VD_SID;
+
+		/* Version negotiated, move to that state */
+		vd->state = VD_STATE_VER;
+		return (0);
+
+	case VD_STATE_VER:	/* expect attribute message */
+		if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0)
+			return (status);
+
+		/* Attributes exchanged, move to that state */
+		vd->state = VD_STATE_ATTR;
+		return (0);
+
+	case VD_STATE_ATTR:
+		switch (vd->xfer_mode) {
+		case VIO_DESC_MODE:	/* expect RDX message */
+			if ((status = process_rdx_msg(msg, msglen)) != 0)
+				return (status);
+
+			/* Ready to receive in-band descriptors */
+			vd->state = VD_STATE_DATA;
+			return (0);
+
+		case VIO_DRING_MODE:	/* expect register-dring message */
+			if ((status =
+				vd_process_dring_reg_msg(vd, msg, msglen)) != 0)
+				return (status);
+
+			/* One dring negotiated, move to that state */
+			vd->state = VD_STATE_DRING;
+			return (0);
+
+		default:
+			ASSERT("Unsupported transfer mode");
+			PRN("Unsupported transfer mode");
+			return (ENOTSUP);
+		}
+
+	case VD_STATE_DRING:	/* expect RDX, register-dring, or unreg-dring */
+		if ((status = process_rdx_msg(msg, msglen)) == 0) {
+			/* Ready to receive data */
+			vd->state = VD_STATE_DATA;
+			return (0);
+		} else if (status != ENOMSG) {
+			return (status);
+		}
+
+
+		/*
+		 * If another register-dring message is received, stay in
+		 * dring state in case the client sends RDX; although the
+		 * protocol allows multiple drings, this server does not
+		 * support using more than one
+		 */
+		if ((status =
+			vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG)
+			return (status);
+
+		/*
+		 * Acknowledge an unregister-dring message, but reset the
+		 * connection anyway:  Although the protocol allows
+		 * unregistering drings, this server cannot serve a vdisk
+		 * without its only dring
+		 */
+		status = vd_process_dring_unreg_msg(vd, msg, msglen);
+		return ((status == 0) ? ENOTSUP : status);
+
+	case VD_STATE_DATA:
+		switch (vd->xfer_mode) {
+		case VIO_DESC_MODE:	/* expect in-band-descriptor message */
+			return (vd_process_desc_msg(vd, msg, msglen));
+
+		case VIO_DRING_MODE:	/* expect dring-data or unreg-dring */
+			/*
+			 * Typically expect dring-data messages, so handle
+			 * them first
+			 */
+			if ((status = vd_process_dring_msg(vd, msg,
+				    msglen)) != ENOMSG)
+				return (status);
+
+			/*
+			 * Acknowledge an unregister-dring message, but reset
+			 * the connection anyway:  Although the protocol
+			 * allows unregistering drings, this server cannot
+			 * serve a vdisk without its only dring
+			 */
+			status = vd_process_dring_unreg_msg(vd, msg, msglen);
+			return ((status == 0) ? ENOTSUP : status);
+
+		default:
+			ASSERT("Unsupported transfer mode");
+			PRN("Unsupported transfer mode");
+			return (ENOTSUP);
+		}
+
+	default:
+		ASSERT("Invalid client connection state");
+		PRN("Invalid client connection state");
+		return (ENOTSUP);
+	}
+}
+
+static void
+vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
+{
+	int		status;
+	boolean_t	reset_ldc = B_FALSE;
+
+
+	ASSERT(mutex_owned(&vd->lock));
+
+	/*
+	 * Check that the message is at least big enough for a "tag", so that
+	 * message processing can proceed based on tag-specified message type
+	 */
+	if (msglen < sizeof (vio_msg_tag_t)) {
+		PRN("Received short (%lu-byte) message", msglen);
+		/* Can't "nack" short message, so drop the big hammer */
+		vd_reset_connection(vd, B_TRUE);
+		return;
+	}
+
+	/*
+	 * Process the message
+	 */
+	switch (status = vd_do_process_msg(vd, msg, msglen)) {
+	case 0:
+		/* "ack" valid, successfully-processed messages */
+		msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
+		break;
+
+	case ENOMSG:
+		PRN("Received unexpected message");
+		_NOTE(FALLTHROUGH);
+	case EBADMSG:
+	case ENOTSUP:
+		/* "nack" invalid messages */
+		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
+		break;
+
+	default:
+		/* "nack" failed messages */
+		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
+		/* An LDC error probably occurred, so try resetting it */
+		reset_ldc = B_TRUE;
+		break;
+	}
+
+	/* "ack" or "nack" the message */
+	PR1("Sending %s",
+	    (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
+	if (send_msg(vd->ldc_handle, msg, msglen) != 0)
+		reset_ldc = B_TRUE;
+
+	/* Reset the connection for nack'ed or failed messages */
+	if ((status != 0) || reset_ldc)
+		vd_reset_connection(vd, reset_ldc);
+}
+
+static void
+vd_process_queue(void *arg)
+{
+	vd_t		*vd = (vd_t *)arg;
+	size_t		max_msglen, nbytes;
+	vio_msg_t	*vio_msg;
+
+
+	PR2("Entered");
+	ASSERT(vd != NULL);
+	mutex_enter(&vd->lock);
+	max_msglen = vd->max_msglen;	/* vd->maxmsglen can change */
+	vio_msg = kmem_alloc(max_msglen, KM_SLEEP);
+	for (nbytes = vd->max_msglen;
+		vd->enabled && recv_msg(vd->ldc_handle, vio_msg, &nbytes) == 0;
+		nbytes = vd->max_msglen)
+		vd_process_msg(vd, vio_msg, nbytes);
+	kmem_free(vio_msg, max_msglen);
+	mutex_exit(&vd->lock);
+	PR2("Returning");
+}
+
+static uint_t
+vd_handle_ldc_events(uint64_t event, caddr_t arg)
+{
+	uint_t	status;
+	vd_t	*vd = (vd_t *)(void *)arg;
+
+
+	ASSERT(vd != NULL);
+	mutex_enter(&vd->lock);
+	if (event & LDC_EVT_READ) {
+		PR1("New packet(s) available");
+		/* Queue a task to process the new data */
+		if (ddi_taskq_dispatch(vd->taskq, vd_process_queue, vd, 0) !=
+		    DDI_SUCCESS)
+			PRN("Unable to dispatch vd_process_queue()");
+	} else if (event & LDC_EVT_RESET) {
+		PR0("Attempting to bring up reset channel");
+		if (((status = ldc_up(vd->ldc_handle)) != 0) &&
+		    (status != ECONNREFUSED)) {
+			PRN("ldc_up() returned errno %d", status);
+		}
+	} else if (event & LDC_EVT_UP) {
+		/* Reset the connection state when channel comes (back) up */
+		vd_reset_connection(vd, B_FALSE);
+	}
+	mutex_exit(&vd->lock);
+	return (LDC_SUCCESS);
+}
+
+static uint_t
+vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
+{
+	_NOTE(ARGUNUSED(key, val))
+	(*((uint_t *)arg))++;
+	return (MH_WALK_TERMINATE);
+}
+
+
+static int
+vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	uint_t	vd_present = 0;
+	minor_t	instance;
+	vds_t	*vds;
+
+
+	PR0("Entered");
+	switch (cmd) {
+	case DDI_DETACH:
+		/* the real work happens below */
+		break;
+	case DDI_SUSPEND:
+		/* nothing to do for this non-device */
+		return (DDI_SUCCESS);
+	default:
+		return (DDI_FAILURE);
+	}
+
+	ASSERT(cmd == DDI_DETACH);
+	instance = ddi_get_instance(dip);
+	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
+		PRN("Could not get state for instance %u", instance);
+		ddi_soft_state_free(vds_state, instance);
+		return (DDI_FAILURE);
+	}
+
+	/* Do no detach when serving any vdisks */
+	mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present);
+	if (vd_present) {
+		PR0("Not detaching because serving vdisks");
+		return (DDI_FAILURE);
+	}
+
+	PR0("Detaching");
+	if (vds->initialized & VDS_MDEG)
+		(void) mdeg_unregister(vds->mdeg);
+	if (vds->initialized & VDS_LDI)
+		(void) ldi_ident_release(vds->ldi_ident);
+	mod_hash_destroy_hash(vds->vd_table);
+	if (vds->initialized & VDS_LOCKING)
+		mutex_destroy(&vds->lock);
+	ddi_soft_state_free(vds_state, instance);
+	return (DDI_SUCCESS);
+}
+
+static boolean_t
+is_pseudo_device(dev_info_t *dip)
+{
+	dev_info_t	*parent, *root = ddi_root_node();
+
+
+	for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root);
+	    parent = ddi_get_parent(parent)) {
+		if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static int
+vd_get_params(ldi_handle_t lh, char *block_device, vd_t *vd)
+{
+	int		otyp, rval, status;
+	dev_info_t	*dip;
+	struct dk_cinfo	dk_cinfo;
+
+
+	/* Get block device's device number, otyp, and size */
+	if ((status = ldi_get_dev(lh, &vd->dev[0])) != 0) {
+		PRN("ldi_get_dev() returned errno %d for %s",
+		    status, block_device);
+		return (status);
+	}
+	if ((status = ldi_get_otyp(lh, &otyp)) != 0) {
+		PRN("ldi_get_otyp() returned errno %d for %s",
+		    status, block_device);
+		return (status);
+	}
+	if (otyp != OTYP_BLK) {
+		PRN("Cannot serve non-block device %s", block_device);
+		return (ENOTBLK);
+	}
+	if (ldi_get_size(lh, &vd->vdisk_size) != DDI_SUCCESS) {
+		PRN("ldi_get_size() failed for %s", block_device);
+		return (EIO);
+	}
+
+	/* Determine if backing block device is a pseudo device */
+	if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]),
+		    dev_to_instance(vd->dev[0]), 0))  == NULL) {
+		PRN("%s is no longer accessible", block_device);
+		return (EIO);
+	}
+	vd->pseudo = is_pseudo_device(dip);
+	ddi_release_devi(dip);
+	if (vd->pseudo) {
+		vd->vdisk_type	= VD_DISK_TYPE_SLICE;
+		vd->nslices	= 1;
+		return (0);	/* ...and we're done */
+	}
+
+	/* Get dk_cinfo to determine slice of backing block device */
+	if ((status = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&dk_cinfo,
+		    FKIOCTL, kcred, &rval)) != 0) {
+		PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
+		    status, block_device);
+		return (status);
+	}
+
+	if (dk_cinfo.dki_partition >= V_NUMPAR) {
+		PRN("slice %u >= maximum slice %u for %s",
+		    dk_cinfo.dki_partition, V_NUMPAR, block_device);
+		return (EIO);
+	}
+
+	/* If block device slice is entire disk, fill in all slice devices */
+	if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) {
+		uint_t	slice;
+		major_t	major = getmajor(vd->dev[0]);
+		minor_t	minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE;
+
+		vd->vdisk_type	= VD_DISK_TYPE_DISK;
+		vd->nslices	= V_NUMPAR;
+		for (slice = 0; slice < vd->nslices; slice++)
+			vd->dev[slice] = makedevice(major, (minor + slice));
+		return (0);	/* ...and we're done */
+	}
+
+	/* Otherwise, we have a (partial) slice of a block device */
+	vd->vdisk_type	= VD_DISK_TYPE_SLICE;
+	vd->nslices	= 1;
+
+
+	/* Initialize dk_geom structure for single-slice block device */
+	if ((status = ldi_ioctl(lh, DKIOCGGEOM, (intptr_t)&vd->dk_geom,
+		    FKIOCTL, kcred, &rval)) != 0) {
+		PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s",
+		    status, block_device);
+		return (status);
+	}
+	if (vd->dk_geom.dkg_nsect == 0) {
+		PRN("%s geometry claims 0 sectors per track", block_device);
+		return (EIO);
+	}
+	if (vd->dk_geom.dkg_nhead == 0) {
+		PRN("%s geometry claims 0 heads", block_device);
+		return (EIO);
+	}
+	vd->dk_geom.dkg_ncyl =
+	    lbtodb(vd->vdisk_size)/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead;
+	vd->dk_geom.dkg_acyl = 0;
+	vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl;
+
+
+	/* Initialize vtoc structure for single-slice block device */
+	if ((status = ldi_ioctl(lh, DKIOCGVTOC, (intptr_t)&vd->vtoc,
+		    FKIOCTL, kcred, &rval)) != 0) {
+		PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d for %s",
+		    status, block_device);
+		return (status);
+	}
+	bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume,
+	    MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume)));
+	bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part));
+	vd->vtoc.v_nparts = 1;
+	vd->vtoc.v_part[0].p_tag = V_UNASSIGNED;
+	vd->vtoc.v_part[0].p_flag = 0;
+	vd->vtoc.v_part[0].p_start = 0;
+	vd->vtoc.v_part[0].p_size = lbtodb(vd->vdisk_size);
+	bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel,
+	    MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel)));
+
+
+	return (0);
+}
+
+static int
+vds_do_init_vd(vds_t *vds, uint64_t id, char *block_device, uint64_t ldc_id,
+    vd_t **vdp)
+{
+	char			tq_name[TASKQ_NAMELEN];
+	int			param_status, status;
+	uint_t			slice;
+	ddi_iblock_cookie_t	iblock = NULL;
+	ldc_attr_t		ldc_attr;
+	ldi_handle_t		lh = NULL;
+	vd_t			*vd;
+
+
+	ASSERT(vds != NULL);
+	ASSERT(block_device != NULL);
+	ASSERT(vdp != NULL);
+	PR0("Adding vdisk for %s", block_device);
+
+	if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) {
+		PRN("No memory for virtual disk");
+		return (EAGAIN);
+	}
+	*vdp = vd;	/* assign here so vds_destroy_vd() can cleanup later */
+	vd->vds = vds;
+
+
+	/* Get device parameters */
+	if ((status = ldi_open_by_name(block_device, FREAD, kcred, &lh,
+		    vds->ldi_ident)) != 0) {
+		PRN("ldi_open_by_name(%s) = errno %d", block_device, status);
+		return (status);
+	}
+	param_status = vd_get_params(lh, block_device, vd);
+	if ((status = ldi_close(lh, FREAD, kcred)) != 0) {
+		PRN("ldi_close(%s) = errno %d", block_device, status);
+		return (status);
+	}
+	if (param_status != 0)
+		return (param_status);
+	ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR);
+	PR0("vdisk_type = %s, pseudo = %s, nslices = %u",
+	    ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"),
+	    (vd->pseudo ? "yes" : "no"), vd->nslices);
+
+
+	/* Initialize locking */
+	if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED,
+		&iblock) != DDI_SUCCESS) {
+		PRN("Could not get iblock cookie.");
+		return (EIO);
+	}
+
+	mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock);
+	vd->initialized |= VD_LOCKING;
+
+
+	/* Open the backing-device slices */
+	for (slice = 0; slice < vd->nslices; slice++) {
+		ASSERT(vd->ldi_handle[slice] == NULL);
+		PR0("Opening device %u, minor %u = slice %u",
+		    getmajor(vd->dev[slice]), getminor(vd->dev[slice]), slice);
+		if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK,
+			    vd_open_flags, kcred, &vd->ldi_handle[slice],
+			    vds->ldi_ident)) != 0) {
+			PRN("ldi_open_by_dev() returned errno %d for slice %u",
+			    status, slice);
+			/* vds_destroy_vd() will close any open slices */
+#if 0	/* FIXME */
+			return (status);
+#endif
+		}
+	}
+
+
+	/* Create the task queue for the vdisk */
+	(void) snprintf(tq_name, sizeof (tq_name), "vd%lu", id);
+	PR1("tq_name = %s", tq_name);
+	if ((vd->taskq = ddi_taskq_create(vds->dip, tq_name, 1,
+		    TASKQ_DEFAULTPRI, 0)) == NULL) {
+		PRN("Could not create task queue");
+		return (EIO);
+	}
+	vd->initialized |= VD_TASKQ;
+	vd->enabled = 1;	/* before callback can dispatch to taskq */
+
+
+	/* Bring up LDC */
+	ldc_attr.devclass	= LDC_DEV_BLK_SVC;
+	ldc_attr.instance	= ddi_get_instance(vds->dip);
+	ldc_attr.mode		= LDC_MODE_UNRELIABLE;
+	ldc_attr.qlen		= VD_LDC_QLEN;
+	if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) {
+		PRN("ldc_init(%lu) = errno %d", ldc_id, status);
+		return (status);
+	}
+	vd->initialized |= VD_LDC;
+
+	if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events,
+		(caddr_t)vd)) != 0) {
+		PRN("ldc_reg_callback() returned errno %d", status);
+		return (status);
+	}
+
+	if ((status = ldc_open(vd->ldc_handle)) != 0) {
+		PRN("ldc_open() returned errno %d", status);
+		return (status);
+	}
+
+	if (((status = ldc_up(vd->ldc_handle)) != 0) &&
+	    (status != ECONNREFUSED)) {
+		PRN("ldc_up() returned errno %d", status);
+		return (status);
+	}
+
+
+	/* Add the successfully-initialized vdisk to the server's table */
+	if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) {
+		PRN("Error adding vdisk ID %lu to table", id);
+		return (EIO);
+	}
+
+	return (0);
+}
+
+/*
+ * Destroy the state associated with a virtual disk
+ */
+static void
+vds_destroy_vd(void *arg)
+{
+	vd_t	*vd = (vd_t *)arg;
+
+
+	PR0("Entered");
+	if (vd == NULL)
+		return;
+
+	/* Disable queuing requests for the vdisk */
+	if (vd->initialized & VD_LOCKING) {
+		mutex_enter(&vd->lock);
+		vd->enabled = 0;
+		mutex_exit(&vd->lock);
+	}
+
+	/* Drain and destroy the task queue (*before* shutting down LDC) */
+	if (vd->initialized & VD_TASKQ)
+		ddi_taskq_destroy(vd->taskq);	/* waits for queued tasks */
+
+	/* Shut down LDC */
+	if (vd->initialized & VD_LDC) {
+		if (vd->initialized & VD_DRING)
+			(void) ldc_mem_dring_unmap(vd->dring_handle);
+		(void) ldc_unreg_callback(vd->ldc_handle);
+		(void) ldc_close(vd->ldc_handle);
+		(void) ldc_fini(vd->ldc_handle);
+	}
+
+	/* Close any open backing-device slices */
+	for (uint_t slice = 0; slice < vd->nslices; slice++) {
+		if (vd->ldi_handle[slice] != NULL) {
+			PR0("Closing slice %u", slice);
+			(void) ldi_close(vd->ldi_handle[slice],
+			    vd_open_flags, kcred);
+		}
+	}
+
+	/* Free lock */
+	if (vd->initialized & VD_LOCKING)
+		mutex_destroy(&vd->lock);
+
+	/* Finally, free the vdisk structure itself */
+	kmem_free(vd, sizeof (*vd));
+}
+
+static int
+vds_init_vd(vds_t *vds, uint64_t id, char *block_device, uint64_t ldc_id)
+{
+	int	status;
+	vd_t	*vd = NULL;
+
+
+#ifdef lint
+	(void) vd;
+#endif	/* lint */
+
+	if ((status = vds_do_init_vd(vds, id, block_device, ldc_id, &vd)) != 0)
+		vds_destroy_vd(vd);
+
+	return (status);
+}
+
+static int
+vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel,
+    uint64_t *ldc_id)
+{
+	int	num_channels;
+
+
+	/* Look for channel endpoint child(ren) of the vdisk MD node */
+	if ((num_channels = md_scan_dag(md, vd_node,
+		    md_find_name(md, VD_CHANNEL_ENDPOINT),
+		    md_find_name(md, "fwd"), channel)) <= 0) {
+		PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT);
+		return (-1);
+	}
+
+	/* Get the "id" value for the first channel endpoint node */
+	if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) {
+		PRN("No \"%s\" property found for \"%s\" of vdisk",
+		    VD_ID_PROP, VD_CHANNEL_ENDPOINT);
+		return (-1);
+	}
+
+	if (num_channels > 1) {
+		PRN("Using ID of first of multiple channels for this vdisk");
+	}
+
+	return (0);
+}
+
+static int
+vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id)
+{
+	int		num_nodes, status;
+	size_t		size;
+	mde_cookie_t	*channel;
+
+
+	if ((num_nodes = md_node_count(md)) <= 0) {
+		PRN("Invalid node count in Machine Description subtree");
+		return (-1);
+	}
+	size = num_nodes*(sizeof (*channel));
+	channel = kmem_zalloc(size, KM_SLEEP);
+	status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id);
+	kmem_free(channel, size);
+
+	return (status);
+}
+
+static void
+vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
+{
+	char		*block_device = NULL;
+	uint64_t	id = 0, ldc_id = 0;
+
+
+	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
+		PRN("Error getting vdisk \"%s\"", VD_ID_PROP);
+		return;
+	}
+	PR0("Adding vdisk ID %lu", id);
+	if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP,
+		&block_device) != 0) {
+		PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
+		return;
+	}
+
+	if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) {
+		PRN("Error getting LDC ID for vdisk %lu", id);
+		return;
+	}
+
+	if (vds_init_vd(vds, id, block_device, ldc_id) != 0) {
+		PRN("Failed to add vdisk ID %lu", id);
+		return;
+	}
+}
+
+static void
+vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
+{
+	uint64_t	id = 0;
+
+
+	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
+		PRN("Unable to get \"%s\" property from vdisk's MD node",
+		    VD_ID_PROP);
+		return;
+	}
+	PR0("Removing vdisk ID %lu", id);
+	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0)
+		PRN("No vdisk entry found for vdisk ID %lu", id);
+}
+
+static void
+vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node,
+    md_t *curr_md, mde_cookie_t curr_vd_node)
+{
+	char		*curr_dev, *prev_dev;
+	uint64_t	curr_id = 0, curr_ldc_id = 0;
+	uint64_t	prev_id = 0, prev_ldc_id = 0;
+	size_t		len;
+
+
+	/* Validate that vdisk ID has not changed */
+	if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) {
+		PRN("Error getting previous vdisk \"%s\" property",
+		    VD_ID_PROP);
+		return;
+	}
+	if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) {
+		PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP);
+		return;
+	}
+	if (curr_id != prev_id) {
+		PRN("Not changing vdisk:  ID changed from %lu to %lu",
+		    prev_id, curr_id);
+		return;
+	}
+
+	/* Validate that LDC ID has not changed */
+	if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) {
+		PRN("Error getting LDC ID for vdisk %lu", prev_id);
+		return;
+	}
+
+	if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) {
+		PRN("Error getting LDC ID for vdisk %lu", curr_id);
+		return;
+	}
+	if (curr_ldc_id != prev_ldc_id) {
+		_NOTE(NOTREACHED);	/* FIXME is there a better way? */
+		PRN("Not changing vdisk:  "
+		    "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id);
+		return;
+	}
+
+	/* Determine whether device path has changed */
+	if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP,
+		&prev_dev) != 0) {
+		PRN("Error getting previous vdisk \"%s\"",
+		    VD_BLOCK_DEVICE_PROP);
+		return;
+	}
+	if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP,
+		&curr_dev) != 0) {
+		PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
+		return;
+	}
+	if (((len = strlen(curr_dev)) == strlen(prev_dev)) &&
+	    (strncmp(curr_dev, prev_dev, len) == 0))
+		return;	/* no relevant (supported) change */
+
+	PR0("Changing vdisk ID %lu", prev_id);
+	/* Remove old state, which will close vdisk and reset */
+	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0)
+		PRN("No entry found for vdisk ID %lu", prev_id);
+	/* Re-initialize vdisk with new state */
+	if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) {
+		PRN("Failed to change vdisk ID %lu", curr_id);
+		return;
+	}
+}
+
+static int
+vds_process_md(void *arg, mdeg_result_t *md)
+{
+	int	i;
+	vds_t	*vds = arg;
+
+
+	if (md == NULL)
+		return (MDEG_FAILURE);
+	ASSERT(vds != NULL);
+
+	for (i = 0; i < md->removed.nelem; i++)
+		vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]);
+	for (i = 0; i < md->match_curr.nelem; i++)
+		vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i],
+		    md->match_curr.mdp, md->match_curr.mdep[i]);
+	for (i = 0; i < md->added.nelem; i++)
+		vds_add_vd(vds, md->added.mdp, md->added.mdep[i]);
+
+	return (MDEG_SUCCESS);
+}
+
+static int
+vds_do_attach(dev_info_t *dip)
+{
+	static char	reg_prop[] = "reg";	/* devinfo ID prop */
+
+	/* MDEG specification for a (particular) vds node */
+	static mdeg_prop_spec_t	vds_prop_spec[] = {
+		{MDET_PROP_STR, "name", {VDS_NAME}},
+		{MDET_PROP_VAL, "cfg-handle", {0}},
+		{MDET_LIST_END, NULL, {0}}};
+	static mdeg_node_spec_t	vds_spec = {"virtual-device", vds_prop_spec};
+
+	/* MDEG specification for matching a vd node */
+	static md_prop_match_t	vd_prop_spec[] = {
+		{MDET_PROP_VAL, VD_ID_PROP},
+		{MDET_LIST_END, NULL}};
+	static mdeg_node_match_t vd_spec = {"virtual-device-port",
+					    vd_prop_spec};
+
+	int			status;
+	uint64_t		cfg_handle;
+	minor_t			instance = ddi_get_instance(dip);
+	vds_t			*vds;
+
+
+	/*
+	 * The "cfg-handle" property of a vds node in an MD contains the MD's
+	 * notion of "instance", or unique identifier, for that node; OBP
+	 * stores the value of the "cfg-handle" MD property as the value of
+	 * the "reg" property on the node in the device tree it builds from
+	 * the MD and passes to Solaris.  Thus, we look up the devinfo node's
+	 * "reg" property value to uniquely identify this device instance when
+	 * registering with the MD event-generation framework.  If the "reg"
+	 * property cannot be found, the device tree state is presumably so
+	 * broken that there is no point in continuing.
+	 */
+	if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, reg_prop)) {
+		PRN("vds \"%s\" property does not exist", reg_prop);
+		return (DDI_FAILURE);
+	}
+
+	/* Get the MD instance for later MDEG registration */
+	cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+	    reg_prop, -1);
+
+	if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) {
+		PRN("Could not allocate state for instance %u", instance);
+		return (DDI_FAILURE);
+	}
+
+	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
+		PRN("Could not get state for instance %u", instance);
+		ddi_soft_state_free(vds_state, instance);
+		return (DDI_FAILURE);
+	}
+
+
+	vds->dip	= dip;
+	vds->vd_table	= mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS,
+							vds_destroy_vd,
+							sizeof (void *));
+	ASSERT(vds->vd_table != NULL);
+
+	mutex_init(&vds->lock, NULL, MUTEX_DRIVER, NULL);
+	vds->initialized |= VDS_LOCKING;
+
+	if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) {
+		PRN("ldi_ident_from_dip() returned errno %d", status);
+		return (DDI_FAILURE);
+	}
+	vds->initialized |= VDS_LDI;
+
+	/* Register for MD updates */
+	vds_prop_spec[1].ps_val = cfg_handle;
+	if (mdeg_register(&vds_spec, &vd_spec, vds_process_md, vds,
+		&vds->mdeg) != MDEG_SUCCESS) {
+		PRN("Unable to register for MD updates");
+		return (DDI_FAILURE);
+	}
+	vds->initialized |= VDS_MDEG;
+
+	ddi_report_dev(dip);
+	return (DDI_SUCCESS);
+}
+
+static int
+vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int	status;
+
+	PR0("Entered");
+	switch (cmd) {
+	case DDI_ATTACH:
+		if ((status = vds_do_attach(dip)) != DDI_SUCCESS)
+			(void) vds_detach(dip, DDI_DETACH);
+		return (status);
+	case DDI_RESUME:
+		/* nothing to do for this non-device */
+		return (DDI_SUCCESS);
+	default:
+		return (DDI_FAILURE);
+	}
+}
+
+static struct dev_ops vds_ops = {
+	DEVO_REV,	/* devo_rev */
+	0,		/* devo_refcnt */
+	ddi_no_info,	/* devo_getinfo */
+	nulldev,	/* devo_identify */
+	nulldev,	/* devo_probe */
+	vds_attach,	/* devo_attach */
+	vds_detach,	/* devo_detach */
+	nodev,		/* devo_reset */
+	NULL,		/* devo_cb_ops */
+	NULL,		/* devo_bus_ops */
+	nulldev		/* devo_power */
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	"virtual disk server v%I%",
+	&vds_ops,
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	&modldrv,
+	NULL
+};
+
+
+int
+_init(void)
+{
+	int		i, status;
+
+
+	PR0("Built %s %s", __DATE__, __TIME__);
+	if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0)
+		return (status);
+	if ((status = mod_install(&modlinkage)) != 0) {
+		ddi_soft_state_fini(&vds_state);
+		return (status);
+	}
+
+	/* Fill in the bit-mask of server-supported operations */
+	for (i = 0; i < vds_noperations; i++)
+		vds_operations |= 1 << (vds_operation[i].operation - 1);
+
+	return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	int	status;
+
+
+	PR0("Entered");
+	if ((status = mod_remove(&modlinkage)) != 0)
+		return (status);
+	ddi_soft_state_fini(&vds_state);
+	return (0);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/vldc.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,1581 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/file.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/open.h>
+#include <sys/cred.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/ksynch.h>
+#include <sys/modctl.h>
+#include <sys/stat.h>			/* needed for S_IFBLK and S_IFCHR */
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/cred.h>
+#include <sys/promif.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cyclic.h>
+#include <sys/note.h>
+#include <sys/mach_descrip.h>
+#include <sys/mdeg.h>
+#include <sys/ldc.h>
+#include <sys/vldc_impl.h>
+
+/*
+ * Function prototypes.
+ */
+
+/* DDI entrypoints */
+static int vldc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int vldc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+static int vldc_open(dev_t *devp, int flag, int otyp, cred_t *cred);
+static int vldc_close(dev_t dev, int flag, int otyp, cred_t *cred);
+static int vldc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
+    cred_t *credp, int *rvalp);
+static int vldc_read(dev_t dev, struct uio *uiop, cred_t *credp);
+static int vldc_write(dev_t dev, struct uio *uiop, cred_t *credp);
+static int vldc_chpoll(dev_t dev, short events, int anyyet,
+    short *reventsp, struct pollhead **phpp);
+
+/* Internal functions */
+static uint_t i_vldc_cb(uint64_t event, caddr_t arg);
+static int i_vldc_mdeg_cb(void *cb_argp, mdeg_result_t *resp);
+static int i_vldc_mdeg_register(vldc_t *vldcp);
+static int i_vldc_mdeg_unregister(vldc_t *vldcp);
+static int i_vldc_add_port(vldc_t *vldcp, md_t *mdp, mde_cookie_t node);
+static int i_vldc_remove_port(vldc_t *vldcp, uint_t portno);
+static int i_vldc_close_port(vldc_t *vldcp, uint_t portno);
+
+/* soft state structure */
+static void *vldc_ssp;
+
+/*
+ * Matching criteria passed to the MDEG to register interest
+ * in changes to 'virtual-device-port' nodes identified by their
+ * 'id' property.
+ */
+static md_prop_match_t vport_prop_match[] = {
+	{ MDET_PROP_VAL,    "id"   },
+	{ MDET_LIST_END,    NULL    }
+};
+
+static mdeg_node_match_t vport_match = { "virtual-device-port",
+					vport_prop_match };
+
+/*
+ * Specification of an MD node passed to the MDEG to filter any
+ * 'virtual-device-port' nodes that do not belong to the specified
+ * node. This template is copied for each vldc instance and filled
+ * in with the appropriate 'name' and 'cfg-handle' values before
+ * being passed to the MDEG.
+ */
+static mdeg_prop_spec_t vldc_prop_template[] = {
+	{ MDET_PROP_STR,    "name",		NULL	},
+	{ MDET_PROP_VAL,    "cfg-handle",	NULL    },
+	{ MDET_LIST_END,    NULL,		NULL    }
+};
+
+#define	VLDC_MDEG_PROP_NAME(specp)		((specp)[0].ps_str)
+#define	VLDC_SET_MDEG_PROP_NAME(specp, name)	((specp)[0].ps_str = (name))
+#define	VLDC_SET_MDEG_PROP_INST(specp, inst)	((specp)[1].ps_val = (inst))
+
+
+static struct cb_ops vldc_cb_ops = {
+	vldc_open,	/* open */
+	vldc_close,	/* close */
+	nodev,		/* strategy */
+	nodev,		/* print */
+	nodev,		/* dump */
+	vldc_read,	/* read */
+	vldc_write,	/* write */
+	vldc_ioctl,	/* ioctl */
+	nodev,		/* devmap */
+	nodev,		/* mmap */
+	ddi_segmap,	/* segmap */
+	vldc_chpoll,	/* chpoll */
+	ddi_prop_op,	/* prop_op */
+	NULL,		/* stream */
+	D_NEW | D_MP	/* flag */
+};
+
+static struct dev_ops vldc_ops = {
+	DEVO_REV,		/* rev */
+	0,			/* ref count */
+	ddi_getinfo_1to1,	/* getinfo */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	vldc_attach,		/* attach */
+	vldc_detach,		/* detach */
+	nodev,			/* reset */
+	&vldc_cb_ops,		/* cb_ops */
+	(struct bus_ops *)NULL	/* bus_ops */
+};
+
+extern struct mod_ops mod_driverops;
+
+static struct modldrv md = {
+	&mod_driverops, 			/* Type - it is a driver */
+	"sun4v Virtual LDC Driver %I%",	/* Name of the module */
+	&vldc_ops,				/* driver specific ops */
+};
+
+static struct modlinkage ml = {
+	MODREV_1,
+	&md,
+	NULL
+};
+
+/* maximum MTU and cookie size tunables */
+uint32_t vldc_max_mtu = VLDC_MAX_MTU;
+uint64_t vldc_max_cookie = VLDC_MAX_COOKIE;
+
+
+#ifdef DEBUG
+
+/*
+ * Print debug messages
+ *
+ * set vldcdbg to 0x7 to enable all messages
+ *
+ * 0x4 - Warnings
+ * 0x2 - All debug messages (most verbose)
+ * 0x1 - Minimal debug messages
+ */
+
+int vldcdbg = 0x0;
+
+static void
+vldcdebug(const char *fmt, ...)
+{
+	char buf[512];
+	va_list ap;
+
+	va_start(ap, fmt);
+	(void) vsnprintf(buf, sizeof (buf), fmt, ap);
+	va_end(ap);
+
+	cmn_err(CE_CONT, "?%s", buf);
+}
+
+#define	D1	if (vldcdbg & 0x01) vldcdebug
+#define	D2	if (vldcdbg & 0x02) vldcdebug
+#define	DWARN	if (vldcdbg & 0x04) vldcdebug
+
+#else /* not DEBUG */
+
+#define	D1	if (0) printf
+#define	D2	if (0) printf
+#define	DWARN	if (0) printf
+
+#endif /* not DEBUG */
+
+
+/* _init(9E): initialize the loadable module */
+int
+_init(void)
+{
+	int error;
+
+	/* init the soft state structure */
+	error = ddi_soft_state_init(&vldc_ssp, sizeof (vldc_t), 1);
+	if (error != 0) {
+		return (error);
+	}
+
+	/* Link the driver into the system */
+	error = mod_install(&ml);
+
+	return (error);
+}
+
+/* _info(9E): return information about the loadable module */
+int
+_info(struct modinfo *modinfop)
+{
+	/* Report status of the dynamically loadable driver module */
+	return (mod_info(&ml, modinfop));
+}
+
+/* _fini(9E): prepare the module for unloading. */
+int
+_fini(void)
+{
+	int error;
+
+	/* Unlink the driver module from the system */
+	if ((error = mod_remove(&ml)) == 0) {
+		/*
+		 * We have successfully "removed" the driver.
+		 * destroy soft state
+		 */
+		ddi_soft_state_fini(&vldc_ssp);
+	}
+
+	return (error);
+}
+
+/* ldc callback */
+static uint_t
+i_vldc_cb(uint64_t event, caddr_t arg)
+{
+	vldc_port_t *vport = (vldc_port_t *)arg;
+	short pollevents = 0;
+	int rv;
+
+	D1("i_vldc_cb: callback invoked port=%d, event=0x%lx\n",
+	    vport->number, event);
+
+	if (event & LDC_EVT_UP) {
+		pollevents |= POLLOUT;
+		vport->hanged_up = B_FALSE;
+
+	} else if (event & LDC_EVT_DOWN) {
+		pollevents |= POLLHUP;
+		vport->hanged_up = B_TRUE;
+
+	} else if (event & LDC_EVT_RESET) {
+		/* do an ldc_up because we can't be sure the other side will */
+		if ((rv = ldc_up(vport->ldc_handle)) != 0)
+			if (rv != ECONNREFUSED)
+				DWARN("i_vldc_cb: port@%d failed to"
+				    " bring up LDC channel=%ld, err=%d\n",
+				    vport->number, vport->ldc_id, rv);
+	}
+
+	if (event & LDC_EVT_READ)
+		pollevents |= POLLIN;
+
+	if (pollevents != 0) {
+		D1("i_vldc_cb: port@%d pollwakeup=0x%x\n",
+		    vport->number, pollevents);
+		pollwakeup(&vport->poll, pollevents);
+	}
+
+	return (LDC_SUCCESS);
+}
+
+/* mdeg callback */
+static int
+i_vldc_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
+{
+	vldc_t		*vldcp;
+	int		idx;
+	uint64_t	portno;
+	int		rv;
+	md_t		*mdp;
+	mde_cookie_t	node;
+
+	if (resp == NULL) {
+		D1("i_vldc_mdeg_cb: no result returned\n");
+		return (MDEG_FAILURE);
+	}
+
+	vldcp = (vldc_t *)cb_argp;
+
+	mutex_enter(&vldcp->lock);
+	if (vldcp->detaching == B_TRUE) {
+		D1("i_vldc_mdeg_cb: detach in progress\n");
+		mutex_exit(&vldcp->lock);
+		return (MDEG_FAILURE);
+	}
+
+	D1("i_vldc_mdeg_cb: added=%d, removed=%d, matched=%d\n",
+	    resp->added.nelem, resp->removed.nelem, resp->match_prev.nelem);
+
+	/* process added ports */
+	for (idx = 0; idx < resp->added.nelem; idx++) {
+		mdp = resp->added.mdp;
+		node = resp->added.mdep[idx];
+
+		D1("i_vldc_mdeg_cb: processing added node 0x%lx\n", node);
+
+		/* attempt to add a port */
+		if ((rv = i_vldc_add_port(vldcp, mdp, node)) != MDEG_SUCCESS) {
+			cmn_err(CE_NOTE, "?i_vldc_mdeg_cb: unable to add port, "
+			    "err = %d", rv);
+		}
+	}
+
+	/* process removed ports */
+	for (idx = 0; idx < resp->removed.nelem; idx++) {
+		mdp = resp->removed.mdp;
+		node = resp->removed.mdep[idx];
+
+		D1("i_vldc_mdeg_cb: processing removed node 0x%lx\n", node);
+
+		/* read in the port's id property */
+		if (md_get_prop_val(mdp, node, "id", &portno)) {
+			cmn_err(CE_NOTE, "?i_vldc_mdeg_cb: node 0x%lx of "
+			    "removed list has no 'id' property", node);
+			continue;
+		}
+
+		/* attempt to remove a port */
+		if ((rv = i_vldc_remove_port(vldcp, portno)) != 0) {
+			cmn_err(CE_NOTE, "?i_vldc_mdeg_cb: unable to remove "
+			    "port %lu, err %d", portno, rv);
+		}
+	}
+
+	/*
+	 * Currently no support for updating already active ports. So, ignore
+	 * the match_curr and match_prev arrays for now.
+	 */
+
+	mutex_exit(&vldcp->lock);
+
+	return (MDEG_SUCCESS);
+}
+
+/* register callback to mdeg */
+static int
+i_vldc_mdeg_register(vldc_t *vldcp)
+{
+	mdeg_prop_spec_t *pspecp;
+	mdeg_node_spec_t *inst_specp;
+	mdeg_handle_t	mdeg_hdl;
+	size_t		templatesz;
+	int		inst;
+	char		*name;
+	size_t		namesz;
+	char		*nameprop;
+	int		rv;
+
+	/* get the unique vldc instance assigned by the LDom manager */
+	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vldcp->dip,
+	    DDI_PROP_DONTPASS, "reg", -1);
+	if (inst == -1) {
+		cmn_err(CE_NOTE, "?vldc%d has no 'reg' property",
+		    ddi_get_instance(vldcp->dip));
+		return (DDI_FAILURE);
+	}
+
+	/* get the name of the vldc instance */
+	rv = ddi_prop_lookup_string(DDI_DEV_T_ANY, vldcp->dip,
+	    DDI_PROP_DONTPASS, "name", &nameprop);
+	if (rv != DDI_PROP_SUCCESS) {
+		cmn_err(CE_NOTE, "?vldc%d has no 'name' property",
+		    ddi_get_instance(vldcp->dip));
+		return (DDI_FAILURE);
+	}
+
+	D1("i_vldc_mdeg_register: name=%s, instance=%d\n", nameprop, inst);
+
+	/*
+	 * Allocate and initialize a per-instance copy
+	 * of the global property spec array that will
+	 * uniquely identify this vldc instance.
+	 */
+	templatesz = sizeof (vldc_prop_template);
+	pspecp = kmem_alloc(templatesz, KM_SLEEP);
+
+	bcopy(vldc_prop_template, pspecp, templatesz);
+
+	/* copy in the name property */
+	namesz = strlen(nameprop) + 1;
+	name = kmem_alloc(namesz, KM_SLEEP);
+
+	bcopy(nameprop, name, namesz);
+	VLDC_SET_MDEG_PROP_NAME(pspecp, name);
+
+	/* copy in the instance property */
+	VLDC_SET_MDEG_PROP_INST(pspecp, inst);
+
+	/* initialize the complete prop spec structure */
+	inst_specp = kmem_alloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
+	inst_specp->namep = "virtual-device";
+	inst_specp->specp = pspecp;
+
+	/* perform the registration */
+	rv = mdeg_register(inst_specp, &vport_match, i_vldc_mdeg_cb,
+	    vldcp, &mdeg_hdl);
+
+	if (rv != MDEG_SUCCESS) {
+		cmn_err(CE_NOTE, "?i_vldc_mdeg_register: mdeg_register "
+		    "failed, err = %d", rv);
+		kmem_free(name, namesz);
+		kmem_free(pspecp, templatesz);
+		kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
+		return (DDI_FAILURE);
+	}
+
+	/* save off data that will be needed later */
+	vldcp->inst_spec = inst_specp;
+	vldcp->mdeg_hdl = mdeg_hdl;
+
+	return (DDI_SUCCESS);
+}
+
+/* unregister callback from mdeg */
+static int
+i_vldc_mdeg_unregister(vldc_t *vldcp)
+{
+	char	*name;
+	int	rv;
+
+	D1("i_vldc_mdeg_unregister: hdl=0x%lx\n", vldcp->mdeg_hdl);
+
+	rv = mdeg_unregister(vldcp->mdeg_hdl);
+	if (rv != MDEG_SUCCESS) {
+		return (rv);
+	}
+
+	/*
+	 * Clean up cached MDEG data
+	 */
+	name = VLDC_MDEG_PROP_NAME(vldcp->inst_spec->specp);
+	if (name != NULL) {
+		kmem_free(name, strlen(name) + 1);
+	}
+	kmem_free(vldcp->inst_spec->specp, sizeof (vldc_prop_template));
+	vldcp->inst_spec->specp = NULL;
+
+	kmem_free(vldcp->inst_spec, sizeof (mdeg_node_spec_t));
+	vldcp->inst_spec = NULL;
+
+	return (MDEG_SUCCESS);
+}
+
+static int
+i_vldc_get_port_channel(md_t *mdp, mde_cookie_t node, uint64_t *ldc_id)
+{
+	int num_nodes, nchan;
+	size_t listsz;
+	mde_cookie_t *listp;
+
+	/*
+	 * Find the channel-endpoint node(s) (which should be under this
+	 * port node) which contain the channel id(s).
+	 */
+	if ((num_nodes = md_node_count(mdp)) <= 0) {
+		cmn_err(CE_NOTE, "?i_vldc_get_port_channel: invalid number of "
+		    "channel-endpoint nodes found (%d)", num_nodes);
+		return (-1);
+	}
+
+	/* allocate space for node list */
+	listsz = num_nodes * sizeof (mde_cookie_t);
+	listp = kmem_alloc(listsz, KM_SLEEP);
+
+	nchan = md_scan_dag(mdp, node, md_find_name(mdp, "channel-endpoint"),
+	    md_find_name(mdp, "fwd"), listp);
+
+	if (nchan <= 0) {
+		cmn_err(CE_NOTE, "?i_vldc_get_port_channel: no channel-endpoint"
+		    " nodes found");
+		kmem_free(listp, listsz);
+		return (-1);
+	}
+
+	D2("i_vldc_get_port_channel: %d channel-endpoint nodes found", nchan);
+
+	/* use property from first node found */
+	if (md_get_prop_val(mdp, listp[0], "id", ldc_id)) {
+		cmn_err(CE_NOTE, "?i_vldc_get_port_channel: channel-endpoint "
+		    "has no 'id' property");
+		kmem_free(listp, listsz);
+		return (-1);
+	}
+
+	kmem_free(listp, listsz);
+
+	return (0);
+}
+
+/* add a vldc port */
+static int
+i_vldc_add_port(vldc_t *vldcp, md_t *mdp, mde_cookie_t node)
+{
+	vldc_port_t	*vport;
+	char		*sname;
+	uint64_t	portno;
+	int		vldc_inst;
+	minor_t		minor;
+	int		minor_idx;
+	boolean_t	new_minor;
+	int		rv;
+
+	/* read in the port's id property */
+	if (md_get_prop_val(mdp, node, "id", &portno)) {
+		cmn_err(CE_NOTE, "?i_vldc_add_port: node 0x%lx of added "
+		    "list has no 'id' property", node);
+		return (MDEG_FAILURE);
+	}
+
+	if (portno >= VLDC_MAX_PORTS) {
+		cmn_err(CE_NOTE, "?i_vldc_add_port: found port number (%lu) "
+		    "larger than maximum supported number of ports", portno);
+		return (MDEG_FAILURE);
+	}
+
+	vport = &(vldcp->port[portno]);
+
+	if (vport->minorp != NULL) {
+		cmn_err(CE_NOTE, "?i_vldc_add_port: trying to add a port (%lu)"
+		    " which is already bound", portno);
+		return (MDEG_FAILURE);
+	}
+
+	vport->number = portno;
+
+	/* get all channels for this device (currently only one) */
+	if (i_vldc_get_port_channel(mdp, node, &vport->ldc_id) == -1) {
+		return (MDEG_FAILURE);
+	}
+
+	/* set the default MTU */
+	vport->mtu = VLDC_DEFAULT_MTU;
+
+	/* get the service being exported by this port */
+	if (md_get_prop_str(mdp, node, "vldc-svc-name", &sname)) {
+		cmn_err(CE_NOTE, "?i_vldc_add_port: vdevice has no "
+		    "'vldc-svc-name' property");
+		return (MDEG_FAILURE);
+	}
+
+	/* minor number look up */
+	for (minor_idx = 0; minor_idx < vldcp->minors_assigned;
+	    minor_idx++) {
+		if (strcmp(vldcp->minor_tbl[minor_idx].sname, sname) == 0) {
+			/* found previously assigned minor number */
+			break;
+		}
+	}
+
+	new_minor = B_FALSE;
+	if (minor_idx == vldcp->minors_assigned) {
+		/* end of lookup - assign new minor number */
+		if (vldcp->minors_assigned == VLDC_MAX_MINORS) {
+			cmn_err(CE_NOTE, "?i_vldc_add_port: too many minor "
+			    "nodes (%d)", minor_idx);
+			return (MDEG_FAILURE);
+		}
+
+		(void) strlcpy(vldcp->minor_tbl[minor_idx].sname,
+		    sname, MAXPATHLEN);
+
+		vldcp->minors_assigned++;
+		new_minor = B_TRUE;
+	}
+
+	ASSERT(vldcp->minor_tbl[minor_idx].portno == VLDC_INVALID_PORTNO);
+
+	vport->minorp = &vldcp->minor_tbl[minor_idx];
+	vldcp->minor_tbl[minor_idx].portno = portno;
+	vldcp->minor_tbl[minor_idx].in_use = 0;
+
+	D1("i_vldc_add_port: port@%d  mtu=%d, ldc=%ld, service=%s\n",
+	    vport->number, vport->mtu, vport->ldc_id, sname);
+
+	/*
+	 * Create a minor node. The minor number is
+	 * (vldc_inst << VLDC_INST_SHIFT) | minor_idx
+	 */
+	vldc_inst = ddi_get_instance(vldcp->dip);
+
+	minor = (vldc_inst << VLDC_INST_SHIFT) | (minor_idx);
+
+	rv = ddi_create_minor_node(vldcp->dip, sname, S_IFCHR,
+	    minor, DDI_NT_SERIAL, 0);
+
+	if (rv != DDI_SUCCESS) {
+		cmn_err(CE_NOTE, "?i_vldc_add_port: failed to create minor"
+		    "node (%u), err = %d", minor, rv);
+		vldcp->minor_tbl[minor_idx].portno = VLDC_INVALID_PORTNO;
+		if (new_minor) {
+			vldcp->minors_assigned--;
+		}
+		return (MDEG_FAILURE);
+	}
+
+	/*
+	 * The port is now bound to a minor node and is initially in the
+	 * closed state.
+	 */
+	vport->status = VLDC_PORT_CLOSED;
+
+	D1("i_vldc_add_port: port %lu initialized\n", portno);
+
+	return (MDEG_SUCCESS);
+}
+
+/* remove a vldc port */
+static int
+i_vldc_remove_port(vldc_t *vldcp, uint_t portno)
+{
+	vldc_port_t *vport;
+	vldc_minor_t *vminor;
+
+	vport = &(vldcp->port[portno]);
+	vminor = vport->minorp;
+	if (vminor == NULL) {
+		cmn_err(CE_NOTE, "?i_vldc_remove_port: trying to remove a "
+		    "port (%u) which is not bound", portno);
+		return (MDEG_FAILURE);
+	}
+
+	/*
+	 * Make sure that all new attempts to open or use the minor node
+	 * associated with the port will fail.
+	 */
+	mutex_enter(&vminor->lock);
+	vminor->portno = VLDC_INVALID_PORTNO;
+	mutex_exit(&vminor->lock);
+
+	/* send hangup to anyone polling */
+	pollwakeup(&vport->poll, POLLHUP);
+
+	/* Now wait for all current users of the minor node to finish. */
+	mutex_enter(&vminor->lock);
+	while (vminor->in_use > 0) {
+		cv_wait(&vminor->cv, &vminor->lock);
+	}
+
+	if ((vport->status == VLDC_PORT_READY) ||
+	    (vport->status == VLDC_PORT_OPEN)) {
+		/* close the port before it is torn down */
+		(void) i_vldc_close_port(vldcp, portno);
+	}
+
+	/* remove minor node */
+	ddi_remove_minor_node(vldcp->dip, vport->minorp->sname);
+	vport->minorp = NULL;
+
+	mutex_exit(&vminor->lock);
+
+	D1("i_vldc_remove_port: removed vldc port %u\n", portno);
+
+	return (MDEG_SUCCESS);
+}
+
+/* close a ldc channel */
+static int
+i_vldc_ldc_close(vldc_port_t *vport)
+{
+	int rv = 0;
+	int err;
+
+	err = ldc_close(vport->ldc_handle);
+	if (err != 0)
+		rv = err;
+	err = ldc_unreg_callback(vport->ldc_handle);
+	if ((err != 0) && (rv != 0))
+		rv = err;
+	err = ldc_fini(vport->ldc_handle);
+	if ((err != 0) && (rv != 0))
+		rv = err;
+
+	return (rv);
+}
+
+/* close a vldc port */
+static int
+i_vldc_close_port(vldc_t *vldcp, uint_t portno)
+{
+	vldc_port_t *vport;
+	int rv;
+
+	vport = &(vldcp->port[portno]);
+
+	ASSERT(MUTEX_HELD(&vport->minorp->lock));
+
+	if (vport->status == VLDC_PORT_CLOSED) {
+		/* nothing to do */
+		DWARN("i_vldc_close_port: port %d in an unexpected "
+		    "state (%d)\n", portno, vport->status);
+		return (DDI_SUCCESS);
+	}
+
+	rv = DDI_SUCCESS;
+	if (vport->status == VLDC_PORT_READY) {
+		rv = i_vldc_ldc_close(vport);
+	} else {
+		ASSERT(vport->status == VLDC_PORT_OPEN);
+	}
+
+	/* free memory */
+	kmem_free(vport->send_buf, vport->mtu);
+	kmem_free(vport->recv_buf, vport->mtu);
+
+	vport->status = VLDC_PORT_CLOSED;
+
+	return (rv);
+}
+
+/*
+ * attach(9E): attach a device to the system.
+ * called once for each instance of the device on the system.
+ */
+static int
+vldc_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	int 	i, instance;
+	vldc_t	*vldcp;
+
+	switch (cmd) {
+
+	case DDI_ATTACH:
+
+		instance = ddi_get_instance(dip);
+
+		if (ddi_soft_state_zalloc(vldc_ssp, instance) != DDI_SUCCESS) {
+			return (DDI_FAILURE);
+		}
+
+		vldcp = ddi_get_soft_state(vldc_ssp, instance);
+		if (vldcp == NULL) {
+			ddi_soft_state_free(vldc_ssp, instance);
+			return (ENXIO);
+		}
+
+		D1("vldc_attach: DDI_ATTACH instance=%d\n", instance);
+
+		mutex_init(&vldcp->lock, NULL, MUTEX_DRIVER, NULL);
+		vldcp->dip = dip;
+		vldcp->detaching = B_FALSE;
+
+		for (i = 0; i < VLDC_MAX_PORTS; i++) {
+			/* No minor node association to start with */
+			vldcp->port[i].minorp = NULL;
+		}
+
+		for (i = 0; i < VLDC_MAX_MINORS; i++) {
+			mutex_init(&(vldcp->minor_tbl[i].lock), NULL,
+			    MUTEX_DRIVER, NULL);
+			cv_init(&(vldcp->minor_tbl[i].cv), NULL,
+			    CV_DRIVER, NULL);
+			/* No port association to start with */
+			vldcp->minor_tbl[i].portno = VLDC_INVALID_PORTNO;
+		}
+
+		/* Register for MD update notification */
+		if (i_vldc_mdeg_register(vldcp) != DDI_SUCCESS) {
+			ddi_soft_state_free(vldc_ssp, instance);
+			return (DDI_FAILURE);
+		}
+
+		return (DDI_SUCCESS);
+
+	case DDI_RESUME:
+
+		return (DDI_SUCCESS);
+
+	default:
+
+		return (DDI_FAILURE);
+	}
+}
+
+/*
+ * detach(9E): detach a device from the system.
+ */
+static int
+vldc_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	int 		i, instance;
+	vldc_t		*vldcp;
+
+	switch (cmd) {
+
+	case DDI_DETACH:
+
+		instance = ddi_get_instance(dip);
+
+		vldcp = ddi_get_soft_state(vldc_ssp, instance);
+		if (vldcp == NULL) {
+			return (DDI_FAILURE);
+		}
+
+		D1("vldc_detach: DDI_DETACH instance=%d\n", instance);
+
+		mutex_enter(&vldcp->lock);
+
+		/* Fail the detach if all ports have not been removed. */
+		for (i = 0; i < VLDC_MAX_MINORS; i++) {
+			if (vldcp->minor_tbl[i].portno != VLDC_INVALID_PORTNO) {
+				D1("vldc_detach: vldc@%d:%d is bound, "
+				    "detach failed\n",
+				    instance, vldcp->minor_tbl[i].portno);
+				mutex_exit(&vldcp->lock);
+				return (DDI_FAILURE);
+			}
+		}
+
+		/*
+		 * Prevent MDEG from adding new ports before the callback can
+		 * be unregistered. The lock can't be held accross the
+		 * unregistration call because a callback may be in progress
+		 * and blocked on the lock.
+		 */
+		vldcp->detaching = B_TRUE;
+
+		mutex_exit(&vldcp->lock);
+
+		if (i_vldc_mdeg_unregister(vldcp) != MDEG_SUCCESS) {
+			vldcp->detaching = B_FALSE;
+			return (DDI_FAILURE);
+		}
+
+		/* Tear down all bound ports and free resources. */
+		for (i = 0; i < VLDC_MAX_MINORS; i++) {
+			if (vldcp->minor_tbl[i].portno != VLDC_INVALID_PORTNO) {
+				(void) i_vldc_remove_port(vldcp, i);
+			}
+			mutex_destroy(&(vldcp->minor_tbl[i].lock));
+			cv_destroy(&(vldcp->minor_tbl[i].cv));
+		}
+
+		mutex_destroy(&vldcp->lock);
+		ddi_soft_state_free(vldc_ssp, instance);
+
+		return (DDI_SUCCESS);
+
+	case DDI_SUSPEND:
+
+		return (DDI_SUCCESS);
+
+	default:
+
+		return (DDI_FAILURE);
+	}
+}
+
+/* cb_open */
+static int
+vldc_open(dev_t *devp, int flag, int otyp, cred_t *cred)
+{
+	_NOTE(ARGUNUSED(flag, otyp, cred))
+
+	int instance;
+	minor_t minor;
+	uint64_t portno;
+	vldc_t *vldcp;
+	vldc_port_t *vport;
+	vldc_minor_t *vminor;
+
+	minor = getminor(*devp);
+	instance = VLDCINST(minor);
+	vldcp = ddi_get_soft_state(vldc_ssp, instance);
+	if (vldcp == NULL)
+		return (ENXIO);
+
+	vminor = VLDCMINOR(vldcp, minor);
+	mutex_enter(&vminor->lock);
+	portno = vminor->portno;
+	if (portno == VLDC_INVALID_PORTNO) {
+		mutex_exit(&vminor->lock);
+		return (ENXIO);
+	}
+
+	vport = &(vldcp->port[portno]);
+
+	D1("vldc_open: opening vldc@%d:%lu\n", instance, portno);
+
+	if (vport->status != VLDC_PORT_CLOSED) {
+		mutex_exit(&vminor->lock);
+		return (EBUSY);
+	}
+
+	vport->recv_buf = kmem_alloc(vport->mtu, KM_SLEEP);
+	vport->send_buf = kmem_alloc(vport->mtu, KM_SLEEP);
+
+	vport->is_stream = B_FALSE;	/* assume not a stream */
+	vport->hanged_up = B_FALSE;
+
+	vport->status = VLDC_PORT_OPEN;
+
+	mutex_exit(&vminor->lock);
+
+	return (DDI_SUCCESS);
+}
+
+/* cb_close */
+static int
+vldc_close(dev_t dev, int flag, int otyp, cred_t *cred)
+{
+	_NOTE(ARGUNUSED(flag, otyp, cred))
+
+	int instance;
+	minor_t minor;
+	uint64_t portno;
+	vldc_t *vldcp;
+	vldc_minor_t *vminor;
+	int rv;
+
+	minor = getminor(dev);
+	instance = VLDCINST(minor);
+	vldcp = ddi_get_soft_state(vldc_ssp, instance);
+	if (vldcp == NULL) {
+		return (ENXIO);
+	}
+
+	vminor = VLDCMINOR(vldcp, minor);
+	mutex_enter(&vminor->lock);
+	portno = vminor->portno;
+	if (portno == VLDC_INVALID_PORTNO) {
+		mutex_exit(&vminor->lock);
+		return (ENOLINK);
+	}
+
+	D1("vldc_close: closing vldc@%d:%lu\n", instance, portno);
+
+	rv = i_vldc_close_port(vldcp, portno);
+
+	mutex_exit(&vminor->lock);
+
+	return (rv);
+}
+
+static int
+vldc_set_ldc_mode(vldc_port_t *vport, vldc_t *vldcp, int channel_mode)
+{
+	ldc_attr_t attr;
+	int rv;
+
+	ASSERT(MUTEX_HELD(&vport->minorp->lock));
+
+	/* validate mode */
+	switch (channel_mode) {
+	case LDC_MODE_STREAM:
+		vport->is_stream = B_TRUE;
+		break;
+	case LDC_MODE_RAW:
+	case LDC_MODE_UNRELIABLE:
+	case LDC_MODE_RELIABLE:
+		vport->is_stream = B_FALSE;
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	if (vport->status == VLDC_PORT_READY) {
+		rv = i_vldc_ldc_close(vport);
+		vport->status = VLDC_PORT_OPEN;
+		if (rv != 0) {
+			DWARN("vldc_set_ldc_mode: i_vldc_ldc_close "
+			    "failed, rv=%d\n", rv);
+			return (rv);
+		}
+	}
+
+	D1("vldc_set_ldc_mode: vport status %d, mode %d\n",
+	    vport->status, channel_mode);
+
+	vport->ldc_mode = channel_mode;
+
+	/* initialize the channel */
+	attr.devclass = LDC_DEV_SERIAL;
+	attr.instance = ddi_get_instance(vldcp->dip);
+	attr.qlen = VLDC_QUEUE_LEN;
+	attr.mode = vport->ldc_mode;
+
+	if ((rv = ldc_init(vport->ldc_id, &attr,
+	    &vport->ldc_handle)) != 0) {
+		DWARN("vldc_ioctl_opt_op: ldc_init failed, rv=%d\n", rv);
+		goto error_init;
+	}
+
+	/* register it */
+	if ((rv = ldc_reg_callback(vport->ldc_handle,
+	    i_vldc_cb, (caddr_t)vport)) != 0) {
+		DWARN("vldc_ioctl_opt_op: ldc_reg_callback failed, rv=%d\n",
+		    rv);
+		goto error_reg;
+	}
+
+	/* open the channel */
+	if ((rv = ldc_open(vport->ldc_handle)) != 0) {
+		DWARN("vldc_ioctl_opt_op: ldc_open failed, rv=%d\n", rv);
+		goto error_open;
+	}
+
+	vport->status = VLDC_PORT_READY;
+
+	/*
+	 * Attempt to bring the channel up, but do not
+	 * fail if the other end is not up yet.
+	 */
+	rv = ldc_up(vport->ldc_handle);
+
+	if (rv == ECONNREFUSED) {
+		D1("vldc_ioctl_opt_op: remote endpoint not up yet\n");
+	} else if (rv != 0) {
+		DWARN("vldc_ioctl_opt_op: ldc_up failed, rv=%d\n", rv);
+		goto error_up;
+	}
+
+	D1("vldc_ioctl_opt_op: ldc %ld initialized successfully\n",
+	    vport->ldc_id);
+
+	return (0);
+
+error_up:
+	vport->status = VLDC_PORT_OPEN;
+	(void) ldc_close(vport->ldc_handle);
+error_open:
+	(void) ldc_unreg_callback(vport->ldc_handle);
+error_reg:
+	(void) ldc_fini(vport->ldc_handle);
+error_init:
+	return (rv);
+}
+
+/* ioctl to read cookie */
+static int
+i_vldc_ioctl_read_cookie(vldc_port_t *vport, int vldc_instance, void *arg,
+    int mode)
+{
+	vldc_data_t copy_info;
+	caddr_t buf;
+	uint64_t len;
+	int rv;
+
+	if (ddi_copyin(arg, &copy_info, sizeof (copy_info), mode) == -1) {
+		return (EFAULT);
+	}
+
+	len = copy_info.length;
+	if (len > vldc_max_cookie) {
+		return (EINVAL);
+	}
+
+	/* allocate a temporary buffer */
+	buf = kmem_alloc(len, KM_SLEEP);
+
+	mutex_enter(&vport->minorp->lock);
+
+	D2("i_vldc_ioctl_read_cookie: vldc@%d:%d reading from 0x%lx "
+	    "size 0x%lx to 0x%lx\n", vldc_instance, vport->number,
+	    copy_info.dst_addr, copy_info.length, copy_info.src_addr);
+
+	/* read from the HV into the temporary buffer */
+	rv = ldc_mem_rdwr_pa(vport->ldc_handle, buf, &len,
+	    (caddr_t)copy_info.dst_addr, LDC_COPY_IN);
+	if (rv != 0) {
+		DWARN("i_vldc_ioctl_read_cookie: vldc@%d:%d cannot read "
+		    "address 0x%lx, rv=%d\n", vldc_instance, vport->number,
+		    copy_info.dst_addr, rv);
+		mutex_exit(&vport->minorp->lock);
+		kmem_free(buf, copy_info.length);
+		return (EFAULT);
+	}
+
+	D2("i_vldc_ioctl_read_cookie: vldc@%d:%d read succeeded\n",
+	    vldc_instance, vport->number);
+
+	mutex_exit(&vport->minorp->lock);
+
+	/* copy data from temporary buffer out to the caller and free buffer */
+	rv = ddi_copyout(buf, (caddr_t)copy_info.src_addr, len, mode);
+	kmem_free(buf, copy_info.length);
+	if (rv != 0) {
+		return (EFAULT);
+	}
+
+	/* set the structure to reflect outcome */
+	copy_info.length = len;
+	if (ddi_copyout(&copy_info, arg, sizeof (copy_info), mode) != 0) {
+		return (EFAULT);
+	}
+
+	return (0);
+}
+
+/* ioctl to write cookie */
+static int
+i_vldc_ioctl_write_cookie(vldc_port_t *vport, int vldc_instance, void *arg,
+    int mode)
+{
+	vldc_data_t copy_info;
+	caddr_t buf;
+	uint64_t len;
+	int rv;
+
+	if (ddi_copyin((caddr_t)arg, &copy_info,
+	    sizeof (copy_info), mode) != 0) {
+		return (EFAULT);
+	}
+
+	len = copy_info.length;
+	if (len > vldc_max_cookie) {
+		return (EINVAL);
+	}
+
+	D2("i_vldc_ioctl_write_cookie: vldc@%d:%d writing 0x%lx size 0x%lx "
+	    "to 0x%lx\n", vldc_instance, vport->number, copy_info.src_addr,
+	    copy_info.length, copy_info.dst_addr);
+
+	/* allocate a temporary buffer */
+	buf = kmem_alloc(len, KM_SLEEP);
+
+	/* copy into the temporary buffer the data to be written to the HV */
+	if (ddi_copyin((caddr_t)copy_info.src_addr, buf,
+	    copy_info.length, mode) != 0) {
+		kmem_free(buf, copy_info.length);
+		return (EFAULT);
+	}
+
+	mutex_enter(&vport->minorp->lock);
+
+	/* write the data from the temporary buffer to the HV */
+	rv = ldc_mem_rdwr_pa(vport->ldc_handle, buf, &len,
+	    (caddr_t)copy_info.dst_addr, LDC_COPY_OUT);
+	if (rv != 0) {
+		DWARN("i_vldc_ioctl_write_cookie: vldc@%d:%d failed to write at"
+		    " address 0x%lx\n, rv=%d", vldc_instance, vport->number,
+		    copy_info.dst_addr, rv);
+		mutex_exit(&vport->minorp->lock);
+		kmem_free(buf, copy_info.length);
+		return (EFAULT);
+	}
+
+	D2("i_vldc_ioctl_write_cookie: vldc@%d:%d write succeeded\n",
+	    vldc_instance, vport->number);
+
+	mutex_exit(&vport->minorp->lock);
+
+	kmem_free(buf, copy_info.length);
+
+	/* set the structure to reflect outcome */
+	copy_info.length = len;
+	if (ddi_copyout(&copy_info, (caddr_t)arg,
+	    sizeof (copy_info), mode) != 0) {
+		return (EFAULT);
+	}
+
+	return (0);
+}
+
+/* vldc specific ioctl option commands */
+static int
+i_vldc_ioctl_opt_op(vldc_port_t *vport, vldc_t *vldcp, void *arg, int mode)
+{
+	vldc_opt_op_t 	vldc_cmd;
+	uint32_t	new_mtu;
+	int		rv = 0;
+
+	if (ddi_copyin(arg, &vldc_cmd, sizeof (vldc_cmd), mode) != 0) {
+		return (EFAULT);
+	}
+
+	D1("vldc_ioctl_opt_op: op %d\n", vldc_cmd.opt_sel);
+
+	switch (vldc_cmd.opt_sel) {
+
+	case VLDC_OPT_MTU_SZ:
+
+		if (vldc_cmd.op_sel == VLDC_OP_GET) {
+			vldc_cmd.opt_val = vport->mtu;
+			if (ddi_copyout(&vldc_cmd, arg,
+			    sizeof (vldc_cmd), mode) == -1) {
+				return (EFAULT);
+			}
+		} else {
+			new_mtu = vldc_cmd.opt_val;
+
+			if ((new_mtu < LDC_PACKET_SIZE) ||
+			    (new_mtu > vldc_max_mtu)) {
+				return (EINVAL);
+			}
+
+			mutex_enter(&vport->minorp->lock);
+
+			if ((vport->status != VLDC_PORT_CLOSED) &&
+			    (new_mtu != vport->mtu)) {
+				/*
+				 * The port has buffers allocated since it is
+				 * not closed plus the MTU size has changed.
+				 * Reallocate the buffers to the new MTU size.
+				 */
+				kmem_free(vport->recv_buf, vport->mtu);
+				vport->recv_buf = kmem_alloc(new_mtu, KM_SLEEP);
+
+				kmem_free(vport->send_buf, vport->mtu);
+				vport->send_buf = kmem_alloc(new_mtu, KM_SLEEP);
+
+				vport->mtu = new_mtu;
+			}
+
+			mutex_exit(&vport->minorp->lock);
+		}
+
+		break;
+
+	case VLDC_OPT_STATUS:
+
+		if (vldc_cmd.op_sel == VLDC_OP_GET) {
+			vldc_cmd.opt_val = vport->status;
+			if (ddi_copyout(&vldc_cmd, arg,
+			    sizeof (vldc_cmd), mode) == -1) {
+				return (EFAULT);
+			}
+		} else {
+			return (ENOTSUP);
+		}
+
+		break;
+
+	case VLDC_OPT_MODE:
+
+		if (vldc_cmd.op_sel == VLDC_OP_GET) {
+			vldc_cmd.opt_val = vport->ldc_mode;
+			if (ddi_copyout(&vldc_cmd, arg,
+			    sizeof (vldc_cmd), mode) == -1) {
+				return (EFAULT);
+			}
+		} else {
+			mutex_enter(&vport->minorp->lock);
+			rv = vldc_set_ldc_mode(vport, vldcp, vldc_cmd.opt_val);
+			mutex_exit(&vport->minorp->lock);
+		}
+
+		break;
+
+	default:
+
+		D1("vldc_ioctl_opt_op: unsupported op %d\n", vldc_cmd.opt_sel);
+		return (ENOTSUP);
+	}
+
+	return (rv);
+}
+
+/* cb_ioctl */
+static int
+vldc_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+    int *rvalp)
+{
+	_NOTE(ARGUNUSED(credp, rvalp))
+
+	int rv = EINVAL;
+	int instance;
+	minor_t minor;
+	uint64_t portno;
+	vldc_t *vldcp;
+	vldc_port_t *vport;
+	vldc_minor_t *vminor;
+
+	minor = getminor(dev);
+	instance = VLDCINST(minor);
+	vldcp = ddi_get_soft_state(vldc_ssp, instance);
+	if (vldcp == NULL) {
+		return (ENXIO);
+	}
+
+	vminor = VLDCMINOR(vldcp, minor);
+	mutex_enter(&vminor->lock);
+	portno = vminor->portno;
+	if (portno == VLDC_INVALID_PORTNO) {
+		mutex_exit(&vminor->lock);
+		return (ENOLINK);
+	}
+	vminor->in_use += 1;
+	mutex_exit(&vminor->lock);
+
+	vport = &(vldcp->port[portno]);
+
+	D1("vldc_ioctl: vldc@%d:%lu cmd=0x%x\n", instance, portno, cmd);
+
+	switch (cmd) {
+
+	case VLDC_IOCTL_OPT_OP:
+
+		rv = i_vldc_ioctl_opt_op(vport, vldcp, (void *)arg,  mode);
+		break;
+
+	case VLDC_IOCTL_READ_COOKIE:
+
+		rv = i_vldc_ioctl_read_cookie(vport, instance,
+		    (void *)arg, mode);
+		break;
+
+	case VLDC_IOCTL_WRITE_COOKIE:
+
+		rv = i_vldc_ioctl_write_cookie(vport, instance,
+		    (void *)arg, mode);
+		break;
+
+	default:
+
+		DWARN("vldc_ioctl: vldc@%d:%lu unknown cmd=0x%x\n",
+		    instance, portno, cmd);
+		rv = EINVAL;
+		break;
+	}
+
+	mutex_enter(&vminor->lock);
+	vminor->in_use -= 1;
+	if (vminor->in_use == 0) {
+		cv_signal(&vminor->cv);
+	}
+	mutex_exit(&vminor->lock);
+
+	D1("vldc_ioctl: rv=%d\n", rv);
+
+	return (rv);
+}
+
+/* cb_read */
+static int
+vldc_read(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	_NOTE(ARGUNUSED(credp))
+
+	int instance;
+	minor_t minor;
+	size_t size = 0;
+	uint64_t portno;
+	vldc_t *vldcp;
+	vldc_port_t *vport;
+	vldc_minor_t *vminor;
+	int rv = 0;
+
+	minor = getminor(dev);
+	instance = VLDCINST(minor);
+	vldcp = ddi_get_soft_state(vldc_ssp, instance);
+	if (vldcp == NULL) {
+		return (ENXIO);
+	}
+
+	vminor = VLDCMINOR(vldcp, minor);
+	mutex_enter(&vminor->lock);
+	portno = vminor->portno;
+	if (portno == VLDC_INVALID_PORTNO) {
+		mutex_exit(&vminor->lock);
+		return (ENOLINK);
+	}
+
+	D2("vldc_read: vldc@%d:%lu reading data\n", instance, portno);
+
+	vport = &(vldcp->port[portno]);
+
+	/* check the port status */
+	if (vport->status != VLDC_PORT_READY) {
+		DWARN("vldc_read: vldc@%d:%lu not in the ready state\n",
+		    instance, portno);
+		mutex_exit(&vminor->lock);
+		return (ENOTACTIVE);
+	}
+
+	/* read data */
+	size = MIN(vport->mtu, uiop->uio_resid);
+	rv = ldc_read(vport->ldc_handle, vport->recv_buf, &size);
+
+	D2("vldc_read: vldc@%d:%lu ldc_read size=%ld, rv=%d\n",
+	    instance, portno, size, rv);
+
+	if (rv == 0) {
+		if (size != 0) {
+			rv = uiomove(vport->recv_buf, size, UIO_READ, uiop);
+		} else {
+			rv = EWOULDBLOCK;
+		}
+	} else {
+		switch (rv) {
+		case ENOBUFS:
+			break;
+		case ETIMEDOUT:
+		case EWOULDBLOCK:
+			rv = EWOULDBLOCK;
+			break;
+		default:
+			rv = ECONNRESET;
+			break;
+		}
+	}
+
+	mutex_exit(&vminor->lock);
+
+	return (rv);
+}
+
+/* cb_write */
+static int
+vldc_write(dev_t dev, struct uio *uiop, cred_t *credp)
+{
+	_NOTE(ARGUNUSED(credp))
+
+	int instance;
+	minor_t minor;
+	size_t size;
+	size_t orig_size;
+	uint64_t portno;
+	vldc_t *vldcp;
+	vldc_port_t *vport;
+	vldc_minor_t *vminor;
+	int rv = EINVAL;
+
+	minor = getminor(dev);
+	instance = VLDCINST(minor);
+	vldcp = ddi_get_soft_state(vldc_ssp, instance);
+	if (vldcp == NULL) {
+		return (ENXIO);
+	}
+
+	vminor = VLDCMINOR(vldcp, minor);
+	mutex_enter(&vminor->lock);
+	portno = vminor->portno;
+	if (portno == VLDC_INVALID_PORTNO) {
+		mutex_exit(&vminor->lock);
+		return (ENOLINK);
+	}
+
+	vport = &(vldcp->port[portno]);
+
+	/* check the port status */
+	if (vport->status != VLDC_PORT_READY) {
+		DWARN("vldc_write: vldc@%d:%lu not in the ready state\n",
+		    instance, portno);
+		mutex_exit(&vminor->lock);
+		return (ENOTACTIVE);
+	}
+
+	orig_size = uiop->uio_resid;
+	size = orig_size;
+
+	if (size > vport->mtu) {
+		if (vport->is_stream) {
+			/* can only send MTU size at a time */
+			size = vport->mtu;
+		} else {
+			mutex_exit(&vminor->lock);
+			return (EMSGSIZE);
+		}
+	}
+
+	D2("vldc_write: vldc@%d:%lu writing %lu bytes\n", instance, portno,
+	    size);
+
+	rv = uiomove(vport->send_buf, size, UIO_WRITE, uiop);
+	if (rv == 0) {
+		rv = ldc_write(vport->ldc_handle, (caddr_t)vport->send_buf,
+			&size);
+		if (rv != 0) {
+			DWARN("vldc_write: vldc@%d:%lu failed writing %lu "
+			    "bytes rv=%d\n", instance, portno, size, rv);
+		}
+	} else {
+		size = 0;
+	}
+
+	mutex_exit(&vminor->lock);
+
+	/* resid is total number of bytes *not* sent */
+	uiop->uio_resid = orig_size - size;
+
+	return (rv);
+}
+
+/* cb_chpoll */
+static int
+vldc_chpoll(dev_t dev, short events, int anyyet,  short *reventsp,
+    struct pollhead **phpp)
+{
+	int instance;
+	minor_t minor;
+	uint64_t portno;
+	vldc_t *vldcp;
+	vldc_port_t *vport;
+	vldc_minor_t *vminor;
+	ldc_status_t ldc_state;
+	boolean_t isempty;
+	int rv;
+
+	minor = getminor(dev);
+	instance = VLDCINST(minor);
+	vldcp = ddi_get_soft_state(vldc_ssp, instance);
+	if (vldcp == NULL) {
+		return (ENXIO);
+	}
+
+	vminor = VLDCMINOR(vldcp, minor);
+	mutex_enter(&vminor->lock);
+	portno = vminor->portno;
+	if (portno == VLDC_INVALID_PORTNO) {
+		mutex_exit(&vminor->lock);
+		return (ENOLINK);
+	}
+
+	vport = &(vldcp->port[portno]);
+
+	/* check the port status */
+	if (vport->status != VLDC_PORT_READY) {
+		mutex_exit(&vminor->lock);
+		return (ENOTACTIVE);
+	}
+
+	D2("vldc_chpoll: vldc@%d:%lu polling events 0x%x\n",
+	    instance, portno, events);
+
+	rv = ldc_status(vport->ldc_handle, &ldc_state);
+	if (rv != 0) {
+		DWARN("vldc_chpoll: vldc@%d:%lu could not get ldc status, "
+		    "rv=%d\n", instance, portno, rv);
+		mutex_exit(&vminor->lock);
+		return (EBADFD);
+	}
+
+	*reventsp = 0;
+
+	if (ldc_state == LDC_UP) {
+		/*
+		 * Check if the receive queue is empty and if not, signal that
+		 * there is data ready to read.
+		 */
+		if (events & POLLIN) {
+			if ((ldc_chkq(vport->ldc_handle, &isempty) == 0) &&
+			    (isempty == B_FALSE)) {
+				*reventsp |= POLLIN;
+			}
+		}
+
+		if (events & POLLOUT)
+			*reventsp |= POLLOUT;
+
+	} else if (vport->hanged_up) {
+		*reventsp |= POLLHUP;
+		vport->hanged_up = B_FALSE;
+	}
+
+	mutex_exit(&vminor->lock);
+
+	if (((*reventsp) == 0) && (!anyyet)) {
+		*phpp = &vport->poll;
+	}
+
+	D2("vldc_chpoll: vldc@%d:%lu ev=0x%x, rev=0x%x\n",
+	    instance, portno, events, *reventsp);
+
+	return (0);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/vnet.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,1049 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/stream.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/ksynch.h>
+#include <sys/stat.h>
+#include <sys/modctl.h>
+#include <sys/debug.h>
+#include <sys/ethernet.h>
+#include <sys/dlpi.h>
+#include <net/if.h>
+#include <sys/mac.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/strsun.h>
+#include <sys/note.h>
+#include <sys/vnet.h>
+
+/*
+ * Function prototypes.
+ */
+
+/* DDI entrypoints */
+static int vnetdevinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
+static int vnetattach(dev_info_t *, ddi_attach_cmd_t);
+static int vnetdetach(dev_info_t *, ddi_detach_cmd_t);
+
+/* MAC entrypoints  */
+static uint64_t vnet_m_stat(void *arg, enum mac_stat stat);
+static int vnet_m_start(void *);
+static void vnet_m_stop(void *);
+static int vnet_m_promisc(void *, boolean_t);
+static int vnet_m_multicst(void *, boolean_t, const uint8_t *);
+static int vnet_m_unicst(void *, const uint8_t *);
+static void vnet_m_resources(void *);
+static void vnet_m_ioctl(void *, queue_t *, mblk_t *);
+mblk_t *vnet_m_tx(void *, mblk_t *);
+
+/* vnet internal functions */
+static int vnet_mac_register(vnet_t *);
+static int vnet_read_mac_address(vnet_t *vnetp);
+static void vnet_add_vptl(vnet_t *vnetp, vp_tl_t *vp_tlp);
+static void vnet_del_vptl(vnet_t *vnetp, vp_tl_t *vp_tlp);
+static vp_tl_t *vnet_get_vptl(vnet_t *vnetp, const char *devname);
+static fdb_t *vnet_lookup_fdb(fdb_fanout_t *fdbhp, uint8_t *macaddr);
+
+/* exported functions */
+void vnet_add_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg);
+void vnet_del_fdb(void *arg, uint8_t *macaddr);
+void vnet_modify_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg);
+void vnet_add_def_rte(void *arg, mac_tx_t m_tx, void *txarg);
+void vnet_del_def_rte(void *arg);
+
+/* externs */
+extern int vgen_init(void *vnetp, dev_info_t *vnetdip, void *vnetmacp,
+	const uint8_t *macaddr, mac_t **vgenmacp);
+extern void vgen_uninit(void *arg);
+
+/*
+ * Linked list of "vnet_t" structures - one per instance.
+ */
+static vnet_t	*vnet_headp = NULL;
+static krwlock_t vnet_rw;
+
+/* Tunables */
+uint32_t vnet_ntxds = VNET_NTXDS;	/* power of 2 transmit descriptors */
+uint32_t vnet_reclaim_lowat = VNET_RECLAIM_LOWAT;  /* tx recl low watermark */
+uint32_t vnet_reclaim_hiwat = VNET_RECLAIM_HIWAT;  /* tx recl high watermark */
+uint32_t vnet_ldcwd_interval = VNET_LDCWD_INTERVAL; /* watchdog freq in msec */
+uint32_t vnet_ldcwd_txtimeout = VNET_LDCWD_TXTIMEOUT;  /* tx timeout in msec */
+uint32_t vnet_ldc_qlen = VNET_LDC_QLEN;		/* ldc qlen */
+uint32_t vnet_nfdb_hash = VNET_NFDB_HASH;	/* size of fdb hash table */
+
+/*
+ * Property names
+ */
+static char macaddr_propname[] = "local-mac-address";
+
+static struct ether_addr etherbroadcastaddr = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+/*
+ * MIB II broadcast/multicast packets
+ */
+#define	IS_BROADCAST(ehp) \
+		(ether_cmp(&ehp->ether_dhost, &etherbroadcastaddr) == 0)
+#define	IS_MULTICAST(ehp) \
+		((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1)
+
+/*
+ * This is the string displayed by modinfo(1m).
+ */
+static char vnet_ident[] = "vnet driver v1.0";
+extern struct mod_ops mod_driverops;
+static struct cb_ops cb_vnetops = {
+	nulldev,		/* cb_open */
+	nulldev,		/* cb_close */
+	nodev,			/* cb_strategy */
+	nodev,			/* cb_print */
+	nodev,			/* cb_dump */
+	nodev,			/* cb_read */
+	nodev,			/* cb_write */
+	nodev,			/* cb_ioctl */
+	nodev,			/* cb_devmap */
+	nodev,			/* cb_mmap */
+	nodev,			/* cb_segmap */
+	nochpoll,		/* cb_chpoll */
+	ddi_prop_op,		/* cb_prop_op */
+	NULL,			/* cb_stream */
+	(int)(D_MP)		/* cb_flag */
+};
+
+static struct dev_ops vnetops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* devo_refcnt */
+	NULL,			/* devo_getinfo */
+	nulldev,		/* devo_identify */
+	nulldev,		/* devo_probe */
+	vnetattach,		/* devo_attach */
+	vnetdetach,		/* devo_detach */
+	nodev,			/* devo_reset */
+	&cb_vnetops,		/* devo_cb_ops */
+	(struct bus_ops *)NULL	/* devo_bus_ops */
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,		/* Type of module.  This one is a driver */
+	vnet_ident,		/* ID string */
+	&vnetops		/* driver specific ops */
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, (void *)&modldrv, NULL
+};
+
+
+/*
+ * Print debug messages - set to 0xf to enable all msgs
+ */
+int _vnet_dbglevel = 0x8;
+
+void
+_vnetdebug_printf(void *arg, const char *fmt, ...)
+{
+	char    buf[512];
+	va_list ap;
+	vnet_t *vnetp = (vnet_t *)arg;
+
+	va_start(ap, fmt);
+	(void) vsprintf(buf, fmt, ap);
+	va_end(ap);
+
+	if (vnetp == NULL)
+		cmn_err(CE_CONT, "%s\n", buf);
+	else
+		cmn_err(CE_CONT, "vnet%d: %s\n", vnetp->instance, buf);
+}
+
+#ifdef DEBUG
+
+/*
+ * XXX: any changes to the definitions below need corresponding changes in
+ * vnet_gen.c
+ */
+
+/*
+ * debug levels:
+ * DBG_LEVEL1:	Function entry/exit tracing
+ * DBG_LEVEL2:	Info messages
+ * DBG_LEVEL3:	Warning messages
+ * DBG_LEVEL4:	Error messages
+ */
+
+enum	{ DBG_LEVEL1 = 0x01, DBG_LEVEL2 = 0x02, DBG_LEVEL3 = 0x04,
+	    DBG_LEVEL4 = 0x08 };
+
+#define	DBG1(_s)	do {						\
+			    if ((_vnet_dbglevel & DBG_LEVEL1) != 0) {	\
+					_vnetdebug_printf _s;		\
+			    }					\
+			_NOTE(CONSTCOND) } while (0)
+
+#define	DBG2(_s)	do {						\
+			    if ((_vnet_dbglevel & DBG_LEVEL2) != 0) {	\
+					_vnetdebug_printf _s;		\
+			    }					\
+			_NOTE(CONSTCOND) } while (0)
+
+#define	DWARN(_s)	do {						\
+			    if ((_vnet_dbglevel & DBG_LEVEL3) != 0) {	\
+					_vnetdebug_printf _s;		\
+			    }					\
+			_NOTE(CONSTCOND) } while (0)
+
+#define	DERR(_s)	do {						\
+			    if ((_vnet_dbglevel & DBG_LEVEL4) != 0) {	\
+					_vnetdebug_printf _s;		\
+			    }					\
+			_NOTE(CONSTCOND) } while (0)
+
+#else
+
+#define	DBG1(_s)	if (0)	_vnetdebug_printf _s
+#define	DBG2(_s)	if (0)	_vnetdebug_printf _s
+#define	DWARN(_s)	if (0)	_vnetdebug_printf _s
+#define	DERR(_s)	if (0)	_vnetdebug_printf _s
+
+#endif
+
+/* _init(9E): initialize the loadable module */
+int
+_init(void)
+{
+	int status;
+
+	DBG1((NULL, "_init: enter\n"));
+
+	mac_init_ops(&vnetops, "vnet");
+	status = mod_install(&modlinkage);
+	if (status != 0) {
+		mac_fini_ops(&vnetops);
+	}
+
+	DBG1((NULL, "_init: exit\n"));
+	return (status);
+}
+
+/* _fini(9E): prepare the module for unloading. */
+int
+_fini(void)
+{
+	int status;
+
+	DBG1((NULL, "_fini: enter\n"));
+
+	status = mod_remove(&modlinkage);
+	if (status != 0)
+		return (status);
+	mac_fini_ops(&vnetops);
+
+	DBG1((NULL, "_fini: exit\n"));
+	return (status);
+}
+
+/* _info(9E): return information about the loadable module */
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+/*
+ * attach(9E): attach a device to the system.
+ * called once for each instance of the device on the system.
+ */
+static int
+vnetattach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	mac_t		*macp;
+	vnet_t		*vnetp;
+	vp_tl_t		*vp_tlp;
+	int		instance;
+	int		status;
+	enum		{ AST_init = 0x0, AST_vnet_alloc = 0x1,
+			    AST_mac_alloc = 0x2, AST_read_macaddr = 0x4,
+			    AST_vgen_init = 0x8, AST_vptl_alloc = 0x10,
+			    AST_fdbh_alloc = 0x20 }
+			attach_state;
+	mac_t		*vgenmacp = NULL;
+	uint32_t	nfdbh = 0;
+
+	attach_state = AST_init;
+
+	switch (cmd) {
+	case DDI_ATTACH:
+		break;
+	case DDI_RESUME:
+	case DDI_PM_RESUME:
+	default:
+		goto vnet_attach_fail;
+	}
+
+	instance = ddi_get_instance(dip);
+	DBG1((NULL, "vnetattach: instance(%d) enter\n", instance));
+
+	/* allocate vnet_t and mac_t structures */
+	vnetp = kmem_zalloc(sizeof (vnet_t), KM_SLEEP);
+	attach_state |= AST_vnet_alloc;
+
+	macp = kmem_zalloc(sizeof (mac_t), KM_SLEEP);
+	attach_state |= AST_mac_alloc;
+
+	/* setup links to vnet_t from both devinfo and mac_t */
+	ddi_set_driver_private(dip, (caddr_t)vnetp);
+	macp->m_driver = vnetp;
+	vnetp->dip = dip;
+	vnetp->macp = macp;
+	vnetp->instance = instance;
+
+	/* read the mac address */
+	status = vnet_read_mac_address(vnetp);
+	if (status != DDI_SUCCESS) {
+		goto vnet_attach_fail;
+	}
+	attach_state |= AST_read_macaddr;
+
+	/*
+	 * Initialize the generic vnet proxy transport. This is the first
+	 * and default transport used by vnet. The generic transport
+	 * is provided by using sun4v LDC (logical domain channel). On success,
+	 * vgen_init() provides a pointer to mac_t of generic transport.
+	 * Currently, this generic layer provides network connectivity to other
+	 * vnets within ldoms and also to remote hosts oustide ldoms through
+	 * the virtual switch (vsw) device on domain0. In the future, when
+	 * physical adapters that are able to share their resources (such as
+	 * dma channels) with guest domains become available, the vnet device
+	 * will use hardware specific driver to communicate directly over the
+	 * physical device to reach remote hosts without going through vswitch.
+	 */
+	status = vgen_init(vnetp, vnetp->dip, vnetp->macp,
+	    (uint8_t *)vnetp->curr_macaddr, &vgenmacp);
+	if (status != DDI_SUCCESS) {
+		DERR((vnetp, "vgen_init() failed\n"));
+		goto vnet_attach_fail;
+	}
+	attach_state |= AST_vgen_init;
+
+	vp_tlp = kmem_zalloc(sizeof (vp_tl_t), KM_SLEEP);
+	vp_tlp->macp = vgenmacp;
+	(void) snprintf(vp_tlp->name, MAXNAMELEN, "%s%u", "vgen", instance);
+	(void) strcpy(vnetp->vgen_name, vp_tlp->name);
+
+	/* add generic transport to the list of vnet proxy transports */
+	vnet_add_vptl(vnetp, vp_tlp);
+	attach_state |= AST_vptl_alloc;
+
+	nfdbh = vnet_nfdb_hash;
+	if ((nfdbh < VNET_NFDB_HASH) || (nfdbh > VNET_NFDB_HASH_MAX)) {
+		vnetp->nfdb_hash = VNET_NFDB_HASH;
+	}
+	else
+		vnetp->nfdb_hash = nfdbh;
+
+	/* allocate fdb hash table, with an extra slot for default route */
+	vnetp->fdbhp = kmem_zalloc(sizeof (fdb_fanout_t) *
+	    (vnetp->nfdb_hash + 1), KM_SLEEP);
+	attach_state |= AST_fdbh_alloc;
+
+	/* register with MAC layer */
+	status = vnet_mac_register(vnetp);
+	if (status != DDI_SUCCESS) {
+		goto vnet_attach_fail;
+	}
+
+	/* add to the list of vnet devices */
+	WRITE_ENTER(&vnet_rw);
+	vnetp->nextp = vnet_headp;
+	vnet_headp = vnetp;
+	RW_EXIT(&vnet_rw);
+
+	DBG1((NULL, "vnetattach: instance(%d) exit\n", instance));
+	return (DDI_SUCCESS);
+
+vnet_attach_fail:
+	if (attach_state & AST_fdbh_alloc) {
+		kmem_free(vnetp->fdbhp,
+		    sizeof (fdb_fanout_t) * (vnetp->nfdb_hash + 1));
+	}
+	if (attach_state & AST_vptl_alloc) {
+		WRITE_ENTER(&vnetp->trwlock);
+		vnet_del_vptl(vnetp, vp_tlp);
+		RW_EXIT(&vnetp->trwlock);
+	}
+	if (attach_state & AST_vgen_init) {
+		vgen_uninit(vgenmacp->m_driver);
+	}
+	if (attach_state & AST_mac_alloc) {
+		KMEM_FREE(macp);
+	}
+	if (attach_state & AST_vnet_alloc) {
+		KMEM_FREE(vnetp);
+	}
+	return (DDI_FAILURE);
+}
+
+/*
+ * detach(9E): detach a device from the system.
+ */
+static int
+vnetdetach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	vnet_t		*vnetp;
+	vnet_t		**vnetpp;
+	vp_tl_t		*vp_tlp;
+	int		instance;
+
+	instance = ddi_get_instance(dip);
+	DBG1((NULL, "vnetdetach: instance(%d) enter\n", instance));
+
+	vnetp = ddi_get_driver_private(dip);
+	if (vnetp == NULL) {
+		goto vnet_detach_fail;
+	}
+
+	switch (cmd) {
+	case DDI_DETACH:
+		break;
+	case DDI_SUSPEND:
+	case DDI_PM_SUSPEND:
+	default:
+		goto vnet_detach_fail;
+	}
+
+	/*
+	 * Unregister from the MAC subsystem.  This can fail, in
+	 * particular if there are DLPI style-2 streams still open -
+	 * in which case we just return failure.
+	 */
+	if (mac_unregister(vnetp->macp) != 0)
+		goto vnet_detach_fail;
+
+	/* unlink from instance(vnet_t) list */
+	WRITE_ENTER(&vnet_rw);
+	for (vnetpp = &vnet_headp; *vnetpp; vnetpp = &(*vnetpp)->nextp) {
+		if (*vnetpp == vnetp) {
+			*vnetpp = vnetp->nextp;
+			break;
+		}
+	}
+	RW_EXIT(&vnet_rw);
+
+	/* uninit and free vnet proxy transports */
+	WRITE_ENTER(&vnetp->trwlock);
+	while ((vp_tlp = vnetp->tlp) != NULL) {
+		if (strcmp(vnetp->vgen_name, vp_tlp->name) == 0) {
+			/* uninitialize generic transport */
+			vgen_uninit(vp_tlp->macp->m_driver);
+		}
+		vnet_del_vptl(vnetp, vp_tlp);
+	}
+	RW_EXIT(&vnetp->trwlock);
+
+	KMEM_FREE(vnetp->macp);
+	KMEM_FREE(vnetp);
+
+	return (DDI_SUCCESS);
+
+vnet_detach_fail:
+	return (DDI_FAILURE);
+}
+
+/* enable the device for transmit/receive */
+static int
+vnet_m_start(void *arg)
+{
+	vnet_t		*vnetp = arg;
+	vp_tl_t		*vp_tlp;
+	mac_t		*vp_macp;
+
+	DBG1((vnetp, "vnet_m_start: enter\n"));
+
+	/*
+	 * XXX
+	 * Currently, we only have generic transport. m_start() invokes
+	 * vgen_start() which enables ports/channels in vgen and
+	 * initiates handshake with peer vnets and vsw. In the future when we
+	 * have support for hardware specific transports, this information
+	 * needs to be propagted back to vnet from vgen and we need to revisit
+	 * this code (see comments in vnet_attach()).
+	 *
+	 */
+	WRITE_ENTER(&vnetp->trwlock);
+	for (vp_tlp = vnetp->tlp; vp_tlp != NULL; vp_tlp = vp_tlp->nextp) {
+		vp_macp = vp_tlp->macp;
+		vp_macp->m_start(vp_macp->m_driver);
+	}
+	RW_EXIT(&vnetp->trwlock);
+
+	DBG1((vnetp, "vnet_m_start: exit\n"));
+	return (VNET_SUCCESS);
+
+}
+
+/* stop transmit/receive for the device */
+static void
+vnet_m_stop(void *arg)
+{
+	vnet_t		*vnetp = arg;
+	vp_tl_t		*vp_tlp;
+	mac_t		*vp_macp;
+
+	DBG1((vnetp, "vnet_m_stop: enter\n"));
+
+	WRITE_ENTER(&vnetp->trwlock);
+	for (vp_tlp = vnetp->tlp; vp_tlp != NULL; vp_tlp = vp_tlp->nextp) {
+		vp_macp = vp_tlp->macp;
+		vp_macp->m_stop(vp_macp->m_driver);
+	}
+	RW_EXIT(&vnetp->trwlock);
+
+	DBG1((vnetp, "vnet_m_stop: exit\n"));
+}
+
+/* set the unicast mac address of the device */
+static int
+vnet_m_unicst(void *arg, const uint8_t *macaddr)
+{
+	_NOTE(ARGUNUSED(macaddr))
+
+	vnet_t *vnetp = arg;
+
+	DBG1((vnetp, "vnet_m_unicst: enter\n"));
+	/*
+	 * XXX: setting mac address dynamically is not supported.
+	 */
+#if 0
+	bcopy(macaddr, vnetp->curr_macaddr, ETHERADDRL);
+#endif
+	DBG1((vnetp, "vnet_m_unicst: exit\n"));
+
+	return (VNET_SUCCESS);
+}
+
+/* enable/disable a multicast address */
+static int
+vnet_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
+{
+	_NOTE(ARGUNUSED(add, mca))
+
+	vnet_t *vnetp = arg;
+	vp_tl_t		*vp_tlp;
+	mac_t		*vp_macp;
+	int rv = VNET_SUCCESS;
+
+	DBG1((vnetp, "vnet_m_multicst: enter\n"));
+	READ_ENTER(&vnetp->trwlock);
+	for (vp_tlp = vnetp->tlp; vp_tlp != NULL; vp_tlp = vp_tlp->nextp) {
+		if (strcmp(vnetp->vgen_name, vp_tlp->name) == 0) {
+			vp_macp = vp_tlp->macp;
+			rv = vp_macp->m_multicst(vp_macp->m_driver, add, mca);
+			break;
+		}
+	}
+	RW_EXIT(&vnetp->trwlock);
+	DBG1((vnetp, "vnet_m_multicst: exit\n"));
+	return (rv);
+}
+
+/* set or clear promiscuous mode on the device */
+static int
+vnet_m_promisc(void *arg, boolean_t on)
+{
+	_NOTE(ARGUNUSED(on))
+
+	vnet_t *vnetp = arg;
+	DBG1((vnetp, "vnet_m_promisc: enter\n"));
+	/*
+	 * XXX: setting promiscuous mode is not supported, just return success.
+	 */
+	DBG1((vnetp, "vnet_m_promisc: exit\n"));
+	return (VNET_SUCCESS);
+}
+
+/*
+ * Transmit a chain of packets. This function provides switching functionality
+ * based on the destination mac address to reach other guests (within ldoms) or
+ * external hosts.
+ */
+mblk_t *
+vnet_m_tx(void *arg, mblk_t *mp)
+{
+	vnet_t *vnetp;
+	mblk_t *next;
+	uint32_t fdbhash;
+	fdb_t *fdbp;
+	fdb_fanout_t *fdbhp;
+	struct ether_header *ehp;
+	uint8_t *macaddr;
+	mblk_t *resid_mp;
+
+	vnetp = (vnet_t *)arg;
+	DBG1((vnetp, "vnet_m_tx: enter\n"));
+	ASSERT(mp != NULL);
+
+	while (mp != NULL) {
+		next = mp->b_next;
+		mp->b_next = NULL;
+
+		/* get the destination mac address in the eth header */
+		ehp = (struct ether_header *)mp->b_rptr;
+		macaddr = (uint8_t *)&ehp->ether_dhost;
+
+		/* Calculate hash value and fdb fanout */
+		fdbhash = MACHASH(macaddr, vnetp->nfdb_hash);
+		fdbhp = &(vnetp->fdbhp[fdbhash]);
+
+		READ_ENTER(&fdbhp->rwlock);
+		fdbp = vnet_lookup_fdb(fdbhp, macaddr);
+		if (fdbp) {
+			/*
+			 * If the destination is in FDB, the destination is
+			 * a vnet device within ldoms and directly reachable,
+			 * invoke the tx function in the fdb entry.
+			 */
+			resid_mp = fdbp->m_tx(fdbp->txarg, mp);
+			if (resid_mp != NULL) {
+				/* m_tx failed */
+				mp->b_next = next;
+				RW_EXIT(&fdbhp->rwlock);
+				break;
+			}
+			RW_EXIT(&fdbhp->rwlock);
+		} else {
+			/* destination is not in FDB */
+			RW_EXIT(&fdbhp->rwlock);
+			/*
+			 * If the destination is broadcast/multicast
+			 * or an unknown unicast address, forward the
+			 * packet to vsw, using the last slot in fdb which is
+			 * reserved for default route.
+			 */
+			fdbhp = &(vnetp->fdbhp[vnetp->nfdb_hash]);
+			READ_ENTER(&fdbhp->rwlock);
+			fdbp = fdbhp->headp;
+			if (fdbp) {
+				resid_mp = fdbp->m_tx(fdbp->txarg, mp);
+				if (resid_mp != NULL) {
+					/* m_tx failed */
+					mp->b_next = next;
+					RW_EXIT(&fdbhp->rwlock);
+					break;
+				}
+			} else {
+				/* drop the packet */
+				freemsg(mp);
+			}
+			RW_EXIT(&fdbhp->rwlock);
+		}
+
+		mp = next;
+	}
+
+	DBG1((vnetp, "vnet_m_tx: exit\n"));
+	return (mp);
+}
+
+/* register resources with mac layer */
+static void
+vnet_m_resources(void *arg)
+{
+	vnet_t *vnetp = arg;
+	vp_tl_t	*vp_tlp;
+	mac_t	*vp_macp;
+
+	DBG1((vnetp, "vnet_m_resources: enter\n"));
+
+	WRITE_ENTER(&vnetp->trwlock);
+	for (vp_tlp = vnetp->tlp; vp_tlp != NULL; vp_tlp = vp_tlp->nextp) {
+		vp_macp = vp_tlp->macp;
+		vp_macp->m_resources(vp_macp->m_driver);
+	}
+	RW_EXIT(&vnetp->trwlock);
+
+	DBG1((vnetp, "vnet_m_resources: exit\n"));
+}
+
+/*
+ * vnet specific ioctls
+ */
+static void
+vnet_m_ioctl(void *arg, queue_t *wq, mblk_t *mp)
+{
+	vnet_t *vnetp = (vnet_t *)arg;
+	struct iocblk *iocp;
+	int cmd;
+
+	DBG1((vnetp, "vnet_m_ioctl: enter\n"));
+
+	iocp = (struct iocblk *)mp->b_rptr;
+	iocp->ioc_error = 0;
+	cmd = iocp->ioc_cmd;
+	switch (cmd) {
+	default:
+		miocnak(wq, mp, 0, EINVAL);
+		break;
+	}
+	DBG1((vnetp, "vnet_m_ioctl: exit\n"));
+}
+
+/* get statistics from the device */
+uint64_t
+vnet_m_stat(void *arg, enum mac_stat stat)
+{
+	vnet_t *vnetp = arg;
+	vp_tl_t	*vp_tlp;
+	mac_t	*vp_macp;
+	uint64_t val = 0;
+
+	DBG1((vnetp, "vnet_m_stat: enter\n"));
+
+	/*
+	 * get the specified statistic from each transport
+	 * and return the aggregate val
+	 */
+	READ_ENTER(&vnetp->trwlock);
+	for (vp_tlp = vnetp->tlp; vp_tlp != NULL; vp_tlp = vp_tlp->nextp) {
+		vp_macp = vp_tlp->macp;
+		val += vp_macp->m_stat(vp_macp->m_driver, stat);
+	}
+	RW_EXIT(&vnetp->trwlock);
+
+	DBG1((vnetp, "vnet_m_stat: exit\n"));
+	return (val);
+}
+
+/* wrapper function for mac_register() */
+static int
+vnet_mac_register(vnet_t *vnetp)
+{
+	mac_info_t *mip;
+	mac_t *macp;
+
+	macp = vnetp->macp;
+
+	mip = &(macp->m_info);
+	mip->mi_media = DL_ETHER;
+	mip->mi_sdu_min = 0;
+	mip->mi_sdu_max = ETHERMTU;
+	mip->mi_cksum = 0;
+	mip->mi_poll = 0; /* DL_CAPAB_POLL ? */
+	mip->mi_addr_length = ETHERADDRL;
+	bcopy(&etherbroadcastaddr, mip->mi_brdcst_addr, ETHERADDRL);
+	bcopy(vnetp->curr_macaddr, mip->mi_unicst_addr, ETHERADDRL);
+
+	MAC_STAT_MIB(mip->mi_stat);
+	mip->mi_stat[MAC_STAT_UNKNOWNS] = B_FALSE;
+	MAC_STAT_ETHER(mip->mi_stat);
+	mip->mi_stat[MAC_STAT_SQE_ERRORS] = B_FALSE;
+	mip->mi_stat[MAC_STAT_MACRCV_ERRORS] = B_FALSE;
+
+	macp->m_stat = vnet_m_stat;
+	macp->m_start = vnet_m_start;
+	macp->m_stop = vnet_m_stop;
+	macp->m_promisc = vnet_m_promisc;
+	macp->m_multicst = vnet_m_multicst;
+	macp->m_unicst = vnet_m_unicst;
+	macp->m_resources = vnet_m_resources;
+	macp->m_ioctl = vnet_m_ioctl;
+	macp->m_tx = vnet_m_tx;
+
+	macp->m_dip = vnetp->dip;
+	macp->m_ident = MAC_IDENT;
+
+	/*
+	 * Finally, we're ready to register ourselves with the MAC layer
+	 * interface; if this succeeds, we're all ready to start()
+	 */
+	if (mac_register(macp) != 0) {
+		KMEM_FREE(macp);
+		return (DDI_FAILURE);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/* add vp_tl to the list */
+static void
+vnet_add_vptl(vnet_t *vnetp, vp_tl_t *vp_tlp)
+{
+	vp_tl_t *ttlp;
+
+	WRITE_ENTER(&vnetp->trwlock);
+	if (vnetp->tlp == NULL) {
+		vnetp->tlp = vp_tlp;
+	} else {
+		ttlp = vnetp->tlp;
+		while (ttlp->nextp)
+			ttlp = ttlp->nextp;
+		ttlp->nextp = vp_tlp;
+	}
+	RW_EXIT(&vnetp->trwlock);
+}
+
+/* remove vp_tl from the list */
+static void
+vnet_del_vptl(vnet_t *vnetp, vp_tl_t *vp_tlp)
+{
+	vp_tl_t *ttlp, **pretlp;
+	boolean_t found = B_FALSE;
+
+	pretlp = &vnetp->tlp;
+	ttlp = *pretlp;
+	while (ttlp) {
+		if (ttlp == vp_tlp) {
+			found = B_TRUE;
+			(*pretlp) = ttlp->nextp;
+			ttlp->nextp = NULL;
+			break;
+		}
+		pretlp = &(ttlp->nextp);
+		ttlp = *pretlp;
+	}
+
+	if (found) {
+		KMEM_FREE(vp_tlp);
+	}
+}
+
+/* get vp_tl corresponding to the given name */
+static vp_tl_t *
+vnet_get_vptl(vnet_t *vnetp, const char *name)
+{
+	vp_tl_t *tlp;
+
+	tlp = vnetp->tlp;
+	while (tlp) {
+		if (strcmp(tlp->name, name) == 0) {
+			return (tlp);
+		}
+		tlp = tlp->nextp;
+	}
+	DWARN((vnetp,
+	    "vnet_get_vptl: can't find vp_tl with name (%s)\n", name));
+	return (NULL);
+}
+
+/* read the mac address of the device */
+static int
+vnet_read_mac_address(vnet_t *vnetp)
+{
+	uchar_t 	*macaddr;
+	uint32_t 	size;
+	int 		rv;
+
+	rv = ddi_prop_lookup_byte_array(DDI_DEV_T_ANY, vnetp->dip,
+		DDI_PROP_DONTPASS, macaddr_propname, &macaddr, &size);
+	if ((rv != DDI_PROP_SUCCESS) || (size != ETHERADDRL)) {
+		DWARN((vnetp,
+		"vnet_read_mac_address: prop_lookup failed (%s) err (%d)\n",
+		macaddr_propname, rv));
+		return (DDI_FAILURE);
+	}
+	bcopy(macaddr, (caddr_t)vnetp->vendor_addr, ETHERADDRL);
+	bcopy(macaddr, (caddr_t)vnetp->curr_macaddr, ETHERADDRL);
+	ddi_prop_free(macaddr);
+
+	return (DDI_SUCCESS);
+}
+
+
+/*
+ * Functions below are called only by generic transport to add/remove/modify
+ * entries in forwarding database. See comments in vgen_port_init(vnet_gen.c).
+ */
+
+/* add an entry into the forwarding database */
+void
+vnet_add_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg)
+{
+	vnet_t *vnetp = (vnet_t *)arg;
+	uint32_t fdbhash;
+	fdb_t *fdbp;
+	fdb_fanout_t *fdbhp;
+
+	/* Calculate hash value and fdb fanout */
+	fdbhash = MACHASH(macaddr, vnetp->nfdb_hash);
+	fdbhp = &(vnetp->fdbhp[fdbhash]);
+
+	WRITE_ENTER(&fdbhp->rwlock);
+
+	fdbp = kmem_zalloc(sizeof (fdb_t), KM_NOSLEEP);
+	if (fdbp == NULL) {
+		RW_EXIT(&fdbhp->rwlock);
+		return;
+	}
+	bcopy(macaddr, (caddr_t)fdbp->macaddr, ETHERADDRL);
+	fdbp->m_tx = m_tx;
+	fdbp->txarg = txarg;
+	fdbp->nextp = fdbhp->headp;
+	fdbhp->headp = fdbp;
+
+	RW_EXIT(&fdbhp->rwlock);
+}
+
+/* delete an entry from the forwarding database */
+void
+vnet_del_fdb(void *arg, uint8_t *macaddr)
+{
+	vnet_t *vnetp = (vnet_t *)arg;
+	uint32_t fdbhash;
+	fdb_t *fdbp;
+	fdb_t **pfdbp;
+	fdb_fanout_t *fdbhp;
+
+	/* Calculate hash value and fdb fanout */
+	fdbhash = MACHASH(macaddr, vnetp->nfdb_hash);
+	fdbhp = &(vnetp->fdbhp[fdbhash]);
+
+	WRITE_ENTER(&fdbhp->rwlock);
+
+	for (pfdbp = &fdbhp->headp; (fdbp  = *pfdbp) != NULL;
+	    pfdbp = &fdbp->nextp) {
+		if (bcmp(fdbp->macaddr, macaddr, ETHERADDRL) == 0) {
+			/* Unlink it from the list */
+			*pfdbp = fdbp->nextp;
+			KMEM_FREE(fdbp);
+			break;
+		}
+	}
+
+	RW_EXIT(&fdbhp->rwlock);
+}
+
+/* modify an existing entry in the forwarding database */
+void
+vnet_modify_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg)
+{
+	vnet_t *vnetp = (vnet_t *)arg;
+	uint32_t fdbhash;
+	fdb_t *fdbp;
+	fdb_fanout_t *fdbhp;
+
+	/* Calculate hash value and fdb fanout */
+	fdbhash = MACHASH(macaddr, vnetp->nfdb_hash);
+	fdbhp = &(vnetp->fdbhp[fdbhash]);
+
+	WRITE_ENTER(&fdbhp->rwlock);
+
+	for (fdbp = fdbhp->headp; fdbp != NULL; fdbp = fdbp->nextp) {
+		if (bcmp(fdbp->macaddr, macaddr, ETHERADDRL) == 0) {
+			/* change the entry to have new tx params */
+			fdbp->m_tx = m_tx;
+			fdbp->txarg = txarg;
+			break;
+		}
+	}
+
+	RW_EXIT(&fdbhp->rwlock);
+}
+
+/* look up an fdb entry based on the mac address, caller holds lock */
+static fdb_t *
+vnet_lookup_fdb(fdb_fanout_t *fdbhp, uint8_t *macaddr)
+{
+	fdb_t *fdbp = NULL;
+
+	for (fdbp = fdbhp->headp; fdbp != NULL; fdbp = fdbp->nextp) {
+		if (bcmp(fdbp->macaddr, macaddr, ETHERADDRL) == 0) {
+			break;
+		}
+	}
+
+	return (fdbp);
+}
+
+/* add default route entry into the forwarding database */
+void
+vnet_add_def_rte(void *arg, mac_tx_t m_tx, void *txarg)
+{
+	vnet_t *vnetp = (vnet_t *)arg;
+	fdb_t *fdbp;
+	fdb_fanout_t *fdbhp;
+
+	/*
+	 * The last hash list is reserved for default route entry,
+	 * and for now, we have only one entry in this list.
+	 */
+	fdbhp = &(vnetp->fdbhp[vnetp->nfdb_hash]);
+
+	WRITE_ENTER(&fdbhp->rwlock);
+
+	if (fdbhp->headp) {
+		DWARN((vnetp,
+		    "vnet_add_def_rte: default rte already exists\n"));
+		RW_EXIT(&fdbhp->rwlock);
+		return;
+	}
+	fdbp = kmem_zalloc(sizeof (fdb_t), KM_NOSLEEP);
+	if (fdbp == NULL) {
+		RW_EXIT(&fdbhp->rwlock);
+		return;
+	}
+	bzero(fdbp->macaddr, ETHERADDRL);
+	fdbp->m_tx = m_tx;
+	fdbp->txarg = txarg;
+	fdbp->nextp = NULL;
+	fdbhp->headp = fdbp;
+
+	RW_EXIT(&fdbhp->rwlock);
+}
+
+/* delete default route entry from the forwarding database */
+void
+vnet_del_def_rte(void *arg)
+{
+	vnet_t *vnetp = (vnet_t *)arg;
+	fdb_t *fdbp;
+	fdb_fanout_t *fdbhp;
+
+	/*
+	 * The last hash list is reserved for default route entry,
+	 * and for now, we have only one entry in this list.
+	 */
+	fdbhp = &(vnetp->fdbhp[vnetp->nfdb_hash]);
+
+	WRITE_ENTER(&fdbhp->rwlock);
+
+	if (fdbhp->headp == NULL) {
+		RW_EXIT(&fdbhp->rwlock);
+		return;
+	}
+	fdbp = fdbhp->headp;
+	KMEM_FREE(fdbp);
+	fdbhp->headp = NULL;
+
+	RW_EXIT(&fdbhp->rwlock);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/vnet_gen.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,4899 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/param.h>
+#include <sys/stream.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/devops.h>
+#include <sys/ksynch.h>
+#include <sys/stat.h>
+#include <sys/modctl.h>
+#include <sys/debug.h>
+#include <sys/ethernet.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/strsun.h>
+#include <sys/note.h>
+#include <sys/mac.h>
+#include <sys/ldc.h>
+#include <sys/mach_descrip.h>
+#include <sys/mdeg.h>
+#include <sys/vio_mailbox.h>
+#include <sys/vio_common.h>
+#include <sys/vnet_common.h>
+#include <sys/vnet_gen.h>
+#include <sys/vnet_mailbox.h>
+
+/*
+ * Implementation of the mac functionality for vnet using the
+ * generic(default) transport layer of sun4v Logical Domain Channels(LDC).
+ */
+
+/*
+ * Function prototypes.
+ */
+/* vgen proxy entry points */
+int vgen_init(void *vnetp, dev_info_t *vnetdip, void *vnetmacp,
+	const uint8_t *macaddr, mac_t **vgenmacp);
+void vgen_uninit(void *arg);
+static int vgen_start(void *arg);
+static void vgen_stop(void *arg);
+static mblk_t *vgen_tx(void *arg, mblk_t *mp);
+static void vgen_resources(void *arg);
+static int vgen_multicst(void *arg, boolean_t add,
+	const uint8_t *mca);
+static int vgen_promisc(void *arg, boolean_t on);
+static int vgen_unicst(void *arg, const uint8_t *mca);
+static uint64_t vgen_stat(void *arg, enum mac_stat stat);
+static void vgen_ioctl(void *arg, queue_t *wq, mblk_t *mp);
+
+/* externs - functions provided by vnet to add/remove/modify entries in fdb */
+void vnet_add_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg);
+void vnet_del_fdb(void *arg, uint8_t *macaddr);
+void vnet_modify_fdb(void *arg, uint8_t *macaddr, mac_tx_t m_tx, void *txarg);
+void vnet_add_def_rte(void *arg, mac_tx_t m_tx, void *txarg);
+void vnet_del_def_rte(void *arg);
+
+/* vgen internal functions */
+static void vgen_detach_ports(vgen_t *vgenp);
+static void vgen_port_detach(vgen_port_t *portp);
+static void vgen_port_list_insert(vgen_port_t *portp);
+static void vgen_port_list_remove(vgen_port_t *portp);
+static vgen_port_t *vgen_port_lookup(vgen_portlist_t *plistp,
+	int port_num);
+static int vgen_mdeg_reg(vgen_t *vgenp);
+static void vgen_mdeg_unreg(vgen_t *vgenp);
+static int vgen_mdeg_cb(void *cb_argp, mdeg_result_t *resp);
+static int vgen_add_port(vgen_t *vgenp, md_t *mdp, mde_cookie_t mdex);
+static int vgen_remove_port(vgen_t *vgenp, md_t *mdp, mde_cookie_t mdex);
+static int vgen_port_attach_mdeg(vgen_t *vgenp, int port_num, uint64_t *ldcids,
+	int num_ids, struct ether_addr *macaddr, boolean_t vsw_port);
+static void vgen_port_detach_mdeg(vgen_port_t *portp);
+static int vgen_update_port(vgen_t *vgenp, md_t *curr_mdp,
+	mde_cookie_t curr_mdex, md_t *prev_mdp, mde_cookie_t prev_mdex);
+static uint64_t	vgen_port_stat(vgen_port_t *portp, enum mac_stat stat);
+
+static int vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id);
+static void vgen_ldc_detach(vgen_ldc_t *ldcp);
+static int vgen_alloc_tx_ring(vgen_ldc_t *ldcp);
+static void vgen_free_tx_ring(vgen_ldc_t *ldcp);
+static void vgen_init_ports(vgen_t *vgenp);
+static void vgen_port_init(vgen_port_t *portp);
+static void vgen_uninit_ports(vgen_t *vgenp);
+static void vgen_port_uninit(vgen_port_t *portp);
+static void vgen_init_ldcs(vgen_port_t *portp);
+static void vgen_uninit_ldcs(vgen_port_t *portp);
+static int vgen_ldc_init(vgen_ldc_t *ldcp);
+static void vgen_ldc_uninit(vgen_ldc_t *ldcp);
+static int vgen_init_tbufs(vgen_ldc_t *ldcp);
+static void vgen_uninit_tbufs(vgen_ldc_t *ldcp);
+static void vgen_clobber_tbufs(vgen_ldc_t *ldcp);
+static void vgen_clobber_rxds(vgen_ldc_t *ldcp);
+static uint64_t	vgen_ldc_stat(vgen_ldc_t *ldcp, enum mac_stat stat);
+static void vgen_init_macp(vgen_t *vgenp, mac_t *macp);
+static uint_t vgen_ldc_cb(uint64_t event, caddr_t arg);
+static int vgen_portsend(vgen_port_t *portp, mblk_t *mp);
+static int vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp);
+static void vgen_reclaim(vgen_ldc_t *ldcp);
+static void vgen_reclaim_dring(vgen_ldc_t *ldcp);
+static int vgen_num_txpending(vgen_ldc_t *ldcp);
+static int vgen_tx_dring_full(vgen_ldc_t *ldcp);
+static int vgen_ldc_txtimeout(vgen_ldc_t *ldcp);
+static void vgen_ldc_watchdog(void *arg);
+static void vgen_copymsg(mblk_t *mp, void *bufp);
+static int vgen_setup_kstats(vgen_ldc_t *ldcp);
+static void vgen_destroy_kstats(vgen_ldc_t *ldcp);
+static int vgen_kstat_update(kstat_t *ksp, int rw);
+
+/* vgen handshake functions */
+static vgen_ldc_t *vh_nextphase(vgen_ldc_t *ldcp);
+static int vgen_supported_version(vgen_ldc_t *ldcp, uint16_t ver_major,
+	uint16_t ver_minor);
+static int vgen_next_version(vgen_ldc_t *ldcp, vgen_ver_t *verp);
+static int vgen_sendmsg(vgen_ldc_t *ldcp, caddr_t msg,  size_t msglen,
+	boolean_t caller_holds_lock);
+static int vgen_send_version_negotiate(vgen_ldc_t *ldcp);
+static int vgen_send_attr_info(vgen_ldc_t *ldcp);
+static int vgen_send_dring_reg(vgen_ldc_t *ldcp);
+static int vgen_send_rdx_info(vgen_ldc_t *ldcp);
+static int vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start,
+	uint32_t end, uint64_t next_txseq);
+static int vgen_send_mcast_info(vgen_ldc_t *ldcp);
+static int vgen_handshake_phase2(vgen_ldc_t *ldcp);
+static void vgen_handshake_reset(vgen_ldc_t *ldcp);
+static void vgen_reset_hphase(vgen_ldc_t *ldcp);
+static void vgen_handshake(vgen_ldc_t *ldcp);
+static int vgen_handshake_done(vgen_ldc_t *ldcp);
+static void vgen_handshake_retry(vgen_ldc_t *ldcp);
+static void vgen_handle_version_negotiate(vgen_ldc_t *ldcp,
+	vio_msg_tag_t *tagp);
+static void vgen_handle_attr_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
+static void vgen_handle_dring_reg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
+static void vgen_handle_rdx_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
+static void vgen_handle_mcast_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
+static void vgen_handle_ctrlmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
+static void vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
+	mblk_t **headp, mblk_t **tailp);
+static void vgen_handle_datamsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
+	mblk_t **headp, mblk_t **tailp);
+static void vgen_handle_errmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
+static int vgen_check_sid(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp);
+static uint64_t	vgen_macaddr_strtoul(const uint8_t *macaddr);
+static int vgen_macaddr_ultostr(uint64_t value, uint8_t *macaddr);
+static caddr_t vgen_print_ethaddr(uint8_t *a, char *ebuf);
+static void vgen_hwatchdog(void *arg);
+static void vgen_print_attr_info(vgen_ldc_t *ldcp, int endpoint);
+static void vgen_print_hparams(vgen_hparams_t *hp);
+static void vgen_print_ldcinfo(vgen_ldc_t *ldcp);
+
+/*
+ * The handshake process consists of 5 phases defined below, with VH_PHASE0
+ * being the pre-handshake phase and VH_DONE is the phase to indicate
+ * successful completion of all phases.
+ * Each phase may have one to several handshake states which are required
+ * to complete successfully to move to the next phase.
+ * Refer to the functions vgen_handshake() and vgen_handshake_done() for
+ * more details.
+ */
+/* handshake phases */
+enum {	VH_PHASE0, VH_PHASE1, VH_PHASE2, VH_PHASE3, VH_DONE = 0x80 };
+
+/* handshake states */
+enum {
+
+	VER_INFO_SENT	=	0x1,
+	VER_ACK_RCVD	=	0x2,
+	VER_INFO_RCVD	=	0x4,
+	VER_ACK_SENT	=	0x8,
+	VER_NEGOTIATED	=	(VER_ACK_RCVD | VER_ACK_SENT),
+
+	ATTR_INFO_SENT	=	0x10,
+	ATTR_ACK_RCVD	=	0x20,
+	ATTR_INFO_RCVD	=	0x40,
+	ATTR_ACK_SENT	=	0x80,
+	ATTR_INFO_EXCHANGED	=	(ATTR_ACK_RCVD | ATTR_ACK_SENT),
+
+	DRING_INFO_SENT	=	0x100,
+	DRING_ACK_RCVD	=	0x200,
+	DRING_INFO_RCVD	=	0x400,
+	DRING_ACK_SENT	=	0x800,
+	DRING_INFO_EXCHANGED	=	(DRING_ACK_RCVD | DRING_ACK_SENT),
+
+	RDX_INFO_SENT	=	0x1000,
+	RDX_ACK_RCVD	=	0x2000,
+	RDX_INFO_RCVD	=	0x4000,
+	RDX_ACK_SENT	=	0x8000,
+	RDX_EXCHANGED	=	(RDX_ACK_RCVD | RDX_ACK_SENT)
+
+};
+
+#define	LDC_LOCK(ldcp)	\
+				mutex_enter(&((ldcp)->cblock));\
+				mutex_enter(&((ldcp)->txlock));\
+				mutex_enter(&((ldcp)->tclock));
+#define	LDC_UNLOCK(ldcp)	\
+				mutex_exit(&((ldcp)->tclock));\
+				mutex_exit(&((ldcp)->txlock));\
+				mutex_exit(&((ldcp)->cblock));
+
+static struct ether_addr etherbroadcastaddr = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+/*
+ * MIB II broadcast/multicast packets
+ */
+#define	IS_BROADCAST(ehp) \
+		(ether_cmp(&ehp->ether_dhost, &etherbroadcastaddr) == 0)
+#define	IS_MULTICAST(ehp) \
+		((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1)
+
+/*
+ * Property names
+ */
+static char macaddr_propname[] = "mac-address";
+static char rmacaddr_propname[] = "remote-mac-address";
+static char channel_propname[] = "channel-endpoint";
+static char reg_propname[] = "reg";
+static char port_propname[] = "port";
+static char swport_propname[] = "switch-port";
+static char id_propname[] = "id";
+
+/* versions supported - in decreasing order */
+static vgen_ver_t vgen_versions[VGEN_NUM_VER] = { {1, 0} };
+
+/* Tunables */
+uint32_t vgen_hwd_interval = 1000;	/* handshake watchdog freq in msec */
+uint32_t vgen_max_hretries = 1;		/* max # of handshake retries */
+
+uint32_t vgen_ldcwr_retries = 10;	/* max # of ldc_write() retries */
+
+#ifdef DEBUG
+/* flags to simulate error conditions for debugging */
+int vgen_trigger_txtimeout = 0;
+int vgen_trigger_rxlost = 0;
+#endif
+
+/* MD update matching structure */
+static md_prop_match_t	vport_prop_match[] = {
+	{ MDET_PROP_VAL,	"id" },
+	{ MDET_LIST_END,	NULL }
+};
+
+static mdeg_node_match_t vport_match = { "virtual-device-port",
+					vport_prop_match };
+
+/* template for matching a particular vnet instance */
+static mdeg_prop_spec_t vgen_prop_template[] = {
+	{ MDET_PROP_STR,	"name",		"network" },
+	{ MDET_PROP_VAL,	"cfg-handle",	NULL },
+	{ MDET_LIST_END,	NULL,		NULL }
+};
+
+#define	VGEN_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val)
+
+static int vgen_mdeg_cb(void *cb_argp, mdeg_result_t *resp);
+
+/* externs */
+extern uint32_t vnet_ntxds;
+extern uint32_t vnet_reclaim_lowat;
+extern uint32_t vnet_reclaim_hiwat;
+extern uint32_t vnet_ldcwd_interval;
+extern uint32_t vnet_ldcwd_txtimeout;
+extern uint32_t vnet_ldc_qlen;
+extern int _vnet_dbglevel;
+extern void _vnetdebug_printf(void *vnetp, const char *fmt, ...);
+
+#ifdef DEBUG
+
+/*
+ * XXX: definitions below need to be in sync with those in vnet.c
+ */
+
+/*
+ * debug levels:
+ * DBG_LEVEL1:	Function entry/exit tracing
+ * DBG_LEVEL2:	Info messages
+ * DBG_LEVEL3:	Warning messages
+ * DBG_LEVEL4:	Error messages
+ */
+
+enum	{ DBG_LEVEL1 = 0x01, DBG_LEVEL2 = 0x02, DBG_LEVEL3 = 0x04,
+	    DBG_LEVEL4 = 0x08 };
+
+#define	DBG1(_s)	do {						\
+			    if ((_vnet_dbglevel & DBG_LEVEL1) != 0) {	\
+					_vnetdebug_printf _s;		\
+			    }					\
+			_NOTE(CONSTCOND) } while (0)
+
+#define	DBG2(_s)	do {						\
+			    if ((_vnet_dbglevel & DBG_LEVEL2) != 0) {	\
+					_vnetdebug_printf _s;		\
+			    }					\
+			_NOTE(CONSTCOND) } while (0)
+
+#define	DWARN(_s)	do {						\
+			    if ((_vnet_dbglevel & DBG_LEVEL3) != 0) {	\
+					_vnetdebug_printf _s;		\
+			    }					\
+			_NOTE(CONSTCOND) } while (0)
+
+#define	DERR(_s)	do {						\
+			    if ((_vnet_dbglevel & DBG_LEVEL4) != 0) {	\
+					_vnetdebug_printf _s;		\
+			    }					\
+			_NOTE(CONSTCOND) } while (0)
+
+#else
+
+#define	DBG1(_s)	if (0)	_vnetdebug_printf _s
+#define	DBG2(_s)	if (0)	_vnetdebug_printf _s
+#define	DWARN(_s)	if (0)	_vnetdebug_printf _s
+#define	DERR(_s)	if (0)	_vnetdebug_printf _s
+
+#endif
+
+#ifdef DEBUG
+
+/* simulate handshake error conditions for debug */
+uint32_t vgen_hdbg;
+#define	HDBG_VERSION	0x1
+#define	HDBG_TIMEOUT	0x2
+#define	HDBG_BAD_SID	0x4
+#define	HDBG_OUT_STATE	0x8
+
+#if 0
+/* debug version negotiation, need to redefine VGEN_NUM_VER */
+vgen_ver_t dbg_vgen_versions[VGEN_NUM_VER] =
+	{ {5, 0}, {3, 0}, {2, 1}, {1, 2}, {1, 1} };
+#endif
+
+#endif
+
+/*
+ * vgen_init() is called by an instance of vnet driver to initialize the
+ * corresponding generic proxy transport layer. The arguments passed by vnet
+ * are - an opaque pointer to the vnet instance, pointers to dev_info_t and
+ * mac_t of the vnet device, mac address of the vnet device, and a pointer to
+ * the mac_t of the generic transport is returned in the last argument.
+ */
+int
+vgen_init(void *vnetp, dev_info_t *vnetdip, void *vnetmacp,
+	const uint8_t *macaddr, mac_t **vgenmacp)
+{
+	vgen_t *vgenp;
+	mac_t *macp;
+	int instance;
+
+	if ((vnetp == NULL) || (vnetdip == NULL) ||(vnetmacp == NULL))
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(vnetdip);
+
+	DBG1((vnetp, "vgen_init: enter vnet_instance(%d)\n", instance));
+
+	vgenp = kmem_zalloc(sizeof (vgen_t), KM_SLEEP);
+
+	vgenp->vnetp = vnetp;
+	vgenp->vnetdip = vnetdip;
+	vgenp->vnetmacp = vnetmacp;
+	bcopy(macaddr, &(vgenp->macaddr), ETHERADDRL);
+
+	/* allocate multicast table */
+	vgenp->mctab = kmem_zalloc(VGEN_INIT_MCTAB_SIZE *
+	    sizeof (struct ether_addr), KM_SLEEP);
+	vgenp->mccount = 0;
+	vgenp->mcsize = VGEN_INIT_MCTAB_SIZE;
+
+	mutex_init(&vgenp->lock, NULL, MUTEX_DRIVER, NULL);
+
+	/* register with MD event generator */
+	if (vgen_mdeg_reg(vgenp) != DDI_SUCCESS) {
+		mutex_destroy(&vgenp->lock);
+		kmem_free(vgenp->mctab, VGEN_INIT_MCTAB_SIZE *
+		    sizeof (struct ether_addr));
+		KMEM_FREE(vgenp);
+		return (DDI_FAILURE);
+	}
+
+	macp = &vgenp->vgenmac;
+	vgen_init_macp(vgenp, macp);
+
+	/* register mac_t of this vgen_t with vnet */
+	*vgenmacp = macp;
+
+	DBG1((vnetp, "vgen_init: exit vnet_instance(%d)\n", instance));
+	return (DDI_SUCCESS);
+}
+
+/*
+ * Called by vnet to undo the initializations done by vgen_init().
+ * The handle provided by generic transport during vgen_init() is the argument.
+ */
+void
+vgen_uninit(void *arg)
+{
+	vgen_t	*vgenp = (vgen_t *)arg;
+	void	*vnetp;
+	int instance;
+
+	if (vgenp == NULL)
+		return;
+
+	instance = ddi_get_instance(vgenp->vnetdip);
+	vnetp = vgenp->vnetp;
+
+	DBG1((vnetp, "vgen_uninit: enter vnet_instance(%d)\n", instance));
+
+	/* unregister with MD event generator */
+	vgen_mdeg_unreg(vgenp);
+
+	mutex_enter(&vgenp->lock);
+
+	/* detach all ports from the device */
+	vgen_detach_ports(vgenp);
+
+	/* free multicast table */
+	kmem_free(vgenp->mctab, vgenp->mcsize * sizeof (struct ether_addr));
+
+	mutex_exit(&vgenp->lock);
+
+	mutex_destroy(&vgenp->lock);
+
+	KMEM_FREE(vgenp);
+
+	DBG1((vnetp, "vgen_uninit: exit vnet_instance(%d)\n", instance));
+}
+
+/* enable transmit/receive for the device */
+static int
+vgen_start(void *arg)
+{
+	vgen_t		*vgenp = (vgen_t *)arg;
+
+	DBG1((vgenp->vnetp, "vgen_start: enter\n"));
+
+	mutex_enter(&vgenp->lock);
+	vgen_init_ports(vgenp);
+	vgenp->flags |= VGEN_STARTED;
+	mutex_exit(&vgenp->lock);
+
+	DBG1((vgenp->vnetp, "vgen_start: exit\n"));
+	return (DDI_SUCCESS);
+}
+
+/* stop transmit/receive */
+static void
+vgen_stop(void *arg)
+{
+	vgen_t		*vgenp = (vgen_t *)arg;
+
+	DBG1((vgenp->vnetp, "vgen_stop: enter\n"));
+
+	mutex_enter(&vgenp->lock);
+	vgen_uninit_ports(vgenp);
+	vgenp->flags &= ~(VGEN_STARTED);
+	mutex_exit(&vgenp->lock);
+
+	DBG1((vgenp->vnetp, "vgen_stop: exit\n"));
+}
+
+/* vgen transmit function */
+static mblk_t *
+vgen_tx(void *arg, mblk_t *mp)
+{
+	vgen_port_t *portp;
+	int status;
+
+	portp = (vgen_port_t *)arg;
+	status = vgen_portsend(portp, mp);
+	if (status != VGEN_SUCCESS) {
+		/* failure */
+		return (mp);
+	}
+	/* success */
+	return (NULL);
+}
+
+/* transmit packets over the given port */
+static int
+vgen_portsend(vgen_port_t *portp, mblk_t *mp)
+{
+	vgen_ldclist_t	*ldclp;
+	vgen_ldc_t *ldcp;
+	vgen_t *vgenp;
+	int status;
+
+	vgenp = portp->vgenp;
+	ldclp = &portp->ldclist;
+	READ_ENTER(&ldclp->rwlock);
+	/*
+	 * XXX - for now, we have a single channel.
+	 */
+	if (ldclp->headp == NULL) {
+		DWARN((vgenp->vnetp, "vgen_portsend: dropping packet\n"));
+		RW_EXIT(&ldclp->rwlock);
+		return (VGEN_FAILURE);
+	}
+	ldcp = ldclp->headp;
+
+	if (ldcp->need_resched) {
+		/* out of tx resources, see vgen_ldcsend() for details. */
+		DWARN((vgenp->vnetp, "vgen_portsend: dropping packet...\n"));
+
+		mutex_enter(&ldcp->txlock);
+		ldcp->statsp->tx_no_desc++;
+		mutex_exit(&ldcp->txlock);
+
+		RW_EXIT(&ldclp->rwlock);
+		freemsg(mp);
+		return (VGEN_SUCCESS);
+	}
+
+	status  = vgen_ldcsend(ldcp, mp);
+	RW_EXIT(&ldclp->rwlock);
+
+	if (status != VGEN_TX_SUCCESS)
+		return (VGEN_FAILURE);
+
+	return (VGEN_SUCCESS);
+}
+
+/* channel transmit function */
+static int
+vgen_ldcsend(vgen_ldc_t *ldcp, mblk_t *mp)
+{
+	void		*vnetp;
+	size_t		size;
+	uint64_t	datalen;
+	uchar_t		*rptr;
+	mblk_t 		*bp = NULL;
+	int		rv;
+	uint32_t	i;
+	uint32_t	start;
+	uint32_t	end;
+	int		txpending = 0;
+	int		ci;
+	uint32_t	ncookies;
+	uint64_t	nc;
+	vgen_private_desc_t	*tbufp;
+	vgen_private_desc_t	*ntbufp;
+	vnet_public_desc_t	*txdp;
+	vio_dring_entry_hdr_t		*hdrp;
+	vgen_stats_t		*statsp;
+	struct ether_header	*ehp;
+	boolean_t	is_bcast = B_FALSE;
+	boolean_t	is_mcast = B_FALSE;
+	boolean_t	reclaim = B_FALSE;
+	boolean_t	need_intr = B_FALSE;
+	boolean_t	err = B_FALSE;
+
+	vnetp = LDC_TO_VNET(ldcp);
+	statsp = ldcp->statsp;
+	DBG1((vnetp, "vgen_ldcsend: enter ldcid(%lx)\n", ldcp->ldc_id));
+
+	/* drop the packet if handshake is not done or ldc is not up */
+	if ((ldcp->hphase != VH_DONE) || (ldcp->ldc_status != LDC_UP)) {
+		DWARN((vnetp,
+		    "vgen_ldcsend: id(%lx) status(%d), dropping packet\n",
+		    ldcp->ldc_id, ldcp->ldc_status));
+		freemsg(mp);
+		return (VGEN_TX_SUCCESS);
+	}
+
+	size = msgsize(mp);
+	if (size > (size_t)ETHERMAX) {
+		DWARN((vnetp, "vgen_ldcsend: id(%lx) invalid size(%d)\n",
+		    ldcp->ldc_id, size));
+		freemsg(mp);
+		return (VGEN_TX_SUCCESS);
+	}
+	if ((size < (size_t)ETHERMIN) ||	/* needs padding to ETHERMIN */
+	    (mp->b_cont) ||			/* more than 1 mblk */
+	    ((uintptr_t)mp->b_rptr & 0x7) ||	/* data not 8 byte aligned */
+	    ((mp->b_wptr - mp->b_rptr) & 0x7)) { /* datalen not multiple of 8 */
+		if (size < ETHERMIN)
+			size = ETHERMIN;
+		/*
+		 * The data buffer returned by allocb(9F) is 8byte aligned.
+		 * We allocate extra 8 bytes to ensure size is multiple of
+		 * 8 bytes for ldc_mem_bind_handle().
+		 */
+		bp = allocb(size + 8, BPRI_MED);
+		if (bp == NULL) {
+			/* drop the packet */
+			freemsg(mp);
+			mutex_enter(&ldcp->txlock);
+			statsp->tx_allocb_fail++;
+			mutex_exit(&ldcp->txlock);
+			return (VGEN_TX_SUCCESS);
+		}
+		vgen_copymsg(mp, bp->b_rptr);
+		bp->b_wptr += size;
+		datalen = size;		/* actual data length without pad */
+		size = (datalen + 7) & ~7;
+		bp->b_wptr += (size - datalen);
+	} else { /* size/alignment are ok */
+		datalen = size;
+	}
+
+	mutex_enter(&ldcp->txlock);
+
+	/*  check if the channel is still up & running */
+	if ((ldcp->hphase != VH_DONE) || (ldcp->ldc_status != LDC_UP)) {
+		DWARN((vnetp,
+		    "vgen_ldcsend: id(%lx) status(%d), dropping packet\n",
+		    ldcp->ldc_id, ldcp->ldc_status));
+		err = B_TRUE;
+		goto vgen_tx_exit;
+	}
+
+	/*
+	 * allocate a descriptor
+	 */
+	tbufp = ldcp->next_tbufp;
+	ntbufp = NEXTTBUF(ldcp, tbufp);
+	if (tbufp->flags != VGEN_PRIV_DESC_FREE ||
+	    ntbufp == ldcp->cur_tbufp) { /* out of tbufs/txds */
+
+		mutex_enter(&ldcp->tclock);
+		if (ntbufp == ldcp->cur_tbufp)
+			ldcp->need_resched = B_TRUE;
+		mutex_exit(&ldcp->tclock);
+
+		statsp->tx_no_desc++;
+		mutex_exit(&ldcp->txlock);
+		if (bp)
+			freemsg(bp);
+#ifdef VGEN_USE_MAC_TX_UPDATE
+		/*
+		 * This cflag is disabled by default. This can be enabled if we
+		 * want to return failure to the mac layer when we run out of
+		 * descriptors and use mac_tx_update() to restart tx when
+		 * descriptors become available. However, stopping tx would
+		 * affect traffic going over other ports, as upper mac layer
+		 * has no concept of multiple ports within a device.
+		 * So currently, to avoid this, drop packets when we run out
+		 * of descrs and just return success. See the corresponding
+		 * code in vgen_portsend() and vgen_reclaim_dring().
+		 */
+		return (VGEN_TX_NORESOURCES);
+#else
+		freemsg(mp);	/* drop the packet */
+		return (VGEN_TX_SUCCESS);
+#endif
+	}
+
+	txpending = vgen_num_txpending(ldcp);
+	if (txpending >= ldcp->reclaim_hiwat) {
+		/*
+		 * if num of pending transmits is more than hiwat,
+		 * reclaim now and also enable ack bit.
+		 */
+		reclaim = B_TRUE;
+		need_intr = B_TRUE;
+	} else {
+		if (txpending >= ldcp->reclaim_lowat) {
+			/*
+			 * if the num of pending transmits is more than lowat
+			 * enable ack bit in the descr and reclaim in intr().
+			 */
+			need_intr = B_TRUE;
+		}
+	}
+
+	i = tbufp - ldcp->tbufp;
+
+	rptr = bp ? (bp->b_rptr) : (mp->b_rptr);
+	ci = 0;
+	rv = ldc_mem_bind_handle(tbufp->memhandle, (caddr_t)rptr, size,
+		LDC_SHADOW_MAP, LDC_MEM_R, &(tbufp->memcookie[ci]), &ncookies);
+	if (rv != 0) {
+		DWARN((vnetp, "vgen_ldcsend: id(%lx)ldc_mem_bind_handle failed"
+		    " rv(%d) tbufi(%d)\n", ldcp->ldc_id, rv, i));
+		err = B_TRUE;
+		statsp->oerrors++;
+		goto vgen_tx_exit;
+	}
+
+	if ((ncookies < 0) || (ncookies > (uint64_t)MAX_COOKIES)) {
+		DWARN((vnetp,
+		    "vgen_ldcsend: id(%lx)ldc_mem_bind_handle returned"
+		    " invalid cookies (%d)\n", ldcp->ldc_id, ncookies));
+		err = B_TRUE;
+		statsp->oerrors++;
+		(void) ldc_mem_unbind_handle(tbufp->memhandle);
+		goto vgen_tx_exit;
+	}
+
+	if (ncookies > 1) {
+		nc = ncookies - 1;
+		while (nc) {
+			ci++;
+			rv = ldc_mem_nextcookie(tbufp->memhandle,
+			    &(tbufp->memcookie[ci]));
+			if (rv != 0) {
+				DWARN((vnetp,
+				    "vgen_ldcsend: ldc_mem_nextcookie"
+				    " err(%d)\n", rv));
+				err = B_TRUE;
+				statsp->oerrors++;
+				(void) ldc_mem_unbind_handle(tbufp->memhandle);
+				goto vgen_tx_exit;
+			}
+			nc--;
+		}
+	}
+
+	ehp = (struct ether_header *)rptr;
+	is_bcast = IS_BROADCAST(ehp);
+	is_mcast = IS_MULTICAST(ehp);
+	/* save the packet, free when the descr done flag is set */
+	tbufp->mp = (bp ? bp : mp);
+	tbufp->flags = VGEN_PRIV_DESC_BUSY;
+	tbufp->datalen = datalen;
+	tbufp->ncookies = ncookies;
+	tbufp->seqnum = ldcp->next_txseq;
+
+	/* initialize the corresponding public descriptor (txd) */
+	txdp = tbufp->descp;
+	hdrp = &txdp->hdr;
+	hdrp->dstate = VIO_DESC_READY;
+	if (need_intr)
+		hdrp->ack = B_TRUE;
+	txdp->nbytes = datalen;
+	txdp->ncookies = ncookies;
+	bcopy((tbufp->memcookie), (txdp->memcookie),
+	    ncookies * sizeof (ldc_mem_cookie_t));
+
+	/* send dring datamsg to the peer */
+	start = end = i;
+	rv = vgen_send_dring_data(ldcp, start, end, ldcp->next_txseq);
+	if (rv != 0) {
+		/* vgen_send_dring_data() error: drop the packet */
+		DWARN((vnetp,
+		    "vgen_ldcsend: vgen_send_dring_data():  failed: "
+		    "id(%lx) rv(%d) len (%d)\n", ldcp->ldc_id, rv, datalen));
+		(void) ldc_mem_unbind_handle(tbufp->memhandle);
+		tbufp->flags = VGEN_PRIV_DESC_FREE;	/* free tbuf */
+		hdrp->dstate = VIO_DESC_FREE;	/* free txd */
+		hdrp->ack = B_FALSE;
+		statsp->oerrors++;
+		err = B_TRUE;
+		goto vgen_tx_exit;
+	}
+
+	/* update next available tbuf in the ring */
+	ldcp->next_tbufp = ntbufp;
+	/* update tx seqnum and index */
+	ldcp->next_txseq++;
+	INCR_TXI(ldcp->next_txi, ldcp);
+
+	/* update stats */
+	statsp->opackets++;
+	statsp->obytes += datalen;
+	if (is_bcast)
+		statsp->brdcstxmt++;
+	else if (is_mcast)
+		statsp->multixmt++;
+
+vgen_tx_exit:
+	mutex_exit(&ldcp->txlock);
+
+	if (reclaim) {
+		vgen_reclaim(ldcp);
+	}
+	DBG1((vnetp, "vgen_ldcsend: exit: ldcid (%lx)\n", ldcp->ldc_id));
+
+	if (err) {
+		if (bp)
+			freemsg(bp);
+#ifdef VGEN_USE_MAC_TX_UPDATE
+		return (VGEN_TX_FAILURE);	/* transmit failed */
+#else
+		freemsg(mp);			/* drop the packet */
+		return (VGEN_TX_SUCCESS);
+#endif
+	} else {
+		if (bp)	/* free original pkt, copy is in bp */
+			freemsg(mp);
+		return (VGEN_TX_SUCCESS);
+	}
+}
+
+/* register resources */
+static void
+vgen_resources(void *arg)
+{
+	vgen_t *vgenp;
+	mac_rx_fifo_t mrf;
+
+	vgenp = (vgen_t *)arg;
+	DBG1((vgenp->vnetp, "vgen_resources: enter\n"));
+
+	mrf.mrf_type = MAC_RX_FIFO;
+	mrf.mrf_blank = NULL;
+	mrf.mrf_arg = NULL;
+	mrf.mrf_normal_blank_time = 0;
+	mrf.mrf_normal_pkt_count = 0;
+	vgenp->mrh = mac_resource_add(vgenp->vnetmacp, (mac_resource_t *)&mrf);
+
+	DBG1((vgenp->vnetp, "vgen_resources: exit\n"));
+}
+
+/* enable/disable a multicast address */
+static int
+vgen_multicst(void *arg, boolean_t add, const uint8_t *mca)
+{
+	vgen_t			*vgenp;
+	vnet_mcast_msg_t	mcastmsg;
+	vio_msg_tag_t		*tagp;
+	vgen_port_t		*portp;
+	vgen_portlist_t		*plistp;
+	vgen_ldc_t		*ldcp;
+	vgen_ldclist_t		*ldclp;
+	void			*vnetp;
+	struct ether_addr	*addrp;
+	int			rv;
+	uint32_t		i;
+
+	vgenp = (vgen_t *)arg;
+	vnetp = vgenp->vnetp;
+	addrp = (struct ether_addr *)mca;
+	tagp = &mcastmsg.tag;
+	bzero(&mcastmsg, sizeof (mcastmsg));
+
+	mutex_enter(&vgenp->lock);
+
+	plistp = &(vgenp->vgenports);
+
+	READ_ENTER(&plistp->rwlock);
+
+	portp = vgenp->vsw_portp;
+	if (portp == NULL) {
+		RW_EXIT(&plistp->rwlock);
+		goto vgen_mcast_exit;
+	}
+	ldclp = &portp->ldclist;
+
+	READ_ENTER(&ldclp->rwlock);
+
+	ldcp = ldclp->headp;
+	if (ldcp == NULL) {
+		RW_EXIT(&ldclp->rwlock);
+		RW_EXIT(&plistp->rwlock);
+		goto vgen_mcast_exit;
+	}
+
+	mutex_enter(&ldcp->cblock);
+
+	if (ldcp->hphase == VH_DONE) {
+		/*
+		 * If handshake is done, send a msg to vsw to add/remove
+		 * the multicast address.
+		 */
+		tagp->vio_msgtype = VIO_TYPE_CTRL;
+		tagp->vio_subtype = VIO_SUBTYPE_INFO;
+		tagp->vio_subtype_env = VNET_MCAST_INFO;
+		tagp->vio_sid = ldcp->local_sid;
+		bcopy(mca, &(mcastmsg.mca), ETHERADDRL);
+		mcastmsg.set = add;
+		mcastmsg.count = 1;
+		rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (mcastmsg),
+		    B_FALSE);
+		if (rv != VGEN_SUCCESS) {
+			DWARN((vnetp, "vgen_mutlicst: vgen_sendmsg failed"
+			    "id (%lx)\n", ldcp->ldc_id));
+		}
+	} else {
+		/* set the flag to send a msg to vsw after handshake is done */
+		ldcp->need_mcast_sync = B_TRUE;
+	}
+
+	mutex_exit(&ldcp->cblock);
+
+	if (add) {
+
+		/* expand multicast table if necessary */
+		if (vgenp->mccount >= vgenp->mcsize) {
+			struct ether_addr	*newtab;
+			uint32_t		newsize;
+
+
+			newsize = vgenp->mcsize * 2;
+
+			newtab = kmem_zalloc(newsize *
+			    sizeof (struct ether_addr), KM_NOSLEEP);
+
+			bcopy(vgenp->mctab, newtab, vgenp->mcsize *
+			    sizeof (struct ether_addr));
+			kmem_free(vgenp->mctab,
+			    vgenp->mcsize * sizeof (struct ether_addr));
+
+			vgenp->mctab = newtab;
+			vgenp->mcsize = newsize;
+		}
+
+		/* add address to the table */
+		vgenp->mctab[vgenp->mccount++] = *addrp;
+
+	} else {
+
+		/* delete address from the table */
+		for (i = 0; i < vgenp->mccount; i++) {
+			if (ether_cmp(addrp, &(vgenp->mctab[i])) == 0) {
+
+				/*
+				 * If there's more than one address in this
+				 * table, delete the unwanted one by moving
+				 * the last one in the list over top of it;
+				 * otherwise, just remove it.
+				 */
+				if (vgenp->mccount > 1) {
+					vgenp->mctab[i] =
+						vgenp->mctab[vgenp->mccount-1];
+				}
+				vgenp->mccount--;
+				break;
+			}
+		}
+	}
+
+	RW_EXIT(&ldclp->rwlock);
+	RW_EXIT(&plistp->rwlock);
+
+vgen_mcast_exit:
+	mutex_exit(&vgenp->lock);
+	return (DDI_SUCCESS);
+}
+
+/* set or clear promiscuous mode on the device */
+static int
+vgen_promisc(void *arg, boolean_t on)
+{
+	_NOTE(ARGUNUSED(arg, on))
+	return (DDI_SUCCESS);
+}
+
+/* set the unicast mac address of the device */
+static int
+vgen_unicst(void *arg, const uint8_t *mca)
+{
+	_NOTE(ARGUNUSED(arg, mca))
+	return (DDI_SUCCESS);
+}
+
+/* get device statistics */
+static uint64_t
+vgen_stat(void *arg, enum mac_stat stat)
+{
+	vgen_t		*vgenp = (vgen_t *)arg;
+	vgen_port_t	*portp;
+	vgen_portlist_t	*plistp;
+	uint64_t val;
+
+	val = 0;
+
+	plistp = &(vgenp->vgenports);
+	READ_ENTER(&plistp->rwlock);
+
+	for (portp = plistp->headp; portp != NULL; portp = portp->nextp) {
+			val += vgen_port_stat(portp, stat);
+	}
+
+	RW_EXIT(&plistp->rwlock);
+
+	return (val);
+}
+
+static void
+vgen_ioctl(void *arg, queue_t *wq, mblk_t *mp)
+{
+	 _NOTE(ARGUNUSED(arg, wq, mp))
+}
+
+/* vgen internal functions */
+/* detach all ports from the device */
+static void
+vgen_detach_ports(vgen_t *vgenp)
+{
+	vgen_port_t	*portp;
+	vgen_portlist_t	*plistp;
+
+	plistp = &(vgenp->vgenports);
+	WRITE_ENTER(&plistp->rwlock);
+
+	while ((portp = plistp->headp) != NULL) {
+		vgen_port_detach(portp);
+	}
+
+	RW_EXIT(&plistp->rwlock);
+}
+
+/*
+ * detach the given port.
+ */
+static void
+vgen_port_detach(vgen_port_t *portp)
+{
+	vgen_t		*vgenp;
+	vgen_ldclist_t	*ldclp;
+	int		port_num;
+
+	vgenp = portp->vgenp;
+	port_num = portp->port_num;
+
+	DBG1((vgenp->vnetp,
+	    "vgen_port_detach: enter: port_num(%d)\n", port_num));
+
+	/* remove it from port list */
+	vgen_port_list_remove(portp);
+
+	/* detach channels from this port */
+	ldclp = &portp->ldclist;
+	WRITE_ENTER(&ldclp->rwlock);
+	while (ldclp->headp) {
+		vgen_ldc_detach(ldclp->headp);
+	}
+	RW_EXIT(&ldclp->rwlock);
+
+	if (vgenp->vsw_portp == portp) {
+		vgenp->vsw_portp = NULL;
+	}
+	KMEM_FREE(portp);
+
+	DBG1((vgenp->vnetp,
+	    "vgen_port_detach: exit: port_num(%d)\n", port_num));
+}
+
+/* add a port to port list */
+static void
+vgen_port_list_insert(vgen_port_t *portp)
+{
+	vgen_portlist_t *plistp;
+	vgen_t *vgenp;
+
+	vgenp = portp->vgenp;
+	plistp = &(vgenp->vgenports);
+
+	if (plistp->headp == NULL) {
+		plistp->headp = portp;
+	} else {
+		plistp->tailp->nextp = portp;
+	}
+	plistp->tailp = portp;
+	portp->nextp = NULL;
+}
+
+/* remove a port from port list */
+static void
+vgen_port_list_remove(vgen_port_t *portp)
+{
+	vgen_port_t *prevp;
+	vgen_port_t *nextp;
+	vgen_portlist_t *plistp;
+	vgen_t *vgenp;
+
+	vgenp = portp->vgenp;
+
+	plistp = &(vgenp->vgenports);
+
+	if (plistp->headp == NULL)
+		return;
+
+	if (portp == plistp->headp) {
+		plistp->headp = portp->nextp;
+		if (portp == plistp->tailp)
+			plistp->tailp = plistp->headp;
+	} else {
+		for (prevp = plistp->headp; ((nextp = prevp->nextp) != NULL) &&
+		    (nextp != portp); prevp = nextp);
+		if (nextp == portp) {
+			prevp->nextp = portp->nextp;
+		}
+		if (portp == plistp->tailp)
+			plistp->tailp = prevp;
+	}
+}
+
+/* lookup a port in the list based on port_num */
+static vgen_port_t *
+vgen_port_lookup(vgen_portlist_t *plistp, int port_num)
+{
+	vgen_port_t *portp = NULL;
+
+	for (portp = plistp->headp; portp != NULL; portp = portp->nextp) {
+		if (portp->port_num == port_num) {
+			break;
+		}
+	}
+
+	return (portp);
+}
+
+/* enable ports for transmit/receive */
+static void
+vgen_init_ports(vgen_t *vgenp)
+{
+	vgen_port_t	*portp;
+	vgen_portlist_t	*plistp;
+
+	plistp = &(vgenp->vgenports);
+	READ_ENTER(&plistp->rwlock);
+
+	for (portp = plistp->headp; portp != NULL; portp = portp->nextp) {
+		vgen_port_init(portp);
+	}
+
+	RW_EXIT(&plistp->rwlock);
+}
+
+static void
+vgen_port_init(vgen_port_t *portp)
+{
+	vgen_t *vgenp;
+
+	vgenp = portp->vgenp;
+	/*
+	 * Create fdb entry in vnet, corresponding to the mac
+	 * address of this port. Note that the port specified
+	 * is vsw-port. This is done so that vsw-port acts
+	 * as the route to reach this macaddr, until the
+	 * channel for this port comes up (LDC_UP) and
+	 * handshake is done successfully.
+	 * eg, if the peer is OBP-vnet, it may not bring the
+	 * channel up for this port and may communicate via
+	 * vsw to reach this port.
+	 * Later, when Solaris-vnet comes up at the other end
+	 * of the channel for this port and brings up the channel,
+	 * it is an indication that peer vnet is capable of
+	 * distributed switching, so the direct route through this
+	 * port is specified in fdb, using vnet_modify_fdb(macaddr);
+	 */
+	vnet_add_fdb(vgenp->vnetp, (uint8_t *)&portp->macaddr,
+	    vgen_tx, vgenp->vsw_portp);
+
+	if (portp == vgenp->vsw_portp) {
+		/*
+		 * create the default route entry in vnet's fdb.
+		 * This is the entry used by vnet to reach
+		 * unknown destinations, which basically goes
+		 * through vsw on domain0 and out through the
+		 * physical device bound to vsw.
+		 */
+		vnet_add_def_rte(vgenp->vnetp, vgen_tx, portp);
+	}
+
+	/* Bring up the channels of this port */
+	vgen_init_ldcs(portp);
+}
+
+/* disable transmit/receive on ports */
+static void
+vgen_uninit_ports(vgen_t *vgenp)
+{
+	vgen_port_t	*portp;
+	vgen_portlist_t	*plistp;
+
+	plistp = &(vgenp->vgenports);
+	READ_ENTER(&plistp->rwlock);
+
+	for (portp = plistp->headp; portp != NULL; portp = portp->nextp) {
+		vgen_port_uninit(portp);
+	}
+
+	RW_EXIT(&plistp->rwlock);
+}
+
+static void
+vgen_port_uninit(vgen_port_t *portp)
+{
+	vgen_t *vgenp;
+
+	vgenp = portp->vgenp;
+
+	vgen_uninit_ldcs(portp);
+	/* delete the entry in vnet's fdb for this port */
+	vnet_del_fdb(vgenp->vnetp, (uint8_t *)&portp->macaddr);
+	if (portp == vgenp->vsw_portp) {
+		/*
+		 * if this is vsw-port, then delete the default
+		 * route entry in vnet's fdb.
+		 */
+		vnet_del_def_rte(vgenp->vnetp);
+	}
+}
+
+/* register with MD event generator */
+static int
+vgen_mdeg_reg(vgen_t *vgenp)
+{
+	mdeg_prop_spec_t	*pspecp;
+	mdeg_node_spec_t	*parentp;
+	uint_t			templatesz;
+	int			rv;
+	mdeg_handle_t		hdl;
+	int			i;
+	void			*vnetp = vgenp->vnetp;
+
+	i = ddi_prop_get_int(DDI_DEV_T_ANY, vgenp->vnetdip,
+			DDI_PROP_DONTPASS, reg_propname, -1);
+	if (i == -1) {
+		return (DDI_FAILURE);
+	}
+	templatesz = sizeof (vgen_prop_template);
+	pspecp = kmem_zalloc(templatesz, KM_NOSLEEP);
+	if (pspecp == NULL) {
+		return (DDI_FAILURE);
+	}
+	parentp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_NOSLEEP);
+	if (parentp == NULL) {
+		kmem_free(pspecp, templatesz);
+		return (DDI_FAILURE);
+	}
+
+	bcopy(vgen_prop_template, pspecp, templatesz);
+
+	/*
+	 * NOTE: The instance here refers to the value of "reg" property and
+	 * not the dev_info instance (ddi_get_instance()) of vnet.
+	 */
+	VGEN_SET_MDEG_PROP_INST(pspecp, i);
+
+	parentp->namep = "virtual-device";
+	parentp->specp = pspecp;
+
+	/* save parentp in vgen_t */
+	vgenp->mdeg_parentp = parentp;
+
+	rv = mdeg_register(parentp, &vport_match, vgen_mdeg_cb, vgenp, &hdl);
+	if (rv != MDEG_SUCCESS) {
+		DERR((vnetp, "vgen_mdeg_reg: mdeg_register failed\n"));
+		KMEM_FREE(parentp);
+		kmem_free(pspecp, templatesz);
+		vgenp->mdeg_parentp = NULL;
+		return (DDI_FAILURE);
+	}
+
+	/* save mdeg handle in vgen_t */
+	vgenp->mdeg_hdl = hdl;
+
+	return (DDI_SUCCESS);
+}
+
+/* unregister with MD event generator */
+static void
+vgen_mdeg_unreg(vgen_t *vgenp)
+{
+	(void) mdeg_unregister(vgenp->mdeg_hdl);
+	KMEM_FREE(vgenp->mdeg_parentp);
+	vgenp->mdeg_parentp = NULL;
+	vgenp->mdeg_hdl = NULL;
+}
+
+/* callback function registered with MD event generator */
+static int
+vgen_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
+{
+	int idx;
+	int vsw_idx = -1;
+	uint64_t val;
+	vgen_t *vgenp;
+
+	if ((resp == NULL) || (cb_argp == NULL)) {
+		return (MDEG_FAILURE);
+	}
+
+	vgenp = (vgen_t *)cb_argp;
+	DBG1((vgenp->vnetp, "vgen_mdeg_cb: enter\n"));
+
+	mutex_enter(&vgenp->lock);
+
+	DBG1((vgenp->vnetp,
+	    "vgen_mdeg_cb: ports: removed(%x), added(%x), updated(%x)\n",
+	    resp->removed.nelem, resp->added.nelem, resp->match_curr.nelem));
+
+	for (idx = 0; idx < resp->removed.nelem; idx++) {
+		(void) vgen_remove_port(vgenp, resp->removed.mdp,
+		    resp->removed.mdep[idx]);
+	}
+
+	if (vgenp->vsw_portp == NULL) {
+		/*
+		 * find vsw_port and add it first, because other ports need
+		 * this when adding fdb entry (see vgen_port_init()).
+		 */
+		for (idx = 0; idx < resp->added.nelem; idx++) {
+			if (!(md_get_prop_val(resp->added.mdp,
+			    resp->added.mdep[idx], swport_propname, &val))) {
+				if (val == 0) {
+					/*
+					 * This port is connected to the
+					 * vsw on dom0.
+					 */
+					vsw_idx = idx;
+					(void) vgen_add_port(vgenp,
+					    resp->added.mdp,
+					    resp->added.mdep[idx]);
+					break;
+				}
+			}
+		}
+		if (vsw_idx == -1) {
+			DWARN((vgenp->vnetp, "vgen_mdeg_cb: "
+			    "can't find vsw_port\n"));
+			return (MDEG_FAILURE);
+		}
+	}
+
+	for (idx = 0; idx < resp->added.nelem; idx++) {
+		if ((vsw_idx != -1) && (vsw_idx == idx)) /* skip vsw_port */
+			continue;
+		(void) vgen_add_port(vgenp, resp->added.mdp,
+		    resp->added.mdep[idx]);
+	}
+
+	for (idx = 0; idx < resp->match_curr.nelem; idx++) {
+		(void) vgen_update_port(vgenp, resp->match_curr.mdp,
+		    resp->match_curr.mdep[idx],
+		    resp->match_prev.mdp,
+		    resp->match_prev.mdep[idx]);
+	}
+
+	mutex_exit(&vgenp->lock);
+	DBG1((vgenp->vnetp, "vgen_mdeg_cb: exit\n"));
+	return (MDEG_SUCCESS);
+}
+
+/* add a new port to the device */
+static int
+vgen_add_port(vgen_t *vgenp, md_t *mdp, mde_cookie_t mdex)
+{
+	uint64_t	port_num;
+	uint64_t	*ldc_ids;
+	uint64_t	macaddr;
+	uint64_t	val;
+	int		num_ldcs;
+	int		vsw_port = B_FALSE;
+	int		i;
+	int		addrsz;
+	int		num_nodes = 0;
+	int		listsz = 0;
+	mde_cookie_t	*listp = NULL;
+	uint8_t		*addrp;
+	struct ether_addr	ea;
+
+	/* read "id" property to get the port number */
+	if (md_get_prop_val(mdp, mdex, id_propname, &port_num)) {
+		DWARN((vgenp->vnetp,
+		    "vgen_add_port: prop(%s) not found\n", id_propname));
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Find the channel endpoint node(s) under this port node.
+	 */
+	if ((num_nodes = md_node_count(mdp)) <= 0) {
+		DWARN((vgenp->vnetp,
+		    "vgen_add_port: invalid number of nodes found (%d)",
+		    num_nodes));
+		return (DDI_FAILURE);
+	}
+
+	/* allocate space for node list */
+	listsz = num_nodes * sizeof (mde_cookie_t);
+	listp = kmem_zalloc(listsz, KM_NOSLEEP);
+	if (listp == NULL)
+		return (DDI_FAILURE);
+
+	num_ldcs = md_scan_dag(mdp, mdex,
+		md_find_name(mdp, channel_propname),
+		md_find_name(mdp, "fwd"), listp);
+
+	if (num_ldcs <= 0) {
+		DWARN((vgenp->vnetp,
+		    "vgen_add_port: can't find %s nodes", channel_propname));
+		kmem_free(listp, listsz);
+		return (DDI_FAILURE);
+	}
+
+	DBG2((vgenp->vnetp, "vgen_add_port: num_ldcs %d", num_ldcs));
+
+	ldc_ids = kmem_zalloc(num_ldcs * sizeof (uint64_t), KM_NOSLEEP);
+	if (ldc_ids == NULL) {
+		kmem_free(listp, listsz);
+		return (DDI_FAILURE);
+	}
+
+	for (i = 0; i < num_ldcs; i++) {
+		/* read channel ids */
+		if (md_get_prop_val(mdp, listp[i], id_propname, &ldc_ids[i])) {
+			DWARN((vgenp->vnetp,
+			    "vgen_add_port: prop(%s) not found\n",
+			    id_propname));
+			kmem_free(listp, listsz);
+			kmem_free(ldc_ids, num_ldcs * sizeof (uint64_t));
+			return (DDI_FAILURE);
+		}
+		DBG2((vgenp->vnetp, "vgen_add_port: ldc_id 0x%llx",
+		    ldc_ids[i]));
+	}
+
+	kmem_free(listp, listsz);
+
+	if (md_get_prop_data(mdp, mdex, rmacaddr_propname, &addrp,
+	    &addrsz)) {
+		DWARN((vgenp->vnetp,
+		    "vgen_add_port: prop(%s) not found\n", rmacaddr_propname));
+		kmem_free(ldc_ids, num_ldcs * sizeof (uint64_t));
+		return (DDI_FAILURE);
+	}
+
+	if (addrsz < ETHERADDRL) {
+		DWARN((vgenp->vnetp,
+		    "vgen_add_port: invalid address size (%d)\n", addrsz));
+		kmem_free(ldc_ids, num_ldcs * sizeof (uint64_t));
+		return (DDI_FAILURE);
+	}
+
+	macaddr = *((uint64_t *)addrp);
+
+	DBG2((vgenp->vnetp, "vgen_add_port: remote mac address 0x%llx\n",
+	    macaddr));
+
+	for (i = ETHERADDRL - 1; i >= 0; i--) {
+		ea.ether_addr_octet[i] = macaddr & 0xFF;
+		macaddr >>= 8;
+	}
+
+	if (vgenp->vsw_portp == NULL) {
+		if (!(md_get_prop_val(mdp, mdex, swport_propname, &val))) {
+			if (val == 0) {
+				/* This port is connected to the vsw on dom0 */
+				vsw_port = B_TRUE;
+			}
+		}
+	}
+	(void) vgen_port_attach_mdeg(vgenp, (int)port_num, ldc_ids, num_ldcs,
+	    &ea, vsw_port);
+
+	kmem_free(ldc_ids, num_ldcs * sizeof (uint64_t));
+
+	return (DDI_SUCCESS);
+}
+
+/* remove a port from the device */
+static int
+vgen_remove_port(vgen_t *vgenp, md_t *mdp, mde_cookie_t mdex)
+{
+	uint64_t	port_num;
+	vgen_port_t	*portp;
+	vgen_portlist_t	*plistp;
+
+	/* read "id" property to get the port number */
+	if (md_get_prop_val(mdp, mdex, id_propname, &port_num)) {
+		DWARN((vgenp->vnetp,
+		    "vgen_remove_port: prop(%s) not found\n", id_propname));
+		return (DDI_FAILURE);
+	}
+
+	plistp = &(vgenp->vgenports);
+
+	WRITE_ENTER(&plistp->rwlock);
+	portp = vgen_port_lookup(plistp, (int)port_num);
+	if (portp == NULL) {
+		DWARN((vgenp->vnetp, "vgen_remove_port: can't find port(%lx)\n",
+		    port_num));
+		RW_EXIT(&plistp->rwlock);
+		return (DDI_FAILURE);
+	}
+
+	vgen_port_detach_mdeg(portp);
+	RW_EXIT(&plistp->rwlock);
+
+	return (DDI_SUCCESS);
+}
+
+/* attach a port to the device based on mdeg data */
+static int
+vgen_port_attach_mdeg(vgen_t *vgenp, int port_num, uint64_t *ldcids,
+	int num_ids, struct ether_addr *macaddr, boolean_t vsw_port)
+{
+	vgen_port_t		*portp;
+	vgen_portlist_t		*plistp;
+	int			i;
+
+	portp = kmem_zalloc(sizeof (vgen_port_t), KM_NOSLEEP);
+	if (portp == NULL) {
+		return (DDI_FAILURE);
+	}
+	portp->vgenp = vgenp;
+	portp->port_num = port_num;
+
+	DBG1((vgenp->vnetp,
+	    "vgen_port_attach_mdeg: port_num(%d)\n", portp->port_num));
+
+	portp->ldclist.num_ldcs = 0;
+	portp->ldclist.headp = NULL;
+	rw_init(&portp->ldclist.rwlock, NULL, RW_DRIVER, NULL);
+
+	ether_copy(macaddr, &portp->macaddr);
+	for (i = 0; i < num_ids; i++) {
+		DBG2((vgenp->vnetp, "vgen_port_attach_mdeg: ldcid (%lx)\n",
+		    ldcids[i]));
+		(void) vgen_ldc_attach(portp, ldcids[i]);
+	}
+
+	/* link it into the list of ports */
+	plistp = &(vgenp->vgenports);
+	WRITE_ENTER(&plistp->rwlock);
+	vgen_port_list_insert(portp);
+	RW_EXIT(&plistp->rwlock);
+
+	/* This port is connected to the vsw on domain0 */
+	if (vsw_port)
+		vgenp->vsw_portp = portp;
+
+	if (vgenp->flags & VGEN_STARTED) {	/* interface is configured */
+		vgen_port_init(portp);
+	}
+
+	DBG1((vgenp->vnetp,
+	    "vgen_port_attach_mdeg: exit: port_num(%d)\n", portp->port_num));
+	return (DDI_SUCCESS);
+}
+
+/* detach a port from the device based on mdeg data */
+static void
+vgen_port_detach_mdeg(vgen_port_t *portp)
+{
+	vgen_t *vgenp = portp->vgenp;
+
+	DBG1((vgenp->vnetp,
+	    "vgen_port_detach_mdeg: enter: port_num(%d)\n", portp->port_num));
+	/* stop the port if needed */
+	if (vgenp->flags & VGEN_STARTED) {
+		vgen_port_uninit(portp);
+	}
+	vgen_port_detach(portp);
+
+	DBG1((vgenp->vnetp,
+	    "vgen_port_detach_mdeg: exit: port_num(%d)\n", portp->port_num));
+}
+
+static int
+vgen_update_port(vgen_t *vgenp, md_t *curr_mdp, mde_cookie_t curr_mdex,
+	md_t *prev_mdp, mde_cookie_t prev_mdex)
+{
+	 _NOTE(ARGUNUSED(vgenp, curr_mdp, curr_mdex, prev_mdp, prev_mdex))
+
+	/* XXX: TBD */
+	return (DDI_SUCCESS);
+}
+
+static uint64_t
+vgen_port_stat(vgen_port_t *portp, enum mac_stat stat)
+{
+	vgen_ldclist_t	*ldclp;
+	vgen_ldc_t *ldcp;
+	uint64_t	val;
+
+	val = 0;
+	ldclp = &portp->ldclist;
+
+	READ_ENTER(&ldclp->rwlock);
+	for (ldcp = ldclp->headp; ldcp != NULL; ldcp = ldcp->nextp) {
+		val += vgen_ldc_stat(ldcp, stat);
+	}
+	RW_EXIT(&ldclp->rwlock);
+
+	return (val);
+}
+
+/* attach the channel corresponding to the given ldc_id to the port */
+static int
+vgen_ldc_attach(vgen_port_t *portp, uint64_t ldc_id)
+{
+	vgen_t 		*vgenp;
+	vgen_ldclist_t	*ldclp;
+	vgen_ldc_t 	*ldcp, **prev_ldcp;
+	ldc_attr_t 	attr;
+	int 		status;
+	ldc_status_t	istatus;
+	enum		{AST_init = 0x0, AST_ldc_alloc = 0x1,
+			AST_mutex_init = 0x2, AST_ldc_init = 0x4,
+			AST_ldc_reg_cb = 0x8, AST_alloc_tx_ring = 0x10}
+			attach_state;
+
+	attach_state = AST_init;
+	vgenp = portp->vgenp;
+	ldclp = &portp->ldclist;
+
+	ldcp = kmem_zalloc(sizeof (vgen_ldc_t), KM_NOSLEEP);
+	if (ldcp == NULL) {
+		goto ldc_attach_failed;
+	}
+	ldcp->ldc_id = ldc_id;
+	ldcp->portp = portp;
+	ldcp->reclaim_lowat = vnet_reclaim_lowat;
+	ldcp->reclaim_hiwat = vnet_reclaim_hiwat;
+
+	attach_state |= AST_ldc_alloc;
+
+	mutex_init(&ldcp->txlock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ldcp->cblock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ldcp->tclock, NULL, MUTEX_DRIVER, NULL);
+
+	attach_state |= AST_mutex_init;
+
+	attr.devclass = LDC_DEV_NT;
+	attr.instance = ddi_get_instance(vgenp->vnetdip);
+	attr.mode = LDC_MODE_UNRELIABLE;
+	attr.qlen = vnet_ldc_qlen;
+	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
+	if (status != 0) {
+		DWARN((vgenp->vnetp, "ldc_init failed, id (%lx) rv (%d)\n",
+		    ldc_id, status));
+		goto ldc_attach_failed;
+	}
+	attach_state |= AST_ldc_init;
+
+	status = ldc_reg_callback(ldcp->ldc_handle, vgen_ldc_cb, (caddr_t)ldcp);
+	if (status != 0) {
+		DWARN((vgenp->vnetp,
+		    "ldc_reg_callback failed, id (%lx) rv (%d)\n",
+		    ldc_id, status));
+		goto ldc_attach_failed;
+	}
+	attach_state |= AST_ldc_reg_cb;
+
+	(void) ldc_status(ldcp->ldc_handle, &istatus);
+	ASSERT(istatus == LDC_INIT);
+	ldcp->ldc_status = istatus;
+
+	/* allocate transmit resources */
+	status = vgen_alloc_tx_ring(ldcp);
+	if (status != 0) {
+		goto ldc_attach_failed;
+	}
+	attach_state |= AST_alloc_tx_ring;
+
+	/* Setup kstats for the channel */
+	status = vgen_setup_kstats(ldcp);
+	if (status != VGEN_SUCCESS) {
+		goto ldc_attach_failed;
+	}
+
+	/* initialize vgen_versions supported */
+	bcopy(vgen_versions, ldcp->vgen_versions, sizeof (ldcp->vgen_versions));
+
+	/* link it into the list of channels for this port */
+	WRITE_ENTER(&ldclp->rwlock);
+	prev_ldcp = (vgen_ldc_t **)(&ldclp->headp);
+	ldcp->nextp = *prev_ldcp;
+	*prev_ldcp = ldcp;
+	ldclp->num_ldcs++;
+	RW_EXIT(&ldclp->rwlock);
+
+	ldcp->flags |= CHANNEL_ATTACHED;
+	return (DDI_SUCCESS);
+
+ldc_attach_failed:
+	if (attach_state & AST_alloc_tx_ring) {
+		vgen_free_tx_ring(ldcp);
+	}
+	if (attach_state & AST_ldc_reg_cb) {
+		(void) ldc_unreg_callback(ldcp->ldc_handle);
+	}
+	if (attach_state & AST_ldc_init) {
+		(void) ldc_fini(ldcp->ldc_handle);
+	}
+	if (attach_state & AST_mutex_init) {
+		mutex_destroy(&ldcp->tclock);
+		mutex_destroy(&ldcp->txlock);
+		mutex_destroy(&ldcp->cblock);
+	}
+	if (attach_state & AST_ldc_alloc) {
+		KMEM_FREE(ldcp);
+	}
+	return (DDI_FAILURE);
+}
+
+/* detach a channel from the port */
+static void
+vgen_ldc_detach(vgen_ldc_t *ldcp)
+{
+	vgen_port_t	*portp;
+	vgen_t 		*vgenp;
+	vgen_ldc_t 	*pldcp;
+	vgen_ldc_t	**prev_ldcp;
+	vgen_ldclist_t	*ldclp;
+
+	portp = ldcp->portp;
+	vgenp = portp->vgenp;
+	ldclp = &portp->ldclist;
+
+	prev_ldcp =  (vgen_ldc_t **)&ldclp->headp;
+	for (; (pldcp = *prev_ldcp) != NULL; prev_ldcp = &pldcp->nextp) {
+		if (pldcp == ldcp) {
+			break;
+		}
+	}
+
+	if (pldcp == NULL) {
+		/* invalid ldcp? */
+		return;
+	}
+
+	if (ldcp->ldc_status != LDC_INIT) {
+		DWARN((vgenp->vnetp,
+		    "vgen_ldc_detach: ldc_status is not INIT id(%lx)\n",
+			    ldcp->ldc_id));
+	}
+
+	if (ldcp->flags & CHANNEL_ATTACHED) {
+		ldcp->flags &= ~(CHANNEL_ATTACHED);
+
+		vgen_destroy_kstats(ldcp);
+		/* free transmit resources */
+		vgen_free_tx_ring(ldcp);
+		(void) ldc_unreg_callback(ldcp->ldc_handle);
+		(void) ldc_fini(ldcp->ldc_handle);
+		mutex_destroy(&ldcp->tclock);
+		mutex_destroy(&ldcp->txlock);
+		mutex_destroy(&ldcp->cblock);
+
+		/* unlink it from the list */
+		*prev_ldcp = ldcp->nextp;
+		ldclp->num_ldcs--;
+		KMEM_FREE(ldcp);
+	}
+}
+
+/*
+ * This function allocates transmit resources for the channel.
+ * The resources consist of a transmit descriptor ring and an associated
+ * transmit buffer ring.
+ */
+static int
+vgen_alloc_tx_ring(vgen_ldc_t *ldcp)
+{
+	void *tbufp;
+	ldc_mem_info_t minfo;
+	uint32_t txdsize;
+	uint32_t tbufsize;
+	int status;
+	void *vnetp = LDC_TO_VNET(ldcp);
+
+	ldcp->num_txds = vnet_ntxds;
+	txdsize = sizeof (vnet_public_desc_t);
+	tbufsize = sizeof (vgen_private_desc_t);
+
+	/* allocate transmit buffer ring */
+	tbufp = kmem_zalloc(ldcp->num_txds * tbufsize, KM_NOSLEEP);
+	if (tbufp == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	/* create transmit descriptor ring */
+	status = ldc_mem_dring_create(ldcp->num_txds, txdsize,
+	    &ldcp->tx_dhandle);
+	if (status) {
+		DWARN((vnetp, "vgen_alloc_tx_ring: ldc_mem_dring_create() "
+		    "failed, id(%lx)\n", ldcp->ldc_id));
+		kmem_free(tbufp, ldcp->num_txds * tbufsize);
+		return (DDI_FAILURE);
+	}
+
+	/* get the addr of descripror ring */
+	status = ldc_mem_dring_info(ldcp->tx_dhandle, &minfo);
+	if (status) {
+		DWARN((vnetp, "vgen_alloc_tx_ring: ldc_mem_dring_info() "
+		    "failed, id(%lx)\n", ldcp->ldc_id));
+		kmem_free(tbufp, ldcp->num_txds * tbufsize);
+		(void) ldc_mem_dring_destroy(ldcp->tx_dhandle);
+		ldcp->tbufp = NULL;
+		return (DDI_FAILURE);
+	}
+	ldcp->txdp = (vnet_public_desc_t *)(minfo.vaddr);
+	ldcp->tbufp = tbufp;
+
+	ldcp->txdendp = &((ldcp->txdp)[ldcp->num_txds]);
+	ldcp->tbufendp = &((ldcp->tbufp)[ldcp->num_txds]);
+
+	return (DDI_SUCCESS);
+}
+
+/* Free transmit resources for the channel */
+static void
+vgen_free_tx_ring(vgen_ldc_t *ldcp)
+{
+	int tbufsize = sizeof (vgen_private_desc_t);
+
+	/* free transmit descriptor ring */
+	(void) ldc_mem_dring_destroy(ldcp->tx_dhandle);
+
+	/* free transmit buffer ring */
+	kmem_free(ldcp->tbufp, ldcp->num_txds * tbufsize);
+	ldcp->txdp = ldcp->txdendp = NULL;
+	ldcp->tbufp = ldcp->tbufendp = NULL;
+}
+
+/* enable transmit/receive on the channels for the port */
+static void
+vgen_init_ldcs(vgen_port_t *portp)
+{
+	vgen_ldclist_t	*ldclp = &portp->ldclist;
+	vgen_ldc_t	*ldcp;
+
+	READ_ENTER(&ldclp->rwlock);
+	ldcp =  ldclp->headp;
+	for (; ldcp  != NULL; ldcp = ldcp->nextp) {
+		(void) vgen_ldc_init(ldcp);
+	}
+	RW_EXIT(&ldclp->rwlock);
+}
+
+/* stop transmit/receive on the channels for the port */
+static void
+vgen_uninit_ldcs(vgen_port_t *portp)
+{
+	vgen_ldclist_t	*ldclp = &portp->ldclist;
+	vgen_ldc_t	*ldcp;
+
+	READ_ENTER(&ldclp->rwlock);
+	ldcp =  ldclp->headp;
+	for (; ldcp  != NULL; ldcp = ldcp->nextp) {
+		vgen_ldc_uninit(ldcp);
+	}
+	RW_EXIT(&ldclp->rwlock);
+}
+
+/* enable transmit/receive on the channel */
+static int
+vgen_ldc_init(vgen_ldc_t *ldcp)
+{
+	void *vnetp = LDC_TO_VNET(ldcp);
+	ldc_status_t	istatus;
+	int		rv;
+	enum		{ ST_init = 0x0, ST_init_tbufs = 0x1,
+			    ST_ldc_open = 0x2, ST_dring_bind = 0x4
+			    }
+			init_state;
+	uint32_t	ncookies = 0;
+
+	init_state = ST_init;
+
+	LDC_LOCK(ldcp);
+
+	rv = ldc_open(ldcp->ldc_handle);
+	if (rv != 0) {
+		DWARN((vnetp,
+		    "vgen_ldcinit: ldc_open failed: id<%lx> rv(%d)\n",
+		    ldcp->ldc_id, rv));
+		goto ldcinit_failed;
+	}
+	init_state |= ST_ldc_open;
+
+	(void) ldc_status(ldcp->ldc_handle, &istatus);
+	if (istatus != LDC_OPEN && istatus != LDC_READY) {
+		DWARN((vnetp,
+		    "vgen_ldcinit: id (%lx) status(%d) is not OPEN/READY\n",
+		    ldcp->ldc_id, istatus));
+		goto ldcinit_failed;
+	}
+	ldcp->ldc_status = istatus;
+
+	rv = vgen_init_tbufs(ldcp);
+	if (rv != 0) {
+		DWARN((vnetp,
+		    "vgen_ldcinit: vgen_init_tbufs() failed: id(%lx)\n",
+		    ldcp->ldc_id));
+		goto ldcinit_failed;
+	}
+	init_state |= ST_init_tbufs;
+
+	/* Bind descriptor ring to the channel */
+	rv = ldc_mem_dring_bind(ldcp->ldc_handle, ldcp->tx_dhandle,
+	    LDC_SHADOW_MAP, LDC_MEM_RW, &ldcp->tx_dcookie, &ncookies);
+	if (rv != 0) {
+		DWARN((vnetp, "vgen_ldcinit: id (%lx) "
+		    "ldc_mem_dring_bind failed\n", ldcp->ldc_id));
+		goto ldcinit_failed;
+	}
+
+	ASSERT(ncookies == 1);
+	ldcp->num_txdcookies = ncookies;
+
+	init_state |= ST_dring_bind;
+
+	rv = ldc_up(ldcp->ldc_handle);
+	if (rv != 0) {
+		DBG2((vnetp,
+		    "vgen_ldcinit: ldc_up err id(%lx) rv(%d)\n",
+		    ldcp->ldc_id, rv));
+	}
+
+	(void) ldc_status(ldcp->ldc_handle, &istatus);
+	if (istatus != LDC_UP) {
+		DBG2((vnetp, "vgen_ldcinit: id(%lx) status(%d) is not UP\n",
+		    ldcp->ldc_id, istatus));
+	}
+	ldcp->ldc_status = istatus;
+
+	/* initialize transmit watchdog timeout */
+	ldcp->wd_tid = timeout(vgen_ldc_watchdog, (caddr_t)ldcp,
+	    drv_usectohz(vnet_ldcwd_interval * 1000));
+
+	ldcp->flags |= CHANNEL_STARTED;
+
+	LDC_UNLOCK(ldcp);
+	return (DDI_SUCCESS);
+
+ldcinit_failed:
+	if (init_state & ST_dring_bind) {
+		(void) ldc_mem_dring_unbind(ldcp->tx_dhandle);
+	}
+	if (init_state & ST_init_tbufs) {
+		vgen_uninit_tbufs(ldcp);
+	}
+	if (init_state & ST_ldc_open) {
+		(void) ldc_close(ldcp->ldc_handle);
+	}
+	LDC_UNLOCK(ldcp);
+	return (DDI_FAILURE);
+}
+
+/* stop transmit/receive on the channel */
+static void
+vgen_ldc_uninit(vgen_ldc_t *ldcp)
+{
+	void *vnetp = LDC_TO_VNET(ldcp);
+	int	rv;
+
+	DBG1((vnetp, "vgen_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id));
+	LDC_LOCK(ldcp);
+
+	if ((ldcp->flags & CHANNEL_STARTED) == 0) {
+		LDC_UNLOCK(ldcp);
+		DWARN((vnetp, "vgen_ldc_uninit: id(%lx) CHANNEL_STARTED"
+		    " flag is not set\n", ldcp->ldc_id));
+		return;
+	}
+
+	/* disable further callbacks */
+	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
+	if (rv != 0) {
+		DWARN((vnetp, "vgen_ldc_uninit: id (%lx) "
+		    "ldc_set_cb_mode failed\n", ldcp->ldc_id));
+	}
+
+	/* clear handshake done bit and wait for pending tx and cb to finish */
+	ldcp->hphase &= ~(VH_DONE);
+	LDC_UNLOCK(ldcp);
+	drv_usecwait(1000);
+	LDC_LOCK(ldcp);
+
+	vgen_reset_hphase(ldcp);
+
+	/* reset transmit watchdog timeout */
+	if (ldcp->wd_tid) {
+		(void) untimeout(ldcp->wd_tid);
+		ldcp->wd_tid = 0;
+	}
+
+	/* unbind tx descriptor ring from the channel */
+	rv = ldc_mem_dring_unbind(ldcp->tx_dhandle);
+	if (rv != 0) {
+		DWARN((vnetp, "vgen_ldcuninit: ldc_mem_dring_unbind "
+		    "failed id(%lx)\n", ldcp->ldc_id));
+	}
+
+	vgen_uninit_tbufs(ldcp);
+
+	rv = ldc_close(ldcp->ldc_handle);
+	if (rv != 0) {
+		DWARN((vnetp, "vgen_ldcuninit: ldc_close err id(%lx)\n",
+		    ldcp->ldc_id));
+	}
+	ldcp->ldc_status = LDC_INIT;
+	ldcp->flags &= ~(CHANNEL_STARTED);
+
+	LDC_UNLOCK(ldcp);
+
+	DBG1((vnetp, "vgen_ldc_uninit: exit: id(%lx)\n", ldcp->ldc_id));
+}
+
+/* Initialize the transmit buffer ring for the channel */
+static int
+vgen_init_tbufs(vgen_ldc_t *ldcp)
+{
+	vgen_private_desc_t	*tbufp;
+	vnet_public_desc_t	*txdp;
+	vio_dring_entry_hdr_t		*hdrp;
+	int 			i;
+	int 			rv;
+
+	bzero(ldcp->tbufp, sizeof (*tbufp) * (ldcp->num_txds));
+	bzero(ldcp->txdp, sizeof (*txdp) * (ldcp->num_txds));
+
+	/*
+	 * for each tx buf (priv_desc), allocate a ldc mem_handle which is
+	 * required to map the data during transmit, set the flags
+	 * to free (available for use by transmit routine).
+	 */
+
+	for (i = 0; i < ldcp->num_txds; i++) {
+		tbufp = &(ldcp->tbufp[i]);
+		rv = ldc_mem_alloc_handle(ldcp->ldc_handle,
+			&(tbufp->memhandle));
+		if (rv) {
+			tbufp->memhandle = 0;
+			goto init_tbufs_failed;
+		}
+		tbufp->flags = VGEN_PRIV_DESC_FREE;
+		txdp = &(ldcp->txdp[i]);
+		hdrp = &txdp->hdr;
+		hdrp->dstate = VIO_DESC_FREE;
+		hdrp->ack = B_FALSE;
+		tbufp->descp = txdp;
+	}
+
+	/* reset tbuf walking pointers */
+	ldcp->next_tbufp = ldcp->tbufp;
+	ldcp->cur_tbufp = ldcp->tbufp;
+
+	/* initialize tx seqnum and index */
+	ldcp->next_txseq = VNET_ISS;
+	ldcp->next_txi = 0;
+
+	return (DDI_SUCCESS);
+
+init_tbufs_failed:;
+	vgen_uninit_tbufs(ldcp);
+	return (DDI_FAILURE);
+}
+
+/* Uninitialize transmit buffer ring for the channel */
+static void
+vgen_uninit_tbufs(vgen_ldc_t *ldcp)
+{
+	vgen_private_desc_t	*tbufp = ldcp->tbufp;
+	vnet_public_desc_t	*txdp;
+	vio_dring_entry_hdr_t		*hdrp;
+	int 			i;
+
+	/* for each tbuf (priv_desc), free ldc mem_handle */
+	for (i = 0; i < ldcp->num_txds; i++) {
+
+		tbufp = &(ldcp->tbufp[i]);
+		txdp = tbufp->descp;
+		hdrp = &txdp->hdr;
+
+		if (tbufp->flags != VGEN_PRIV_DESC_FREE) {
+			(void) ldc_mem_unbind_handle(tbufp->memhandle);
+			freemsg(tbufp->mp);
+			tbufp->mp = NULL;
+			tbufp->flags = VGEN_PRIV_DESC_FREE;
+			hdrp->dstate = VIO_DESC_FREE;
+			hdrp->ack = B_FALSE;
+		}
+		if (tbufp->memhandle) {
+			(void) ldc_mem_free_handle(tbufp->memhandle);
+			tbufp->memhandle = 0;
+		}
+		tbufp->descp = NULL;
+	}
+
+	bzero(ldcp->tbufp, sizeof (*tbufp) * (ldcp->num_txds));
+	bzero(ldcp->txdp, sizeof (*txdp) * (ldcp->num_txds));
+}
+
+/* clobber tx descriptor ring */
+static void
+vgen_clobber_tbufs(vgen_ldc_t *ldcp)
+{
+	vnet_public_desc_t	*txdp;
+	vgen_private_desc_t	*tbufp;
+	vio_dring_entry_hdr_t		*hdrp;
+	void *vnetp = LDC_TO_VNET(ldcp);
+	int i;
+#ifdef DEBUG
+	int ndone = 0;
+#endif
+
+	for (i = 0; i < ldcp->num_txds; i++) {
+
+		tbufp = &(ldcp->tbufp[i]);
+		txdp = tbufp->descp;
+		hdrp = &txdp->hdr;
+
+		if (tbufp->flags & VGEN_PRIV_DESC_BUSY) {
+			(void) ldc_mem_unbind_handle(tbufp->memhandle);
+			freemsg(tbufp->mp);
+			tbufp->mp = NULL;
+			tbufp->flags = VGEN_PRIV_DESC_FREE;
+#ifdef DEBUG
+			if (hdrp->dstate == VIO_DESC_DONE)
+				ndone++;
+#endif
+			hdrp->dstate = VIO_DESC_FREE;
+			hdrp->ack = B_FALSE;
+		}
+	}
+	/* reset tbuf walking pointers */
+	ldcp->next_tbufp = ldcp->tbufp;
+	ldcp->cur_tbufp = ldcp->tbufp;
+
+	/* reset tx seqnum and index */
+	ldcp->next_txseq = VNET_ISS;
+	ldcp->next_txi = 0;
+#ifdef DEBUG
+	DBG2((vnetp,
+	    "vgen_clobber_tbufs: id(0x%lx) num descrs done (%d)\n",
+	    ldcp->ldc_id, ndone));
+#endif
+}
+
+/* clobber receive descriptor ring */
+static void
+vgen_clobber_rxds(vgen_ldc_t *ldcp)
+{
+	ldcp->rx_dhandle = 0;
+	bzero(&ldcp->rx_dcookie, sizeof (ldcp->rx_dcookie));
+	ldcp->rxdp = NULL;
+	ldcp->next_rxi = 0;
+	ldcp->num_rxds = 0;
+	ldcp->next_rxseq = VNET_ISS;
+}
+
+/* initialize receive descriptor ring */
+static int
+vgen_init_rxds(vgen_ldc_t *ldcp, uint32_t num_desc, uint32_t desc_size,
+	ldc_mem_cookie_t *dcookie, uint32_t ncookies)
+{
+	int rv;
+	ldc_mem_info_t minfo;
+
+	rv = ldc_mem_dring_map(ldcp->ldc_handle, dcookie, ncookies, num_desc,
+	    desc_size, LDC_SHADOW_MAP, &(ldcp->rx_dhandle));
+	if (rv != 0) {
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * sucessfully mapped, now try to
+	 * get info about the mapped dring
+	 */
+	rv = ldc_mem_dring_info(ldcp->rx_dhandle, &minfo);
+	if (rv != 0) {
+		(void) ldc_mem_dring_unmap(ldcp->rx_dhandle);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * save ring address, number of descriptors.
+	 */
+	ldcp->rxdp = (vnet_public_desc_t *)(minfo.vaddr);
+	bcopy(dcookie, &(ldcp->rx_dcookie), sizeof (*dcookie));
+	ldcp->num_rxdcookies = ncookies;
+	ldcp->num_rxds = num_desc;
+	ldcp->next_rxi = 0;
+	ldcp->next_rxseq = VNET_ISS;
+
+	return (DDI_SUCCESS);
+}
+
+/* get channel statistics */
+static uint64_t
+vgen_ldc_stat(vgen_ldc_t *ldcp, enum mac_stat stat)
+{
+	vgen_stats_t *statsp;
+	uint64_t val;
+
+	val = 0;
+	statsp = ldcp->statsp;
+	switch (stat) {
+
+	case MAC_STAT_MULTIRCV:
+		val = statsp->multircv;
+		break;
+
+	case MAC_STAT_BRDCSTRCV:
+		val = statsp->brdcstrcv;
+		break;
+
+	case MAC_STAT_MULTIXMT:
+		val = statsp->multixmt;
+		break;
+
+	case MAC_STAT_BRDCSTXMT:
+		val = statsp->brdcstxmt;
+		break;
+
+	case MAC_STAT_NORCVBUF:
+		val = statsp->norcvbuf;
+		break;
+
+	case MAC_STAT_IERRORS:
+		val = statsp->ierrors;
+		break;
+
+	case MAC_STAT_NOXMTBUF:
+		val = statsp->noxmtbuf;
+		break;
+
+	case MAC_STAT_OERRORS:
+		val = statsp->oerrors;
+		break;
+
+	case MAC_STAT_COLLISIONS:
+		break;
+
+	case MAC_STAT_RBYTES:
+		val = statsp->rbytes;
+		break;
+
+	case MAC_STAT_IPACKETS:
+		val = statsp->ipackets;
+		break;
+
+	case MAC_STAT_OBYTES:
+		val = statsp->obytes;
+		break;
+
+	case MAC_STAT_OPACKETS:
+		val = statsp->opackets;
+		break;
+
+	/* stats not relevant to ldc, return 0 */
+	case MAC_STAT_IFSPEED:
+	case MAC_STAT_ALIGN_ERRORS:
+	case MAC_STAT_FCS_ERRORS:
+	case MAC_STAT_FIRST_COLLISIONS:
+	case MAC_STAT_MULTI_COLLISIONS:
+	case MAC_STAT_DEFER_XMTS:
+	case MAC_STAT_TX_LATE_COLLISIONS:
+	case MAC_STAT_EX_COLLISIONS:
+	case MAC_STAT_MACXMT_ERRORS:
+	case MAC_STAT_CARRIER_ERRORS:
+	case MAC_STAT_TOOLONG_ERRORS:
+	case MAC_STAT_XCVR_ADDR:
+	case MAC_STAT_XCVR_ID:
+	case MAC_STAT_XCVR_INUSE:
+	case MAC_STAT_CAP_1000FDX:
+	case MAC_STAT_CAP_1000HDX:
+	case MAC_STAT_CAP_100FDX:
+	case MAC_STAT_CAP_100HDX:
+	case MAC_STAT_CAP_10FDX:
+	case MAC_STAT_CAP_10HDX:
+	case MAC_STAT_CAP_ASMPAUSE:
+	case MAC_STAT_CAP_PAUSE:
+	case MAC_STAT_CAP_AUTONEG:
+	case MAC_STAT_ADV_CAP_1000FDX:
+	case MAC_STAT_ADV_CAP_1000HDX:
+	case MAC_STAT_ADV_CAP_100FDX:
+	case MAC_STAT_ADV_CAP_100HDX:
+	case MAC_STAT_ADV_CAP_10FDX:
+	case MAC_STAT_ADV_CAP_10HDX:
+	case MAC_STAT_ADV_CAP_ASMPAUSE:
+	case MAC_STAT_ADV_CAP_PAUSE:
+	case MAC_STAT_ADV_CAP_AUTONEG:
+	case MAC_STAT_LP_CAP_1000FDX:
+	case MAC_STAT_LP_CAP_1000HDX:
+	case MAC_STAT_LP_CAP_100FDX:
+	case MAC_STAT_LP_CAP_100HDX:
+	case MAC_STAT_LP_CAP_10FDX:
+	case MAC_STAT_LP_CAP_10HDX:
+	case MAC_STAT_LP_CAP_ASMPAUSE:
+	case MAC_STAT_LP_CAP_PAUSE:
+	case MAC_STAT_LP_CAP_AUTONEG:
+	case MAC_STAT_LINK_ASMPAUSE:
+	case MAC_STAT_LINK_PAUSE:
+	case MAC_STAT_LINK_AUTONEG:
+	case MAC_STAT_LINK_DUPLEX:
+	default:
+		val = 0;
+		break;
+
+	}
+	return (val);
+}
+
+static void
+vgen_init_macp(vgen_t *vgenp, mac_t *macp)
+{
+	macp->m_driver = (void *)vgenp;
+	macp->m_start = vgen_start;
+	macp->m_stop = vgen_stop;
+	macp->m_tx = vgen_tx;
+	macp->m_resources = vgen_resources;
+	macp->m_multicst = vgen_multicst;
+	macp->m_promisc = vgen_promisc;
+	macp->m_unicst = vgen_unicst;
+	macp->m_stat = vgen_stat;
+	macp->m_ioctl = vgen_ioctl;
+}
+
+/* Interrupt handler for the channel */
+static uint_t
+vgen_ldc_cb(uint64_t event, caddr_t arg)
+{
+	_NOTE(ARGUNUSED(event))
+	vgen_ldc_t	*ldcp;
+	void 		*vnetp;
+	vgen_t		*vgenp;
+	size_t		msglen;
+	ldc_status_t 	istatus;
+	uint64_t	ldcmsg[7];
+	int 		rv;
+	vio_msg_tag_t	*tagp;
+	mblk_t		*mp = NULL;
+	mblk_t		*bp = NULL;
+	mblk_t		*bpt = NULL;
+	mblk_t		*headp = NULL;
+	mblk_t		*tailp = NULL;
+	vgen_stats_t	*statsp;
+
+	ldcp = (vgen_ldc_t *)arg;
+	vgenp = LDC_TO_VGEN(ldcp);
+	vnetp = LDC_TO_VNET(ldcp);
+	statsp = ldcp->statsp;
+
+	DBG1((vnetp, "vgen_ldc_cb enter: ldcid (%lx)\n", ldcp->ldc_id));
+
+	mutex_enter(&ldcp->cblock);
+	statsp->callbacks++;
+	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
+		DWARN((vnetp, "vgen_ldc_cb: id(%lx), status(%d) is LDC_INIT\n",
+		    ldcp->ldc_id, ldcp->ldc_status));
+		mutex_exit(&ldcp->cblock);
+		return (LDC_SUCCESS);
+	}
+
+	/* check ldc status change events first */
+	(void) ldc_status(ldcp->ldc_handle, &istatus);
+
+	if (istatus != ldcp->ldc_status) {
+		switch (istatus) {
+		case LDC_UP:
+			ldcp->ldc_status = istatus;
+			DBG1((vnetp,
+			    "vgen_ldc_cb: id(%lx) status(%d) is LDC_UP\n",
+			    ldcp->ldc_id, ldcp->ldc_status));
+
+			if (ldcp->portp != vgenp->vsw_portp) {
+				/*
+				 * modify fdb entry to use this port as the
+				 * channel is up, instead of going through the
+				 * vsw-port (see comments in vgen_port_init())
+				 */
+				vnet_modify_fdb(vnetp,
+				    (uint8_t *)&ldcp->portp->macaddr,
+				    vgen_tx, ldcp->portp);
+			}
+			/* Initialize local session id */
+			ldcp->local_sid = ddi_get_lbolt();
+			/* clear peer session id */
+			ldcp->peer_sid = 0;
+			ldcp->hretries = 0;
+			/* Initiate Handshake process with peer ldc endpoint */
+			vgen_handshake_reset(ldcp);
+			vgen_handshake(vh_nextphase(ldcp));
+			break;
+
+		case LDC_OPEN:
+		case LDC_READY:
+			ldcp->ldc_status = istatus;
+			if ((ldcp->portp != vgenp->vsw_portp) &&
+				(vgenp->vsw_portp != NULL)) {
+				/*
+				 * modify fdb entry to use vsw-port  as the
+				 * channel is reset and we don't have a direct
+				 * link to the destination (see comments
+				 * in vgen_port_init()).
+				 */
+				vnet_modify_fdb(vnetp,
+				    (uint8_t *)&ldcp->portp->macaddr,
+				    vgen_tx, vgenp->vsw_portp);
+			}
+			/* clear sids */
+			ldcp->local_sid = 0;
+			ldcp->peer_sid = 0;
+			if (ldcp->hphase != VH_PHASE0) {
+				vgen_handshake_reset(ldcp);
+			}
+			DBG1((vnetp,
+			    "vgen_ldc_cb: id(%lx) status is (%d)\n",
+			    ldcp->ldc_id, ldcp->ldc_status));
+			break;
+
+		default:
+			DWARN((vnetp,
+			    "vgen_ldc_cb: id(%lx) istatus=(%d) status(%d) is"
+			    " *UNKNOWN*\n",
+			    ldcp->ldc_id, istatus, ldcp->ldc_status));
+			break;
+		}
+	}
+
+	if (istatus != LDC_UP) {
+		DBG1((vnetp, "vgen_ldc_cb: id(%lx) status(%d) is NOT LDC_UP\n",
+			    ldcp->ldc_id, ldcp->ldc_status));
+		mutex_exit(&ldcp->cblock);
+		return (LDC_SUCCESS);
+	}
+
+	/* if ldc_status is UP, receive all packets */
+	do {
+		msglen = sizeof (ldcmsg);
+		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&ldcmsg, &msglen);
+
+		if (rv != 0) {
+			DWARN((vnetp,
+			    "vgen_ldc_cb:ldc_read err id(%lx) rv(%d) "
+			    "len(%d)\n", ldcp->ldc_id, rv, msglen));
+			break;
+		}
+		if (msglen == 0) {
+			DBG2((vnetp, "vgen_ldc_cb: ldc_read id(%lx) NODATA",
+			ldcp->ldc_id));
+			break;
+		}
+		DBG2((vnetp, "vgen_ldc_cb: ldc_read id(%lx): msglen(%d)",
+		    ldcp->ldc_id, msglen));
+
+		tagp = (vio_msg_tag_t *)ldcmsg;
+
+		if (ldcp->peer_sid) {
+			/*
+			 * check sid only after we have received peer's sid
+			 * in the version negotiate msg.
+			 */
+#ifdef DEBUG
+			if (vgen_hdbg & HDBG_BAD_SID) {
+				/* simulate bad sid condition */
+				tagp->vio_sid = 0;
+				vgen_hdbg &= ~(HDBG_BAD_SID);
+			}
+#endif
+			if (vgen_check_sid(ldcp, tagp) == VGEN_FAILURE) {
+				/*
+				 * If sid mismatch is detected,
+				 * reset the channel.
+				 */
+				ldcp->need_ldc_reset = B_TRUE;
+				vgen_handshake_reset(ldcp);
+				mutex_exit(&ldcp->cblock);
+				return (LDC_SUCCESS);
+			}
+		}
+
+		switch (tagp->vio_msgtype) {
+		case VIO_TYPE_CTRL:
+			vgen_handle_ctrlmsg(ldcp, tagp);
+			break;
+
+		case VIO_TYPE_DATA:
+			headp = tailp = NULL;
+			vgen_handle_datamsg(ldcp, tagp, &headp, &tailp);
+			/* build a chain of received packets */
+			if (headp != NULL) {
+				if (bp == NULL) {
+					bp = headp;
+					bpt = tailp;
+				} else {
+					bpt->b_next = headp;
+					bpt = tailp;
+				}
+			}
+			break;
+
+		case VIO_TYPE_ERR:
+			vgen_handle_errmsg(ldcp, tagp);
+			break;
+
+		default:
+			DWARN((vnetp,
+			    "vgen_ldc_cb: Unknown VIO_TYPE(%x)\n",
+			    tagp->vio_msgtype));
+			break;
+		}
+
+	} while (msglen);
+
+	mutex_exit(&ldcp->cblock);
+	/* send up the received packets to MAC layer */
+	while (bp != NULL) {
+		mp = bp;
+		bp = bp->b_next;
+		mp->b_next = mp->b_prev = NULL;
+		DBG2((vnetp, "vgen_ldc_cb: id(%lx) rx pkt len (%lx)\n",
+		    ldcp->ldc_id, MBLKL(mp)));
+		mac_rx((mac_t *)vgenp->vnetmacp, vgenp->mrh, mp);
+	}
+	DBG1((vnetp, "vgen_ldc_cb exit: ldcid (%lx)\n", ldcp->ldc_id));
+
+	return (LDC_SUCCESS);
+}
+
+/* vgen handshake functions */
+
+/* change the hphase for the channel to the next phase */
+static vgen_ldc_t *
+vh_nextphase(vgen_ldc_t *ldcp)
+{
+	if (ldcp->hphase == VH_PHASE3) {
+		ldcp->hphase = VH_DONE;
+	} else {
+		ldcp->hphase++;
+	}
+	return (ldcp);
+}
+
+/*
+ * Check whether the given version is supported or not and
+ * return VGEN_SUCCESS if supported.
+ */
+static int
+vgen_supported_version(vgen_ldc_t *ldcp, uint16_t ver_major,
+uint16_t ver_minor)
+{
+	vgen_ver_t	*versions = ldcp->vgen_versions;
+	int		i = 0;
+
+	while (i < VGEN_NUM_VER) {
+		if ((versions[i].ver_major == 0) &&
+		    (versions[i].ver_minor == 0)) {
+			break;
+		}
+		if ((versions[i].ver_major == ver_major) &&
+			(versions[i].ver_minor == ver_minor)) {
+			return (VGEN_SUCCESS);
+		}
+		i++;
+	}
+	return (VGEN_FAILURE);
+}
+
+/*
+ * Given a version, return VGEN_SUCCESS if a lower version is supported.
+ */
+static int
+vgen_next_version(vgen_ldc_t *ldcp, vgen_ver_t *verp)
+{
+	vgen_ver_t	*versions = ldcp->vgen_versions;
+	int		i = 0;
+
+	while (i < VGEN_NUM_VER) {
+		if ((versions[i].ver_major == 0) &&
+		    (versions[i].ver_minor == 0)) {
+			break;
+		}
+		/*
+		 * if we support a lower minor version within the same major
+		 * version, or if we support a lower major version,
+		 * update the verp parameter with this lower version and
+		 * return success.
+		 */
+		if (((versions[i].ver_major == verp->ver_major) &&
+			(versions[i].ver_minor < verp->ver_minor)) ||
+			(versions[i].ver_major < verp->ver_major)) {
+				verp->ver_major = versions[i].ver_major;
+				verp->ver_minor = versions[i].ver_minor;
+				return (VGEN_SUCCESS);
+		}
+		i++;
+	}
+
+	return (VGEN_FAILURE);
+}
+
+/*
+ * wrapper routine to send the given message over ldc using ldc_write().
+ */
+static int
+vgen_sendmsg(vgen_ldc_t *ldcp, caddr_t msg,  size_t msglen,
+    boolean_t caller_holds_lock)
+{
+	int	rv;
+	size_t	len;
+	void *vnetp = LDC_TO_VNET(ldcp);
+	uint32_t retries = 0;
+
+	len = msglen;
+	if ((len == 0) || (msg == NULL))
+		return (VGEN_FAILURE);
+
+	if (!caller_holds_lock) {
+		mutex_enter(&ldcp->txlock);
+	}
+
+	do {
+		len = msglen;
+		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msg, &len);
+		if (retries++ >= vgen_ldcwr_retries)
+			break;
+	} while (rv == EWOULDBLOCK);
+
+	if (!caller_holds_lock) {
+		mutex_exit(&ldcp->txlock);
+	}
+
+	if ((rv != 0) || (len != msglen)) {
+		DWARN((vnetp,
+		    "vgen_sendmsg: ldc_write failed: id(%lx) rv(%d)"
+		    " msglen (%d)\n", ldcp->ldc_id, rv, msglen));
+		return (VGEN_FAILURE);
+	}
+	return (VGEN_SUCCESS);
+}
+
+/* send version negotiate message to the peer over ldc */
+static int
+vgen_send_version_negotiate(vgen_ldc_t *ldcp)
+{
+	vio_ver_msg_t	vermsg;
+	vio_msg_tag_t	*tagp = &vermsg.tag;
+	void		*vnetp = LDC_TO_VNET(ldcp);
+	int		rv;
+
+	bzero(&vermsg, sizeof (vermsg));
+
+	tagp->vio_msgtype = VIO_TYPE_CTRL;
+	tagp->vio_subtype = VIO_SUBTYPE_INFO;
+	tagp->vio_subtype_env = VIO_VER_INFO;
+	tagp->vio_sid = ldcp->local_sid;
+
+	/* get version msg payload from ldcp->local */
+	vermsg.ver_major = ldcp->local_hparams.ver_major;
+	vermsg.ver_minor = ldcp->local_hparams.ver_minor;
+	vermsg.dev_class = ldcp->local_hparams.dev_class;
+
+	rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (vermsg), B_FALSE);
+	if (rv != VGEN_SUCCESS) {
+		DWARN((vnetp, "vgen_send_version_negotiate: vgen_sendmsg failed"
+		    "id (%lx)\n", ldcp->ldc_id));
+		return (VGEN_FAILURE);
+	}
+
+	ldcp->hstate |= VER_INFO_SENT;
+	DBG2((vnetp,
+	    "vgen_send_version_negotiate: VER_INFO_SENT id (%lx) ver(%d,%d)\n",
+	    ldcp->ldc_id, vermsg.ver_major, vermsg.ver_minor));
+
+	return (VGEN_SUCCESS);
+}
+
+/* send attr info message to the peer over ldc */
+static int
+vgen_send_attr_info(vgen_ldc_t *ldcp)
+{
+	vnet_attr_msg_t	attrmsg;
+	vio_msg_tag_t	*tagp = &attrmsg.tag;
+	void		*vnetp = LDC_TO_VNET(ldcp);
+	int		rv;
+
+	bzero(&attrmsg, sizeof (attrmsg));
+
+	tagp->vio_msgtype = VIO_TYPE_CTRL;
+	tagp->vio_subtype = VIO_SUBTYPE_INFO;
+	tagp->vio_subtype_env = VIO_ATTR_INFO;
+	tagp->vio_sid = ldcp->local_sid;
+
+	/* get attr msg payload from ldcp->local */
+	attrmsg.mtu = ldcp->local_hparams.mtu;
+	attrmsg.addr = ldcp->local_hparams.addr;
+	attrmsg.addr_type = ldcp->local_hparams.addr_type;
+	attrmsg.xfer_mode = ldcp->local_hparams.xfer_mode;
+	attrmsg.ack_freq = ldcp->local_hparams.ack_freq;
+
+	rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (attrmsg), B_FALSE);
+	if (rv != VGEN_SUCCESS) {
+		DWARN((vnetp, "vgen_send_attr_info: vgen_sendmsg failed"
+		    "id (%lx)\n", ldcp->ldc_id));
+		return (VGEN_FAILURE);
+	}
+
+	ldcp->hstate |= ATTR_INFO_SENT;
+	DBG2((vnetp, "vgen_send_attr_info: ATTR_INFO_SENT id (%lx)\n",
+	    ldcp->ldc_id));
+
+	return (VGEN_SUCCESS);
+}
+
+/* send descriptor ring register message to the peer over ldc */
+static int
+vgen_send_dring_reg(vgen_ldc_t *ldcp)
+{
+	vio_dring_reg_msg_t	msg;
+	vio_msg_tag_t		*tagp = &msg.tag;
+	void		*vnetp = LDC_TO_VNET(ldcp);
+	int		rv;
+
+	bzero(&msg, sizeof (msg));
+
+	tagp->vio_msgtype = VIO_TYPE_CTRL;
+	tagp->vio_subtype = VIO_SUBTYPE_INFO;
+	tagp->vio_subtype_env = VIO_DRING_REG;
+	tagp->vio_sid = ldcp->local_sid;
+
+	/* get dring info msg payload from ldcp->local */
+	bcopy(&(ldcp->local_hparams.dring_cookie), (msg.cookie),
+		sizeof (ldc_mem_cookie_t));
+	msg.ncookies = ldcp->local_hparams.num_dcookies;
+	msg.num_descriptors = ldcp->local_hparams.num_desc;
+	msg.descriptor_size = ldcp->local_hparams.desc_size;
+
+	/*
+	 * dring_ident is set to 0. After mapping the dring, peer sets this
+	 * value and sends it in the ack, which is saved in
+	 * vgen_handle_dring_reg().
+	 */
+	msg.dring_ident = 0;
+
+	rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (msg), B_FALSE);
+	if (rv != VGEN_SUCCESS) {
+		DWARN((vnetp, "vgen_send_dring_reg: vgen_sendmsg failed"
+		    "id (%lx)\n", ldcp->ldc_id));
+		return (VGEN_FAILURE);
+	}
+
+	ldcp->hstate |= DRING_INFO_SENT;
+	DBG2((vnetp, "vgen_send_dring_reg: DRING_INFO_SENT id (%lx)\n",
+	    ldcp->ldc_id));
+
+	return (VGEN_SUCCESS);
+}
+
+static int
+vgen_send_rdx_info(vgen_ldc_t *ldcp)
+{
+	vio_rdx_msg_t	rdxmsg;
+	vio_msg_tag_t	*tagp = &rdxmsg.tag;
+	void		*vnetp = LDC_TO_VNET(ldcp);
+	int		rv;
+
+	bzero(&rdxmsg, sizeof (rdxmsg));
+
+	tagp->vio_msgtype = VIO_TYPE_CTRL;
+	tagp->vio_subtype = VIO_SUBTYPE_INFO;
+	tagp->vio_subtype_env = VIO_RDX;
+	tagp->vio_sid = ldcp->local_sid;
+
+	rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (rdxmsg), B_FALSE);
+	if (rv != VGEN_SUCCESS) {
+		DWARN((vnetp, "vgen_send_rdx_info: vgen_sendmsg failed"
+		    "id (%lx)\n", ldcp->ldc_id));
+		return (VGEN_FAILURE);
+	}
+
+	ldcp->hstate |= RDX_INFO_SENT;
+	DBG2((vnetp, "vgen_send_rdx_info: RDX_INFO_SENT id (%lx)\n",
+	    ldcp->ldc_id));
+
+	return (VGEN_SUCCESS);
+}
+
+/* send descriptor ring data message to the peer over ldc */
+static int
+vgen_send_dring_data(vgen_ldc_t *ldcp, uint32_t start, uint32_t end,
+	uint64_t next_txseq)
+{
+	vio_dring_msg_t	dringmsg, *msgp = &dringmsg;
+	vio_msg_tag_t	*tagp = &msgp->tag;
+	void		*vnetp = LDC_TO_VNET(ldcp);
+	int		rv;
+
+	bzero(msgp, sizeof (*msgp));
+
+	tagp->vio_msgtype = VIO_TYPE_DATA;
+	tagp->vio_subtype = VIO_SUBTYPE_INFO;
+	tagp->vio_subtype_env = VIO_DRING_DATA;
+	tagp->vio_sid = ldcp->local_sid;
+
+	msgp->seq_num = next_txseq;
+	msgp->dring_ident = ldcp->local_hparams.dring_ident;
+	msgp->start_idx = start;
+	msgp->end_idx = end;
+
+	rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (dringmsg), B_TRUE);
+	if (rv != VGEN_SUCCESS) {
+		DWARN((vnetp, "vgen_send_dring_data: vgen_sendmsg failed"
+		    "id (%lx)\n", ldcp->ldc_id));
+		return (VGEN_FAILURE);
+	}
+
+	DBG2((vnetp, "vgen_send_dring_data: DRING_DATA_SENT id (%lx)\n",
+	    ldcp->ldc_id));
+
+	return (VGEN_SUCCESS);
+}
+
+/* send multicast addr info message to vsw */
+static int
+vgen_send_mcast_info(vgen_ldc_t *ldcp)
+{
+	vnet_mcast_msg_t	mcastmsg;
+	vnet_mcast_msg_t	*msgp;
+	vio_msg_tag_t		*tagp;
+	vgen_t			*vgenp;
+	void			*vnetp;
+	struct ether_addr	*mca;
+	int			rv;
+	int			i;
+	uint32_t		size;
+	uint32_t		mccount;
+	uint32_t		n;
+
+	msgp = &mcastmsg;
+	tagp = &msgp->tag;
+	vgenp = LDC_TO_VGEN(ldcp);
+	vnetp = LDC_TO_VNET(ldcp);
+
+	mccount = vgenp->mccount;
+	i = 0;
+
+	do {
+		tagp->vio_msgtype = VIO_TYPE_CTRL;
+		tagp->vio_subtype = VIO_SUBTYPE_INFO;
+		tagp->vio_subtype_env = VNET_MCAST_INFO;
+		tagp->vio_sid = ldcp->local_sid;
+
+		n = ((mccount >= VNET_NUM_MCAST) ? VNET_NUM_MCAST : mccount);
+		size = n * sizeof (struct ether_addr);
+
+		mca = &(vgenp->mctab[i]);
+		bcopy(mca, (msgp->mca), size);
+		msgp->set = B_TRUE;
+		msgp->count = n;
+
+		rv = vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (*msgp),
+		    B_FALSE);
+		if (rv != VGEN_SUCCESS) {
+			DWARN((vnetp, "vgen_send_mcast_info: vgen_sendmsg err"
+			    "id (%lx)\n", ldcp->ldc_id));
+			return (VGEN_FAILURE);
+		}
+
+		mccount -= n;
+		i += n;
+
+	} while (mccount);
+
+	return (VGEN_SUCCESS);
+}
+
+/* Initiate Phase 2 of handshake */
+static int
+vgen_handshake_phase2(vgen_ldc_t *ldcp)
+{
+	int rv;
+#ifdef DEBUG
+	if (vgen_hdbg & HDBG_OUT_STATE) {
+		/* simulate out of state condition */
+		vgen_hdbg &= ~(HDBG_OUT_STATE);
+		rv = vgen_send_rdx_info(ldcp);
+		return (rv);
+	}
+	if (vgen_hdbg & HDBG_TIMEOUT) {
+		/* simulate timeout condition */
+		vgen_hdbg &= ~(HDBG_TIMEOUT);
+		return (VGEN_SUCCESS);
+	}
+#endif
+	if ((rv = vgen_send_attr_info(ldcp)) != VGEN_SUCCESS) {
+		return (rv);
+	}
+	if ((rv = vgen_send_dring_reg(ldcp)) != VGEN_SUCCESS) {
+		return (rv);
+	}
+
+	return (VGEN_SUCCESS);
+}
+
+/*
+ * This function resets the handshake phase to VH_PHASE0(pre-handshake phase).
+ * This can happen after a channel comes up (status: LDC_UP) or
+ * when handshake gets terminated due to various conditions.
+ */
+static void
+vgen_reset_hphase(vgen_ldc_t *ldcp)
+{
+	vgen_t	*vgenp = LDC_TO_VGEN(ldcp);
+	void	*vnetp = LDC_TO_VNET(ldcp);
+	ldc_status_t istatus;
+
+	DBG2((vnetp, "vgen_reset_hphase: id(0x%lx)\n", ldcp->ldc_id));
+	/* reset hstate and hphase */
+	ldcp->hstate = 0;
+	ldcp->hphase = VH_PHASE0;
+
+	/* reset handshake watchdog timeout */
+	if (ldcp->htid) {
+		(void) untimeout(ldcp->htid);
+		ldcp->htid = 0;
+	}
+
+	/*
+	 * Unmap drings, if dring_ready is set.
+	 */
+	if (ldcp->local_hparams.dring_ready) {
+		ldcp->local_hparams.dring_ready = B_FALSE;
+		/* do not unbind our dring */
+	}
+
+	if (ldcp->peer_hparams.dring_ready) {
+		ldcp->peer_hparams.dring_ready = B_FALSE;
+		/* Unmap peer's dring */
+		(void) ldc_mem_dring_unmap(ldcp->rx_dhandle);
+		vgen_clobber_rxds(ldcp);
+	}
+
+	vgen_clobber_tbufs(ldcp);
+
+	/*
+	 * clear local handshake params and initialize.
+	 */
+	bzero(&(ldcp->local_hparams), sizeof (ldcp->local_hparams));
+
+#ifdef DEBUG
+#if 0
+	if (vgen_hdbg & HDBG_VERSION) {
+		bcopy(dbg_vgen_versions, ldcp->vgen_versions,
+		    sizeof (ldcp->vgen_versions));
+	}
+#endif
+#endif
+	/* set version to the highest version supported */
+	ldcp->local_hparams.ver_major =
+			ldcp->vgen_versions[0].ver_major;
+	ldcp->local_hparams.ver_minor =
+			ldcp->vgen_versions[0].ver_minor;
+	ldcp->local_hparams.dev_class = VDEV_NETWORK;
+
+	/* set attr_info params */
+	ldcp->local_hparams.mtu = ETHERMAX;
+	ldcp->local_hparams.addr =
+		vgen_macaddr_strtoul(vgenp->macaddr);
+	ldcp->local_hparams.addr_type = ADDR_TYPE_MAC;
+	ldcp->local_hparams.xfer_mode = VIO_DRING_MODE;
+	ldcp->local_hparams.ack_freq = 0;	/* don't need acks */
+
+#ifdef DEBUG
+#if 0
+	vgen_print_attr_info(ldcp, VGEN_LOCAL);
+#endif
+#endif
+
+	/*
+	 * set dring_info params.
+	 * Note: dring is already created and bound.
+	 */
+	bcopy(&(ldcp->tx_dcookie), &(ldcp->local_hparams.dring_cookie),
+		sizeof (ldc_mem_cookie_t));
+	ldcp->local_hparams.num_dcookies = ldcp->num_txdcookies;
+	ldcp->local_hparams.num_desc = ldcp->num_txds;
+	ldcp->local_hparams.desc_size = sizeof (vnet_public_desc_t);
+
+	/*
+	 * dring_ident is set to 0. After mapping the dring, peer sets this
+	 * value and sends it in the ack, which is saved in
+	 * vgen_handle_dring_reg().
+	 */
+	ldcp->local_hparams.dring_ident = 0;
+
+	/* clear peer_hparams */
+	bzero(&(ldcp->peer_hparams), sizeof (ldcp->peer_hparams));
+
+	/* reset the channel if required */
+	if (ldcp->need_ldc_reset) {
+		DWARN((vnetp,
+		    "vgen_reset_hphase: id (%lx), Doing Channel Reset...\n",
+		    ldcp->ldc_id));
+		ldcp->need_ldc_reset = B_FALSE;
+		(void) ldc_reset(ldcp->ldc_handle);
+		(void) ldc_status(ldcp->ldc_handle, &istatus);
+		DBG2((vnetp,
+		    "vgen_reset_hphase: id (%lx), RESET Done,ldc_status(%x)\n",
+		    ldcp->ldc_id, istatus));
+		ldcp->ldc_status = istatus;
+		/* clear sids */
+		ldcp->local_sid = 0;
+		ldcp->peer_sid = 0;
+		(void) ldc_up(ldcp->ldc_handle);
+	}
+}
+
+/* wrapper function for vgen_reset_hphase */
+static void
+vgen_handshake_reset(vgen_ldc_t *ldcp)
+{
+	ASSERT(MUTEX_HELD(&ldcp->cblock));
+	mutex_enter(&ldcp->txlock);
+	mutex_enter(&ldcp->tclock);
+
+	vgen_reset_hphase(ldcp);
+
+	mutex_exit(&ldcp->tclock);
+	mutex_exit(&ldcp->txlock);
+}
+
+/*
+ * Initiate handshake with the peer by sending various messages
+ * based on the handshake-phase that the channel is currently in.
+ */
+static void
+vgen_handshake(vgen_ldc_t *ldcp)
+{
+	uint32_t hphase = ldcp->hphase;
+	void	*vnetp = LDC_TO_VNET(ldcp);
+	vgen_t	*vgenp = LDC_TO_VGEN(ldcp);
+
+	switch (hphase) {
+
+	case VH_PHASE1:
+
+		/*
+		 * start timer, for entire handshake process, turn this timer
+		 * off if all phases of handshake complete successfully and
+		 * hphase goes to VH_DONE(below) or
+		 * vgen_reset_hphase() gets called or
+		 * channel is reset due to errors or
+		 * vgen_ldc_uninit() is invoked(vgen_stop).
+		 */
+		ldcp->htid = timeout(vgen_hwatchdog, (caddr_t)ldcp,
+		    drv_usectohz(vgen_hwd_interval * 1000));
+
+		/* Phase 1 involves negotiating the version */
+		if (vgen_send_version_negotiate(ldcp) != VGEN_SUCCESS) {
+			vgen_handshake_reset(ldcp);
+		}
+		break;
+
+	case VH_PHASE2:
+		if (vgen_handshake_phase2(ldcp) != VGEN_SUCCESS) {
+			vgen_handshake_reset(ldcp);
+		}
+		break;
+
+	case VH_PHASE3:
+		if (vgen_send_rdx_info(ldcp) != VGEN_SUCCESS) {
+			vgen_handshake_reset(ldcp);
+		}
+		break;
+
+	case VH_DONE:
+		/* reset handshake watchdog timeout */
+		if (ldcp->htid) {
+			(void) untimeout(ldcp->htid);
+			ldcp->htid = 0;
+		}
+		ldcp->hretries = 0;
+#if 0
+		vgen_print_ldcinfo(ldcp);
+#endif
+		DBG1((vnetp, "vgen_handshake: id(0x%lx) Handshake Done\n",
+		    ldcp->ldc_id));
+
+		if (ldcp->need_mcast_sync) {
+			/* need to sync multicast table with vsw */
+
+			ldcp->need_mcast_sync = B_FALSE;
+			mutex_exit(&ldcp->cblock);
+
+			mutex_enter(&vgenp->lock);
+			(void) vgen_send_mcast_info(ldcp);
+			mutex_exit(&vgenp->lock);
+
+			mutex_enter(&ldcp->cblock);
+
+		}
+		break;
+
+	default:
+		break;
+	}
+}
+
+/*
+ * Check if the current handshake phase has completed successfully and
+ * return the status.
+ */
+static int
+vgen_handshake_done(vgen_ldc_t *ldcp)
+{
+	uint32_t	hphase = ldcp->hphase;
+	int 		status = 0;
+	void		*vnetp = LDC_TO_VNET(ldcp);
+
+	switch (hphase) {
+
+	case VH_PHASE1:
+		/*
+		 * Phase1 is done, if version negotiation
+		 * completed successfully.
+		 */
+		status = ((ldcp->hstate & VER_NEGOTIATED) ==
+			VER_NEGOTIATED);
+		break;
+
+	case VH_PHASE2:
+		/*
+		 * Phase 2 is done, if attr info and dring info
+		 * have been exchanged successfully.
+		 */
+		status = (((ldcp->hstate & ATTR_INFO_EXCHANGED) ==
+			    ATTR_INFO_EXCHANGED) &&
+			    ((ldcp->hstate & DRING_INFO_EXCHANGED) ==
+			    DRING_INFO_EXCHANGED));
+		break;
+
+	case VH_PHASE3:
+		/* Phase 3 is done, if rdx msg has been exchanged */
+		status = ((ldcp->hstate & RDX_EXCHANGED) ==
+			RDX_EXCHANGED);
+		break;
+
+	default:
+		break;
+	}
+
+	if (status == 0) {
+		return (VGEN_FAILURE);
+	}
+	DBG2((vnetp, "VNET_HANDSHAKE_DONE: PHASE(%d)\n", hphase));
+	return (VGEN_SUCCESS);
+}
+
+/* retry handshake on failure */
+static void
+vgen_handshake_retry(vgen_ldc_t *ldcp)
+{
+	/* reset handshake phase */
+	vgen_handshake_reset(ldcp);
+	if (vgen_max_hretries) {	/* handshake retry is specified */
+		if (ldcp->hretries++ < vgen_max_hretries)
+			vgen_handshake(vh_nextphase(ldcp));
+	}
+}
+
+/*
+ * Handle a version info msg from the peer or an ACK/NACK from the peer
+ * to a version info msg that we sent.
+ */
+static void
+vgen_handle_version_negotiate(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
+{
+	vio_ver_msg_t	*vermsg = (vio_ver_msg_t *)tagp;
+	int		ack = 0;
+	int		failed = 0;
+	void		*vnetp = LDC_TO_VNET(ldcp);
+	int		idx;
+	vgen_ver_t	*versions = ldcp->vgen_versions;
+
+	DBG1((vnetp, "vgen_handle_version_negotiate: enter\n"));
+	switch (tagp->vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+
+		/*  Cache sid of peer if this is the first time */
+		if (ldcp->peer_sid == 0) {
+			DBG2((vnetp,
+			    "vgen_handle_version_negotiate: id (%lx) Caching"
+			    " peer_sid(%x)\n", ldcp->ldc_id, tagp->vio_sid));
+			ldcp->peer_sid = tagp->vio_sid;
+		}
+
+		if (ldcp->hphase != VH_PHASE1) {
+			/*
+			 * If we are not already in VH_PHASE1, reset to
+			 * pre-handshake state, and initiate handshake
+			 * to the peer too.
+			 */
+			vgen_handshake_reset(ldcp);
+			vgen_handshake(vh_nextphase(ldcp));
+		}
+		ldcp->hstate |= VER_INFO_RCVD;
+
+		/* save peer's requested values */
+		ldcp->peer_hparams.ver_major = vermsg->ver_major;
+		ldcp->peer_hparams.ver_minor = vermsg->ver_minor;
+		ldcp->peer_hparams.dev_class = vermsg->dev_class;
+
+		if ((vermsg->dev_class != VDEV_NETWORK) &&
+		    (vermsg->dev_class != VDEV_NETWORK_SWITCH)) {
+			/* unsupported dev_class, send NACK */
+
+			tagp->vio_subtype = VIO_SUBTYPE_NACK;
+			tagp->vio_sid = ldcp->local_sid;
+			/* send reply msg back to peer */
+			(void) vgen_sendmsg(ldcp, (caddr_t)tagp,
+			    sizeof (*vermsg), B_FALSE);
+			DWARN((vnetp,
+			    "vgen_handle_version_negotiate: Version"
+			    " Negotiation Failed id (%lx)\n", ldcp->ldc_id));
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		DBG2((vnetp, "vgen_handle_version_negotiate: VER_INFO_RCVD,"
+		    " id (%lx), ver(%d,%d)\n", ldcp->ldc_id,
+		    vermsg->ver_major,  vermsg->ver_minor));
+
+		idx = 0;
+
+		for (;;) {
+
+			if (vermsg->ver_major > versions[idx].ver_major) {
+
+				/* nack with next lower version */
+				tagp->vio_subtype = VIO_SUBTYPE_NACK;
+				vermsg->ver_major = versions[idx].ver_major;
+				vermsg->ver_minor = versions[idx].ver_minor;
+				break;
+			}
+
+			if (vermsg->ver_major == versions[idx].ver_major) {
+
+				/* major version match - ACK version */
+				tagp->vio_subtype = VIO_SUBTYPE_ACK;
+				ack = 1;
+
+				/*
+				 * lower minor version to the one this endpt
+				 * supports, if necessary
+				 */
+				if (vermsg->ver_minor >
+				    versions[idx].ver_minor) {
+					vermsg->ver_minor =
+						versions[idx].ver_minor;
+					ldcp->peer_hparams.ver_minor =
+						versions[idx].ver_minor;
+				}
+				break;
+			}
+
+			idx++;
+
+			if (idx == VGEN_NUM_VER) {
+
+				/* no version match - send NACK */
+				tagp->vio_subtype = VIO_SUBTYPE_NACK;
+				vermsg->ver_major = 0;
+				vermsg->ver_minor = 0;
+				failed = 1;
+				break;
+			}
+
+		}
+
+		tagp->vio_sid = ldcp->local_sid;
+
+		/* send reply msg back to peer */
+		if (vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (*vermsg),
+		    B_FALSE) != VGEN_SUCCESS) {
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		if (ack) {
+			ldcp->hstate |= VER_ACK_SENT;
+			DBG2((vnetp, "vgen_handle_version_negotiate:"
+			    " VER_ACK_SENT, id (%lx) ver(%d,%d) \n",
+			    ldcp->ldc_id, vermsg->ver_major,
+			    vermsg->ver_minor));
+		}
+		if (failed) {
+			DWARN((vnetp, "vgen_handle_version_negotiate:"
+			    " Version Negotiation Failed id (%lx)\n",
+			    ldcp->ldc_id));
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+		if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) {
+
+			/*  VER_ACK_SENT and VER_ACK_RCVD */
+
+			/* local and peer versions match? */
+			ASSERT((ldcp->local_hparams.ver_major ==
+				ldcp->peer_hparams.ver_major) &&
+				(ldcp->local_hparams.ver_minor ==
+				ldcp->peer_hparams.ver_minor));
+
+			/* move to the next phase */
+			vgen_handshake(vh_nextphase(ldcp));
+		}
+
+		break;
+
+	case VIO_SUBTYPE_ACK:
+
+		if (ldcp->hphase != VH_PHASE1) {
+			/*  This should not happen. */
+			DWARN((vnetp,
+			    "vgen_handle_version_negotiate:"
+			    " VER_ACK_RCVD id (%lx) Invalid Phase(%u)\n",
+			    ldcp->ldc_id, ldcp->hphase));
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		/* SUCCESS - we have agreed on a version */
+		ldcp->local_hparams.ver_major = vermsg->ver_major;
+		ldcp->local_hparams.ver_minor = vermsg->ver_minor;
+		ldcp->hstate |= VER_ACK_RCVD;
+
+		DBG2((vnetp, "vgen_handle_version_negotiate:"
+		    " VER_ACK_RCVD, id (%lx) ver(%d,%d) \n",
+		    ldcp->ldc_id, vermsg->ver_major,  vermsg->ver_minor));
+
+		if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) {
+
+			/*  VER_ACK_SENT and VER_ACK_RCVD */
+
+			/* local and peer versions match? */
+			ASSERT((ldcp->local_hparams.ver_major ==
+				ldcp->peer_hparams.ver_major) &&
+				(ldcp->local_hparams.ver_minor ==
+				ldcp->peer_hparams.ver_minor));
+
+			/* move to the next phase */
+			vgen_handshake(vh_nextphase(ldcp));
+		}
+		break;
+
+	case VIO_SUBTYPE_NACK:
+
+		if (ldcp->hphase != VH_PHASE1) {
+			/*  This should not happen.  */
+			DWARN((vnetp,
+			    "vgen_handle_version_negotiate:"
+			    " VER_NACK_RCVD id (%lx) Invalid Phase(%u)\n",
+			    ldcp->ldc_id, ldcp->hphase));
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		DBG2((vnetp, "vgen_handle_version_negotiate:"
+		    " VER_NACK_RCVD id(%lx) next ver(%d,%d)\n",
+		    ldcp->ldc_id, vermsg->ver_major, vermsg->ver_minor));
+
+		/* check if version in NACK is zero */
+		if (vermsg->ver_major == 0 && vermsg->ver_minor == 0) {
+			/*
+			 * Version Negotiation has failed.
+			 */
+			DWARN((vnetp, "vgen_handle_version_negotiate:"
+			    " Version Negotiation Failed id (%lx)\n",
+			    ldcp->ldc_id));
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		idx = 0;
+
+		for (;;) {
+
+			if (vermsg->ver_major > versions[idx].ver_major) {
+				/* select next lower version */
+
+				ldcp->local_hparams.ver_major =
+					versions[idx].ver_major;
+				ldcp->local_hparams.ver_minor =
+					versions[idx].ver_minor;
+				break;
+			}
+
+			if (vermsg->ver_major == versions[idx].ver_major) {
+				/* major version match */
+
+				ldcp->local_hparams.ver_major =
+					versions[idx].ver_major;
+
+				ldcp->local_hparams.ver_minor =
+					versions[idx].ver_minor;
+				break;
+			}
+
+			idx++;
+
+			if (idx == VGEN_NUM_VER) {
+				/*
+				 * no version match.
+				 * Version Negotiation has failed.
+				 */
+				DWARN((vnetp, "vgen_handle_version_negotiate:"
+				    " Version Negotiation Failed id (%lx)\n",
+				    ldcp->ldc_id));
+				vgen_handshake_reset(ldcp);
+				return;
+			}
+
+		}
+
+		if (vgen_send_version_negotiate(ldcp) != VGEN_SUCCESS) {
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		break;
+	}
+	DBG1((vnetp, "vgen_handle_version_negotiate: exit\n"));
+}
+
+/* Check if the attributes are supported */
+static int
+vgen_check_attr_info(vgen_ldc_t *ldcp, vnet_attr_msg_t *msg)
+{
+	_NOTE(ARGUNUSED(ldcp))
+
+#if 0
+	uint64_t port_macaddr;
+	port_macaddr = vgen_macaddr_strtoul((uint8_t *)
+				&(ldcp->portp->macaddr));
+#endif
+	/*
+	 * currently, we support these attr values:
+	 * mtu of ethernet, addr_type of mac, xfer_mode of
+	 * ldc shared memory, ack_freq of 0 (data is acked if
+	 * the ack bit is set in the descriptor) and the address should
+	 * match the address in the port node.
+	 */
+	if ((msg->mtu != ETHERMAX) ||
+	    (msg->addr_type != ADDR_TYPE_MAC) ||
+	    (msg->xfer_mode != VIO_DRING_MODE) ||
+	    (msg->ack_freq > 64)) {
+#if 0
+	    (msg->addr != port_macaddr))
+cmn_err(CE_CONT, "vgen_check_attr_info: msg->addr(%lx), port_macaddr(%lx)\n",
+	msg->addr, port_macaddr);
+#endif
+		return (VGEN_FAILURE);
+	}
+
+	return (VGEN_SUCCESS);
+}
+
+/*
+ * Handle an attribute info msg from the peer or an ACK/NACK from the peer
+ * to an attr info msg that we sent.
+ */
+static void
+vgen_handle_attr_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
+{
+	vnet_attr_msg_t *attrmsg = (vnet_attr_msg_t *)tagp;
+	void		*vnetp = LDC_TO_VNET(ldcp);
+	int		ack = 0;
+
+	DBG1((vnetp, "vgen_handle_attr_info: enter\n"));
+	if (ldcp->hphase != VH_PHASE2) {
+		DWARN((vnetp,
+		    "vgen_handle_attr_info: Rcvd ATTR_INFO id(%lx)"
+		    " subtype (%d), Invalid Phase(%u)\n", ldcp->ldc_id,
+		    tagp->vio_subtype, ldcp->hphase));
+		vgen_handshake_reset(ldcp);
+		return;
+	}
+	switch (tagp->vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+
+		DBG2((vnetp, "vgen_handle_attr_info: ATTR_INFO_RCVD id(%lx)\n",
+		    ldcp->ldc_id));
+		ldcp->hstate |= ATTR_INFO_RCVD;
+
+		/* save peer's values */
+		ldcp->peer_hparams.mtu = attrmsg->mtu;
+		ldcp->peer_hparams.addr = attrmsg->addr;
+		ldcp->peer_hparams.addr_type = attrmsg->addr_type;
+		ldcp->peer_hparams.xfer_mode = attrmsg->xfer_mode;
+		ldcp->peer_hparams.ack_freq = attrmsg->ack_freq;
+
+		if (vgen_check_attr_info(ldcp, attrmsg) == VGEN_FAILURE) {
+			/* unsupported attr, send NACK */
+			tagp->vio_subtype = VIO_SUBTYPE_NACK;
+		} else {
+			ack = 1;
+			tagp->vio_subtype = VIO_SUBTYPE_ACK;
+		}
+		tagp->vio_sid = ldcp->local_sid;
+
+		/* send reply msg back to peer */
+		if (vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (*attrmsg),
+		    B_FALSE) != VGEN_SUCCESS) {
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		if (ack) {
+			ldcp->hstate |= ATTR_ACK_SENT;
+			DBG2((vnetp, "vgen_handle_attr_info:"
+			    " ATTR_ACK_SENT id(%lx)\n", ldcp->ldc_id));
+#ifdef DEBUG
+#if 0
+			vgen_print_attr_info(ldcp, VGEN_PEER);
+#endif
+#endif
+		} else {
+			/* failed */
+			DWARN((vnetp, "vgen_handle_attr_info:"
+			    " ATTR_NACK_SENT id(%lx)\n", ldcp->ldc_id));
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) {
+			vgen_handshake(vh_nextphase(ldcp));
+		}
+
+		break;
+
+	case VIO_SUBTYPE_ACK:
+
+		ldcp->hstate |= ATTR_ACK_RCVD;
+
+		DBG2((vnetp, "vgen_handle_attr_info: ATTR_ACK_RCVD id(%lx)\n",
+		    ldcp->ldc_id));
+
+		if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) {
+			vgen_handshake(vh_nextphase(ldcp));
+		}
+		break;
+
+	case VIO_SUBTYPE_NACK:
+
+		DBG2((vnetp, "vgen_handle_attr_info: ATTR_NACK_RCVD id(%lx)\n",
+		    ldcp->ldc_id));
+		vgen_handshake_reset(ldcp);
+		break;
+	}
+	DBG1((vnetp, "vgen_handle_attr_info: exit\n"));
+}
+
+/* Check if the dring info msg is ok */
+static int
+vgen_check_dring_reg(vio_dring_reg_msg_t *msg)
+{
+	/* check if msg contents are ok */
+	if ((msg->num_descriptors < 128) || (msg->descriptor_size <
+	    sizeof (vnet_public_desc_t))) {
+		return (VGEN_FAILURE);
+	}
+	return (VGEN_SUCCESS);
+}
+
+/*
+ * Handle a descriptor ring register msg from the peer or an ACK/NACK from
+ * the peer to a dring register msg that we sent.
+ */
+static void
+vgen_handle_dring_reg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
+{
+	vio_dring_reg_msg_t *msg = (vio_dring_reg_msg_t *)tagp;
+	void *vnetp = LDC_TO_VNET(ldcp);
+	ldc_mem_cookie_t dcookie;
+	int ack = 0;
+	int rv = 0;
+
+	DBG1((vnetp, "vgen_handle_dring_reg: enter\n"));
+	if (ldcp->hphase < VH_PHASE2) {
+		/* dring_info can be rcvd in any of the phases after Phase1 */
+		DWARN((vnetp,
+		    "vgen_handle_dring_reg: Rcvd DRING_INFO, id (%lx)"
+		    " Subtype (%d), Invalid Phase(%u)\n", ldcp->ldc_id,
+		    tagp->vio_subtype, ldcp->hphase));
+		vgen_handshake_reset(ldcp);
+		return;
+	}
+	switch (tagp->vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+
+		DBG2((vnetp, "vgen_handle_dring_reg: DRING_INFO_RCVD id(%lx)\n",
+		    ldcp->ldc_id));
+		ldcp->hstate |= DRING_INFO_RCVD;
+		bcopy((msg->cookie), &dcookie, sizeof (dcookie));
+
+		ASSERT(msg->ncookies == 1);
+
+		if (vgen_check_dring_reg(msg) == VGEN_SUCCESS) {
+			/*
+			 * verified dring info msg to be ok,
+			 * now try to map the remote dring.
+			 */
+			rv = vgen_init_rxds(ldcp, msg->num_descriptors,
+			    msg->descriptor_size, &dcookie,
+			    msg->ncookies);
+			if (rv == DDI_SUCCESS) {
+				/* now we can ack the peer */
+				ack = 1;
+			}
+		}
+		if (ack == 0) {
+			/* failed, send NACK */
+			tagp->vio_subtype = VIO_SUBTYPE_NACK;
+		} else {
+			if (!(ldcp->peer_hparams.dring_ready)) {
+
+				/* save peer's dring_info values */
+				bcopy(&dcookie,
+				    &(ldcp->peer_hparams.dring_cookie),
+				    sizeof (dcookie));
+				ldcp->peer_hparams.num_desc =
+						msg->num_descriptors;
+				ldcp->peer_hparams.desc_size =
+						msg->descriptor_size;
+				ldcp->peer_hparams.num_dcookies =
+						msg->ncookies;
+
+				/* set dring_ident for the peer */
+				ldcp->peer_hparams.dring_ident =
+							(uint64_t)ldcp->rxdp;
+				/* return the dring_ident in ack msg */
+				msg->dring_ident =
+							(uint64_t)ldcp->rxdp;
+
+				ldcp->peer_hparams.dring_ready = B_TRUE;
+			}
+			tagp->vio_subtype = VIO_SUBTYPE_ACK;
+		}
+		tagp->vio_sid = ldcp->local_sid;
+		/* send reply msg back to peer */
+		if (vgen_sendmsg(ldcp, (caddr_t)tagp, sizeof (*msg),
+		    B_FALSE) != VGEN_SUCCESS) {
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		if (ack) {
+			ldcp->hstate |= DRING_ACK_SENT;
+			DBG2((vnetp, "vgen_handle_dring_reg: DRING_ACK_SENT"
+			    " id (%lx)\n", ldcp->ldc_id));
+		} else {
+			DWARN((vnetp, "vgen_handle_dring_reg: DRING_NACK_SENT"
+			    " id (%lx)\n", ldcp->ldc_id));
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) {
+			vgen_handshake(vh_nextphase(ldcp));
+		}
+
+		break;
+
+	case VIO_SUBTYPE_ACK:
+
+		ldcp->hstate |= DRING_ACK_RCVD;
+
+		DBG2((vnetp, "vgen_handle_dring_reg: DRING_ACK_RCVD"
+		    " id (%lx)\n", ldcp->ldc_id));
+
+		if (!(ldcp->local_hparams.dring_ready)) {
+			/* local dring is now ready */
+			ldcp->local_hparams.dring_ready = B_TRUE;
+
+			/* save dring_ident acked by peer */
+			ldcp->local_hparams.dring_ident =
+				msg->dring_ident;
+		}
+
+		if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) {
+			vgen_handshake(vh_nextphase(ldcp));
+		}
+
+		break;
+
+	case VIO_SUBTYPE_NACK:
+
+		DBG2((vnetp, "vgen_handle_dring_reg: DRING_NACK_RCVD"
+		    " id (%lx)\n", ldcp->ldc_id));
+		vgen_handshake_reset(ldcp);
+		break;
+	}
+	DBG1((vnetp, "vgen_handle_dring_reg: exit\n"));
+}
+
+/*
+ * Handle a rdx info msg from the peer or an ACK/NACK
+ * from the peer to a rdx info msg that we sent.
+ */
+static void
+vgen_handle_rdx_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
+{
+	void *vnetp = LDC_TO_VNET(ldcp);
+
+	DBG1((vnetp, "vgen_handle_rdx_info: enter\n"));
+	if (ldcp->hphase != VH_PHASE3) {
+		DWARN((vnetp,
+		    "vgen_handle_rdx_info: Rcvd RDX_INFO, id (%lx)"
+		    "  Subtype (%d), Invalid Phase(%u)\n", ldcp->ldc_id,
+		    tagp->vio_subtype, ldcp->hphase));
+		vgen_handshake_reset(ldcp);
+		return;
+	}
+	switch (tagp->vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+
+		DBG2((vnetp, "vgen_handle_rdx_info: RDX_INFO_RCVD id (%lx)\n",
+		    ldcp->ldc_id));
+		ldcp->hstate |= RDX_INFO_RCVD;
+
+		tagp->vio_subtype = VIO_SUBTYPE_ACK;
+		tagp->vio_sid = ldcp->local_sid;
+		/* send reply msg back to peer */
+		if (vgen_sendmsg(ldcp, (caddr_t)tagp,
+		    sizeof (vio_rdx_msg_t), B_FALSE) != VGEN_SUCCESS) {
+			vgen_handshake_reset(ldcp);
+			return;
+		}
+
+		ldcp->hstate |= RDX_ACK_SENT;
+		DBG2((vnetp, "vgen_handle_rdx_info: RDX_ACK_SENT id (%lx)\n",
+		    ldcp->ldc_id));
+
+		if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) {
+			vgen_handshake(vh_nextphase(ldcp));
+		}
+
+		break;
+
+	case VIO_SUBTYPE_ACK:
+
+		ldcp->hstate |= RDX_ACK_RCVD;
+
+		DBG2((vnetp, "vgen_handle_rdx_info: RDX_ACK_RCVD id (%lx)\n",
+		    ldcp->ldc_id));
+
+		if (vgen_handshake_done(ldcp) == VGEN_SUCCESS) {
+			vgen_handshake(vh_nextphase(ldcp));
+		}
+		break;
+
+	case VIO_SUBTYPE_NACK:
+
+		DBG2((vnetp, "vgen_handle_rdx_info: RDX_NACK_RCVD id (%lx)\n",
+		    ldcp->ldc_id));
+		vgen_handshake_reset(ldcp);
+		break;
+	}
+	DBG1((vnetp, "vgen_handle_rdx_info: exit\n"));
+}
+
+/* Handle ACK/NACK from vsw to a set multicast msg that we sent */
+static void
+vgen_handle_mcast_info(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
+{
+	void *vnetp = LDC_TO_VNET(ldcp);
+	vgen_t *vgenp = LDC_TO_VGEN(ldcp);
+	vnet_mcast_msg_t *msgp = (vnet_mcast_msg_t *)tagp;
+	struct ether_addr *addrp;
+	int count;
+	int i;
+
+	DBG1((vnetp, "vgen_handle_mcast_info: enter\n"));
+	switch (tagp->vio_subtype) {
+
+	case VIO_SUBTYPE_INFO:
+
+		/* vnet shouldn't recv set mcast msg, only vsw handles it */
+		DWARN((vnetp,
+		    "vgen_handle_mcast_info: rcvd SET_MCAST_INFO id (%lx)\n",
+		    ldcp->ldc_id));
+		break;
+
+	case VIO_SUBTYPE_ACK:
+
+		/* success adding/removing multicast addr */
+		DBG2((vnetp,
+		    "vgen_handle_mcast_info: rcvd SET_MCAST_ACK id (%lx)\n",
+		    ldcp->ldc_id));
+		break;
+
+	case VIO_SUBTYPE_NACK:
+
+		DWARN((vnetp,
+		    "vgen_handle_mcast_info: rcvd SET_MCAST_NACK id (%lx)\n",
+		    ldcp->ldc_id));
+		if (!(msgp->set)) {
+			/* multicast remove request failed */
+			break;
+		}
+
+		/* multicast add request failed */
+		for (count = 0; count < msgp->count; count++) {
+			addrp = &(msgp->mca[count]);
+
+			/* delete address from the table */
+			for (i = 0; i < vgenp->mccount; i++) {
+				if (ether_cmp(addrp,
+				    &(vgenp->mctab[i])) == 0) {
+					if (vgenp->mccount > 1) {
+						vgenp->mctab[i] =
+						vgenp->mctab[vgenp->mccount-1];
+					}
+					vgenp->mccount--;
+					break;
+				}
+			}
+		}
+		break;
+
+	}
+	DBG1((vnetp, "vgen_handle_mcast_info: exit\n"));
+}
+
+/* handler for control messages received from the peer ldc end-point */
+static void
+vgen_handle_ctrlmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
+{
+	void *vnetp = LDC_TO_VNET(ldcp);
+
+	DBG1((vnetp, "vgen_handle_ctrlmsg: enter\n"));
+	switch (tagp->vio_subtype_env) {
+
+	case VIO_VER_INFO:
+		vgen_handle_version_negotiate(ldcp, tagp);
+		break;
+
+	case VIO_ATTR_INFO:
+		vgen_handle_attr_info(ldcp, tagp);
+		break;
+
+	case VIO_DRING_REG:
+		vgen_handle_dring_reg(ldcp, tagp);
+		break;
+
+	case VIO_RDX:
+		vgen_handle_rdx_info(ldcp, tagp);
+		break;
+
+	case VNET_MCAST_INFO:
+		vgen_handle_mcast_info(ldcp, tagp);
+		break;
+
+	}
+	DBG1((vnetp, "vgen_handle_ctrlmsg: exit\n"));
+}
+
+/* handler for data messages received from the peer ldc end-point */
+static void
+vgen_handle_datamsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
+	mblk_t **headp, mblk_t **tailp)
+{
+	void *vnetp = LDC_TO_VNET(ldcp);
+
+	DBG1((vnetp, "vgen_handle_datamsg: enter\n"));
+
+	if (ldcp->hphase != VH_DONE)
+		return;
+	switch (tagp->vio_subtype_env) {
+	case VIO_DRING_DATA:
+		vgen_handle_dring_data(ldcp, tagp, headp, tailp);
+		break;
+	default:
+		break;
+	}
+
+	DBG1((vnetp, "vgen_handle_datamsg: exit\n"));
+}
+
+static void
+vgen_handle_dring_data(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp,
+	mblk_t **headp, mblk_t **tailp)
+{
+	vio_dring_msg_t *dringmsg;
+	vnet_public_desc_t *rxdp;
+	vnet_public_desc_t *txdp;
+	vio_dring_entry_hdr_t *hdrp;
+	vgen_stats_t *statsp;
+	struct ether_header *ehp;
+	mblk_t *mp = NULL;
+	mblk_t *bp = NULL;
+	mblk_t *bpt = NULL;
+	size_t nbytes;
+	size_t nread;
+	uint64_t off = 0;
+	uint32_t start;
+	uint32_t end;
+	uint32_t datalen;
+	uint32_t ncookies;
+	uint32_t sync_start;
+	uint32_t sync_end;
+	uint32_t rxi;
+	uint32_t txi;
+	int rv;
+	boolean_t rxd_err = B_FALSE;
+	boolean_t sync_done = B_FALSE;
+#ifdef VGEN_HANDLE_LOST_PKTS
+	int n;
+#endif
+#ifdef VGEN_REXMIT
+	uint64_t seqnum;
+	vgen_private_desc_t *tbufp;
+#endif
+	void *vnetp = LDC_TO_VNET(ldcp);
+
+	dringmsg = (vio_dring_msg_t *)tagp;
+	start = dringmsg->start_idx;
+	end = dringmsg->end_idx;
+	statsp = ldcp->statsp;
+
+	DBG1((vnetp, "vgen_handle_dring_data: enter\n"));
+	switch (tagp->vio_subtype) {
+
+	case VIO_SUBTYPE_INFO:
+		/*
+		 * received a data msg, which contains the start and end
+		 * indeces of the descriptors within the rx ring holding data,
+		 * the seq_num of data packet corresponding to the start index,
+		 * and the dring_ident.
+		 * We can now read the contents of each of these descriptors
+		 * and gather data from it.
+		 */
+		DBG2((vnetp,
+		    "vgen_handle_dring_data: INFO: start(%d), end(%d)\n",
+		    start, end));
+
+		/* validate rx start and end indeces */
+		if (!(CHECK_RXI(start, ldcp)) || !(CHECK_RXI(end, ldcp))) {
+			/* drop the message if invalid index */
+			break;
+		}
+
+		/* validate dring_ident */
+		if (dringmsg->dring_ident != ldcp->peer_hparams.dring_ident) {
+			/* invalid dring_ident, drop the msg */
+			break;
+		}
+#ifdef DEBUG
+		if (vgen_trigger_rxlost) {
+			/* drop this msg to simulate lost pkts for debugging */
+			vgen_trigger_rxlost = 0;
+			break;
+		}
+#endif
+
+#ifdef	VGEN_HANDLE_LOST_PKTS
+
+		/* receive start index doesn't match expected index */
+		if (ldcp->next_rxi != start) {
+
+			DWARN((vnetp, "vgen_handle_dring_data: id(%lx) "
+			    "next_rxi(%d) != start(%d)\n",
+			    ldcp->ldc_id, ldcp->next_rxi, start));
+
+			/* calculate the number of pkts lost */
+			if (start >= ldcp->next_rxi) {
+				n = start - ldcp->next_rxi;
+			} else  {
+				n = ldcp->num_rxds - (ldcp->next_rxi - start);
+			}
+
+			/*
+			 * Starting sequence number of the received packets
+			 * is less than the next sequence number that
+			 * is expected:
+			 *
+			 * drop the message and the corresponding packets.
+			 */
+			if (ldcp->next_rxseq > dringmsg->seq_num) {
+				DWARN((vnetp, "vgen_handle_dring_data: id(%lx) "
+				    "dropping pkts, expected rxseq(0x%lx) "
+				    "> recvd(0x%lx)\n",
+				    ldcp->ldc_id, ldcp->next_rxseq,
+				    dringmsg->seq_num));
+				/*
+				 * duplicate/multiple retransmissions from
+				 * sender?? drop this msg.
+				 */
+				break;
+			}
+
+			/*
+			 * Starting sequence number of the received packets
+			 * is greater than the next expected sequence number
+			 *
+			 * send a NACK back to the peer to indicate lost
+			 * packets.
+			 */
+			if (dringmsg->seq_num > ldcp->next_rxseq) {
+				statsp->rx_lost_pkts += n;
+				tagp->vio_subtype = VIO_SUBTYPE_NACK;
+				tagp->vio_sid = ldcp->local_sid;
+				/* indicate the range of lost descriptors */
+				dringmsg->start_idx = ldcp->next_rxi;
+				rxi = start;
+				DECR_RXI(rxi, ldcp);
+				dringmsg->end_idx = rxi;
+				/* dring ident is left unchanged */
+				if (vgen_sendmsg(ldcp, (caddr_t)tagp,
+				    sizeof (*dringmsg), B_FALSE)) {
+					DWARN((vnetp,
+					    "vgen_handle_dring_data: id(%lx) "
+					    "vgen_sendmsg failed, "
+					    "stype: NACK\n", ldcp->ldc_id));
+				}
+#ifdef VGEN_REXMIT
+				/*
+				 * stop further processing until peer
+				 * retransmits with the right index and seqnum.
+				 */
+				break;
+#else	/* VGEN_REXMIT */
+				/*
+				 * treat this range of descrs/pkts as dropped
+				 * and set the new expected values for next_rxi
+				 * and next_rxseq. continue(below) to process
+				 * from the new start index.
+				 */
+				ldcp->next_rxi = start;
+				ldcp->next_rxseq += n;
+#endif	/* VGEN_REXMIT */
+
+			} else if (dringmsg->seq_num == ldcp->next_rxseq) {
+				/*
+				 * expected and starting seqnums match, but
+				 * the descriptor indeces don't?
+				 *
+				 * restart handshake with peer.
+				 */
+				DWARN((vnetp,
+				    "vgen_handle_dring_data: id(%lx) "
+				    "next_rxseq(0x%lx) == seq_num(0x%lx)\n",
+				    ldcp->ldc_id, ldcp->next_rxseq,
+				    dringmsg->seq_num));
+
+#if 0
+				vgen_handshake_retry(ldcp);
+				break;
+#endif
+
+			}
+
+		} else {
+			/* expected and start dring indeces match */
+
+			if (dringmsg->seq_num != ldcp->next_rxseq) {
+
+				/* seqnums don't match */
+
+				DWARN((vnetp,
+				    "vgen_handle_dring_data: id(%lx) "
+				    "next_rxseq(0x%lx) != seq_num(0x%lx)\n",
+				    ldcp->ldc_id, ldcp->next_rxseq,
+				    dringmsg->seq_num));
+
+#if 0
+				vgen_handshake_retry(ldcp);
+				break;
+#endif
+			}
+		}
+
+#endif	/* VGEN_HANDLE_LOST_PKTS */
+
+		/*
+		 * Start processing the descriptor range, specified
+		 * in the dring data msg.
+		 */
+		if (ldc_mem_dring_acquire(ldcp->rx_dhandle, start, end)) {
+			DWARN((vnetp, "vgen_handle_dring_data: "
+			    "id(%lx), ldc_mem_dring_acquire() failed\n",
+			    ldcp->ldc_id));
+			statsp->ierrors++;
+		}
+		rxi = start;
+		sync_start = start;
+		do {
+			/* recv packets from 'start' to 'end' */
+
+			rxdp = &(ldcp->rxdp[rxi]);
+			hdrp = &rxdp->hdr;
+
+			datalen = rxdp->nbytes;
+			ncookies = rxdp->ncookies;
+			if ((datalen < ETHERMIN) ||
+			    (ncookies == 0) ||
+			    (ncookies > (uint64_t)MAX_COOKIES) ||
+			    (hdrp->dstate != VIO_DESC_READY)) {
+				rxd_err = B_TRUE;
+			} else {
+				/*
+				 * The data buffer returned by allocb(9F) is
+				 * 8byte aligned. We allocate extra 8 bytes to
+				 * ensure size is multiple of 8 bytes for
+				 * ldc_mem_copy().
+				 */
+				mp = allocb(datalen + 8, BPRI_MED);
+				nbytes = (datalen + 7) & ~7;
+			}
+			if ((rxd_err) || (mp == NULL)) {
+				/*
+				 * rxd_err or allocb() failure,
+				 * drop this packet, get next.
+				 */
+				if (rxd_err) {
+					statsp->ierrors++;
+					rxd_err = B_FALSE;
+				} else {
+					statsp->rx_allocb_fail++;
+				}
+
+				/* set descriptor done bit */
+				hdrp->dstate = VIO_DESC_DONE;
+
+				if (hdrp->ack) {
+					/*
+					 * sender needs ack for this packet.
+					 * sync pkts upto this index and
+					 * send the ack to the peer.
+					 */
+					sync_end = rxi;
+					(void) ldc_mem_dring_release(
+					    ldcp->rx_dhandle, sync_start,
+					    sync_end);
+					tagp->vio_subtype = VIO_SUBTYPE_ACK;
+					tagp->vio_sid = ldcp->local_sid;
+					dringmsg = (vio_dring_msg_t *)tagp;
+					dringmsg->start_idx = sync_start;
+					dringmsg->end_idx = sync_end;
+					if (vgen_sendmsg(ldcp, (caddr_t)tagp,
+					    sizeof (*dringmsg), B_FALSE)) {
+						DWARN((vnetp,
+						    "vgen_handle_dring_data: "
+						    "id(%lx) vgen_sendmsg "
+						    "failed, stype: ACK\n",
+						    ldcp->ldc_id));
+					}
+					/* save new sync index start */
+					if (sync_end != end) {
+						INCR_RXI(sync_end, ldcp);
+						sync_start = sync_end;
+					} else
+						sync_done = B_TRUE;
+				}
+				goto vgen_next_rxi;
+			}
+
+			nread = nbytes;
+			rv = ldc_mem_copy(ldcp->ldc_handle,
+			    (caddr_t)mp->b_rptr, off, &nread,
+			    rxdp->memcookie, ncookies, LDC_COPY_IN);
+
+			/* set done bit irrespective of rv of ldc_mem_copy() */
+			hdrp->dstate = VIO_DESC_DONE;
+
+			if (hdrp->ack) {
+				/*
+				 * sender needs ack for this packet.
+				 * sync pkts upto this index and
+				 * send the ack to the peer.
+				 */
+				sync_end = rxi;
+				(void) ldc_mem_dring_release(ldcp->rx_dhandle,
+				    sync_start, sync_end);
+				tagp->vio_subtype = VIO_SUBTYPE_ACK;
+				tagp->vio_sid = ldcp->local_sid;
+				dringmsg = (vio_dring_msg_t *)tagp;
+				dringmsg->start_idx = sync_start;
+				dringmsg->end_idx = sync_end;
+				if (vgen_sendmsg(ldcp, (caddr_t)tagp,
+				    sizeof (*dringmsg), B_FALSE)) {
+					DWARN((vnetp,
+					    "vgen_handle_dring_data: id(%lx) "
+					    "vgen_sendmsg failed stype: ACK\n",
+					    ldcp->ldc_id));
+				}
+				/* save new sync index start */
+				if (sync_end != end) {
+					INCR_RXI(sync_end, ldcp);
+					sync_start = sync_end;
+				} else
+					sync_done = B_TRUE;
+			}
+			/* if ldc_mem_copy() failed */
+			if (rv) {
+				DWARN((vnetp,
+				    "vgen_handle_dring_data: id(%lx) "
+				    "ldc_mem_copy failed\n", ldcp->ldc_id));
+				statsp->ierrors++;
+				freemsg(mp);
+				goto vgen_next_rxi;
+			}
+			if (nread != nbytes) {
+				DWARN((vnetp,
+				    "vgen_handle_dring_data: id(%lx) "
+				    "ldc_mem_copy nread(%lx), nbytes(%lx)\n",
+				    ldcp->ldc_id, nread, nbytes));
+				statsp->ierrors++;
+				freemsg(mp);
+				goto vgen_next_rxi;
+			}
+
+			/* point to the actual end of data */
+			mp->b_wptr = mp->b_rptr + datalen;
+
+			/* update stats */
+			statsp->ipackets++;
+			statsp->rbytes += datalen;
+			ehp = (struct ether_header *)mp->b_rptr;
+			if (IS_BROADCAST(ehp))
+				statsp->brdcstrcv++;
+			else if (IS_MULTICAST(ehp))
+				statsp->multircv++;
+
+			/* build a chain of received packets */
+			if (bp == NULL) {
+				/* first pkt */
+				bp = mp;
+				bpt = bp;
+				bpt->b_next = NULL;
+			} else {
+				mp->b_next = NULL;
+				bpt->b_next = mp;
+				bpt = mp;
+			}
+
+vgen_next_rxi:		if (rxi == end) {
+				break;
+			}
+			/* increment recv index */
+			INCR_RXI(rxi, ldcp);
+
+		_NOTE(CONSTCOND)
+		} while (1);
+
+		if (!sync_done) {
+			/* sync remote descriptor range */
+			sync_end = rxi;
+			(void) ldc_mem_dring_release(ldcp->rx_dhandle,
+			    sync_start, sync_end);
+			DBG2((vnetp,
+			    "vgen_handle_dring_data: not sending ACK\n"));
+		}
+
+		/* save new recv index */
+		INCR_RXI(rxi, ldcp);
+		ldcp->next_rxi = rxi;
+		ldcp->next_rxseq += ((end >= start) ?
+			((end - start) + 1) : (start - end));
+
+		/* try to reclaim transmit descrs also */
+		vgen_reclaim(ldcp);
+		break;
+
+	case VIO_SUBTYPE_ACK:
+		/*
+		 * received an ack corresponding to a specific descriptor for
+		 * which we had set the ACK bit in the descriptor (during
+		 * transmit). This enables us to reclaim descriptors.
+		 */
+		DBG2((vnetp,
+		    "vgen_handle_dring_data: ACK:  start(%d), end(%d)\n",
+		    start, end));
+
+		/* validate start and end indeces in the tx ack msg */
+		if (!(CHECK_TXI(start, ldcp)) || !(CHECK_TXI(end, ldcp))) {
+			/* drop the message if invalid index */
+			break;
+		}
+		/* validate dring_ident */
+		if (dringmsg->dring_ident != ldcp->local_hparams.dring_ident) {
+			/* invalid dring_ident, drop the msg */
+			break;
+		}
+		statsp->dring_data_acks++;
+		vgen_reclaim(ldcp);
+		break;
+
+	case VIO_SUBTYPE_NACK:
+		/*
+		 * peer sent a NACK msg to indicate lost packets.
+		 * The start and end correspond to the range of descriptors
+		 * for which the peer didn't receive a dring data msg and so
+		 * didn't receive the corresponding data.
+		 */
+		DWARN((vnetp,
+		    "vgen_handle_dring_data: NACK:  start(%d), end(%d)\n",
+		    start, end));
+
+		/* validate start and end indeces in the tx nack msg */
+		if (!(CHECK_TXI(start, ldcp)) || !(CHECK_TXI(end, ldcp))) {
+			/* drop the message if invalid index */
+			break;
+		}
+		/* validate dring_ident */
+		if (dringmsg->dring_ident != ldcp->local_hparams.dring_ident) {
+			/* invalid dring_ident, drop the msg */
+			break;
+		}
+		mutex_enter(&ldcp->txlock);
+		mutex_enter(&ldcp->tclock);
+
+		if (ldcp->next_tbufp == ldcp->cur_tbufp) {
+			/* no busy descriptors, bogus nack ? */
+			mutex_exit(&ldcp->tclock);
+			mutex_exit(&ldcp->txlock);
+			break;
+		}
+
+#ifdef VGEN_REXMIT
+		/* send a new dring data msg including the lost descrs */
+		end = ldcp->next_tbufp - ldcp->tbufp;
+		DECR_TXI(end, ldcp);
+		seqnum = ldcp->tbufp[start].seqnum;
+		/* no need to increment ldcp->next_txseq as this is rexmit */
+		rv = vgen_send_dring_data(ldcp, start, end, seqnum);
+		if (rv != 0) {
+			/*
+			 * vgen_send_dring_data() error: drop all packets
+			 * in this descr range
+			 */
+			DWARN((vnetp,
+			    "vgen_handle_dring_data: "
+			    "vgen_send_dring_data failed :"
+			    "id(%lx) rv(%d)\n", ldcp->ldc_id, rv));
+			for (txi = start; txi <= end; ) {
+				tbufp = &(ldcp->tbufp[txi]);
+				txdp = tbufp->descp;
+				hdrp = &txdp->hdr;
+				(void) ldc_mem_unbind_handle(tbufp->memhandle);
+				freemsg(tbufp->mp);
+				tbufp->flags = VGEN_PRIV_DESC_FREE;
+				hdrp->dstate = VIO_DESC_FREE;
+				hdrp->ack = B_FALSE;
+				statsp->oerrors++;
+			}
+
+			/* update next pointer */
+			ldcp->next_tbufp = &(ldcp->tbufp[start]);
+			ldcp->next_txseq = seqnum;
+			ldcp->next_txi = start;
+		}
+		DBG2((vnetp,
+		    "vgen_handle_dring_data: rexmit: start(%d) end(%d)\n",
+		    start, end));
+#else	/* VGEN_REXMIT */
+		/* we just mark the descrs as done so they can be reclaimed */
+		for (txi = start; txi <= end; ) {
+			txdp = &(ldcp->txdp[txi]);
+			hdrp = &txdp->hdr;
+			if (hdrp->dstate == VIO_DESC_READY)
+				hdrp->dstate = VIO_DESC_DONE;
+			INCR_TXI(txi, ldcp);
+		}
+#endif	/* VGEN_REXMIT */
+		mutex_exit(&ldcp->tclock);
+		mutex_exit(&ldcp->txlock);
+
+		vgen_reclaim(ldcp);
+
+		break;
+	}
+
+	DBG1((vnetp, "vgen_handle_dring_data: exit\n"));
+	*headp = bp;
+	*tailp = bpt;
+}
+
+static void
+vgen_reclaim(vgen_ldc_t *ldcp)
+{
+	if (mutex_tryenter(&ldcp->tclock) == 0)
+		return;			/* already in progress */
+	vgen_reclaim_dring(ldcp);
+	ldcp->reclaim_lbolt = ddi_get_lbolt();
+	mutex_exit(&ldcp->tclock);
+}
+
+/*
+ * transmit reclaim function. starting from the current reclaim index
+ * look for descriptors marked DONE and reclaim the descriptor and the
+ * corresponding buffers (tbuf).
+ */
+static void
+vgen_reclaim_dring(vgen_ldc_t *ldcp)
+{
+	vnet_public_desc_t *txdp;
+	vgen_private_desc_t *tbufp;
+	vio_dring_entry_hdr_t	*hdrp;
+#ifdef VGEN_USE_MAC_TX_UPDATE
+	vgen_t	*vgenp = (vgen_t *)ldcp->vgenp;
+#endif
+
+#ifdef DEBUG
+	if (vgen_trigger_txtimeout)
+		return;
+#endif
+
+	tbufp = ldcp->cur_tbufp;
+	txdp = tbufp->descp;
+	hdrp = &txdp->hdr;
+
+	while ((hdrp->dstate == VIO_DESC_DONE) &&
+	    (tbufp != ldcp->next_tbufp)) {
+		(void) ldc_mem_unbind_handle(tbufp->memhandle);
+		freemsg(tbufp->mp);
+		tbufp->mp = NULL;
+		tbufp->flags = VGEN_PRIV_DESC_FREE;
+		hdrp->dstate = VIO_DESC_FREE;
+		hdrp->ack = B_FALSE;
+
+		tbufp = NEXTTBUF(ldcp, tbufp);
+		txdp = tbufp->descp;
+		hdrp = &txdp->hdr;
+	}
+
+	ldcp->cur_tbufp = tbufp;
+
+	/*
+	 * Check if mac layer should be notified to restart transmissions
+	 */
+	if (ldcp->need_resched) {
+		ldcp->need_resched = B_FALSE;
+#ifdef VGEN_USE_MAC_TX_UPDATE
+		mac_tx_update(vgenp->vnetmacp);
+#endif
+	}
+}
+
+/* return the number of pending transmits for the channel */
+static int
+vgen_num_txpending(vgen_ldc_t *ldcp)
+{
+	int n;
+
+	if (ldcp->next_tbufp >= ldcp->cur_tbufp) {
+		n = ldcp->next_tbufp - ldcp->cur_tbufp;
+	} else  {
+		/* cur_tbufp > next_tbufp */
+		n = ldcp->num_txds - (ldcp->cur_tbufp - ldcp->next_tbufp);
+	}
+
+	return (n);
+}
+
+/* determine if the transmit descriptor ring is full */
+static int
+vgen_tx_dring_full(vgen_ldc_t *ldcp)
+{
+	vgen_private_desc_t	*tbufp;
+	vgen_private_desc_t	*ntbufp;
+
+	tbufp = ldcp->next_tbufp;
+	ntbufp = NEXTTBUF(ldcp, tbufp);
+	if (ntbufp == ldcp->cur_tbufp) { /* out of tbufs/txds */
+#if 0
+		void *vnetp = LDC_TO_VNET(ldcp);
+		DWARN((vnetp, "vgen_tx_dring_full: id(%lx)\n",
+		    ldcp->ldc_id));
+#endif
+		return (VGEN_SUCCESS);
+	}
+	return (VGEN_FAILURE);
+}
+
+/* determine if timeout condition has occured */
+static int
+vgen_ldc_txtimeout(vgen_ldc_t *ldcp)
+{
+	if (((ddi_get_lbolt() - ldcp->reclaim_lbolt) >
+	    drv_usectohz(vnet_ldcwd_txtimeout * 1000)) &&
+	    (vnet_ldcwd_txtimeout) &&
+	    (vgen_tx_dring_full(ldcp) == VGEN_SUCCESS)) {
+#if 0
+		void *vnetp = LDC_TO_VNET(ldcp);
+		DWARN((vnetp, "vgen_ldc_txtimeout: id(%lx)\n",
+		    ldcp->ldc_id));
+#endif
+		return (VGEN_SUCCESS);
+	} else {
+		return (VGEN_FAILURE);
+	}
+}
+
+/* transmit watchdog timeout handler */
+static void
+vgen_ldc_watchdog(void *arg)
+{
+	vgen_ldc_t *ldcp;
+	void *vnetp;
+	int rv;
+
+	ldcp = (vgen_ldc_t *)arg;
+	vnetp = LDC_TO_VNET(ldcp);
+
+	rv = vgen_ldc_txtimeout(ldcp);
+	if (rv == VGEN_SUCCESS) {
+		DWARN((vnetp,
+		    "vgen_ldc_watchdog: transmit timeout ldcid(%lx)\n",
+		    ldcp->ldc_id));
+#ifdef DEBUG
+		if (vgen_trigger_txtimeout) {
+			/* tx timeout triggered for debugging */
+			vgen_trigger_txtimeout = 0;
+		}
+#endif
+		mutex_enter(&ldcp->cblock);
+		vgen_handshake_retry(ldcp);
+		mutex_exit(&ldcp->cblock);
+		if (ldcp->need_resched) {
+			ldcp->need_resched = B_FALSE;
+#ifdef VGEN_USE_MAC_TX_UPDATE
+			mac_tx_update(ldcp->vgenp->vnetmacp);
+#endif
+		}
+	}
+
+	ldcp->wd_tid = timeout(vgen_ldc_watchdog, (caddr_t)ldcp,
+	    drv_usectohz(vnet_ldcwd_interval * 1000));
+}
+
+/* based on mcopymsg() */
+static void
+vgen_copymsg(mblk_t *mp, void *bufp)
+{
+	caddr_t	dest = bufp;
+	mblk_t	*bp;
+	size_t	n;
+
+	for (bp = mp; bp != NULL; bp = bp->b_cont) {
+		n = MBLKL(bp);
+		bcopy(bp->b_rptr, dest, n);
+		dest += n;
+	}
+}
+
+static int
+vgen_setup_kstats(vgen_ldc_t *ldcp)
+{
+	vgen_t *vgenp;
+	struct kstat *ksp;
+	vgen_stats_t *statsp;
+	vgen_kstats_t *ldckp;
+	int instance;
+	size_t size;
+	char name[MAXNAMELEN];
+
+	vgenp = LDC_TO_VGEN(ldcp);
+	instance = ddi_get_instance(vgenp->vnetdip);
+	(void) sprintf(name, "vnetldc0x%lx", ldcp->ldc_id);
+	statsp = kmem_zalloc(sizeof (vgen_stats_t), KM_SLEEP);
+	if (statsp == NULL) {
+		return (VGEN_FAILURE);
+	}
+	size = sizeof (vgen_kstats_t) / sizeof (kstat_named_t);
+	ksp = kstat_create("vnet", instance, name, "net", KSTAT_TYPE_NAMED,
+		size, 0);
+	if (ksp == NULL) {
+		KMEM_FREE(statsp);
+		return (VGEN_FAILURE);
+	}
+
+	ldckp = (vgen_kstats_t *)ksp->ks_data;
+	kstat_named_init(&ldckp->ipackets,		"ipackets",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->ipackets64,		"ipackets64",
+		KSTAT_DATA_ULONGLONG);
+	kstat_named_init(&ldckp->ierrors,		"ierrors",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->opackets,		"opackets",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->opackets64,		"opackets64",
+		KSTAT_DATA_ULONGLONG);
+	kstat_named_init(&ldckp->oerrors,		"oerrors",
+		KSTAT_DATA_ULONG);
+
+
+	/* MIB II kstat variables */
+	kstat_named_init(&ldckp->rbytes,		"rbytes",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->rbytes64,		"rbytes64",
+		KSTAT_DATA_ULONGLONG);
+	kstat_named_init(&ldckp->obytes,		"obytes",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->obytes64,		"obytes64",
+		KSTAT_DATA_ULONGLONG);
+	kstat_named_init(&ldckp->multircv,		"multircv",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->multixmt,		"multixmt",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->brdcstrcv,		"brdcstrcv",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->brdcstxmt,		"brdcstxmt",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->norcvbuf,		"norcvbuf",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->noxmtbuf,		"noxmtbuf",
+		KSTAT_DATA_ULONG);
+
+	/* Tx stats */
+	kstat_named_init(&ldckp->tx_no_desc,		"tx_no_desc",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->tx_allocb_fail,	"tx_allocb_fail",
+		KSTAT_DATA_ULONG);
+
+	/* Rx stats */
+	kstat_named_init(&ldckp->rx_no_desc,		"rx_no_desc",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->rx_allocb_fail,	"rx_allocb_fail",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->rx_lost_pkts,		"rx_lost_pkts",
+		KSTAT_DATA_ULONG);
+
+	/* Interrupt stats */
+	kstat_named_init(&ldckp->callbacks,		"callbacks",
+		KSTAT_DATA_ULONG);
+	kstat_named_init(&ldckp->dring_data_acks,	"dring_data_acks",
+		KSTAT_DATA_ULONG);
+
+	ksp->ks_update = vgen_kstat_update;
+	ksp->ks_private = (void *)ldcp;
+	kstat_install(ksp);
+
+	ldcp->ksp = ksp;
+	ldcp->statsp = statsp;
+	return (VGEN_SUCCESS);
+}
+
+static void
+vgen_destroy_kstats(vgen_ldc_t *ldcp)
+{
+	if (ldcp->ksp)
+		kstat_delete(ldcp->ksp);
+	KMEM_FREE(ldcp->statsp);
+}
+
+static int
+vgen_kstat_update(kstat_t *ksp, int rw)
+{
+	vgen_ldc_t *ldcp;
+	vgen_stats_t *statsp;
+	vgen_kstats_t *ldckp;
+
+	ldcp = (vgen_ldc_t *)ksp->ks_private;
+	statsp = ldcp->statsp;
+	ldckp = (vgen_kstats_t *)ksp->ks_data;
+
+	if (rw == KSTAT_READ) {
+		ldckp->ipackets.value.ul	= (uint32_t)statsp->ipackets;
+		ldckp->ipackets64.value.ull	= statsp->ipackets;
+		ldckp->ierrors.value.ul		= statsp->ierrors;
+		ldckp->opackets.value.ul	= (uint32_t)statsp->opackets;
+		ldckp->opackets64.value.ull	= statsp->opackets;
+		ldckp->oerrors.value.ul		= statsp->oerrors;
+
+		/*
+		 * MIB II kstat variables
+		 */
+		ldckp->rbytes.value.ul		= (uint32_t)statsp->rbytes;
+		ldckp->rbytes64.value.ull	= statsp->rbytes;
+		ldckp->obytes.value.ul		= (uint32_t)statsp->obytes;
+		ldckp->obytes64.value.ull	= statsp->obytes;
+		ldckp->multircv.value.ul	= statsp->multircv;
+		ldckp->multixmt.value.ul	= statsp->multixmt;
+		ldckp->brdcstrcv.value.ul	= statsp->brdcstrcv;
+		ldckp->brdcstxmt.value.ul	= statsp->brdcstxmt;
+		ldckp->norcvbuf.value.ul	= statsp->norcvbuf;
+		ldckp->noxmtbuf.value.ul	= statsp->noxmtbuf;
+
+		ldckp->tx_no_desc.value.ul	= statsp->tx_no_desc;
+		ldckp->tx_allocb_fail.value.ul	= statsp->tx_allocb_fail;
+
+		ldckp->rx_no_desc.value.ul	= statsp->rx_no_desc;
+		ldckp->rx_allocb_fail.value.ul	= statsp->rx_allocb_fail;
+		ldckp->rx_lost_pkts.value.ul	= statsp->rx_lost_pkts;
+
+		ldckp->callbacks.value.ul	= statsp->callbacks;
+		ldckp->dring_data_acks.value.ul	= statsp->dring_data_acks;
+	} else {
+		statsp->ipackets	= ldckp->ipackets64.value.ull;
+		statsp->ierrors		= ldckp->ierrors.value.ul;
+		statsp->opackets	= ldckp->opackets64.value.ull;
+		statsp->oerrors		= ldckp->oerrors.value.ul;
+
+		/*
+		 * MIB II kstat variables
+		 */
+		statsp->rbytes		= ldckp->rbytes64.value.ull;
+		statsp->obytes		= ldckp->obytes64.value.ull;
+		statsp->multircv	= ldckp->multircv.value.ul;
+		statsp->multixmt	= ldckp->multixmt.value.ul;
+		statsp->brdcstrcv	= ldckp->brdcstrcv.value.ul;
+		statsp->brdcstxmt	= ldckp->brdcstxmt.value.ul;
+		statsp->norcvbuf	= ldckp->norcvbuf.value.ul;
+		statsp->noxmtbuf	= ldckp->noxmtbuf.value.ul;
+
+		statsp->tx_no_desc	= ldckp->tx_no_desc.value.ul;
+		statsp->tx_allocb_fail	= ldckp->tx_allocb_fail.value.ul;
+
+		statsp->rx_no_desc	= ldckp->rx_no_desc.value.ul;
+		statsp->rx_allocb_fail	= ldckp->rx_allocb_fail.value.ul;
+		statsp->rx_lost_pkts	= ldckp->rx_lost_pkts.value.ul;
+
+		statsp->callbacks	= ldckp->callbacks.value.ul;
+		statsp->dring_data_acks	= ldckp->dring_data_acks.value.ul;
+	}
+
+	return (VGEN_SUCCESS);
+}
+
+/* handler for error messages received from the peer ldc end-point */
+static void
+vgen_handle_errmsg(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
+{
+	_NOTE(ARGUNUSED(ldcp, tagp))
+}
+
+/* Check if the session id in the received message is valid */
+static int
+vgen_check_sid(vgen_ldc_t *ldcp, vio_msg_tag_t *tagp)
+{
+	if (tagp->vio_sid != ldcp->peer_sid) {
+		void *vnetp = LDC_TO_VNET(ldcp);
+		DWARN((vnetp,
+		    "sid mismatch: expected(%x), rcvd(%x)\n",
+		    ldcp->peer_sid, tagp->vio_sid));
+		return (VGEN_FAILURE);
+	}
+	else
+		return (VGEN_SUCCESS);
+}
+
+/* convert mac address from string to uint64_t */
+static uint64_t
+vgen_macaddr_strtoul(const uint8_t *macaddr)
+{
+	uint64_t val = 0;
+	int i;
+
+#if 0
+	for (i = ETHERADDRL - 1; i >= 0; i--) {
+#endif
+	for (i = 0; i < ETHERADDRL; i++) {
+		val <<= 8;
+		val |= macaddr[i];
+	}
+
+#if 0
+	cmn_err(CE_CONT, "vgen_macaddr_strtoul: str(%x:%x:%x:%x:%x:%x)\n",
+		macaddr[0], macaddr[1], macaddr[2],
+		macaddr[3], macaddr[4], macaddr[5]);
+	cmn_err(CE_CONT, "vgen_macaddr_strtoul: val(0x%lx)\n", val);
+#endif
+	return (val);
+}
+
+/* convert mac address from uint64_t to string */
+static int
+vgen_macaddr_ultostr(uint64_t val, uint8_t *macaddr)
+{
+	int i;
+	uint64_t value;
+
+	value = val;
+#if 0
+	for (i = 0; i < ETHERADDRL; i++) {
+#endif
+	for (i = ETHERADDRL - 1; i >= 0; i--) {
+		macaddr[i] = value & 0xFF;
+		value >>= 8;
+	}
+#if 0
+	cmn_err(CE_CONT, "vgen_macaddr_ultostr: val(0x%lx)\n", val);
+	cmn_err(CE_CONT, "vgen_macaddr_ultostr: str(%x:%x:%x:%x:%x:%x)\n",
+		macaddr[0], macaddr[1], macaddr[2],
+		macaddr[3], macaddr[4], macaddr[5]);
+#endif
+	return (VGEN_SUCCESS);
+}
+
+static caddr_t
+vgen_print_ethaddr(uint8_t *a, char *ebuf)
+{
+	(void) sprintf(ebuf,
+		"%x:%x:%x:%x:%x:%x", a[0], a[1], a[2], a[3], a[4], a[5]);
+	return (ebuf);
+}
+
+/* Handshake watchdog timeout handler */
+static void
+vgen_hwatchdog(void *arg)
+{
+	vgen_ldc_t *ldcp = (vgen_ldc_t *)arg;
+	void *vnetp = LDC_TO_VNET(ldcp);
+
+	DWARN((vnetp,
+	    "vgen_hwatchdog: handshake timeout ldc(%lx) phase(%x) state(%x)\n",
+	    ldcp->ldc_id, ldcp->hphase, ldcp->hstate));
+
+	mutex_enter(&ldcp->cblock);
+	ldcp->htid = 0;
+	vgen_handshake_retry(ldcp);
+	mutex_exit(&ldcp->cblock);
+}
+
+static void
+vgen_print_attr_info(vgen_ldc_t *ldcp, int endpoint)
+{
+	vgen_hparams_t *hp;
+	char ep[8];
+	uint8_t addr[6];
+	char	ea[6];
+
+	if (endpoint == VGEN_LOCAL) {
+		hp = &ldcp->local_hparams;
+		(void) sprintf(ep, "Local");
+	} else {
+		hp = &ldcp->peer_hparams;
+		(void) sprintf(ep, "Peer");
+	}
+	(void) vgen_macaddr_ultostr(hp->addr, addr);
+	cmn_err(CE_CONT, "attr_info: %s: \n", ep);
+	cmn_err(CE_CONT, "\tMTU: %lx, addr: %s\n", hp->mtu,
+				vgen_print_ethaddr(addr, ea));
+	cmn_err(CE_CONT, "\taddr_type: %x, xfer_mode: %x, ack_freq: %x\n",
+		hp->addr_type, hp->xfer_mode, hp->ack_freq);
+}
+
+static void
+vgen_print_hparams(vgen_hparams_t *hp)
+{
+	uint8_t	addr[6];
+	char	ea[6];
+	ldc_mem_cookie_t *dc;
+
+	cmn_err(CE_CONT, "version_info:\n");
+	cmn_err(CE_CONT,
+	    "\tver_major: %d, ver_minor: %d, dev_class: %d\n",
+	    hp->ver_major, hp->ver_minor, hp->dev_class);
+
+	(void) vgen_macaddr_ultostr(hp->addr, addr);
+	cmn_err(CE_CONT, "attr_info:\n");
+	cmn_err(CE_CONT, "\tMTU: %lx, addr: %s\n", hp->mtu,
+	    vgen_print_ethaddr(addr, ea));
+	cmn_err(CE_CONT,
+	    "\taddr_type: %x, xfer_mode: %x, ack_freq: %x\n",
+	    hp->addr_type, hp->xfer_mode, hp->ack_freq);
+
+	dc = &hp->dring_cookie;
+	cmn_err(CE_CONT, "dring_info:\n");
+	cmn_err(CE_CONT,
+	    "\tlength: %d, dsize: %d\n", hp->num_desc, hp->desc_size);
+	cmn_err(CE_CONT,
+	    "\tldc_addr: 0x%lx, ldc_size: %ld\n",
+	    dc->addr, dc->size);
+	cmn_err(CE_CONT, "\tdring_ident: 0x%lx\n", hp->dring_ident);
+}
+
+static void
+vgen_print_ldcinfo(vgen_ldc_t *ldcp)
+{
+	vgen_hparams_t *hp;
+
+	cmn_err(CE_CONT, "Channel Information:\n");
+	cmn_err(CE_CONT,
+	    "\tldc_id: 0x%lx, ldc_status: 0x%x\n",
+	    ldcp->ldc_id, ldcp->ldc_status);
+	cmn_err(CE_CONT,
+	    "\tlocal_sid: 0x%x, peer_sid: 0x%x\n",
+	    ldcp->local_sid, ldcp->peer_sid);
+	cmn_err(CE_CONT,
+	    "\thphase: 0x%x, hstate: 0x%x\n",
+	    ldcp->hphase, ldcp->hstate);
+
+	cmn_err(CE_CONT, "Local handshake params:\n");
+	hp = &ldcp->local_hparams;
+	vgen_print_hparams(hp);
+
+	cmn_err(CE_CONT, "Peer handshake params:\n");
+	hp = &ldcp->peer_hparams;
+	vgen_print_hparams(hp);
+}
--- a/usr/src/uts/sun4v/io/vnex.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/io/vnex.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -96,7 +95,8 @@
 	{"loop", 	PIL_3},
 	{"sunmc", 	PIL_3},
 	{"sunvts", 	PIL_3},
-	{"explorer", 	PIL_3}
+	{"explorer", 	PIL_3},
+	{"ncp", 	PIL_8}
 };
 
 #define	VNEX_MAX_DEVS	(sizeof (vnex_name_to_pil) /	\
@@ -423,6 +423,20 @@
 	return (DDI_SUCCESS);
 }
 
+int
+vnex_ino_to_inum(dev_info_t *dip, uint32_t ino)
+{
+	vnex_id_t		*vid_p;
+	ddi_intr_handle_impl_t	*hdlp;
+
+	if ((vid_p = vnex_locate_id(dip, ino)) == NULL)
+		return (-1);
+	else if ((hdlp = vid_p->vid_ddi_hdlp) == NULL)
+		return (-1);
+	else
+		return (hdlp->ih_inum);
+}
+
 static int
 vnex_add_intr(dev_info_t *dip, dev_info_t *rdip,
     ddi_intr_handle_impl_t *hdlp)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/io/vsw.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,6959 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/debug.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/user.h>
+#include <sys/stropts.h>
+#include <sys/stream.h>
+#include <sys/strlog.h>
+#include <sys/strsubr.h>
+#include <sys/cmn_err.h>
+#include <sys/cpu.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ksynch.h>
+#include <sys/stat.h>
+#include <sys/kstat.h>
+#include <sys/vtrace.h>
+#include <sys/strsun.h>
+#include <sys/dlpi.h>
+#include <sys/ethernet.h>
+#include <net/if.h>
+#include <sys/varargs.h>
+#include <sys/machsystm.h>
+#include <sys/modctl.h>
+#include <sys/modhash.h>
+#include <sys/mac.h>
+#include <sys/taskq.h>
+#include <sys/note.h>
+#include <sys/mach_descrip.h>
+#include <sys/mac.h>
+#include <sys/mdeg.h>
+#include <sys/ldc.h>
+#include <sys/vsw_fdb.h>
+#include <sys/vsw.h>
+#include <sys/vio_mailbox.h>
+#include <sys/vnet_mailbox.h>
+#include <sys/vnet_common.h>
+
+/*
+ * Function prototypes.
+ */
+static	int vsw_attach(dev_info_t *, ddi_attach_cmd_t);
+static	int vsw_detach(dev_info_t *, ddi_detach_cmd_t);
+static	int vsw_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
+static	void vsw_get_md_properties(vsw_t *vswp);
+static	int vsw_setup_layer2(vsw_t *);
+static	int vsw_setup_layer3(vsw_t *);
+
+/* MAC layer routines */
+static	int vsw_mac_attach(vsw_t *vswp);
+static	void vsw_mac_detach(vsw_t *vswp);
+static void vsw_notify_cb(void *, mac_notify_type_t);
+static void vsw_rx_cb(void *, mac_resource_handle_t, mblk_t *);
+static mblk_t *vsw_tx_msg(vsw_t *, mblk_t *);
+static int vsw_mac_register(vsw_t *);
+static int vsw_mac_unregister(vsw_t *);
+static uint64_t vsw_m_stat(void *arg, enum mac_stat);
+static void vsw_m_stop(void *arg);
+static int vsw_m_start(void *arg);
+static int vsw_m_unicst(void *arg, const uint8_t *);
+static int vsw_m_multicst(void *arg, boolean_t, const uint8_t *);
+static int vsw_m_promisc(void *arg, boolean_t);
+static mblk_t *vsw_m_tx(void *arg, mblk_t *);
+static void vsw_m_resources(void *arg);
+static void vsw_m_ioctl(void *arg, queue_t *q, mblk_t *mp);
+
+/* MDEG routines */
+static	void vsw_mdeg_register(vsw_t *vswp);
+static	void vsw_mdeg_unregister(vsw_t *vswp);
+static	int vsw_mdeg_cb(void *cb_argp, mdeg_result_t *);
+
+/* Port add/deletion routines */
+static	int vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node);
+static	int vsw_port_attach(vsw_t *vswp, int p_instance,
+	uint64_t *ldcids, int nids, struct ether_addr *macaddr);
+static	int vsw_detach_ports(vsw_t *vswp);
+static	int vsw_port_detach(vsw_t *vswp, int p_instance);
+static	int vsw_port_delete(vsw_port_t *port);
+static	int vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id);
+static	int vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id);
+static	int vsw_init_ldcs(vsw_port_t *port);
+static	int vsw_uninit_ldcs(vsw_port_t *port);
+static	int vsw_ldc_init(vsw_ldc_t *ldcp);
+static	int vsw_ldc_uninit(vsw_ldc_t *ldcp);
+static	int vsw_drain_ldcs(vsw_port_t *port);
+static	int vsw_drain_port_taskq(vsw_port_t *port);
+static	void vsw_marker_task(void *);
+static	vsw_port_t *vsw_lookup_port(vsw_t *vswp, int p_instance);
+static	int vsw_plist_del_node(vsw_t *, vsw_port_t *port);
+
+/* Interrupt routines */
+static	uint_t vsw_ldc_cb(uint64_t cb, caddr_t arg);
+
+/* Handshake routines */
+static	void vsw_restart_handshake(vsw_ldc_t *);
+static	int vsw_check_flag(vsw_ldc_t *, int, uint64_t);
+static	void vsw_next_milestone(vsw_ldc_t *);
+static	int vsw_supported_version(vio_ver_msg_t *);
+
+/* Data processing routines */
+static void vsw_process_pkt(void *);
+static void vsw_dispatch_ctrl_task(vsw_ldc_t *, void *, vio_msg_tag_t);
+static void vsw_process_ctrl_pkt(void *);
+static void vsw_process_ctrl_ver_pkt(vsw_ldc_t *, void *);
+static void vsw_process_ctrl_attr_pkt(vsw_ldc_t *, void *);
+static void vsw_process_ctrl_mcst_pkt(vsw_ldc_t *, void *);
+static void vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *, void *);
+static void vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *, void *);
+static void vsw_process_ctrl_rdx_pkt(vsw_ldc_t *, void *);
+static void vsw_process_data_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
+static void vsw_process_data_dring_pkt(vsw_ldc_t *, void *);
+static void vsw_process_data_raw_pkt(vsw_ldc_t *, void *);
+static void vsw_process_data_ibnd_pkt(vsw_ldc_t *, void *);
+static void vsw_process_err_pkt(vsw_ldc_t *, void *, vio_msg_tag_t);
+
+/* Switching/data transmit routines */
+static	void vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
+	    vsw_port_t *port, mac_resource_handle_t);
+static	void vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
+	    vsw_port_t *port, mac_resource_handle_t);
+static	int vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller,
+	    vsw_port_t *port);
+static	int vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller,
+	    vsw_port_t *port);
+static	int vsw_portsend(vsw_port_t *, mblk_t *);
+static	int vsw_dringsend(vsw_ldc_t *, mblk_t *);
+static	int vsw_descrsend(vsw_ldc_t *, mblk_t *);
+
+/* Packet creation routines */
+static void vsw_send_ver(vsw_ldc_t *);
+static void vsw_send_attr(vsw_ldc_t *);
+static vio_dring_reg_msg_t *vsw_create_dring_info_pkt(vsw_ldc_t *);
+static void vsw_send_dring_info(vsw_ldc_t *);
+static void vsw_send_rdx(vsw_ldc_t *);
+
+static void vsw_send_msg(vsw_ldc_t *, void *, int);
+
+/* Forwarding database (FDB) routines */
+static	int vsw_add_fdb(vsw_t *vswp, vsw_port_t *port);
+static	int vsw_del_fdb(vsw_t *vswp, vsw_port_t *port);
+static	vsw_port_t *vsw_lookup_fdb(vsw_t *vswp, struct ether_header *);
+static	int vsw_add_rem_mcst(vnet_mcast_msg_t *, vsw_port_t *);
+static	int vsw_add_mcst(vsw_t *, uint8_t, uint64_t, void *);
+static	int vsw_del_mcst(vsw_t *, uint8_t, uint64_t, void *);
+static	void vsw_del_addr(uint8_t, void *, uint64_t);
+static	void vsw_del_mcst_port(vsw_port_t *);
+static	void vsw_del_mcst_vsw(vsw_t *);
+
+/* Dring routines */
+static dring_info_t *vsw_create_dring(vsw_ldc_t *);
+static void vsw_create_privring(vsw_ldc_t *);
+static int vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp);
+static int vsw_dring_find_free_desc(dring_info_t *, vsw_private_desc_t **,
+    int *);
+static void vsw_dring_priv2pub(vsw_private_desc_t *);
+static dring_info_t *vsw_ident2dring(lane_t *, uint64_t);
+
+static void vsw_set_lane_attr(vsw_t *, lane_t *);
+static int vsw_check_attr(vnet_attr_msg_t *, vsw_port_t *);
+static int vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg);
+static int vsw_mem_cookie_match(ldc_mem_cookie_t *, ldc_mem_cookie_t *);
+static int vsw_check_dring_info(vio_dring_reg_msg_t *);
+
+/* Misc support routines */
+static	caddr_t vsw_print_ethaddr(uint8_t *addr, char *ebuf);
+
+static void vsw_free_lane_resources(vsw_ldc_t *, uint64_t);
+static int vsw_free_ring(dring_info_t *);
+
+/* Debugging routines */
+static void dump_flags(uint64_t);
+static void display_state(void);
+static void display_lane(lane_t *);
+static void display_ring(dring_info_t *);
+
+int	vsw_num_handshakes = 3;		/* # of handshake attempts */
+int	vsw_wretries = 100;		/* # of write attempts */
+
+/*
+ * mode specific frame switching function
+ */
+void		(*vsw_switch_frame)(vsw_t *, mblk_t *, int, vsw_port_t *,
+			mac_resource_handle_t);
+
+static	struct	cb_ops	vsw_cb_ops = {
+	nulldev,			/* cb_open */
+	nulldev,			/* cb_close */
+	nodev,				/* cb_strategy */
+	nodev,				/* cb_print */
+	nodev,				/* cb_dump */
+	nodev,				/* cb_read */
+	nodev,				/* cb_write */
+	nodev,				/* cb_ioctl */
+	nodev,				/* cb_devmap */
+	nodev,				/* cb_mmap */
+	nodev,				/* cb_segmap */
+	nochpoll,			/* cb_chpoll */
+	ddi_prop_op,			/* cb_prop_op */
+	NULL,				/* cb_stream */
+	D_MP,				/* cb_flag */
+	CB_REV,				/* rev */
+	nodev,				/* int (*cb_aread)() */
+	nodev				/* int (*cb_awrite)() */
+};
+
+static	struct	dev_ops	vsw_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* devo_refcnt */
+	vsw_getinfo,		/* devo_getinfo */
+	nulldev,		/* devo_identify */
+	nulldev,		/* devo_probe */
+	vsw_attach,		/* devo_attach */
+	vsw_detach,		/* devo_detach */
+	nodev,			/* devo_reset */
+	&vsw_cb_ops,		/* devo_cb_ops */
+	(struct bus_ops *)NULL,	/* devo_bus_ops */
+	ddi_power		/* devo_power */
+};
+
+extern	struct	mod_ops	mod_driverops;
+static struct modldrv vswmodldrv = {
+	&mod_driverops,
+	"sun4v Virtual Switch Driver %I%",
+	&vsw_ops,
+};
+
+#define	LDC_ENTER_LOCK(ldcp)	\
+				mutex_enter(&((ldcp)->ldc_cblock));\
+				mutex_enter(&((ldcp)->ldc_txlock));
+#define	LDC_EXIT_LOCK(ldcp)	\
+				mutex_exit(&((ldcp)->ldc_txlock));\
+				mutex_exit(&((ldcp)->ldc_cblock));
+
+/* Driver soft state ptr  */
+static void	*vsw_state;
+
+/*
+ * Linked list of "vsw_t" structures - one per instance.
+ */
+vsw_t		*vsw_head = NULL;
+krwlock_t	vsw_rw;
+
+/*
+ * Property names
+ */
+static char vdev_propname[] = "virtual-device";
+static char vsw_propname[] = "virtual-network-switch";
+static char physdev_propname[] = "vsw-phys-dev";
+static char smode_propname[] = "vsw-switch-mode";
+static char macaddr_propname[] = "local-mac-address";
+static char remaddr_propname[] = "remote-mac-address";
+static char ldcids_propname[] = "ldc-ids";
+static char chan_propname[] = "channel-endpoint";
+static char id_propname[] = "id";
+static char reg_propname[] = "reg";
+
+/* supported versions */
+static	ver_sup_t	vsw_versions[] = { {1, 0} };
+
+/*
+ * Matching criteria passed to the MDEG to register interest
+ * in changes to 'virtual-device-port' nodes identified by their
+ * 'id' property.
+ */
+static md_prop_match_t vport_prop_match[] = {
+	{ MDET_PROP_VAL,    "id"   },
+	{ MDET_LIST_END,    NULL    }
+};
+
+static mdeg_node_match_t vport_match = { "virtual-device-port",
+						vport_prop_match };
+
+/*
+ * Specification of an MD node passed to the MDEG to filter any
+ * 'vport' nodes that do not belong to the specified node. This
+ * template is copied for each vsw instance and filled in with
+ * the appropriate 'cfg-handle' value before being passed to the MDEG.
+ */
+static mdeg_prop_spec_t vsw_prop_template[] = {
+	{ MDET_PROP_STR,    "name",		vsw_propname },
+	{ MDET_PROP_VAL,    "cfg-handle",	NULL	},
+	{ MDET_LIST_END,    NULL,		NULL	}
+};
+
+#define	VSW_SET_MDEG_PROP_INST(specp, val)	(specp)[1].ps_val = (val);
+
+/*
+ * Print debug messages - set to 0x1f to enable all msgs
+ * or 0x0 to turn all off.
+ */
+int vswdbg = 0x0;
+
+/*
+ * debug levels:
+ * 0x01:	Function entry/exit tracing
+ * 0x02:	Internal function messages
+ * 0x04:	Verbose internal messages
+ * 0x08:	Warning messages
+ * 0x10:	Error messages
+ */
+
+static void
+vswdebug(vsw_t *vswp, const char *fmt, ...)
+{
+	char buf[512];
+	va_list ap;
+
+	va_start(ap, fmt);
+	(void) vsprintf(buf, fmt, ap);
+	va_end(ap);
+
+	if (vswp == NULL)
+		cmn_err(CE_CONT, "%s\n", buf);
+	else
+		cmn_err(CE_CONT, "vsw%d: %s\n", vswp->instance, buf);
+}
+
+/*
+ * For the moment the state dump routines have their own
+ * private flag.
+ */
+#define	DUMP_STATE	0
+
+#if DUMP_STATE
+
+#define	DUMP_TAG(tag) \
+{			\
+	D1(NULL, "DUMP_TAG: type 0x%llx", (tag).vio_msgtype); \
+	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag).vio_subtype);	\
+	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag).vio_subtype_env);	\
+}
+
+#define	DUMP_TAG_PTR(tag) \
+{			\
+	D1(NULL, "DUMP_TAG: type 0x%llx", (tag)->vio_msgtype); \
+	D1(NULL, "DUMP_TAG: stype 0x%llx", (tag)->vio_subtype);	\
+	D1(NULL, "DUMP_TAG: senv 0x%llx", (tag)->vio_subtype_env);	\
+}
+
+#define	DUMP_FLAGS(flags) dump_flags(flags);
+#define	DISPLAY_STATE()	display_state()
+
+#else
+
+#define	DUMP_TAG(tag)
+#define	DUMP_TAG_PTR(tag)
+#define	DUMP_FLAGS(state)
+#define	DISPLAY_STATE()
+
+#endif	/* DUMP_STATE */
+
+#ifdef DEBUG
+
+#define	D1		\
+if (vswdbg & 0x01)	\
+	vswdebug
+
+#define	D2		\
+if (vswdbg & 0x02)	\
+	vswdebug
+
+#define	D3		\
+if (vswdbg & 0x04)	\
+	vswdebug
+
+#define	DWARN		\
+if (vswdbg & 0x08)	\
+	vswdebug
+
+#define	DERR		\
+if (vswdbg & 0x10)	\
+	vswdebug
+
+#else
+
+#define	DERR		if (0)	vswdebug
+#define	DWARN		if (0)	vswdebug
+#define	D1		if (0)	vswdebug
+#define	D2		if (0)	vswdebug
+#define	D3		if (0)	vswdebug
+
+#endif	/* DEBUG */
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	&vswmodldrv,
+	NULL
+};
+
+int
+_init(void)
+{
+	int status;
+
+	rw_init(&vsw_rw, NULL, RW_DRIVER, NULL);
+
+	status = ddi_soft_state_init(&vsw_state, sizeof (vsw_t), 1);
+	if (status != 0) {
+		return (status);
+	}
+
+	mac_init_ops(&vsw_ops, "vsw");
+	status = mod_install(&modlinkage);
+	if (status != 0) {
+		ddi_soft_state_fini(&vsw_state);
+	}
+	return (status);
+}
+
+int
+_fini(void)
+{
+	int status;
+
+	status = mod_remove(&modlinkage);
+	if (status != 0)
+		return (status);
+	mac_fini_ops(&vsw_ops);
+	ddi_soft_state_fini(&vsw_state);
+
+	rw_destroy(&vsw_rw);
+
+	return (status);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+static int
+vsw_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	vsw_t		*vswp;
+	int		smode, instance, i;
+	char		hashname[MAXNAMELEN];
+	char		qname[TASKQ_NAMELEN];
+	int		rv = 1;
+	enum		{ PROG_init = 0x0, PROG_if_lock = 0x1,
+				PROG_fdb = 0x2, PROG_mfdb = 0x4,
+				PROG_report_dev = 0x8, PROG_plist = 0x10,
+				PROG_taskq = 0x20}
+			progress;
+
+	progress = PROG_init;
+
+	switch (cmd) {
+	case DDI_ATTACH:
+		break;
+	case DDI_RESUME:
+		/* nothing to do for this non-device */
+		return (DDI_SUCCESS);
+	case DDI_PM_RESUME:
+	default:
+		return (DDI_FAILURE);
+	}
+
+	instance = ddi_get_instance(dip);
+	if (ddi_soft_state_zalloc(vsw_state, instance) != DDI_SUCCESS) {
+		DERR(NULL, "vsw%d: ddi_soft_state_zalloc failed", instance);
+		return (DDI_FAILURE);
+	}
+	vswp = ddi_get_soft_state(vsw_state, instance);
+
+	if (vswp == NULL) {
+		DERR(NULL, "vsw%d: ddi_get_soft_state failed", instance);
+		goto vsw_attach_fail;
+	}
+
+	vswp->dip = dip;
+	vswp->instance = instance;
+	ddi_set_driver_private(dip, (caddr_t)vswp);
+
+	rw_init(&vswp->if_lockrw, NULL, RW_DRIVER, NULL);
+
+	progress |= PROG_if_lock;
+
+	/*
+	 * User specifies (via MD) an array of switching modes in
+	 * decreasing order of preference. Default mode is always
+	 * layer 2 (mac switching), so init array with that value.
+	 */
+	vswp->smode_idx = 0;
+	for (i = 0; i < NUM_SMODES; i++)
+		vswp->smode[i] = VSW_LAYER2;
+
+	/*
+	 * Get the various properties such as physical device name
+	 * (vsw-phys-dev), switch mode etc from the MD.
+	 */
+	vsw_get_md_properties(vswp);
+
+	/* setup the unicast forwarding database  */
+	(void) snprintf(hashname, MAXNAMELEN, "vsw_unicst_table-%d",
+							vswp->instance);
+	D2(vswp, "creating unicast hash table (%s)...", hashname);
+	vswp->fdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
+		mod_hash_null_valdtor, sizeof (void *));
+
+	progress |= PROG_fdb;
+
+	/* setup the multicast fowarding database */
+	(void) snprintf(hashname, MAXNAMELEN, "vsw_mcst_table-%d",
+							vswp->instance);
+	D2(vswp, "creating multicast hash table %s)...", hashname);
+	rw_init(&vswp->mfdbrw, NULL, RW_DRIVER, NULL);
+	vswp->mfdb = mod_hash_create_ptrhash(hashname, VSW_NCHAINS,
+			mod_hash_null_valdtor, sizeof (void *));
+
+	progress |= PROG_mfdb;
+
+	/*
+	 * create lock protecting list of multicast addresses
+	 * which could come via m_multicst() entry point when plumbed.
+	 */
+	mutex_init(&vswp->mca_lock, NULL, MUTEX_DRIVER, NULL);
+	vswp->mcap = NULL;
+
+	ddi_report_dev(vswp->dip);
+
+	progress |= PROG_report_dev;
+
+	WRITE_ENTER(&vsw_rw);
+	vswp->next = vsw_head;
+	vsw_head = vswp;
+	RW_EXIT(&vsw_rw);
+
+	/* setup the port list */
+	rw_init(&vswp->plist.lockrw, NULL, RW_DRIVER, NULL);
+	vswp->plist.head = NULL;
+
+	progress |= PROG_plist;
+
+	/*
+	 * Create the taskq which will process all the VIO
+	 * control messages.
+	 */
+	(void) snprintf(qname, TASKQ_NAMELEN, "vsw_taskq%d", vswp->instance);
+	if ((vswp->taskq_p = ddi_taskq_create(vswp->dip, qname, 1,
+					TASKQ_DEFAULTPRI, 0)) == NULL) {
+		cmn_err(CE_WARN, "Unable to create task queue");
+		goto vsw_attach_fail;
+	}
+
+	progress |= PROG_taskq;
+
+	/* select best switching mode */
+	for (i = 0; i < NUM_SMODES; i++) {
+		smode = vswp->smode[i];
+		switch (smode) {
+		case VSW_LAYER2:
+			rv = vsw_setup_layer2(vswp);
+			break;
+
+		case VSW_LAYER2_PROMISC:
+			rv = vsw_setup_layer2(vswp);
+			break;
+
+		case VSW_LAYER3:
+			rv = vsw_setup_layer3(vswp);
+			break;
+
+		default:
+			DERR(vswp, "unknown switch mode");
+			break;
+		}
+
+		if (rv == 0) {
+			vswp->smode_idx = i;
+			break;
+		}
+	}
+
+	if (rv == 1) {
+		cmn_err(CE_WARN, "Unable to setup switching mode");
+		goto vsw_attach_fail;
+	}
+
+	D2(vswp, "Operating in mode %d", vswp->smode[vswp->smode_idx]);
+
+	/*
+	 * Register with the MAC layer as a network device so
+	 * we can be plumbed if desired.
+	 *
+	 * Do this in both layer 2 and layer 3 mode.
+	 */
+	vswp->if_state &= ~VSW_IF_UP;
+	vswp->if_macp = NULL;
+	vswp->if_mrh = NULL;
+	if (vswp->mdprops & VSW_MD_MACADDR) {
+		if (vsw_mac_register(vswp) != 0) {
+			cmn_err(CE_WARN, "Unable to register as provider "
+				" with MAC layer, continuing with attach");
+		}
+	}
+
+	/*
+	 * Now we have everything setup, register for MD change
+	 * events.
+	 */
+	vsw_mdeg_register(vswp);
+
+	return (DDI_SUCCESS);
+
+vsw_attach_fail:
+	DERR(NULL, "vsw_attach: failed");
+
+	if (progress & PROG_taskq)
+		ddi_taskq_destroy(vswp->taskq_p);
+
+	if (progress & PROG_plist)
+		rw_destroy(&vswp->plist.lockrw);
+
+	if (progress & PROG_report_dev) {
+		ddi_remove_minor_node(dip, NULL);
+		mutex_destroy(&vswp->mca_lock);
+	}
+
+	if (progress & PROG_mfdb) {
+		mod_hash_destroy_hash(vswp->mfdb);
+		vswp->mfdb = NULL;
+		rw_destroy(&vswp->mfdbrw);
+	}
+
+	if (progress & PROG_fdb) {
+		mod_hash_destroy_hash(vswp->fdb);
+		vswp->fdb = NULL;
+	}
+
+	if (progress & PROG_if_lock)
+		rw_destroy(&vswp->if_lockrw);
+
+	ddi_soft_state_free(vsw_state, instance);
+	return (DDI_FAILURE);
+}
+
+static int
+vsw_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	vsw_t	**vswpp, *vswp;
+	int 	instance;
+
+	instance = ddi_get_instance(dip);
+	vswp = ddi_get_soft_state(vsw_state, instance);
+
+	if (vswp == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	switch (cmd) {
+	case DDI_DETACH:
+		break;
+	case DDI_SUSPEND:
+	case DDI_PM_SUSPEND:
+	default:
+		return (DDI_FAILURE);
+	}
+
+	D2(vswp, "detaching instance %d", instance);
+
+	if (vswp->mdprops & VSW_MD_MACADDR) {
+		if (vsw_mac_unregister(vswp) != 0) {
+			cmn_err(CE_WARN, "Unable to detach from MAC layer");
+			return (DDI_FAILURE);
+		}
+	}
+	rw_destroy(&vswp->if_lockrw);
+
+	vsw_mdeg_unregister(vswp);
+
+	if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
+		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) {
+		vsw_mac_detach(vswp);
+	}
+
+	if (vsw_detach_ports(vswp) != 0) {
+		cmn_err(CE_WARN, "Unable to detach ports");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Remove this instance from any entries it may be on in
+	 * the hash table by using the list of addresses maintained
+	 * in the vsw_t structure.
+	 */
+	vsw_del_mcst_vsw(vswp);
+
+	vswp->mcap = NULL;
+	mutex_destroy(&vswp->mca_lock);
+
+	/*
+	 * By now any pending tasks have finished and the underlying
+	 * ldc's have been destroyed, so its safe to delete the control
+	 * message taskq.
+	 */
+	if (vswp->taskq_p != NULL)
+		ddi_taskq_destroy(vswp->taskq_p);
+
+	/*
+	 * At this stage all the data pointers in the hash table
+	 * should be NULL, as all the ports have been removed and will
+	 * have deleted themselves from the port lists which the data
+	 * pointers point to. Hence we can destroy the table using the
+	 * default destructors.
+	 */
+	D2(vswp, "vsw_detach: destroying hash tables..");
+	mod_hash_destroy_hash(vswp->fdb);
+	vswp->fdb = NULL;
+
+	WRITE_ENTER(&vswp->mfdbrw);
+	mod_hash_destroy_hash(vswp->mfdb);
+	vswp->mfdb = NULL;
+	RW_EXIT(&vswp->mfdbrw);
+	rw_destroy(&vswp->mfdbrw);
+
+	ddi_remove_minor_node(dip, NULL);
+
+	rw_destroy(&vswp->plist.lockrw);
+	WRITE_ENTER(&vsw_rw);
+	for (vswpp = &vsw_head; *vswpp; vswpp = &(*vswpp)->next) {
+		if (*vswpp == vswp) {
+			*vswpp = vswp->next;
+			break;
+		}
+	}
+	RW_EXIT(&vsw_rw);
+	ddi_soft_state_free(vsw_state, instance);
+
+	return (DDI_SUCCESS);
+}
+
+static int
+vsw_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	_NOTE(ARGUNUSED(dip))
+
+	vsw_t	*vswp = NULL;
+	dev_t	dev = (dev_t)arg;
+	int	instance;
+
+	instance = getminor(dev);
+
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		if ((vswp = ddi_get_soft_state(vsw_state, instance)) == NULL) {
+			*result = NULL;
+			return (DDI_FAILURE);
+		}
+		*result = vswp->dip;
+		return (DDI_SUCCESS);
+
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)(uintptr_t)instance;
+		return (DDI_SUCCESS);
+
+	default:
+		*result = NULL;
+		return (DDI_FAILURE);
+	}
+}
+
+/*
+ * Get the properties from our MD node.
+ */
+static void
+vsw_get_md_properties(vsw_t *vswp)
+{
+	md_t		*mdp = NULL;
+	int		num_nodes = 0;
+	int		len = 0, listsz = 0;
+	int		num_vdev = 0;
+	int		i, idx;
+	boolean_t	found_node = B_FALSE;
+	char		*smode = NULL;
+	char		*curr_mode = NULL;
+	char		*physname = NULL;
+	char		*node_name = NULL;
+	char		*dev;
+	uint64_t 	macaddr = 0;
+	uint64_t	md_inst, obp_inst;
+	mde_cookie_t	*listp = NULL;
+	mde_cookie_t	rootnode;
+
+	D1(vswp, "%s: enter", __func__);
+
+	/*
+	 * Further down we compare the obp 'reg' property to the
+	 * 'cfg-handle' property in the vsw MD node to determine
+	 * if the node refers to this particular instance. So if
+	 * we can't read the obp value then there is no point
+	 * in proceeding further.
+	 */
+	if (ddi_prop_exists(DDI_DEV_T_ANY, vswp->dip,
+			DDI_PROP_DONTPASS, reg_propname) != 1) {
+		cmn_err(CE_WARN, "Unable to read %s property "
+			"from OBP device node", reg_propname);
+		return;
+	}
+
+	obp_inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
+		DDI_PROP_DONTPASS, reg_propname, 0);
+
+	D2(vswp, "%s: obp_inst 0x%llx", __func__, obp_inst);
+
+	if ((mdp = md_get_handle()) == NULL) {
+		DERR(vswp, "%s: unable to init MD", __func__);
+		return;
+	}
+
+	if ((num_nodes = md_node_count(mdp)) <= 0) {
+		DERR(vswp, "%s: invalid number of  nodes found %d",
+			__func__, num_nodes);
+		(void) md_fini_handle(mdp);
+		return;
+	}
+
+	D2(vswp, "%s: %d nodes in total in MD", __func__, num_nodes);
+
+	/* allocate enough space for node list */
+	listsz = num_nodes * sizeof (mde_cookie_t);
+	listp = kmem_zalloc(listsz, KM_SLEEP);
+
+	rootnode = md_root_node(mdp);
+
+	/* Get the list of virtual devices */
+	num_vdev = md_scan_dag(mdp, rootnode,
+		md_find_name(mdp, vdev_propname),
+		md_find_name(mdp, "fwd"), listp);
+
+	if (num_vdev <= 0) {
+		DERR(vswp, "%s: didn't find any virtual-device nodes in MD",
+			__func__);
+		goto md_prop_exit;
+	}
+
+	D2(vswp, "%s: %d virtual-device nodes found", __func__, num_vdev);
+
+	/* Look for the virtual switch nodes in the list */
+	for (idx = 0; idx < num_vdev; idx++) {
+		if (md_get_prop_str(mdp, listp[idx],
+				"name", &node_name) != 0) {
+			DERR(vswp, "%s: unable to get node name", __func__);
+			continue;
+
+		}
+
+		if (strcmp(node_name, vsw_propname) == 0) {
+			/* Virtual switch node */
+			if (md_get_prop_val(mdp, listp[idx],
+				"cfg-handle", &md_inst) != 0) {
+				DERR(vswp, "%s: unable to get cfg-handle from"
+					" node %d", __func__, idx);
+				goto md_prop_exit;
+			} else if (md_inst == obp_inst) {
+				D2(vswp, "%s: found matching node (%d)"
+					" 0x%llx == 0x%llx", __func__, idx,
+					md_inst, obp_inst);
+				found_node = B_TRUE;
+				break;
+			}
+		}
+	}
+
+	if (!found_node) {
+		DWARN(vswp, "%s: couldn't find correct vsw node", __func__);
+		goto md_prop_exit;
+	}
+
+	/*
+	 * Now, having found the correct node, get the various properties.
+	 */
+
+	if (md_get_prop_data(mdp, listp[idx], physdev_propname,
+				(uint8_t **)(&physname), &len) != 0) {
+		cmn_err(CE_WARN, "%s: unable to get name(s) of physical "
+			"device(s) from MD", __func__);
+	} else if ((strlen(physname) + 1) > LIFNAMSIZ) {
+		cmn_err(CE_WARN, "%s is too long a device name", physname);
+	} else {
+		(void) strncpy(vswp->physname, physname, strlen(physname) + 1);
+		vswp->mdprops |= VSW_MD_PHYSNAME;
+		D2(vswp, "%s: using first device specified (%s)",
+			__func__, vswp->physname);
+	}
+
+
+#ifdef DEBUG
+	/*
+	 * As a temporary measure to aid testing we check to see if there
+	 * is a vsw.conf file present. If there is we use the value of the
+	 * vsw_physname property in the file as the name of the physical
+	 * device, overriding the value from the MD.
+	 *
+	 * There may be multiple devices listed, but for the moment
+	 * we just use the first one.
+	 */
+	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, vswp->dip, 0,
+		"vsw_physname", &dev) == DDI_PROP_SUCCESS) {
+		if ((strlen(dev) + 1) > LIFNAMSIZ) {
+			cmn_err(CE_WARN, "%s is too long a device name", dev);
+		} else {
+			cmn_err(CE_NOTE, "%s: using device name (%s) from "
+				"config file", __func__, dev);
+
+			(void) strncpy(vswp->physname, dev, strlen(dev) + 1);
+			vswp->mdprops |= VSW_MD_PHYSNAME;
+		}
+
+		ddi_prop_free(dev);
+
+	}
+#endif
+
+	/* local mac address */
+	if (md_get_prop_val(mdp, listp[idx],
+			macaddr_propname, &macaddr) != 0) {
+		cmn_err(CE_WARN, "%s: unable to get local MAC address",
+								__func__);
+	} else {
+		READ_ENTER(&vswp->if_lockrw);
+		for (i = ETHERADDRL - 1; i >= 0; i--) {
+			vswp->if_addr.ether_addr_octet[i] = macaddr & 0xFF;
+			macaddr >>= 8;
+		}
+		RW_EXIT(&vswp->if_lockrw);
+		vswp->mdprops |= VSW_MD_MACADDR;
+	}
+
+	/*
+	 * Get the switch-mode property. The modes are listed in
+	 * decreasing order of preference, i.e. prefered mode is
+	 * first item in list.
+	 */
+	len = 0;
+	if (md_get_prop_data(mdp, listp[idx], smode_propname,
+				(uint8_t **)(&smode), &len) != 0) {
+		/*
+		 * Unable to get switch-mode property, so just use
+		 * default values which vswp->smode[] array has already
+		 * been pre-populated with, namely layer2.
+		 */
+		cmn_err(CE_WARN, "%s: unable to get switch mode property, "
+			"defaulting to layer 2 mode", __func__);
+	} else {
+		i = 0;
+		curr_mode = smode;
+		/*
+		 * Modes of operation:
+		 * 'switched'	 - layer 2 switching, underlying HW in
+		 *			non-promiscuous mode.
+		 * 'promiscuous' - layer 2 switching, underlying HW in
+		 *			promiscuous mode.
+		 * 'routed'	 - layer 3 (i.e. IP) routing, underlying HW
+		 *			in non-promiscuous mode.
+		 */
+		while ((curr_mode < (smode + len)) && (i < NUM_SMODES)) {
+			D2(vswp, "%s: curr_mode = [%s]", __func__, curr_mode);
+			if (strcmp(curr_mode, "switched") == 0)
+				vswp->smode[i] = VSW_LAYER2;
+			else if (strcmp(curr_mode, "promiscuous") == 0)
+				vswp->smode[i] = VSW_LAYER2_PROMISC;
+			else if (strcmp(curr_mode, "routed") == 0)
+				vswp->smode[i] = VSW_LAYER3;
+			else {
+				DERR(vswp, "%s: unknown mode %s",
+					__func__, curr_mode);
+				/* default to layer 2 */
+				vswp->smode[i] = VSW_LAYER2;
+			}
+			curr_mode += strlen(curr_mode) + 1;
+			i++;
+		}
+
+		vswp->mdprops |= VSW_MD_SMODE;
+	}
+
+md_prop_exit:
+	(void) md_fini_handle(mdp);
+
+	kmem_free(listp, listsz);
+
+	D1(vswp, "%s: exit", __func__);
+}
+
+static int
+vsw_setup_layer2(vsw_t *vswp)
+{
+	int		rv = 0;
+
+	D1(vswp, "%s: enter", __func__);
+
+	vsw_switch_frame = vsw_switch_l2_frame;
+
+	/*
+	 * Attempt to link into the MAC layer so we can get
+	 * and send packets out over the physical adapter.
+	 */
+	if (vswp->mdprops & VSW_MD_PHYSNAME) {
+		if (vsw_mac_attach(vswp) != 0) {
+			/*
+			 * Registration with the MAC layer has failed,
+			 * so return 1 so that can fall back to next
+			 * prefered switching method.
+			 */
+			cmn_err(CE_WARN, "!unable to join as MAC layer "
+				"client, continuing with attach");
+			rv = 1;
+		}
+	} else {
+		/* No physical device name found in MD */
+		DERR(vswp, "%s: no physical device name specified", __func__);
+		rv = 1;
+	}
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (rv);
+}
+
+static int
+vsw_setup_layer3(vsw_t *vswp)
+{
+	D1(vswp, "%s: enter", __func__);
+
+	D2(vswp, "%s: operating in layer 3 mode", __func__);
+	vsw_switch_frame = vsw_switch_l3_frame;
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (0);
+}
+
+/*
+ * Link into the MAC layer to gain access to the services provided by
+ * the underlying physical device driver (which should also have
+ * registered with the MAC layer).
+ *
+ * Only when in layer 2 mode.
+ */
+static int
+vsw_mac_attach(vsw_t *vswp)
+{
+	D1(vswp, "vsw_mac_attach: enter");
+
+	vswp->mh = NULL;
+	vswp->mrh = NULL;
+	vswp->mnh = NULL;
+
+	ASSERT(vswp->mdprops & VSW_MD_PHYSNAME);
+
+	if ((mac_open(vswp->physname, 0, &vswp->mh)) != 0) {
+		cmn_err(CE_WARN, "mac_open %s failed", vswp->physname);
+		goto mac_fail_exit;
+	}
+
+	D2(vswp, "vsw_mac_attach: using device %s", vswp->physname);
+
+	/* register for changes in the interface */
+	vswp->mnh = mac_notify_add(vswp->mh, vsw_notify_cb, (void *)vswp);
+
+	/* register our rx callback function */
+	vswp->mrh = mac_rx_add(vswp->mh, vsw_rx_cb, (void *)vswp);
+
+	/* get the MAC tx fn */
+	vswp->txinfo = mac_tx_get(vswp->mh);
+
+	/* start the interface */
+	if (mac_start(vswp->mh) != 0) {
+		cmn_err(CE_WARN, "could not start mac interface");
+		goto mac_fail_exit;
+	}
+
+	/* get and store original promisc setting */
+	vswp->init_promisc = mac_promisc_get(vswp->mh, MAC_DEVPROMISC);
+
+	/*
+	 * FUTURE: When we have the ability to set multiple unicast
+	 * mac address then we won't have to set the device into
+	 * promisc mode, but for the moment its the only way we.
+	 * can see pkts that logical domains we are serving are
+	 * interested in.
+	 */
+	if ((vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC) &&
+					(vswp->init_promisc == B_FALSE)) {
+		DERR(vswp, "vsw_mac_attach: enabling promisc mode..");
+
+		if (mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC) != 0) {
+			DERR(vswp, "vsw_mac_attach: unable to set device"
+				" into promiscuous mode");
+			goto mac_fail_exit;
+		}
+	}
+
+	D1(vswp, "vsw_mac_attach: exit");
+	return (0);
+
+mac_fail_exit:
+	if (vswp->mh != NULL) {
+		mac_promisc_set(vswp->mh, vswp->init_promisc, MAC_DEVPROMISC);
+		if (vswp->mrh != NULL)
+			mac_rx_remove(vswp->mh, vswp->mrh);
+
+		if (vswp->mnh != NULL)
+			mac_notify_remove(vswp->mh, vswp->mnh);
+
+		mac_close(vswp->mh);
+	}
+
+	vswp->mrh = NULL;
+	vswp->mnh = NULL;
+	vswp->mh = NULL;
+	vswp->txinfo = NULL;
+
+	D1(vswp, "vsw_mac_attach: fail exit");
+	return (1);
+}
+
+static void
+vsw_mac_detach(vsw_t *vswp)
+{
+	D1(vswp, "vsw_mac_detach: enter");
+
+	if (vswp->mh != NULL) {
+		/* restore promisc to original setting */
+		mac_promisc_set(vswp->mh, vswp->init_promisc, MAC_DEVPROMISC);
+		if (vswp->mrh != NULL)
+			mac_rx_remove(vswp->mh, vswp->mrh);
+
+		if (vswp->mnh != NULL)
+			mac_notify_remove(vswp->mh, vswp->mnh);
+
+		mac_close(vswp->mh);
+	}
+
+	vswp->mrh = NULL;
+	vswp->mnh = NULL;
+	vswp->mh = NULL;
+	vswp->txinfo = NULL;
+
+	D1(vswp, "vsw_mac_detach: exit");
+}
+
+/*
+ * Get notified of changes to the interface.
+ *
+ * For the moment we brute force the interface back
+ * into promisc mode if it is unset (e.g. by snoop).
+ * When we have the ability to set multiple mac addresses,
+ * we will need to see if this is necessary.
+ */
+static void
+vsw_notify_cb(void *arg, mac_notify_type_t type)
+{
+	vsw_t		*vswp = (vsw_t *)arg;
+
+	switch (type) {
+	case MAC_NOTE_PROMISC:
+		vswp->txinfo = mac_tx_get(vswp->mh);
+		if (mac_promisc_get(vswp->mh, MAC_DEVPROMISC) == B_TRUE) {
+			D2(vswp, "%s: still in PROMISC mode", __func__);
+		} else {
+			D2(vswp, "%s: now in NON-PROMISC mode", __func__);
+			D2(vswp, "...re-enabling");
+			mac_promisc_set(vswp->mh, B_TRUE, MAC_DEVPROMISC);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * receive callback routine. Invoked by MAC layer when there
+ * are pkts being passed up from physical device.
+ *
+ * PERF: It may be more efficient when the card is in promisc
+ * mode to check the dest address of the pkts here (against
+ * the FDB) rather than checking later. Needs to be investigated.
+ */
+static void
+vsw_rx_cb(void *arg, mac_resource_handle_t mrh, mblk_t *mp)
+{
+	_NOTE(ARGUNUSED(mrh))
+
+	vsw_t		*vswp = (vsw_t *)arg;
+
+	ASSERT(vswp != NULL);
+
+	D1(vswp, "vsw_rx_cb: enter");
+
+	/* switch the chain of packets received */
+	vsw_switch_frame(vswp, mp, VSW_PHYSDEV, NULL, NULL);
+
+	D1(vswp, "vsw_rx_cb: exit");
+}
+
+/*
+ * Send a message out over the physical device via the MAC layer.
+ *
+ * Returns any mblks that it was unable to transmit.
+ */
+static mblk_t *
+vsw_tx_msg(vsw_t *vswp, mblk_t *mp)
+{
+	const mac_txinfo_t	*mtp;
+	mblk_t			*nextp;
+
+	if (vswp->mh == NULL) {
+		DERR(vswp, "vsw_tx_msg: dropping pkts: no tx routine avail");
+		return (mp);
+	} else {
+		for (;;) {
+			nextp = mp->b_next;
+			mp->b_next = NULL;
+
+			mtp = vswp->txinfo;
+			if ((mp = mtp->mt_fn(mtp->mt_arg, mp)) != NULL) {
+				mp->b_next = nextp;
+				break;
+			}
+
+			if ((mp = nextp) == NULL)
+				break;
+
+		}
+
+	}
+
+	return (mp);
+}
+
+/*
+ * Register with the MAC layer as a network device, so we
+ * can be plumbed if necessary.
+ */
+static int
+vsw_mac_register(vsw_t *vswp)
+{
+	mac_t		*macp = NULL;
+	mac_info_t	*mip = NULL;
+	int		rv = 0;
+
+	D1(vswp, "%s: enter", __func__);
+
+	macp = kmem_zalloc(sizeof (mac_t), KM_SLEEP);
+
+	/*
+	 * Setup the m_info fields.
+	 */
+	mip = &(macp->m_info);
+	mip->mi_media = DL_ETHER;
+	mip->mi_sdu_min = 0;
+	mip->mi_sdu_max = ETHERMTU;
+	mip->mi_cksum = 0;
+	mip->mi_poll = DL_CAPAB_POLL;
+
+	mip->mi_addr_length = ETHERADDRL;
+	bcopy(&etherbroadcastaddr, mip->mi_brdcst_addr, ETHERADDRL);
+
+	READ_ENTER(&vswp->if_lockrw);
+	bcopy(&vswp->if_addr, mip->mi_unicst_addr, ETHERADDRL);
+	RW_EXIT(&vswp->if_lockrw);
+
+	MAC_STAT_MIB(mip->mi_stat);
+	MAC_STAT_ETHER(mip->mi_stat);
+
+	/* entry points */
+	macp->m_stat = vsw_m_stat;
+	macp->m_stop = vsw_m_stop;
+	macp->m_start = vsw_m_start;
+	macp->m_unicst = vsw_m_unicst;
+	macp->m_multicst = vsw_m_multicst;
+	macp->m_promisc = vsw_m_promisc;
+	macp->m_tx = vsw_m_tx;
+	macp->m_resources = vsw_m_resources;
+	macp->m_ioctl = vsw_m_ioctl;
+
+	macp->m_port = 0;
+	macp->m_dip = vswp->dip;
+	macp->m_ident = MAC_IDENT;
+	macp->m_driver = vswp;
+
+	vswp->if_macp = macp;
+
+	/* register */
+	rv = mac_register(macp);
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (rv);
+}
+
+static int
+vsw_mac_unregister(vsw_t *vswp)
+{
+	int		rv = 0;
+
+	D1(vswp, "%s: enter", __func__);
+
+	WRITE_ENTER(&vswp->if_lockrw);
+
+	if (vswp->if_macp != NULL) {
+		rv = mac_unregister(vswp->if_macp);
+		if (rv != 0) {
+			DWARN(vswp, "%s: unable to unregister from MAC "
+				"framework", __func__);
+
+			RW_EXIT(&vswp->if_lockrw);
+			D1(vswp, "%s: fail exit", __func__);
+			return (rv);
+		}
+
+		/* mark i/f as down and promisc off */
+		vswp->if_state &= ~VSW_IF_UP;
+
+		kmem_free(vswp->if_macp, sizeof (mac_t));
+		vswp->if_macp = NULL;
+	}
+	RW_EXIT(&vswp->if_lockrw);
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (rv);
+}
+
+static uint64_t
+vsw_m_stat(void *arg, enum mac_stat stat)
+{
+	vsw_t			*vswp = (vsw_t *)arg;
+	const mac_info_t	*mip;
+
+	D1(vswp, "%s: enter", __func__);
+
+	if (vswp->mh != NULL)
+		mip = mac_info(vswp->mh);
+	else
+		return (0);
+
+	if (!mip->mi_stat[stat])
+		return (0);
+
+	/* return stats from underlying device */
+	return (mac_stat_get(vswp->mh, stat));
+
+}
+
+static void
+vsw_m_stop(void *arg)
+{
+	vsw_t		*vswp = (vsw_t *)arg;
+
+	D1(vswp, "%s: enter", __func__);
+
+	WRITE_ENTER(&vswp->if_lockrw);
+	vswp->if_state &= ~VSW_IF_UP;
+	RW_EXIT(&vswp->if_lockrw);
+
+	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
+}
+
+static int
+vsw_m_start(void *arg)
+{
+	vsw_t		*vswp = (vsw_t *)arg;
+
+	D1(vswp, "%s: enter", __func__);
+
+	WRITE_ENTER(&vswp->if_lockrw);
+	vswp->if_state |= VSW_IF_UP;
+	RW_EXIT(&vswp->if_lockrw);
+
+	D1(vswp, "%s: exit (state = %d)", __func__, vswp->if_state);
+	return (0);
+}
+
+/*
+ * Change the local interface address.
+ */
+static int
+vsw_m_unicst(void *arg, const uint8_t *macaddr)
+{
+	vsw_t		*vswp = (vsw_t *)arg;
+
+	D1(vswp, "%s: enter", __func__);
+
+	WRITE_ENTER(&vswp->if_lockrw);
+	ether_copy(macaddr, &vswp->if_addr);
+	RW_EXIT(&vswp->if_lockrw);
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (0);
+}
+
+static int
+vsw_m_multicst(void *arg, boolean_t add, const uint8_t *mca)
+{
+	vsw_t		*vswp = (vsw_t *)arg;
+	mcst_addr_t	*mcst_p = NULL;
+	uint64_t	addr = 0x0;
+	int		i;
+
+	D1(vswp, "%s: enter", __func__);
+
+	/*
+	 * Convert address into form that can be used
+	 * as hash table key.
+	 */
+	for (i = 0; i < ETHERADDRL; i++) {
+		addr = (addr << 8) | mca[i];
+	}
+
+	D2(vswp, "%s: addr = 0x%llx", __func__, addr);
+
+	if (add) {
+		D2(vswp, "%s: adding multicast", __func__);
+		if (vsw_add_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
+			/*
+			 * Update the list of multicast addresses
+			 * contained within the vsw_t structure to
+			 * include this new one.
+			 */
+			mcst_p = kmem_zalloc(sizeof (mcst_addr_t), KM_NOSLEEP);
+			if (mcst_p == NULL) {
+				DERR(vswp, "%s unable to alloc mem", __func__);
+				return (1);
+			}
+			mcst_p->addr = addr;
+
+			mutex_enter(&vswp->mca_lock);
+			mcst_p->nextp = vswp->mcap;
+			vswp->mcap = mcst_p;
+			mutex_exit(&vswp->mca_lock);
+
+			/*
+			 * Call into the underlying driver to program the
+			 * address into HW.
+			 *
+			 * Note:
+			 * Can safely ignore the return value as the card
+			 * will for the moment always be in promisc mode.
+			 * When we can program multiple MAC addresses into the
+			 * HW then we will need to care about the return
+			 * value here.
+			 */
+			if (vswp->mh != NULL)
+				(void) mac_multicst_add(vswp->mh, mca);
+		}
+	} else {
+		D2(vswp, "%s: removing multicast", __func__);
+		/*
+		 * Remove the address from the hash table..
+		 */
+		if (vsw_del_mcst(vswp, VSW_LOCALDEV, addr, NULL) == 0) {
+
+			/*
+			 * ..and then from the list maintained in the
+			 * vsw_t structure.
+			 */
+			vsw_del_addr(VSW_LOCALDEV, vswp, addr);
+
+			if (vswp->mh != NULL)
+				(void) mac_multicst_remove(vswp->mh, mca);
+		}
+	}
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (0);
+}
+
+static int
+vsw_m_promisc(void *arg, boolean_t on)
+{
+	vsw_t		*vswp = (vsw_t *)arg;
+
+	D1(vswp, "%s: enter", __func__);
+
+	WRITE_ENTER(&vswp->if_lockrw);
+	if (on)
+		vswp->if_state |= VSW_IF_PROMISC;
+	else
+		vswp->if_state &= ~VSW_IF_PROMISC;
+	RW_EXIT(&vswp->if_lockrw);
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (0);
+}
+
+static mblk_t *
+vsw_m_tx(void *arg, mblk_t *mp)
+{
+	vsw_t		*vswp = (vsw_t *)arg;
+
+	D1(vswp, "%s: enter", __func__);
+
+	vsw_switch_frame(vswp, mp, VSW_LOCALDEV, NULL, NULL);
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (NULL);
+}
+
+static void
+vsw_m_resources(void *arg)
+{
+	vsw_t		*vswp = (vsw_t *)arg;
+	mac_rx_fifo_t	mrf;
+
+	D1(vswp, "%s: enter", __func__);
+
+	mrf.mrf_type = MAC_RX_FIFO;
+	mrf.mrf_blank = NULL;
+	mrf.mrf_arg = (void *)vswp;
+	mrf.mrf_normal_blank_time = 0;
+	mrf.mrf_normal_pkt_count = 0;
+
+	WRITE_ENTER(&vswp->if_lockrw);
+	vswp->if_mrh = mac_resource_add(vswp->if_macp, (mac_resource_t *)&mrf);
+	RW_EXIT(&vswp->if_lockrw);
+
+	D1(vswp, "%s: exit", __func__);
+}
+
+static void
+vsw_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
+{
+	vsw_t		*vswp = (vsw_t *)arg;
+
+	D1(vswp, "%s: enter", __func__);
+
+	miocnak(q, mp, 0, ENOTSUP);
+
+	D1(vswp, "%s: exit", __func__);
+}
+
+/*
+ * Register for machine description (MD) updates.
+ */
+static void
+vsw_mdeg_register(vsw_t *vswp)
+{
+	mdeg_prop_spec_t	*pspecp;
+	mdeg_node_spec_t	*inst_specp;
+	mdeg_handle_t		mdeg_hdl;
+	size_t			templatesz;
+	int			inst, rv;
+
+	D1(vswp, "%s: enter", __func__);
+
+	inst = ddi_prop_get_int(DDI_DEV_T_ANY, vswp->dip,
+		DDI_PROP_DONTPASS, reg_propname, -1);
+	if (inst == -1) {
+		DERR(vswp, "%s: unable to get %s property",
+						__func__, reg_propname);
+		return;
+	}
+
+	D2(vswp, "%s: instance %d registering with mdeg", __func__, inst);
+
+	/*
+	 * Allocate and initialize a per-instance copy
+	 * of the global property spec array that will
+	 * uniquely identify this vsw instance.
+	 */
+	templatesz = sizeof (vsw_prop_template);
+	pspecp = kmem_zalloc(templatesz, KM_SLEEP);
+
+	bcopy(vsw_prop_template, pspecp, templatesz);
+
+	VSW_SET_MDEG_PROP_INST(pspecp, inst);
+
+	/* initialize the complete prop spec structure */
+	inst_specp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
+	inst_specp->namep = "virtual-device";
+	inst_specp->specp = pspecp;
+
+	/* perform the registration */
+	rv = mdeg_register(inst_specp, &vport_match, vsw_mdeg_cb,
+	    (void *)vswp, &mdeg_hdl);
+
+	if (rv != MDEG_SUCCESS) {
+		DERR(vswp, "%s: mdeg_register failed (%d)\n", __func__, rv);
+		kmem_free(inst_specp, sizeof (mdeg_node_spec_t));
+		kmem_free(pspecp, templatesz);
+		return;
+	}
+
+	/* save off data that will be needed later */
+	vswp->inst_spec = inst_specp;
+	vswp->mdeg_hdl = mdeg_hdl;
+
+	D1(vswp, "%s: exit", __func__);
+}
+
+static void
+vsw_mdeg_unregister(vsw_t *vswp)
+{
+	D1(vswp, "vsw_mdeg_unregister: enter");
+
+	(void) mdeg_unregister(vswp->mdeg_hdl);
+
+	if (vswp->inst_spec->specp != NULL) {
+		(void) kmem_free(vswp->inst_spec->specp,
+			sizeof (vsw_prop_template));
+		vswp->inst_spec->specp = NULL;
+	}
+
+	if (vswp->inst_spec != NULL) {
+		(void) kmem_free(vswp->inst_spec,
+			sizeof (mdeg_node_spec_t));
+		vswp->inst_spec = NULL;
+	}
+
+	D1(vswp, "vsw_mdeg_unregister: exit");
+}
+
+static int
+vsw_mdeg_cb(void *cb_argp, mdeg_result_t *resp)
+{
+	vsw_t		*vswp;
+	int		idx;
+	md_t		*mdp;
+	mde_cookie_t	node;
+	uint64_t	inst;
+
+	if (resp == NULL)
+		return (MDEG_FAILURE);
+
+	vswp = (vsw_t *)cb_argp;
+
+	D1(vswp, "%s: added %d : removed %d : matched %d",
+		__func__, resp->added.nelem, resp->removed.nelem,
+		resp->match_prev.nelem);
+
+	/* process added ports */
+	for (idx = 0; idx < resp->added.nelem; idx++) {
+		mdp = resp->added.mdp;
+		node = resp->added.mdep[idx];
+
+		D2(vswp, "%s: adding node(%d) 0x%lx", __func__, idx, node);
+
+		if (vsw_port_add(vswp, mdp, &node) != 0) {
+			cmn_err(CE_WARN, "Unable to add new port (0x%lx)",
+					node);
+		}
+	}
+
+	/* process removed ports */
+	for (idx = 0; idx < resp->removed.nelem; idx++) {
+		mdp = resp->removed.mdp;
+		node = resp->removed.mdep[idx];
+
+		if (md_get_prop_val(mdp, node, id_propname, &inst)) {
+			DERR(vswp, "%s: prop(%s) not found port(%d)",
+				__func__, id_propname, idx);
+			continue;
+		}
+
+		D2(vswp, "%s: removing node(%d) 0x%lx", __func__, idx, node);
+
+		if (vsw_port_detach(vswp, inst) != 0) {
+			cmn_err(CE_WARN, "Unable to remove port %ld", inst);
+		}
+	}
+
+	/*
+	 * Currently no support for updating already active ports.
+	 * So, ignore the match_curr and match_priv arrays for now.
+	 */
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (MDEG_SUCCESS);
+}
+
+/*
+ * Add a new port to the system.
+ *
+ * Returns 0 on success, 1 on failure.
+ */
+int
+vsw_port_add(vsw_t *vswp, md_t *mdp, mde_cookie_t *node)
+{
+	uint64_t		ldc_id;
+	uint8_t			*addrp;
+	int			i, addrsz;
+	int			num_nodes = 0, nchan = 0;
+	int			listsz = 0;
+	mde_cookie_t		*listp = NULL;
+	struct ether_addr	ea;
+	uint64_t		macaddr;
+	uint64_t		inst = 0;
+	vsw_port_t		*port;
+
+	if (md_get_prop_val(mdp, *node, id_propname, &inst)) {
+		DWARN(vswp, "%s: prop(%s) not found", __func__,
+			id_propname);
+		return (1);
+	}
+
+	/*
+	 * Find the channel endpoint node(s) (which should be under this
+	 * port node) which contain the channel id(s).
+	 */
+	if ((num_nodes = md_node_count(mdp)) <= 0) {
+		DERR(vswp, "%s: invalid number of nodes found (%d)",
+			__func__, num_nodes);
+		return (1);
+	}
+
+	/* allocate enough space for node list */
+	listsz = num_nodes * sizeof (mde_cookie_t);
+	listp = kmem_zalloc(listsz, KM_SLEEP);
+
+	nchan = md_scan_dag(mdp, *node,
+		md_find_name(mdp, chan_propname),
+		md_find_name(mdp, "fwd"), listp);
+
+	if (nchan <= 0) {
+		DWARN(vswp, "%s: no %s nodes found", __func__, chan_propname);
+		kmem_free(listp, listsz);
+		return (1);
+	}
+
+	D2(vswp, "%s: %d %s nodes found", __func__, nchan, chan_propname);
+
+	/* use property from first node found */
+	if (md_get_prop_val(mdp, listp[0], id_propname, &ldc_id)) {
+		DWARN(vswp, "%s: prop(%s) not found\n", __func__,
+			id_propname);
+		kmem_free(listp, listsz);
+		return (1);
+	}
+
+	/* don't need list any more */
+	kmem_free(listp, listsz);
+
+	D2(vswp, "%s: ldc_id 0x%llx", __func__, ldc_id);
+
+	/* read mac-address property */
+	if (md_get_prop_data(mdp, *node, remaddr_propname,
+					&addrp, &addrsz)) {
+		DWARN(vswp, "%s: prop(%s) not found",
+				__func__, remaddr_propname);
+		return (1);
+	}
+
+	if (addrsz < ETHERADDRL) {
+		DWARN(vswp, "%s: invalid address size", __func__);
+		return (1);
+	}
+
+	macaddr = *((uint64_t *)addrp);
+	D2(vswp, "%s: remote mac address 0x%llx", __func__, macaddr);
+
+	for (i = ETHERADDRL - 1; i >= 0; i--) {
+		ea.ether_addr_octet[i] = macaddr & 0xFF;
+		macaddr >>= 8;
+	}
+
+	if (vsw_port_attach(vswp, (int)inst, &ldc_id, 1, &ea) != 0) {
+		DERR(vswp, "%s: failed to attach port", __func__);
+		return (1);
+	}
+
+	port = vsw_lookup_port(vswp, (int)inst);
+
+	/* just successfuly created the port, so it should exist */
+	ASSERT(port != NULL);
+
+	return (0);
+}
+
+/*
+ * Attach the specified port.
+ *
+ * Returns 0 on success, 1 on failure.
+ */
+static int
+vsw_port_attach(vsw_t *vswp, int p_instance, uint64_t *ldcids, int nids,
+struct ether_addr *macaddr)
+{
+	vsw_port_list_t		*plist = &vswp->plist;
+	vsw_port_t		*port, **prev_port;
+	int			i;
+
+	D1(vswp, "%s: enter : port %d", __func__, p_instance);
+
+	/* port already exists? */
+	READ_ENTER(&plist->lockrw);
+	for (port = plist->head; port != NULL; port = port->p_next) {
+		if (port->p_instance == p_instance) {
+			DWARN(vswp, "%s: port instance %d already attached",
+				__func__, p_instance);
+			RW_EXIT(&plist->lockrw);
+			return (1);
+		}
+	}
+	RW_EXIT(&plist->lockrw);
+
+	port = kmem_zalloc(sizeof (vsw_port_t), KM_SLEEP);
+	port->p_vswp = vswp;
+	port->p_instance = p_instance;
+	port->p_ldclist.num_ldcs = 0;
+	port->p_ldclist.head = NULL;
+
+	rw_init(&port->p_ldclist.lockrw, NULL, RW_DRIVER, NULL);
+
+	mutex_init(&port->tx_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&port->mca_lock, NULL, MUTEX_DRIVER, NULL);
+
+	mutex_init(&port->ref_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&port->ref_cv, NULL, CV_DRIVER, NULL);
+
+	mutex_init(&port->state_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&port->state_cv, NULL, CV_DRIVER, NULL);
+	port->state = VSW_PORT_INIT;
+
+	if (nids > VSW_PORT_MAX_LDCS) {
+		D2(vswp, "%s: using first of %d ldc ids",
+			__func__, nids);
+		nids = VSW_PORT_MAX_LDCS;
+	}
+
+	D2(vswp, "%s: %d nids", __func__, nids);
+	for (i = 0; i < nids; i++) {
+		D2(vswp, "%s: ldcid (%llx)", __func__, (uint64_t)ldcids[i]);
+		if (vsw_ldc_attach(port, (uint64_t)ldcids[i]) != 0) {
+			DERR(vswp, "%s: ldc_attach failed", __func__);
+
+			rw_destroy(&port->p_ldclist.lockrw);
+
+			cv_destroy(&port->ref_cv);
+			mutex_destroy(&port->ref_lock);
+
+			cv_destroy(&port->state_cv);
+			mutex_destroy(&port->state_lock);
+
+			mutex_destroy(&port->tx_lock);
+			mutex_destroy(&port->mca_lock);
+			kmem_free(port, sizeof (vsw_port_t));
+			return (1);
+		}
+	}
+
+	ether_copy(macaddr, &port->p_macaddr);
+
+	WRITE_ENTER(&plist->lockrw);
+
+	/* create the fdb entry for this port/mac address */
+	(void) vsw_add_fdb(vswp, port);
+
+	/* link it into the list of ports for this vsw instance */
+	prev_port = (vsw_port_t **)(&plist->head);
+	port->p_next = *prev_port;
+	*prev_port = port;
+	plist->num_ports++;
+	RW_EXIT(&plist->lockrw);
+
+	/*
+	 * Initialise the port and any ldc's under it.
+	 */
+	(void) vsw_init_ldcs(port);
+
+	D1(vswp, "%s: exit", __func__);
+	return (0);
+}
+
+/*
+ * Detach the specified port.
+ *
+ * Returns 0 on success, 1 on failure.
+ */
+static int
+vsw_port_detach(vsw_t *vswp, int p_instance)
+{
+	vsw_port_t	*port = NULL;
+	vsw_port_list_t	*plist = &vswp->plist;
+
+	D1(vswp, "%s: enter: port id %d", __func__, p_instance);
+
+	WRITE_ENTER(&plist->lockrw);
+
+	if ((port = vsw_lookup_port(vswp, p_instance)) == NULL) {
+		RW_EXIT(&plist->lockrw);
+		return (1);
+	}
+
+	if (vsw_plist_del_node(vswp, port)) {
+		RW_EXIT(&plist->lockrw);
+		return (1);
+	}
+
+	/* Remove the fdb entry for this port/mac address */
+	(void) vsw_del_fdb(vswp, port);
+
+	/* Remove any multicast addresses.. */
+	vsw_del_mcst_port(port);
+
+	/*
+	 * No longer need to hold lock on port list now that we
+	 * have unlinked the target port from the list.
+	 */
+	RW_EXIT(&plist->lockrw);
+
+	if (vsw_port_delete(port)) {
+		return (1);
+	}
+
+	D1(vswp, "%s: exit: p_instance(%d)", __func__, p_instance);
+	return (0);
+}
+
+/*
+ * Detach all active ports.
+ *
+ * Returns 0 on success, 1 on failure.
+ */
+static int
+vsw_detach_ports(vsw_t *vswp)
+{
+	vsw_port_list_t 	*plist = &vswp->plist;
+	vsw_port_t		*port = NULL;
+
+	D1(vswp, "%s: enter", __func__);
+
+	WRITE_ENTER(&plist->lockrw);
+
+	while ((port = plist->head) != NULL) {
+		if (vsw_plist_del_node(vswp, port)) {
+			DERR(vswp, "%s: Error deleting port %d"
+				" from port list", __func__,
+				port->p_instance);
+			RW_EXIT(&plist->lockrw);
+			return (1);
+		}
+
+		/* Remove the fdb entry for this port/mac address */
+		(void) vsw_del_fdb(vswp, port);
+
+		/* Remove any multicast addresses.. */
+		vsw_del_mcst_port(port);
+
+		/*
+		 * No longer need to hold the lock on the port list
+		 * now that we have unlinked the target port from the
+		 * list.
+		 */
+		RW_EXIT(&plist->lockrw);
+		if (vsw_port_delete(port)) {
+			DERR(vswp, "%s: Error deleting port %d",
+				__func__, port->p_instance);
+			return (1);
+		}
+		WRITE_ENTER(&plist->lockrw);
+	}
+	RW_EXIT(&plist->lockrw);
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (0);
+}
+
+/*
+ * Delete the specified port.
+ *
+ * Returns 0 on success, 1 on failure.
+ */
+static int
+vsw_port_delete(vsw_port_t *port)
+{
+	vsw_ldc_list_t 		*ldcl;
+	vsw_t			*vswp = port->p_vswp;
+
+	D1(vswp, "%s: enter : port id %d", __func__, port->p_instance);
+
+	(void) vsw_uninit_ldcs(port);
+
+	/*
+	 * Wait for any pending ctrl msg tasks which reference this
+	 * port to finish.
+	 */
+	if (vsw_drain_port_taskq(port))
+		return (1);
+
+	/*
+	 * Wait for port reference count to hit zero.
+	 */
+	mutex_enter(&port->ref_lock);
+	while (port->ref_cnt != 0)
+		cv_wait(&port->ref_cv, &port->ref_lock);
+	mutex_exit(&port->ref_lock);
+
+	/*
+	 * Wait for any active callbacks to finish
+	 */
+	if (vsw_drain_ldcs(port))
+		return (1);
+
+	ldcl = &port->p_ldclist;
+	WRITE_ENTER(&ldcl->lockrw);
+	while (ldcl->num_ldcs > 0) {
+		if (vsw_ldc_detach(port, ldcl->head->ldc_id) != 0) {;
+			cmn_err(CE_WARN, "unable to detach ldc %ld",
+					ldcl->head->ldc_id);
+			RW_EXIT(&ldcl->lockrw);
+			return (1);
+		}
+	}
+	RW_EXIT(&ldcl->lockrw);
+
+	rw_destroy(&port->p_ldclist.lockrw);
+
+	mutex_destroy(&port->mca_lock);
+	mutex_destroy(&port->tx_lock);
+	cv_destroy(&port->ref_cv);
+	mutex_destroy(&port->ref_lock);
+
+	cv_destroy(&port->state_cv);
+	mutex_destroy(&port->state_lock);
+
+	kmem_free(port, sizeof (vsw_port_t));
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (0);
+}
+
+/*
+ * Attach a logical domain channel (ldc) under a specified port.
+ *
+ * Returns 0 on success, 1 on failure.
+ */
+static int
+vsw_ldc_attach(vsw_port_t *port, uint64_t ldc_id)
+{
+	vsw_t 		*vswp = port->p_vswp;
+	vsw_ldc_list_t *ldcl = &port->p_ldclist;
+	vsw_ldc_t 	*ldcp = NULL;
+	ldc_attr_t 	attr;
+	ldc_status_t	istatus;
+	int 		status = DDI_FAILURE;
+
+	D1(vswp, "%s: enter", __func__);
+
+	ldcp = kmem_zalloc(sizeof (vsw_ldc_t), KM_NOSLEEP);
+	if (ldcp == NULL) {
+		DERR(vswp, "%s: kmem_zalloc failed", __func__);
+		return (1);
+	}
+	ldcp->ldc_id = ldc_id;
+
+	mutex_init(&ldcp->ldc_txlock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ldcp->ldc_cblock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ldcp->drain_cv_lock, NULL, MUTEX_DRIVER, NULL);
+	cv_init(&ldcp->drain_cv, NULL, CV_DRIVER, NULL);
+
+	/* required for handshake with peer */
+	ldcp->local_session = (uint64_t)ddi_get_lbolt();
+	ldcp->peer_session = 0;
+	ldcp->session_status = 0;
+
+	mutex_init(&ldcp->hss_lock, NULL, MUTEX_DRIVER, NULL);
+	ldcp->hss_id = 1;	/* Initial handshake session id */
+
+	/* only set for outbound lane, inbound set by peer */
+	vsw_set_lane_attr(vswp, &ldcp->lane_out);
+
+	attr.devclass = LDC_DEV_NT_SVC;
+	attr.instance = ddi_get_instance(vswp->dip);
+	attr.mode = LDC_MODE_UNRELIABLE;
+	attr.qlen = VSW_LDC_QLEN;
+	status = ldc_init(ldc_id, &attr, &ldcp->ldc_handle);
+	if (status != 0) {
+		DERR(vswp, "%s(%lld): ldc_init failed, rv (%d)",
+		    __func__, ldc_id, status);
+		mutex_destroy(&ldcp->ldc_txlock);
+		mutex_destroy(&ldcp->ldc_cblock);
+		cv_destroy(&ldcp->drain_cv);
+		mutex_destroy(&ldcp->drain_cv_lock);
+		mutex_destroy(&ldcp->hss_lock);
+		kmem_free(ldcp, sizeof (vsw_ldc_t));
+		return (1);
+	}
+
+	status = ldc_reg_callback(ldcp->ldc_handle, vsw_ldc_cb, (caddr_t)ldcp);
+	if (status != 0) {
+		DERR(vswp, "%s(%lld): ldc_reg_callback failed, rv (%d)",
+		    __func__, ldc_id, status);
+		mutex_destroy(&ldcp->ldc_txlock);
+		mutex_destroy(&ldcp->ldc_cblock);
+		cv_destroy(&ldcp->drain_cv);
+		mutex_destroy(&ldcp->drain_cv_lock);
+		mutex_destroy(&ldcp->hss_lock);
+		(void) ldc_fini(ldcp->ldc_handle);
+		kmem_free(ldcp, sizeof (vsw_ldc_t));
+		return (1);
+	}
+
+
+	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
+		DERR(vswp, "%s: ldc_status failed", __func__);
+		return (1);
+	}
+
+	ldcp->ldc_status = istatus;
+	ldcp->ldc_port = port;
+	ldcp->ldc_vswp = vswp;
+
+	/* link it into the list of channels for this port */
+	WRITE_ENTER(&ldcl->lockrw);
+	ldcp->ldc_next = ldcl->head;
+	ldcl->head = ldcp;
+	ldcl->num_ldcs++;
+	RW_EXIT(&ldcl->lockrw);
+
+	D1(vswp, "%s: exit", __func__);
+	return (0);
+}
+
+/*
+ * Detach a logical domain channel (ldc) belonging to a
+ * particular port.
+ *
+ * Returns 0 on success, 1 on failure.
+ */
+static int
+vsw_ldc_detach(vsw_port_t *port, uint64_t ldc_id)
+{
+	vsw_t 		*vswp = port->p_vswp;
+	vsw_ldc_t 	*ldcp, *prev_ldcp;
+	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
+	int 		rv;
+
+	prev_ldcp = ldcl->head;
+	for (; (ldcp = prev_ldcp) != NULL; prev_ldcp = ldcp->ldc_next) {
+		if (ldcp->ldc_id == ldc_id) {
+			break;
+		}
+	}
+
+	/* specified ldc id not found */
+	if (ldcp == NULL) {
+		DERR(vswp, "%s: ldcp = NULL", __func__);
+		return (1);
+	}
+
+	D2(vswp, "%s: detaching channel %lld", __func__, ldcp->ldc_id);
+
+	/*
+	 * Before we can close the channel we must release any mapped
+	 * resources (e.g. drings).
+	 */
+	vsw_free_lane_resources(ldcp, INBOUND);
+	vsw_free_lane_resources(ldcp, OUTBOUND);
+
+	/*
+	 * If the close fails we are in serious trouble, as won't
+	 * be able to delete the parent port.
+	 */
+	if ((rv = ldc_close(ldcp->ldc_handle)) != 0) {
+		DERR(vswp, "%s: error %d closing channel %lld",
+			__func__, rv, ldcp->ldc_id);
+		return (1);
+	}
+
+	(void) ldc_fini(ldcp->ldc_handle);
+
+	ldcp->ldc_status = LDC_INIT;
+	ldcp->ldc_handle = NULL;
+	ldcp->ldc_vswp = NULL;
+	mutex_destroy(&ldcp->ldc_txlock);
+	mutex_destroy(&ldcp->ldc_cblock);
+	cv_destroy(&ldcp->drain_cv);
+	mutex_destroy(&ldcp->drain_cv_lock);
+	mutex_destroy(&ldcp->hss_lock);
+
+	/* unlink it from the list */
+	prev_ldcp = ldcp->ldc_next;
+	ldcl->num_ldcs--;
+	kmem_free(ldcp, sizeof (vsw_ldc_t));
+
+	return (0);
+}
+
+/*
+ * Open and attempt to bring up the channel. Note that channel
+ * can only be brought up if peer has also opened channel.
+ *
+ * Returns 0 if can open and bring up channel, otherwise
+ * returns 1.
+ */
+static int
+vsw_ldc_init(vsw_ldc_t *ldcp)
+{
+	vsw_t 		*vswp = ldcp->ldc_vswp;
+	ldc_status_t	istatus = 0;
+	int		rv;
+
+	D1(vswp, "%s: enter", __func__);
+
+	LDC_ENTER_LOCK(ldcp);
+
+	/* don't start at 0 in case clients don't like that */
+	ldcp->next_ident = 1;
+
+	rv = ldc_open(ldcp->ldc_handle);
+	if (rv != 0) {
+		DERR(vswp, "%s: ldc_open failed: id(%lld) rv(%d)",
+		    __func__, ldcp->ldc_id, rv);
+		LDC_EXIT_LOCK(ldcp);
+		return (1);
+	}
+
+	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
+		DERR(vswp, "%s: unable to get status", __func__);
+		LDC_EXIT_LOCK(ldcp);
+		return (1);
+
+	} else if (istatus != LDC_OPEN && istatus != LDC_READY) {
+		DERR(vswp, "%s: id (%lld) status(%d) is not OPEN/READY",
+		    __func__, ldcp->ldc_id, istatus);
+		LDC_EXIT_LOCK(ldcp);
+		return (1);
+	}
+
+	ldcp->ldc_status = istatus;
+	rv = ldc_up(ldcp->ldc_handle);
+	if (rv != 0) {
+		/*
+		 * Not a fatal error for ldc_up() to fail, as peer
+		 * end point may simply not be ready yet.
+		 */
+		D2(vswp, "%s: ldc_up err id(%lld) rv(%d)", __func__,
+			ldcp->ldc_id, rv);
+		LDC_EXIT_LOCK(ldcp);
+		return (1);
+	}
+
+	/*
+	 * ldc_up() call is non-blocking so need to explicitly
+	 * check channel status to see if in fact the channel
+	 * is UP.
+	 */
+	if (ldc_status(ldcp->ldc_handle, &istatus) != 0) {
+		DERR(vswp, "%s: unable to get status", __func__);
+		LDC_EXIT_LOCK(ldcp);
+		return (1);
+
+	} else if (istatus != LDC_UP) {
+		DERR(vswp, "%s: id(%lld) status(%d) is not UP",
+		    __func__, ldcp->ldc_id, istatus);
+	} else {
+		ldcp->ldc_status = istatus;
+	}
+
+	LDC_EXIT_LOCK(ldcp);
+
+	D1(vswp, "%s: exit", __func__);
+	return (0);
+}
+
+/* disable callbacks on the channel */
+static int
+vsw_ldc_uninit(vsw_ldc_t *ldcp)
+{
+	vsw_t	*vswp = ldcp->ldc_vswp;
+	int	rv;
+
+	D1(vswp, "vsw_ldc_uninit: enter: id(%lx)\n", ldcp->ldc_id);
+
+	LDC_ENTER_LOCK(ldcp);
+
+	rv = ldc_set_cb_mode(ldcp->ldc_handle, LDC_CB_DISABLE);
+	if (rv != 0) {
+		DERR(vswp, "vsw_ldc_uninit(%lld): error disabling "
+			"interrupts (rv = %d)\n", ldcp->ldc_id, rv);
+		LDC_EXIT_LOCK(ldcp);
+		return (1);
+	}
+
+	ldcp->ldc_status = LDC_INIT;
+
+	LDC_EXIT_LOCK(ldcp);
+
+	D1(vswp, "vsw_ldc_uninit: exit: id(%lx)", ldcp->ldc_id);
+
+	return (0);
+}
+
+static int
+vsw_init_ldcs(vsw_port_t *port)
+{
+	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
+	vsw_ldc_t	*ldcp;
+
+	READ_ENTER(&ldcl->lockrw);
+	ldcp =  ldcl->head;
+	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
+		(void) vsw_ldc_init(ldcp);
+	}
+	RW_EXIT(&ldcl->lockrw);
+
+	return (0);
+}
+
+static int
+vsw_uninit_ldcs(vsw_port_t *port)
+{
+	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
+	vsw_ldc_t	*ldcp;
+
+	D1(NULL, "vsw_uninit_ldcs: enter\n");
+
+	READ_ENTER(&ldcl->lockrw);
+	ldcp =  ldcl->head;
+	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
+		(void) vsw_ldc_uninit(ldcp);
+	}
+	RW_EXIT(&ldcl->lockrw);
+
+	D1(NULL, "vsw_uninit_ldcs: exit\n");
+
+	return (0);
+}
+
+/*
+ * Wait until the callback(s) associated with the ldcs under the specified
+ * port have completed.
+ *
+ * Prior to this function being invoked each channel under this port
+ * should have been quiesced via ldc_set_cb_mode(DISABLE).
+ *
+ * A short explaination of what we are doing below..
+ *
+ * The simplest approach would be to have a reference counter in
+ * the ldc structure which is increment/decremented by the callbacks as
+ * they use the channel. The drain function could then simply disable any
+ * further callbacks and do a cv_wait for the ref to hit zero. Unfortunately
+ * there is a tiny window here - before the callback is able to get the lock
+ * on the channel it is interrupted and this function gets to execute. It
+ * sees that the ref count is zero and believes its free to delete the
+ * associated data structures.
+ *
+ * We get around this by taking advantage of the fact that before the ldc
+ * framework invokes a callback it sets a flag to indicate that there is a
+ * callback active (or about to become active). If when we attempt to
+ * unregister a callback when this active flag is set then the unregister
+ * will fail with EWOULDBLOCK.
+ *
+ * If the unregister fails we do a cv_timedwait. We will either be signaled
+ * by the callback as it is exiting (note we have to wait a short period to
+ * allow the callback to return fully to the ldc framework and it to clear
+ * the active flag), or by the timer expiring. In either case we again attempt
+ * the unregister. We repeat this until we can succesfully unregister the
+ * callback.
+ *
+ * The reason we use a cv_timedwait rather than a simple cv_wait is to catch
+ * the case where the callback has finished but the ldc framework has not yet
+ * cleared the active flag. In this case we would never get a cv_signal.
+ */
+static int
+vsw_drain_ldcs(vsw_port_t *port)
+{
+	vsw_ldc_list_t	*ldcl = &port->p_ldclist;
+	vsw_ldc_t	*ldcp;
+	vsw_t		*vswp = port->p_vswp;
+
+	D1(vswp, "%s: enter", __func__);
+
+	READ_ENTER(&ldcl->lockrw);
+
+	ldcp = ldcl->head;
+
+	for (; ldcp  != NULL; ldcp = ldcp->ldc_next) {
+		/*
+		 * If we can unregister the channel callback then we
+		 * know that there is no callback either running or
+		 * scheduled to run for this channel so move on to next
+		 * channel in the list.
+		 */
+		mutex_enter(&ldcp->drain_cv_lock);
+
+		/* prompt active callbacks to quit */
+		ldcp->drain_state = VSW_LDC_DRAINING;
+
+		if ((ldc_unreg_callback(ldcp->ldc_handle)) == 0) {
+			D2(vswp, "%s: unreg callback for chan %ld", __func__,
+				ldcp->ldc_id);
+			mutex_exit(&ldcp->drain_cv_lock);
+			continue;
+		} else {
+			/*
+			 * If we end up here we know that either 1) a callback
+			 * is currently executing, 2) is about to start (i.e.
+			 * the ldc framework has set the active flag but
+			 * has not actually invoked the callback yet, or 3)
+			 * has finished and has returned to the ldc framework
+			 * but the ldc framework has not yet cleared the
+			 * active bit.
+			 *
+			 * Wait for it to finish.
+			 */
+			while (ldc_unreg_callback(ldcp->ldc_handle)
+								== EWOULDBLOCK)
+				(void) cv_timedwait(&ldcp->drain_cv,
+					&ldcp->drain_cv_lock, lbolt + hz);
+
+			mutex_exit(&ldcp->drain_cv_lock);
+			D2(vswp, "%s: unreg callback for chan %ld after "
+				"timeout", __func__, ldcp->ldc_id);
+		}
+	}
+	RW_EXIT(&ldcl->lockrw);
+
+	D1(vswp, "%s: exit", __func__);
+	return (0);
+}
+
+/*
+ * Wait until all tasks which reference this port have completed.
+ *
+ * Prior to this function being invoked each channel under this port
+ * should have been quiesced via ldc_set_cb_mode(DISABLE).
+ */
+static int
+vsw_drain_port_taskq(vsw_port_t *port)
+{
+	vsw_t		*vswp = port->p_vswp;
+
+	D1(vswp, "%s: enter", __func__);
+
+	/*
+	 * Mark the port as in the process of being detached, and
+	 * dispatch a marker task to the queue so we know when all
+	 * relevant tasks have completed.
+	 */
+	mutex_enter(&port->state_lock);
+	port->state = VSW_PORT_DETACHING;
+
+	if ((vswp->taskq_p == NULL) ||
+		(ddi_taskq_dispatch(vswp->taskq_p, vsw_marker_task,
+			port, DDI_NOSLEEP) != DDI_SUCCESS)) {
+		DERR(vswp, "%s: unable to dispatch marker task",
+			__func__);
+		mutex_exit(&port->state_lock);
+		return (1);
+	}
+
+	/*
+	 * Wait for the marker task to finish.
+	 */
+	while (port->state != VSW_PORT_DETACHABLE)
+		cv_wait(&port->state_cv, &port->state_lock);
+
+	mutex_exit(&port->state_lock);
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (0);
+}
+
+static void
+vsw_marker_task(void *arg)
+{
+	vsw_port_t	*port = arg;
+	vsw_t		*vswp = port->p_vswp;
+
+	D1(vswp, "%s: enter", __func__);
+
+	mutex_enter(&port->state_lock);
+
+	/*
+	 * No further tasks should be dispatched which reference
+	 * this port so ok to mark it as safe to detach.
+	 */
+	port->state = VSW_PORT_DETACHABLE;
+
+	cv_signal(&port->state_cv);
+
+	mutex_exit(&port->state_lock);
+
+	D1(vswp, "%s: exit", __func__);
+}
+
+static vsw_port_t *
+vsw_lookup_port(vsw_t *vswp, int p_instance)
+{
+	vsw_port_list_t *plist = &vswp->plist;
+	vsw_port_t	*port;
+
+	for (port = plist->head; port != NULL; port = port->p_next) {
+		if (port->p_instance == p_instance) {
+			D2(vswp, "vsw_lookup_port: found p_instance\n");
+			return (port);
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * Search for and remove the specified port from the port
+ * list. Returns 0 if able to locate and remove port, otherwise
+ * returns 1.
+ */
+static int
+vsw_plist_del_node(vsw_t *vswp, vsw_port_t *port)
+{
+	vsw_port_list_t *plist = &vswp->plist;
+	vsw_port_t	*curr_p, *prev_p;
+
+	if (plist->head == NULL)
+		return (1);
+
+	curr_p = prev_p = plist->head;
+
+	while (curr_p != NULL) {
+		if (curr_p == port) {
+			if (prev_p == curr_p) {
+				plist->head = curr_p->p_next;
+			} else {
+				prev_p->p_next = curr_p->p_next;
+			}
+			plist->num_ports--;
+			break;
+		} else {
+			prev_p = curr_p;
+			curr_p = curr_p->p_next;
+		}
+	}
+	return (0);
+}
+
+/*
+ * Interrupt handler for ldc messages.
+ */
+static uint_t
+vsw_ldc_cb(uint64_t event, caddr_t arg)
+{
+	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
+	vsw_t 		*vswp = ldcp->ldc_vswp;
+	ldc_status_t	lstatus;
+	int		rv;
+
+	D1(vswp, "%s: enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
+
+	mutex_enter(&ldcp->ldc_cblock);
+
+	if ((ldcp->ldc_status == LDC_INIT) || (ldcp->ldc_handle == NULL)) {
+		mutex_exit(&ldcp->ldc_cblock);
+		return (LDC_SUCCESS);
+	}
+
+	if (event & LDC_EVT_UP) {
+		/*
+		 * Channel has come up, get the state and then start
+		 * the handshake.
+		 */
+		rv = ldc_status(ldcp->ldc_handle, &lstatus);
+		if (rv != 0) {
+			cmn_err(CE_WARN, "Unable to read channel state");
+		}
+		ldcp->ldc_status = lstatus;
+
+		D2(vswp, "%s: id(%ld) event(%llx) UP:  status(%ld)",
+			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
+
+		vsw_restart_handshake(ldcp);
+
+		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
+	}
+
+	if (event & LDC_EVT_READ) {
+		/*
+		 * Data available for reading.
+		 */
+		D2(vswp, "%s: id(ld) event(%llx) data READ",
+				__func__, ldcp->ldc_id, event);
+
+		vsw_process_pkt(ldcp);
+
+		ASSERT((event & (LDC_EVT_RESET | LDC_EVT_DOWN)) == 0);
+
+		goto vsw_cb_exit;
+	}
+
+	if (event & LDC_EVT_RESET) {
+		rv = ldc_status(ldcp->ldc_handle, &lstatus);
+		if (rv != 0) {
+			cmn_err(CE_WARN, "Unable to read channel state");
+		} else {
+			ldcp->ldc_status = lstatus;
+		}
+		D2(vswp, "%s: id(%ld) event(%llx) RESET:  status (%ld)",
+			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
+	}
+
+	if (event & LDC_EVT_DOWN) {
+		rv = ldc_status(ldcp->ldc_handle, &lstatus);
+		if (rv != 0) {
+			cmn_err(CE_WARN, "Unable to read channel state");
+		} else {
+			ldcp->ldc_status = lstatus;
+		}
+
+		D2(vswp, "%s: id(%ld) event(%llx) DOWN:  status (%ld)",
+			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
+
+	}
+
+	/*
+	 * Catch either LDC_EVT_WRITE which we don't support or any
+	 * unknown event.
+	 */
+	if (event & ~(LDC_EVT_UP | LDC_EVT_RESET
+					| LDC_EVT_DOWN | LDC_EVT_READ)) {
+
+		DERR(vswp, "%s: id(%ld) Unexpected event=(%llx) status(%ld)",
+			__func__, ldcp->ldc_id, event, ldcp->ldc_status);
+	}
+
+vsw_cb_exit:
+	mutex_exit(&ldcp->ldc_cblock);
+
+	/*
+	 * Let the drain function know we are finishing if it
+	 * is waiting.
+	 */
+	mutex_enter(&ldcp->drain_cv_lock);
+	if (ldcp->drain_state == VSW_LDC_DRAINING)
+		cv_signal(&ldcp->drain_cv);
+	mutex_exit(&ldcp->drain_cv_lock);
+
+	return (LDC_SUCCESS);
+}
+
+/*
+ * (Re)start a handshake with our peer by sending them
+ * our version info.
+ */
+static void
+vsw_restart_handshake(vsw_ldc_t *ldcp)
+{
+	vsw_t		*vswp = ldcp->ldc_vswp;
+	vsw_port_t	*port;
+	vsw_ldc_list_t	*ldcl;
+
+	D1(vswp, "vsw_restart_handshake: enter");
+
+	port = ldcp->ldc_port;
+	ldcl = &port->p_ldclist;
+
+	WRITE_ENTER(&ldcl->lockrw);
+
+	D2(vswp, "%s: in 0x%llx : out 0x%llx", __func__,
+		ldcp->lane_in.lstate, ldcp->lane_out.lstate);
+
+	vsw_free_lane_resources(ldcp, INBOUND);
+	vsw_free_lane_resources(ldcp, OUTBOUND);
+	RW_EXIT(&ldcl->lockrw);
+
+	ldcp->lane_in.lstate = 0;
+	ldcp->lane_out.lstate = 0;
+
+	/*
+	 * Remove parent port from any multicast groups
+	 * it may have registered with. Client must resend
+	 * multicast add command after handshake completes.
+	 */
+	(void) vsw_del_fdb(vswp, port);
+
+	vsw_del_mcst_port(port);
+
+	ldcp->hphase = VSW_MILESTONE0;
+
+	ldcp->peer_session = 0;
+	ldcp->session_status = 0;
+
+	/*
+	 * We now increment the transaction group id. This allows
+	 * us to identify and disard any tasks which are still pending
+	 * on the taskq and refer to the handshake session we are about
+	 * to restart. These stale messages no longer have any real
+	 * meaning.
+	 */
+	mutex_enter(&ldcp->hss_lock);
+	ldcp->hss_id++;
+	mutex_exit(&ldcp->hss_lock);
+
+	if (ldcp->hcnt++ > vsw_num_handshakes) {
+		cmn_err(CE_WARN, "exceeded number of permitted "
+			"handshake attempts (%d) on channel %ld",
+			ldcp->hcnt, ldcp->ldc_id);
+		return;
+	}
+
+	vsw_send_ver(ldcp);
+
+	D1(vswp, "vsw_restart_handshake: exit");
+}
+
+/*
+ * returns 0 if legal for event signified by flag to have
+ * occured at the time it did. Otherwise returns 1.
+ */
+int
+vsw_check_flag(vsw_ldc_t *ldcp, int dir, uint64_t flag)
+{
+	vsw_t		*vswp = ldcp->ldc_vswp;
+	uint64_t	state;
+	uint64_t	phase;
+
+	if (dir == INBOUND)
+		state = ldcp->lane_in.lstate;
+	else
+		state = ldcp->lane_out.lstate;
+
+	phase = ldcp->hphase;
+
+	switch (flag) {
+	case VSW_VER_INFO_RECV:
+		if (phase > VSW_MILESTONE0) {
+			DERR(vswp, "vsw_check_flag (%d): VER_INFO_RECV"
+				" when in state %d\n", ldcp->ldc_id, phase);
+			vsw_restart_handshake(ldcp);
+			return (1);
+		}
+		break;
+
+	case VSW_VER_ACK_RECV:
+	case VSW_VER_NACK_RECV:
+		if (!(state & VSW_VER_INFO_SENT)) {
+			DERR(vswp, "vsw_check_flag (%d): spurious VER_ACK"
+				" or VER_NACK when in state %d\n",
+				ldcp->ldc_id, phase);
+			vsw_restart_handshake(ldcp);
+			return (1);
+		} else
+			state &= ~VSW_VER_INFO_SENT;
+		break;
+
+	case VSW_ATTR_INFO_RECV:
+		if ((phase < VSW_MILESTONE1) || (phase >= VSW_MILESTONE2)) {
+			DERR(vswp, "vsw_check_flag (%d): ATTR_INFO_RECV"
+				" when in state %d\n", ldcp->ldc_id, phase);
+			vsw_restart_handshake(ldcp);
+			return (1);
+		}
+		break;
+
+	case VSW_ATTR_ACK_RECV:
+	case VSW_ATTR_NACK_RECV:
+		if (!(state & VSW_ATTR_INFO_SENT)) {
+			DERR(vswp, "vsw_check_flag (%d): spurious ATTR_ACK"
+				" or ATTR_NACK when in state %d\n",
+				ldcp->ldc_id, phase);
+			vsw_restart_handshake(ldcp);
+			return (1);
+		} else
+			state &= ~VSW_ATTR_INFO_SENT;
+		break;
+
+	case VSW_DRING_INFO_RECV:
+		if (phase < VSW_MILESTONE1) {
+			DERR(vswp, "vsw_check_flag (%d): DRING_INFO_RECV"
+				" when in state %d\n", ldcp->ldc_id, phase);
+			vsw_restart_handshake(ldcp);
+			return (1);
+		}
+		break;
+
+	case VSW_DRING_ACK_RECV:
+	case VSW_DRING_NACK_RECV:
+		if (!(state & VSW_DRING_INFO_SENT)) {
+			DERR(vswp, "vsw_check_flag (%d): spurious DRING_ACK"
+				" or DRING_NACK when in state %d\n",
+				ldcp->ldc_id, phase);
+			vsw_restart_handshake(ldcp);
+			return (1);
+		} else
+			state &= ~VSW_DRING_INFO_SENT;
+		break;
+
+	case VSW_RDX_INFO_RECV:
+		if (phase < VSW_MILESTONE3) {
+			DERR(vswp, "vsw_check_flag (%d): RDX_INFO_RECV"
+				" when in state %d\n", ldcp->ldc_id, phase);
+			vsw_restart_handshake(ldcp);
+			return (1);
+		}
+		break;
+
+	case VSW_RDX_ACK_RECV:
+	case VSW_RDX_NACK_RECV:
+		if (!(state & VSW_RDX_INFO_SENT)) {
+			DERR(vswp, "vsw_check_flag (%d): spurious RDX_ACK"
+				" or RDX_NACK when in state %d\n",
+				ldcp->ldc_id, phase);
+			vsw_restart_handshake(ldcp);
+			return (1);
+		} else
+			state &= ~VSW_RDX_INFO_SENT;
+		break;
+
+	case VSW_MCST_INFO_RECV:
+		if (phase < VSW_MILESTONE3) {
+			DERR(vswp, "vsw_check_flag (%d): VSW_MCST_INFO_RECV"
+				" when in state %d\n", ldcp->ldc_id, phase);
+			vsw_restart_handshake(ldcp);
+			return (1);
+		}
+		break;
+
+	default:
+		DERR(vswp, "vsw_check_flag (%lld): unknown flag (%llx)",
+				ldcp->ldc_id, flag);
+		return (1);
+	}
+
+	if (dir == INBOUND)
+		ldcp->lane_in.lstate = state;
+	else
+		ldcp->lane_out.lstate = state;
+
+	D1(vswp, "vsw_check_flag (chan %lld): exit", ldcp->ldc_id);
+
+	return (0);
+}
+
+void
+vsw_next_milestone(vsw_ldc_t *ldcp)
+{
+	vsw_t		*vswp = ldcp->ldc_vswp;
+
+	D1(vswp, "%s (chan %lld): enter (phase %ld)", __func__,
+		ldcp->ldc_id, ldcp->hphase);
+
+	DUMP_FLAGS(ldcp->lane_in.lstate);
+	DUMP_FLAGS(ldcp->lane_out.lstate);
+
+	switch (ldcp->hphase) {
+
+	case VSW_MILESTONE0:
+		/*
+		 * If we haven't started to handshake with our peer,
+		 * start to do so now.
+		 */
+		if (ldcp->lane_out.lstate == 0) {
+			D2(vswp, "%s: (chan %lld) starting handshake "
+				"with peer", __func__, ldcp->ldc_id);
+			vsw_restart_handshake(ldcp);
+		}
+
+		/*
+		 * Only way to pass this milestone is to have successfully
+		 * negotiated version info.
+		 */
+		if ((ldcp->lane_in.lstate & VSW_VER_ACK_SENT) &&
+			(ldcp->lane_out.lstate & VSW_VER_ACK_RECV)) {
+
+			D2(vswp, "%s: (chan %lld) leaving milestone 0",
+				__func__, ldcp->ldc_id);
+
+			/*
+			 * Next milestone is passed when attribute
+			 * information has been successfully exchanged.
+			 */
+			ldcp->hphase = VSW_MILESTONE1;
+			vsw_send_attr(ldcp);
+
+		}
+		break;
+
+	case VSW_MILESTONE1:
+		/*
+		 * Only way to pass this milestone is to have successfully
+		 * negotiated attribute information.
+		 */
+		if (ldcp->lane_in.lstate & VSW_ATTR_ACK_SENT) {
+
+			ldcp->hphase = VSW_MILESTONE2;
+
+			/*
+			 * If the peer device has said it wishes to
+			 * use descriptor rings then we send it our ring
+			 * info, otherwise we just set up a private ring
+			 * which we use an internal buffer
+			 */
+			if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE)
+				vsw_send_dring_info(ldcp);
+		}
+		break;
+
+
+	case VSW_MILESTONE2:
+		/*
+		 * If peer has indicated in its attribute message that
+		 * it wishes to use descriptor rings then the only way
+		 * to pass this milestone is for us to have received
+		 * valid dring info.
+		 *
+		 * If peer is not using descriptor rings then just fall
+		 * through.
+		 */
+		if ((ldcp->lane_in.xfer_mode == VIO_DRING_MODE) &&
+			(!(ldcp->lane_in.lstate & VSW_DRING_ACK_SENT)))
+			break;
+
+		D2(vswp, "%s: (chan %lld) leaving milestone 2",
+				__func__, ldcp->ldc_id);
+
+		ldcp->hphase = VSW_MILESTONE3;
+		vsw_send_rdx(ldcp);
+		break;
+
+	case VSW_MILESTONE3:
+		/*
+		 * Pass this milestone when all paramaters have been
+		 * successfully exchanged and RDX sent in both directions.
+		 *
+		 * Mark outbound lane as available to transmit data.
+		 */
+		if ((ldcp->lane_in.lstate & VSW_RDX_ACK_SENT) &&
+			(ldcp->lane_out.lstate & VSW_RDX_ACK_RECV)) {
+
+			D2(vswp, "%s: (chan %lld) leaving milestone 3",
+				__func__, ldcp->ldc_id);
+			D2(vswp, "%s: ** handshake complete **", __func__);
+			ldcp->lane_out.lstate |= VSW_LANE_ACTIVE;
+			ldcp->hphase = VSW_MILESTONE4;
+			ldcp->hcnt = 0;
+			DISPLAY_STATE();
+		}
+		break;
+
+	case VSW_MILESTONE4:
+		D2(vswp, "%s: (chan %lld) in milestone 4", __func__,
+							ldcp->ldc_id);
+		break;
+
+	default:
+		DERR(vswp, "%s: (chan %lld) Unknown Phase %x", __func__,
+			ldcp->ldc_id, ldcp->hphase);
+	}
+
+	D1(vswp, "%s (chan %lld): exit (phase %ld)", __func__, ldcp->ldc_id,
+		ldcp->hphase);
+}
+
+/*
+ * Check if major version is supported.
+ *
+ * Returns 0 if finds supported major number, and if necessary
+ * adjusts the minor field.
+ *
+ * Returns 1 if can't match major number exactly. Sets mjor/minor
+ * to next lowest support values, or to zero if no other values possible.
+ */
+static int
+vsw_supported_version(vio_ver_msg_t *vp)
+{
+	int	i;
+
+	D1(NULL, "vsw_supported_version: enter");
+
+	for (i = 0; i < VSW_NUM_VER; i++) {
+		if (vsw_versions[i].ver_major == vp->ver_major) {
+			/*
+			 * Matching or lower major version found. Update
+			 * minor number if necessary.
+			 */
+			if (vp->ver_minor > vsw_versions[i].ver_minor) {
+				D2(NULL, "%s: adjusting minor value"
+					" from %d to %d", __func__,
+					vp->ver_minor,
+					vsw_versions[i].ver_minor);
+				vp->ver_minor = vsw_versions[i].ver_minor;
+			}
+
+			return (0);
+		}
+
+		if (vsw_versions[i].ver_major < vp->ver_major) {
+			if (vp->ver_minor > vsw_versions[i].ver_minor) {
+				D2(NULL, "%s: adjusting minor value"
+					" from %d to %d", __func__,
+					vp->ver_minor,
+					vsw_versions[i].ver_minor);
+				vp->ver_minor = vsw_versions[i].ver_minor;
+			}
+			return (1);
+		}
+	}
+
+	/* No match was possible, zero out fields */
+	vp->ver_major = 0;
+	vp->ver_minor = 0;
+
+	D1(NULL, "vsw_supported_version: exit");
+
+	return (1);
+}
+
+/*
+ * Main routine for processing messages received over LDC.
+ */
+static void
+vsw_process_pkt(void *arg)
+{
+	vsw_ldc_t	*ldcp = (vsw_ldc_t  *)arg;
+	vsw_t 		*vswp = ldcp->ldc_vswp;
+	size_t		msglen;
+	vio_msg_tag_t	tag;
+	def_msg_t	dmsg;
+	int 		rv = 0;
+
+	D1(vswp, "%s enter: ldcid (%lld)\n", __func__, ldcp->ldc_id);
+
+	/*
+	 * If channel is up read messages until channel is empty.
+	 */
+	do {
+		msglen = sizeof (dmsg);
+		rv = ldc_read(ldcp->ldc_handle, (caddr_t)&dmsg, &msglen);
+
+		if (rv != 0) {
+			DERR(vswp, "%s :ldc_read err id(%lld) rv(%d) "
+				"len(%d)\n", __func__, ldcp->ldc_id,
+							rv, msglen);
+			break;
+		}
+
+		if (msglen == 0) {
+			D2(vswp, "%s: ldc_read id(%lld) NODATA", __func__,
+			ldcp->ldc_id);
+			break;
+		}
+
+		D2(vswp, "%s: ldc_read id(%lld): msglen(%d)", __func__,
+		    ldcp->ldc_id, msglen);
+
+		/*
+		 * Figure out what sort of packet we have gotten by
+		 * examining the msg tag, and then switch it appropriately.
+		 */
+		bcopy(&dmsg, &tag, sizeof (vio_msg_tag_t));
+
+		switch (tag.vio_msgtype) {
+		case VIO_TYPE_CTRL:
+			vsw_dispatch_ctrl_task(ldcp, &dmsg, tag);
+			break;
+		case VIO_TYPE_DATA:
+			vsw_process_data_pkt(ldcp, &dmsg, tag);
+			break;
+		case VIO_TYPE_ERR:
+			vsw_process_err_pkt(ldcp, &dmsg, tag);
+			break;
+		default:
+			DERR(vswp, "%s: Unknown tag(%lx) ", __func__,
+				"id(%lx)\n", tag.vio_msgtype, ldcp->ldc_id);
+			break;
+		}
+	} while (msglen);
+
+	D1(vswp, "%s exit: ldcid (%lld)\n", __func__, ldcp->ldc_id);
+}
+
+/*
+ * Dispatch a task to process a VIO control message.
+ */
+static void
+vsw_dispatch_ctrl_task(vsw_ldc_t *ldcp, void *cpkt, vio_msg_tag_t tag)
+{
+	vsw_ctrl_task_t		*ctaskp = NULL;
+	vsw_port_t		*port = ldcp->ldc_port;
+	vsw_t			*vswp = port->p_vswp;
+
+	D1(vswp, "%s: enter", __func__);
+
+	/*
+	 * We need to handle RDX ACK messages in-band as once they
+	 * are exchanged it is possible that we will get an
+	 * immediate (legitimate) data packet.
+	 */
+	if ((tag.vio_subtype_env == VIO_RDX) &&
+		(tag.vio_subtype == VIO_SUBTYPE_ACK)) {
+		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_ACK_RECV))
+			return;
+
+		ldcp->lane_out.lstate |= VSW_RDX_ACK_RECV;
+		vsw_next_milestone(ldcp);
+		D2(vswp, "%s (%ld) handling RDX_ACK in place", __func__,
+			ldcp->ldc_id);
+		return;
+	}
+
+	ctaskp = kmem_alloc(sizeof (vsw_ctrl_task_t), KM_NOSLEEP);
+
+	if (ctaskp == NULL) {
+		DERR(vswp, "%s: unable to alloc space for ctrl"
+			" msg", __func__);
+		vsw_restart_handshake(ldcp);
+		return;
+	}
+
+	ctaskp->ldcp = ldcp;
+	bcopy((def_msg_t *)cpkt, &ctaskp->pktp, sizeof (def_msg_t));
+	mutex_enter(&ldcp->hss_lock);
+	ctaskp->hss_id = ldcp->hss_id;
+	mutex_exit(&ldcp->hss_lock);
+
+	/*
+	 * Dispatch task to processing taskq if port is not in
+	 * the process of being detached.
+	 */
+	mutex_enter(&port->state_lock);
+	if (port->state == VSW_PORT_INIT) {
+		if ((vswp->taskq_p == NULL) ||
+			(ddi_taskq_dispatch(vswp->taskq_p,
+			vsw_process_ctrl_pkt, ctaskp, DDI_NOSLEEP)
+							!= DDI_SUCCESS)) {
+			DERR(vswp, "%s: unable to dispatch task to taskq",
+				__func__);
+			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
+			mutex_exit(&port->state_lock);
+			vsw_restart_handshake(ldcp);
+			return;
+		}
+	} else {
+		DWARN(vswp, "%s: port %d detaching, not dispatching "
+			"task", __func__, port->p_instance);
+	}
+
+	mutex_exit(&port->state_lock);
+
+	D2(vswp, "%s: dispatched task to taskq for chan %d", __func__,
+			ldcp->ldc_id);
+	D1(vswp, "%s: exit", __func__);
+}
+
+/*
+ * Process a VIO ctrl message. Invoked from taskq.
+ */
+static void
+vsw_process_ctrl_pkt(void *arg)
+{
+	vsw_ctrl_task_t	*ctaskp = (vsw_ctrl_task_t *)arg;
+	vsw_ldc_t	*ldcp = ctaskp->ldcp;
+	vsw_t 		*vswp = ldcp->ldc_vswp;
+	vio_msg_tag_t	tag;
+	uint16_t	env;
+
+	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
+
+	bcopy(&ctaskp->pktp, &tag, sizeof (vio_msg_tag_t));
+	env = tag.vio_subtype_env;
+
+	/* stale pkt check */
+	mutex_enter(&ldcp->hss_lock);
+	if (ctaskp->hss_id < ldcp->hss_id) {
+		DWARN(vswp, "%s: discarding stale packet belonging to"
+			" earlier (%ld) handshake session", __func__,
+			ctaskp->hss_id);
+		mutex_exit(&ldcp->hss_lock);
+		return;
+	}
+	mutex_exit(&ldcp->hss_lock);
+
+	/* session id check */
+	if (ldcp->session_status & VSW_PEER_SESSION) {
+		if (ldcp->peer_session != tag.vio_sid) {
+			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
+				__func__, ldcp->ldc_id, tag.vio_sid);
+			kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
+			vsw_restart_handshake(ldcp);
+			return;
+		}
+	}
+
+	/*
+	 * Switch on vio_subtype envelope, then let lower routines
+	 * decide if its an INFO, ACK or NACK packet.
+	 */
+	switch (env) {
+	case VIO_VER_INFO:
+		vsw_process_ctrl_ver_pkt(ldcp, &ctaskp->pktp);
+		break;
+	case VIO_DRING_REG:
+		vsw_process_ctrl_dring_reg_pkt(ldcp, &ctaskp->pktp);
+		break;
+	case VIO_DRING_UNREG:
+		vsw_process_ctrl_dring_unreg_pkt(ldcp, &ctaskp->pktp);
+		break;
+	case VIO_ATTR_INFO:
+		vsw_process_ctrl_attr_pkt(ldcp, &ctaskp->pktp);
+		break;
+	case VNET_MCAST_INFO:
+		vsw_process_ctrl_mcst_pkt(ldcp, &ctaskp->pktp);
+		break;
+	case VIO_RDX:
+		vsw_process_ctrl_rdx_pkt(ldcp, &ctaskp->pktp);
+		break;
+	default:
+		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
+							__func__, env);
+	}
+
+	kmem_free(ctaskp, sizeof (vsw_ctrl_task_t));
+	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
+}
+
+/*
+ * Version negotiation. We can end up here either because our peer
+ * has responded to a handshake message we have sent it, or our peer
+ * has initiated a handshake with us. If its the former then can only
+ * be ACK or NACK, if its the later can only be INFO.
+ *
+ * If its an ACK we move to the next stage of the handshake, namely
+ * attribute exchange. If its a NACK we see if we can specify another
+ * version, if we can't we stop.
+ *
+ * If it is an INFO we reset all params associated with communication
+ * in that direction over this channel (remember connection is
+ * essentially 2 independent simplex channels).
+ */
+void
+vsw_process_ctrl_ver_pkt(vsw_ldc_t *ldcp, void *pkt)
+{
+	vio_ver_msg_t	*ver_pkt;
+	vsw_t 		*vswp = ldcp->ldc_vswp;
+
+	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
+
+	/*
+	 * We know this is a ctrl/version packet so
+	 * cast it into the correct structure.
+	 */
+	ver_pkt = (vio_ver_msg_t *)pkt;
+
+	switch (ver_pkt->tag.vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+		D2(vswp, "vsw_process_ctrl_ver_pkt: VIO_SUBTYPE_INFO\n");
+
+		/*
+		 * Record the session id, which we will use from now
+		 * until we see another VER_INFO msg. Even then the
+		 * session id in most cases will be unchanged, execpt
+		 * if channel was reset.
+		 */
+		if ((ldcp->session_status & VSW_PEER_SESSION) &&
+			(ldcp->peer_session != ver_pkt->tag.vio_sid)) {
+			DERR(vswp, "%s: updating session id for chan %lld "
+				"from %llx to %llx", __func__, ldcp->ldc_id,
+				ldcp->peer_session, ver_pkt->tag.vio_sid);
+		}
+
+		ldcp->peer_session = ver_pkt->tag.vio_sid;
+		ldcp->session_status |= VSW_PEER_SESSION;
+
+		/* Legal message at this time ? */
+		if (vsw_check_flag(ldcp, INBOUND, VSW_VER_INFO_RECV))
+			return;
+
+		/*
+		 * First check the device class. Currently only expect
+		 * to be talking to a network device. In the future may
+		 * also talk to another switch.
+		 */
+		if (ver_pkt->dev_class != VDEV_NETWORK) {
+			DERR(vswp, "%s: illegal device class %d", __func__,
+				ver_pkt->dev_class);
+
+			ver_pkt->tag.vio_sid = ldcp->local_session;
+			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
+
+			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
+
+			vsw_send_msg(ldcp, (void *)ver_pkt,
+					sizeof (vio_ver_msg_t));
+
+			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
+			vsw_next_milestone(ldcp);
+			return;
+		} else {
+			ldcp->dev_class = ver_pkt->dev_class;
+		}
+
+		/*
+		 * Now check the version.
+		 */
+		if (vsw_supported_version(ver_pkt) == 0) {
+			/*
+			 * Support this major version and possibly
+			 * adjusted minor version.
+			 */
+
+			D2(vswp, "%s: accepted ver %d:%d", __func__,
+				ver_pkt->ver_major, ver_pkt->ver_minor);
+
+			/* Store accepted values */
+			ldcp->lane_in.ver_major = ver_pkt->ver_major;
+			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
+
+			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
+
+			ldcp->lane_in.lstate |= VSW_VER_ACK_SENT;
+		} else {
+			/*
+			 * NACK back with the next lower major/minor
+			 * pairing we support (if don't suuport any more
+			 * versions then they will be set to zero.
+			 */
+
+			D2(vswp, "%s: replying with ver %d:%d", __func__,
+				ver_pkt->ver_major, ver_pkt->ver_minor);
+
+			/* Store updated values */
+			ldcp->lane_in.ver_major = ver_pkt->ver_major;
+			ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
+
+			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
+
+			ldcp->lane_in.lstate |= VSW_VER_NACK_SENT;
+		}
+
+		DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
+		ver_pkt->tag.vio_sid = ldcp->local_session;
+		vsw_send_msg(ldcp, (void *)ver_pkt, sizeof (vio_ver_msg_t));
+
+		vsw_next_milestone(ldcp);
+		break;
+
+	case VIO_SUBTYPE_ACK:
+		D2(vswp, "%s: VIO_SUBTYPE_ACK\n", __func__);
+
+		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_ACK_RECV))
+			return;
+
+		/* Store updated values */
+		ldcp->lane_in.ver_major = ver_pkt->ver_major;
+		ldcp->lane_in.ver_minor = ver_pkt->ver_minor;
+
+
+		ldcp->lane_out.lstate |= VSW_VER_ACK_RECV;
+		vsw_next_milestone(ldcp);
+
+		break;
+
+	case VIO_SUBTYPE_NACK:
+		D2(vswp, "%s: VIO_SUBTYPE_NACK\n", __func__);
+
+		if (vsw_check_flag(ldcp, OUTBOUND, VSW_VER_NACK_RECV))
+			return;
+
+		/*
+		 * If our peer sent us a NACK with the ver fields set to
+		 * zero then there is nothing more we can do. Otherwise see
+		 * if we support either the version suggested, or a lesser
+		 * one.
+		 */
+		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
+			DERR(vswp, "%s: peer unable to negotiate any "
+				"further.", __func__);
+			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
+			vsw_next_milestone(ldcp);
+			return;
+		}
+
+		/*
+		 * Check to see if we support this major version or
+		 * a lower one. If we don't then maj/min will be set
+		 * to zero.
+		 */
+		(void) vsw_supported_version(ver_pkt);
+		if ((ver_pkt->ver_major == 0) && (ver_pkt->ver_minor == 0)) {
+			/* Nothing more we can do */
+			DERR(vswp, "%s: version negotiation failed.\n",
+								__func__);
+			ldcp->lane_out.lstate |= VSW_VER_NACK_RECV;
+			vsw_next_milestone(ldcp);
+		} else {
+			/* found a supported major version */
+			ldcp->lane_out.ver_major = ver_pkt->ver_major;
+			ldcp->lane_out.ver_minor = ver_pkt->ver_minor;
+
+			D2(vswp, "%s: resending with updated values (%x, %x)",
+				__func__, ver_pkt->ver_major,
+				ver_pkt->ver_minor);
+
+			ldcp->lane_out.lstate |= VSW_VER_INFO_SENT;
+			ver_pkt->tag.vio_sid = ldcp->local_session;
+			ver_pkt->tag.vio_subtype = VIO_SUBTYPE_INFO;
+
+			DUMP_TAG_PTR((vio_msg_tag_t *)ver_pkt);
+
+			vsw_send_msg(ldcp, (void *)ver_pkt,
+					sizeof (vio_ver_msg_t));
+
+			vsw_next_milestone(ldcp);
+
+		}
+		break;
+
+	default:
+		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
+			ver_pkt->tag.vio_subtype);
+	}
+
+	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
+}
+
+/*
+ * Process an attribute packet. We can end up here either because our peer
+ * has ACK/NACK'ed back to an earlier ATTR msg we had sent it, or our
+ * peer has sent us an attribute INFO message
+ *
+ * If its an ACK we then move to the next stage of the handshake which
+ * is to send our descriptor ring info to our peer. If its a NACK then
+ * there is nothing more we can (currently) do.
+ *
+ * If we get a valid/acceptable INFO packet (and we have already negotiated
+ * a version) we ACK back and set channel state to ATTR_RECV, otherwise we
+ * NACK back and reset channel state to INACTIV.
+ *
+ * FUTURE: in time we will probably negotiate over attributes, but for
+ * the moment unacceptable attributes are regarded as a fatal error.
+ *
+ */
+void
+vsw_process_ctrl_attr_pkt(vsw_ldc_t *ldcp, void *pkt)
+{
+	vnet_attr_msg_t		*attr_pkt;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	vsw_port_t		*port = ldcp->ldc_port;
+	uint64_t		macaddr = 0;
+	int			i;
+
+	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
+
+	/*
+	 * We know this is a ctrl/attr packet so
+	 * cast it into the correct structure.
+	 */
+	attr_pkt = (vnet_attr_msg_t *)pkt;
+
+	switch (attr_pkt->tag.vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
+
+		if (vsw_check_flag(ldcp, INBOUND, VSW_ATTR_INFO_RECV))
+			return;
+
+		/*
+		 * If the attributes are unacceptable then we NACK back.
+		 */
+		if (vsw_check_attr(attr_pkt, ldcp->ldc_port)) {
+
+			DERR(vswp, "%s (chan %d): invalid attributes",
+				__func__, ldcp->ldc_id);
+
+			vsw_free_lane_resources(ldcp, INBOUND);
+
+			attr_pkt->tag.vio_sid = ldcp->local_session;
+			attr_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
+
+			DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
+			ldcp->lane_in.lstate |= VSW_ATTR_NACK_SENT;
+			vsw_send_msg(ldcp, (void *)attr_pkt,
+					sizeof (vnet_attr_msg_t));
+
+			vsw_next_milestone(ldcp);
+			return;
+		}
+
+		/*
+		 * Otherwise store attributes for this lane and update
+		 * lane state.
+		 */
+		ldcp->lane_in.mtu = attr_pkt->mtu;
+		ldcp->lane_in.addr = attr_pkt->addr;
+		ldcp->lane_in.addr_type = attr_pkt->addr_type;
+		ldcp->lane_in.xfer_mode = attr_pkt->xfer_mode;
+		ldcp->lane_in.ack_freq = attr_pkt->ack_freq;
+
+		macaddr = ldcp->lane_in.addr;
+		for (i = ETHERADDRL - 1; i >= 0; i--) {
+			port->p_macaddr.ether_addr_octet[i] = macaddr & 0xFF;
+			macaddr >>= 8;
+		}
+
+		/* create the fdb entry for this port/mac address */
+		(void) vsw_add_fdb(vswp, port);
+
+		/* setup device specifc xmit routines */
+		mutex_enter(&port->tx_lock);
+		if (ldcp->lane_in.xfer_mode == VIO_DRING_MODE) {
+			D2(vswp, "%s: mode = VIO_DRING_MODE", __func__);
+			port->transmit = vsw_dringsend;
+		} else if (ldcp->lane_in.xfer_mode == VIO_DESC_MODE) {
+			D2(vswp, "%s: mode = VIO_DESC_MODE", __func__);
+			vsw_create_privring(ldcp);
+			port->transmit = vsw_descrsend;
+		}
+		mutex_exit(&port->tx_lock);
+
+		attr_pkt->tag.vio_sid = ldcp->local_session;
+		attr_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
+
+		DUMP_TAG_PTR((vio_msg_tag_t *)attr_pkt);
+
+		ldcp->lane_in.lstate |= VSW_ATTR_ACK_SENT;
+
+		vsw_send_msg(ldcp, (void *)attr_pkt,
+					sizeof (vnet_attr_msg_t));
+
+		vsw_next_milestone(ldcp);
+		break;
+
+	case VIO_SUBTYPE_ACK:
+		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
+
+		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_ACK_RECV))
+			return;
+
+		ldcp->lane_out.lstate |= VSW_ATTR_ACK_RECV;
+		vsw_next_milestone(ldcp);
+		break;
+
+	case VIO_SUBTYPE_NACK:
+		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
+
+		if (vsw_check_flag(ldcp, OUTBOUND, VSW_ATTR_NACK_RECV))
+			return;
+
+		ldcp->lane_out.lstate |= VSW_ATTR_NACK_RECV;
+		vsw_next_milestone(ldcp);
+		break;
+
+	default:
+		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
+			attr_pkt->tag.vio_subtype);
+	}
+
+	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
+}
+
+/*
+ * Process a dring info packet. We can end up here either because our peer
+ * has ACK/NACK'ed back to an earlier DRING msg we had sent it, or our
+ * peer has sent us a dring INFO message.
+ *
+ * If we get a valid/acceptable INFO packet (and we have already negotiated
+ * a version) we ACK back and update the lane state, otherwise we NACK back.
+ *
+ * FUTURE: nothing to stop client from sending us info on multiple dring's
+ * but for the moment we will just use the first one we are given.
+ *
+ */
+void
+vsw_process_ctrl_dring_reg_pkt(vsw_ldc_t *ldcp, void *pkt)
+{
+	vio_dring_reg_msg_t	*dring_pkt;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	ldc_mem_info_t		minfo;
+	dring_info_t		*dp, *dbp;
+	int			dring_found = 0;
+
+	/*
+	 * We know this is a ctrl/dring packet so
+	 * cast it into the correct structure.
+	 */
+	dring_pkt = (vio_dring_reg_msg_t *)pkt;
+
+	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
+
+	switch (dring_pkt->tag.vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
+
+		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
+			return;
+
+		/*
+		 * If the dring params are unacceptable then we NACK back.
+		 */
+		if (vsw_check_dring_info(dring_pkt)) {
+
+			DERR(vswp, "%s (%lld): invalid dring info",
+				__func__, ldcp->ldc_id);
+
+			vsw_free_lane_resources(ldcp, INBOUND);
+
+			dring_pkt->tag.vio_sid = ldcp->local_session;
+			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
+
+			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
+
+			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
+
+			vsw_send_msg(ldcp, (void *)dring_pkt,
+					sizeof (vio_dring_reg_msg_t));
+
+			vsw_next_milestone(ldcp);
+			return;
+		}
+
+		/*
+		 * Otherwise, attempt to map in the dring using the
+		 * cookie. If that succeeds we send back a unique dring
+		 * identifier that the sending side will use in future
+		 * to refer to this descriptor ring.
+		 */
+		dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
+
+		dp->num_descriptors = dring_pkt->num_descriptors;
+		dp->descriptor_size = dring_pkt->descriptor_size;
+		dp->options = dring_pkt->options;
+		dp->ncookies = dring_pkt->ncookies;
+
+		/*
+		 * Note: should only get one cookie. Enforced in
+		 * the ldc layer.
+		 */
+		bcopy(&dring_pkt->cookie[0], &dp->cookie[0],
+			sizeof (ldc_mem_cookie_t));
+
+		D2(vswp, "%s: num_desc %ld : desc_size %ld", __func__,
+			dp->num_descriptors, dp->descriptor_size);
+		D2(vswp, "%s: options 0x%lx: ncookies %ld", __func__,
+			dp->options, dp->ncookies);
+
+		if ((ldc_mem_dring_map(ldcp->ldc_handle, &dp->cookie[0],
+			dp->ncookies, dp->num_descriptors,
+			dp->descriptor_size, LDC_SHADOW_MAP,
+			&(dp->handle))) != 0) {
+
+			DERR(vswp, "%s: dring_map failed\n", __func__);
+
+			kmem_free(dp, sizeof (dring_info_t));
+			vsw_free_lane_resources(ldcp, INBOUND);
+
+			dring_pkt->tag.vio_sid = ldcp->local_session;
+			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
+
+			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
+
+			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
+			vsw_send_msg(ldcp, (void *)dring_pkt,
+				sizeof (vio_dring_reg_msg_t));
+
+			vsw_next_milestone(ldcp);
+			return;
+		}
+
+		if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
+
+			DERR(vswp, "%s: dring_addr failed\n", __func__);
+
+			kmem_free(dp, sizeof (dring_info_t));
+			vsw_free_lane_resources(ldcp, INBOUND);
+
+			dring_pkt->tag.vio_sid = ldcp->local_session;
+			dring_pkt->tag.vio_subtype = VIO_SUBTYPE_NACK;
+
+			DUMP_TAG_PTR((vio_msg_tag_t *)dring_pkt);
+
+			ldcp->lane_in.lstate |= VSW_DRING_NACK_SENT;
+			vsw_send_msg(ldcp, (void *)dring_pkt,
+				sizeof (vio_dring_reg_msg_t));
+
+			vsw_next_milestone(ldcp);
+			return;
+		} else {
+			/* store the address of the pub part of ring */
+			dp->pub_addr = minfo.vaddr;
+		}
+
+		/* no private section as we are importing */
+		dp->priv_addr = NULL;
+
+		/*
+		 * Using simple mono increasing int for ident at
+		 * the moment.
+		 */
+		dp->ident = ldcp->next_ident;
+		ldcp->next_ident++;
+
+		dp->end_idx = 0;
+		dp->next = NULL;
+
+		/*
+		 * Link it onto the end of the list of drings
+		 * for this lane.
+		 */
+		if (ldcp->lane_in.dringp == NULL) {
+			D2(vswp, "%s: adding first INBOUND dring", __func__);
+			ldcp->lane_in.dringp = dp;
+		} else {
+			dbp = ldcp->lane_in.dringp;
+
+			while (dbp->next != NULL)
+				dbp = dbp->next;
+
+			dbp->next = dp;
+		}
+
+		/* acknowledge it */
+		dring_pkt->tag.vio_sid = ldcp->local_session;
+		dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
+		dring_pkt->dring_ident = dp->ident;
+
+		vsw_send_msg(ldcp, (void *)dring_pkt,
+				sizeof (vio_dring_reg_msg_t));
+
+		ldcp->lane_in.lstate |= VSW_DRING_ACK_SENT;
+		vsw_next_milestone(ldcp);
+		break;
+
+	case VIO_SUBTYPE_ACK:
+		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
+
+		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_ACK_RECV))
+			return;
+
+		/*
+		 * Peer is acknowledging our dring info and will have
+		 * sent us a dring identifier which we will use to
+		 * refer to this ring w.r.t. our peer.
+		 */
+		dp = ldcp->lane_out.dringp;
+		if (dp != NULL) {
+			/*
+			 * Find the ring this ident should be associated
+			 * with.
+			 */
+			if (vsw_dring_match(dp, dring_pkt)) {
+				dring_found = 1;
+
+			} else while (dp != NULL) {
+				if (vsw_dring_match(dp, dring_pkt)) {
+					dring_found = 1;
+					break;
+				}
+				dp = dp->next;
+			}
+
+			if (dring_found == 0) {
+				DERR(NULL, "%s: unrecognised ring cookie",
+					__func__);
+				vsw_restart_handshake(ldcp);
+				return;
+			}
+
+		} else {
+			DERR(vswp, "%s: DRING ACK received but no drings "
+				"allocated", __func__);
+			vsw_restart_handshake(ldcp);
+			return;
+		}
+
+		/* store ident */
+		dp->ident = dring_pkt->dring_ident;
+		ldcp->lane_out.lstate |= VSW_DRING_ACK_RECV;
+		vsw_next_milestone(ldcp);
+		break;
+
+	case VIO_SUBTYPE_NACK:
+		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
+
+		if (vsw_check_flag(ldcp, OUTBOUND, VSW_DRING_NACK_RECV))
+			return;
+
+		ldcp->lane_out.lstate |= VSW_DRING_NACK_RECV;
+		vsw_next_milestone(ldcp);
+		break;
+
+	default:
+		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
+			dring_pkt->tag.vio_subtype);
+	}
+
+	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
+}
+
+/*
+ * Process a request from peer to unregister a dring.
+ *
+ * For the moment we just restart the handshake if our
+ * peer endpoint attempts to unregister a dring.
+ */
+void
+vsw_process_ctrl_dring_unreg_pkt(vsw_ldc_t *ldcp, void *pkt)
+{
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	vio_dring_unreg_msg_t	*dring_pkt;
+
+	/*
+	 * We know this is a ctrl/dring packet so
+	 * cast it into the correct structure.
+	 */
+	dring_pkt = (vio_dring_unreg_msg_t *)pkt;
+
+	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
+
+	switch (dring_pkt->tag.vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
+
+		DWARN(vswp, "%s: restarting handshake..", __func__);
+		vsw_restart_handshake(ldcp);
+		break;
+
+	case VIO_SUBTYPE_ACK:
+		D2(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
+
+		DWARN(vswp, "%s: restarting handshake..", __func__);
+		vsw_restart_handshake(ldcp);
+		break;
+
+	case VIO_SUBTYPE_NACK:
+		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
+
+		DWARN(vswp, "%s: restarting handshake..", __func__);
+		vsw_restart_handshake(ldcp);
+		break;
+
+	default:
+		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
+			dring_pkt->tag.vio_subtype);
+		vsw_restart_handshake(ldcp);
+	}
+
+	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
+}
+
+#define	SND_MCST_NACK(ldcp, pkt) \
+	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
+	pkt->tag.vio_sid = ldcp->local_session; \
+	vsw_send_msg(ldcp, (void *)pkt, sizeof (vnet_mcast_msg_t));
+
+/*
+ * Process a multicast request from a vnet.
+ *
+ * Vnet's specify a multicast address that they are interested in. This
+ * address is used as a key into the hash table which forms the multicast
+ * forwarding database (mFDB).
+ *
+ * The table keys are the multicast addresses, while the table entries
+ * are pointers to lists of ports which wish to receive packets for the
+ * specified multicast address.
+ *
+ * When a multicast packet is being switched we use the address as a key
+ * into the hash table, and then walk the appropriate port list forwarding
+ * the pkt to each port in turn.
+ *
+ * If a vnet is no longer interested in a particular multicast grouping
+ * we simply find the correct location in the hash table and then delete
+ * the relevant port from the port list.
+ *
+ * To deal with the case whereby a port is being deleted without first
+ * removing itself from the lists in the hash table, we maintain a list
+ * of multicast addresses the port has registered an interest in, within
+ * the port structure itself. We then simply walk that list of addresses
+ * using them as keys into the hash table and remove the port from the
+ * appropriate lists.
+ */
+static void
+vsw_process_ctrl_mcst_pkt(vsw_ldc_t *ldcp, void *pkt)
+{
+	vnet_mcast_msg_t	*mcst_pkt;
+	vsw_port_t		*port = ldcp->ldc_port;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	int			i;
+
+	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
+
+	/*
+	 * We know this is a ctrl/mcast packet so
+	 * cast it into the correct structure.
+	 */
+	mcst_pkt = (vnet_mcast_msg_t *)pkt;
+
+	switch (mcst_pkt->tag.vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
+
+		/*
+		 * Check if in correct state to receive a multicast
+		 * message (i.e. handshake complete). If not reset
+		 * the handshake.
+		 */
+		if (vsw_check_flag(ldcp, INBOUND, VSW_MCST_INFO_RECV))
+			return;
+
+		/*
+		 * Before attempting to add or remove address check
+		 * that they are valid multicast addresses.
+		 * If not, then NACK back.
+		 */
+		for (i = 0; i < mcst_pkt->count; i++) {
+			if ((mcst_pkt->mca[i].ether_addr_octet[0] & 01) != 1) {
+				DERR(vswp, "%s: invalid multicast address",
+								__func__);
+				SND_MCST_NACK(ldcp, mcst_pkt);
+				return;
+			}
+		}
+
+		/*
+		 * Now add/remove the addresses. If this fails we
+		 * NACK back.
+		 */
+		if (vsw_add_rem_mcst(mcst_pkt, port) != 0) {
+			SND_MCST_NACK(ldcp, mcst_pkt);
+			return;
+		}
+
+		mcst_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
+		mcst_pkt->tag.vio_sid = ldcp->local_session;
+
+		DUMP_TAG_PTR((vio_msg_tag_t *)mcst_pkt);
+
+		vsw_send_msg(ldcp, (void *)mcst_pkt,
+					sizeof (vnet_mcast_msg_t));
+		break;
+
+	case VIO_SUBTYPE_ACK:
+		DWARN(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
+
+		/*
+		 * We shouldn't ever get a multicast ACK message as
+		 * at the moment we never request multicast addresses
+		 * to be set on some other device. This may change in
+		 * the future if we have cascading switches.
+		 */
+		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_ACK_RECV))
+			return;
+
+				/* Do nothing */
+		break;
+
+	case VIO_SUBTYPE_NACK:
+		DWARN(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
+
+		/*
+		 * We shouldn't get a multicast NACK packet for the
+		 * same reasons as we shouldn't get a ACK packet.
+		 */
+		if (vsw_check_flag(ldcp, OUTBOUND, VSW_MCST_NACK_RECV))
+			return;
+
+				/* Do nothing */
+		break;
+
+	default:
+		DERR(vswp, "%s: unknown vio_subtype %x\n", __func__,
+			mcst_pkt->tag.vio_subtype);
+	}
+
+	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
+}
+
+static void
+vsw_process_ctrl_rdx_pkt(vsw_ldc_t *ldcp, void *pkt)
+{
+	vio_rdx_msg_t	*rdx_pkt;
+	vsw_t		*vswp = ldcp->ldc_vswp;
+
+	/*
+	 * We know this is a ctrl/rdx packet so
+	 * cast it into the correct structure.
+	 */
+	rdx_pkt = (vio_rdx_msg_t *)pkt;
+
+	D1(vswp, "%s(%lld) enter", __func__, ldcp->ldc_id);
+
+	switch (rdx_pkt->tag.vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+		D2(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
+
+		if (vsw_check_flag(ldcp, INBOUND, VSW_RDX_INFO_RECV))
+			return;
+
+		rdx_pkt->tag.vio_sid = ldcp->local_session;
+		rdx_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
+
+		DUMP_TAG_PTR((vio_msg_tag_t *)rdx_pkt);
+
+		ldcp->lane_in.lstate |= VSW_RDX_ACK_SENT;
+
+		vsw_send_msg(ldcp, (void *)rdx_pkt,
+				sizeof (vio_rdx_msg_t));
+
+		vsw_next_milestone(ldcp);
+		break;
+
+	case VIO_SUBTYPE_ACK:
+		/*
+		 * Should be handled in-band by callback handler.
+		 */
+		DERR(vswp, "%s: Unexpected VIO_SUBTYPE_ACK", __func__);
+		vsw_restart_handshake(ldcp);
+		break;
+
+	case VIO_SUBTYPE_NACK:
+		D2(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
+
+		if (vsw_check_flag(ldcp, OUTBOUND, VSW_RDX_NACK_RECV))
+			return;
+
+		ldcp->lane_out.lstate |= VSW_RDX_NACK_RECV;
+		vsw_next_milestone(ldcp);
+		break;
+
+	default:
+		DERR(vswp, "%s: Unknown vio_subtype %x\n", __func__,
+			rdx_pkt->tag.vio_subtype);
+	}
+
+	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
+}
+
+static void
+vsw_process_data_pkt(vsw_ldc_t *ldcp, void *dpkt, vio_msg_tag_t tag)
+{
+	uint16_t	env = tag.vio_subtype_env;
+	vsw_t		*vswp = ldcp->ldc_vswp;
+
+	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
+
+	/* session id check */
+	if (ldcp->session_status & VSW_PEER_SESSION) {
+		if (ldcp->peer_session != tag.vio_sid) {
+			DERR(vswp, "%s (chan %d): invalid session id (%llx)",
+				__func__, ldcp->ldc_id, tag.vio_sid);
+			vsw_restart_handshake(ldcp);
+			return;
+		}
+	}
+
+	/*
+	 * It is an error for us to be getting data packets
+	 * before the handshake has completed.
+	 */
+	if (ldcp->hphase != VSW_MILESTONE4) {
+		DERR(vswp, "%s: got data packet before handshake complete "
+			"hphase %d (%x: %x)", __func__, ldcp->hphase,
+			ldcp->lane_in.lstate, ldcp->lane_out.lstate);
+		DUMP_FLAGS(ldcp->lane_in.lstate);
+		DUMP_FLAGS(ldcp->lane_out.lstate);
+		vsw_restart_handshake(ldcp);
+		return;
+	}
+
+	/*
+	 * Switch on vio_subtype envelope, then let lower routines
+	 * decide if its an INFO, ACK or NACK packet.
+	 */
+	if (env == VIO_DRING_DATA) {
+		vsw_process_data_dring_pkt(ldcp, dpkt);
+	} else if (env == VIO_PKT_DATA) {
+		vsw_process_data_raw_pkt(ldcp, dpkt);
+	} else if (env == VIO_DESC_DATA) {
+		vsw_process_data_ibnd_pkt(ldcp, dpkt);
+	} else {
+		DERR(vswp, "%s : unknown vio_subtype_env (%x)\n",
+							__func__, env);
+	}
+
+	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
+}
+
+#define	SND_DRING_NACK(ldcp, pkt) \
+	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
+	pkt->tag.vio_sid = ldcp->local_session; \
+	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_dring_msg_t));
+
+static void
+vsw_process_data_dring_pkt(vsw_ldc_t *ldcp, void *dpkt)
+{
+	vio_dring_msg_t		*dring_pkt;
+	vnet_public_desc_t	*pub_addr = NULL;
+	vsw_private_desc_t	*priv_addr = NULL;
+	dring_info_t		*dp = NULL;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	mblk_t			*mp = NULL;
+	mblk_t			*bp = NULL;
+	mblk_t			*bpt = NULL;
+	size_t			nbytes = 0;
+	size_t			off = 0;
+	uint64_t		ncookies = 0;
+	uint64_t		chain = 0;
+	uint64_t		j, len, num;
+	uint32_t		start, end, datalen;
+	int			i, last_sync, rv;
+	boolean_t		ack_needed = B_FALSE;
+	boolean_t		sync_needed = B_TRUE;
+
+	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
+
+	/*
+	 * We know this is a data/dring packet so
+	 * cast it into the correct structure.
+	 */
+	dring_pkt = (vio_dring_msg_t *)dpkt;
+
+	/*
+	 * Switch on the vio_subtype. If its INFO then we need to
+	 * process the data. If its an ACK we need to make sure
+	 * it makes sense (i.e did we send an earlier data/info),
+	 * and if its a NACK then we maybe attempt a retry.
+	 */
+	switch (dring_pkt->tag.vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+		D2(vswp, "%s(%lld): VIO_SUBTYPE_INFO", __func__, ldcp->ldc_id);
+
+		if ((dp = vsw_ident2dring(&ldcp->lane_in,
+				dring_pkt->dring_ident)) == NULL) {
+
+			DERR(vswp, "%s(%lld): unable to find dring from "
+				"ident 0x%llx", __func__, ldcp->ldc_id,
+				dring_pkt->dring_ident);
+
+			SND_DRING_NACK(ldcp, dring_pkt);
+			return;
+		}
+
+		start = end = 0;
+		start = dring_pkt->start_idx;
+		end = dring_pkt->end_idx;
+
+		D3(vswp, "%s(%lld): start index %ld : end %ld\n",
+			__func__, ldcp->ldc_id, start, end);
+
+		/* basic sanity check */
+		len = dp->num_descriptors;
+		if (end > len) {
+			DERR(vswp, "%s(%lld): endpoint %lld outside ring"
+				" length %lld", __func__, ldcp->ldc_id,
+				end, len);
+
+			SND_DRING_NACK(ldcp, dring_pkt);
+			return;
+		}
+
+		/* sync data */
+		if ((rv = ldc_mem_dring_acquire(dp->handle,
+						start, end)) != 0) {
+			DERR(vswp, "%s(%lld): unable to acquire dring : err %d",
+				__func__, ldcp->ldc_id, rv);
+			return;
+		}
+
+		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
+
+		j = num = 0;
+
+		/* calculate # descriptors taking into a/c wrap around */
+		num = end >= start ? end - start + 1: (len - start + 1) + end;
+
+		last_sync = start;
+
+		for (i = start; j < num; i = (i + 1) % len, j++) {
+			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
+
+			/*
+			 * Data is padded to align on 8 byte boundary,
+			 * datalen is actual data length, i.e. minus that
+			 * padding.
+			 */
+			datalen = pub_addr->nbytes;
+
+			/*
+			 * Does peer wish us to ACK when we have finished
+			 * with this descriptor ?
+			 */
+			if (pub_addr->hdr.ack)
+				ack_needed = B_TRUE;
+
+			D2(vswp, "%s(%lld): processing desc %lld at pos"
+				" 0x%llx : dstate 0x%lx : datalen 0x%lx",
+				__func__, ldcp->ldc_id, i, pub_addr,
+				pub_addr->hdr.dstate, datalen);
+
+			/*
+			 * XXXX : Is it a fatal error to be told to
+			 * process a packet when the READY bit is not
+			 * set ?
+			 */
+			if (pub_addr->hdr.dstate != VIO_DESC_READY) {
+				DERR(vswp, "%s(%d): descriptor %lld at pos "
+				" 0x%llx not READY (0x%lx)", __func__,
+				ldcp->ldc_id, i, pub_addr,
+				pub_addr->hdr.dstate);
+
+				SND_DRING_NACK(ldcp, dring_pkt);
+				(void) ldc_mem_dring_release(dp->handle,
+					start, end);
+				return;
+			}
+
+			/*
+			 * Mark that we are starting to process descriptor.
+			 */
+			pub_addr->hdr.dstate = VIO_DESC_ACCEPTED;
+
+			/*
+			 * allocb(9F) returns an aligned data block. We
+			 * need to ensure that we ask ldc for an aligned
+			 * number of bytes also.
+			 */
+			nbytes = datalen;
+			if (nbytes & 0x7) {
+				off = 8 - (nbytes & 0x7);
+				nbytes += off;
+			}
+			mp = allocb(datalen, BPRI_MED);
+			if (mp == NULL) {
+				DERR(vswp, "%s(%lld): allocb failed",
+					__func__, ldcp->ldc_id);
+				(void) ldc_mem_dring_release(dp->handle,
+					start, end);
+				return;
+			}
+
+			ncookies = pub_addr->ncookies;
+			rv = ldc_mem_copy(ldcp->ldc_handle,
+				(caddr_t)mp->b_rptr, 0, &nbytes,
+				pub_addr->memcookie, ncookies,
+				LDC_COPY_IN);
+
+			if (rv != 0) {
+				DERR(vswp, "%s(%d): unable to copy in "
+					"data from %d cookies", __func__,
+					ldcp->ldc_id, ncookies);
+				freemsg(mp);
+				(void) ldc_mem_dring_release(dp->handle,
+					start, end);
+				return;
+			} else {
+				D2(vswp, "%s(%d): copied in %ld bytes"
+					" using %d cookies", __func__,
+					ldcp->ldc_id, nbytes, ncookies);
+			}
+
+			/* point to the actual end of data */
+			mp->b_wptr = mp->b_rptr + datalen;
+
+			/* build a chain of received packets */
+			if (bp == NULL) {
+				/* first pkt */
+				bp = mp;
+				bp->b_next = bp->b_prev = NULL;
+				bpt = bp;
+				chain = 1;
+			} else {
+				mp->b_next = NULL;
+				mp->b_prev = bpt;
+				bpt->b_next = mp;
+				bpt = mp;
+				chain++;
+			}
+
+			/* mark we are finished with this descriptor */
+			pub_addr->hdr.dstate = VIO_DESC_DONE;
+
+			/*
+			 * Send an ACK back to peer if requested, and sync
+			 * the rings up to this point so the remote side sees
+			 * the descriptor flag in a consistent state.
+			 */
+			if (ack_needed) {
+				if ((rv = ldc_mem_dring_release(
+					dp->handle, last_sync, i)) != 0) {
+					DERR(vswp, "%s(%lld): unable to sync"
+						" from %d to %d", __func__,
+						ldcp->ldc_id, last_sync, i);
+				}
+
+				ack_needed = B_FALSE;
+
+				if (i == end)
+					sync_needed = B_FALSE;
+				else
+					sync_needed = B_TRUE;
+
+				last_sync = (i + 1) % len;
+
+				dring_pkt->tag.vio_subtype = VIO_SUBTYPE_ACK;
+				dring_pkt->tag.vio_sid = ldcp->local_session;
+				vsw_send_msg(ldcp, (void *)dring_pkt,
+					sizeof (vio_dring_msg_t));
+			}
+		}
+
+		if (sync_needed) {
+			if ((rv = ldc_mem_dring_release(dp->handle,
+					last_sync, end)) != 0) {
+				DERR(vswp, "%s(%lld): unable to sync"
+					" from %d to %d", __func__,
+					ldcp->ldc_id, last_sync, end);
+			}
+		}
+
+		/* send the chain of packets to be switched */
+		D3(vswp, "%s(%lld): switching chain of %d msgs", __func__,
+			ldcp->ldc_id, chain);
+		vsw_switch_frame(vswp, bp, VSW_VNETPORT,
+					ldcp->ldc_port, NULL);
+
+		break;
+
+	case VIO_SUBTYPE_ACK:
+		D2(vswp, "%s(%lld): VIO_SUBTYPE_ACK", __func__, ldcp->ldc_id);
+		/*
+		 * Verify that the relevant descriptors are all
+		 * marked as DONE
+		 */
+		if ((dp = vsw_ident2dring(&ldcp->lane_out,
+			dring_pkt->dring_ident)) == NULL) {
+			DERR(vswp, "%s: unknown ident in ACK", __func__);
+			return;
+		}
+
+		pub_addr = (vnet_public_desc_t *)dp->pub_addr;
+		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
+
+		start = end = 0;
+		start = dring_pkt->start_idx;
+		end = dring_pkt->end_idx;
+		len = dp->num_descriptors;
+
+
+		j = num = 0;
+		/* calculate # descriptors taking into a/c wrap around */
+		num = end >= start ? end - start + 1: (len - start + 1) + end;
+
+		D2(vswp, "%s(%lld): start index %ld : end %ld : num %ld\n",
+			__func__, ldcp->ldc_id, start, end, num);
+
+		for (i = start; j < num; i = (i + 1) % len, j++) {
+			pub_addr = (vnet_public_desc_t *)dp->pub_addr + i;
+			priv_addr = (vsw_private_desc_t *)dp->priv_addr + i;
+
+			if (pub_addr->hdr.dstate != VIO_DESC_DONE) {
+				DERR(vswp, "%s: descriptor %lld at pos "
+					" 0x%llx not DONE (0x%lx)\n", __func__,
+					i, pub_addr, pub_addr->hdr.dstate);
+				return;
+			} else {
+				/* clear all the fields */
+				bzero(priv_addr->datap, priv_addr->datalen);
+				priv_addr->datalen = 0;
+
+				pub_addr->hdr.dstate = VIO_DESC_FREE;
+				pub_addr->hdr.ack = 0;
+				priv_addr->dstate = VIO_DESC_FREE;
+
+				D3(vswp, "clearing descp %d : pub state "
+					"0x%llx : priv state 0x%llx", i,
+					pub_addr->hdr.dstate,
+					priv_addr->dstate);
+			}
+		}
+
+		break;
+
+	case VIO_SUBTYPE_NACK:
+		DWARN(vswp, "%s(%lld): VIO_SUBTYPE_NACK",
+						__func__, ldcp->ldc_id);
+		/*
+		 * Something is badly wrong if we are getting NACK's
+		 * for our data pkts. So reset the channel.
+		 */
+		vsw_restart_handshake(ldcp);
+
+		break;
+
+	default:
+		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
+			ldcp->ldc_id, dring_pkt->tag.vio_subtype);
+	}
+
+	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
+}
+
+/*
+ * VIO_PKT_DATA (a.k.a raw data mode )
+ *
+ * Note - currently not supported. Do nothing.
+ */
+static void
+vsw_process_data_raw_pkt(vsw_ldc_t *ldcp, void *dpkt)
+{
+	_NOTE(ARGUNUSED(dpkt))
+
+	D1(NULL, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
+
+	DERR(NULL, "%s (%lld): currently  not supported",
+						__func__, ldcp->ldc_id);
+
+	D1(NULL, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
+}
+
+#define	SND_IBND_DESC_NACK(ldcp, pkt) \
+	pkt->tag.vio_subtype = VIO_SUBTYPE_NACK; \
+	pkt->tag.vio_sid = ldcp->local_session; \
+	vsw_send_msg(ldcp, (void *)pkt, sizeof (vio_ibnd_desc_t));
+
+/*
+ * Process an in-band descriptor message (most likely from
+ * OBP).
+ */
+static void
+vsw_process_data_ibnd_pkt(vsw_ldc_t *ldcp, void *pkt)
+{
+	vio_ibnd_desc_t		*ibnd_desc;
+	dring_info_t		*dp = NULL;
+	vsw_private_desc_t	*priv_addr = NULL;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	mblk_t			*mp = NULL;
+	size_t			nbytes = 0;
+	size_t			off = 0;
+	uint64_t		idx = 0;
+	uint32_t		datalen = 0;
+	uint64_t		ncookies = 0;
+	int			rv;
+
+	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
+
+	ibnd_desc = (vio_ibnd_desc_t *)pkt;
+
+	switch (ibnd_desc->hdr.tag.vio_subtype) {
+	case VIO_SUBTYPE_INFO:
+		D1(vswp, "%s: VIO_SUBTYPE_INFO", __func__);
+
+		if (vsw_check_flag(ldcp, INBOUND, VSW_DRING_INFO_RECV))
+			return;
+
+		/*
+		 * Data is padded to align on a 8 byte boundary,
+		 * nbytes is actual data length, i.e. minus that
+		 * padding.
+		 */
+		datalen = ibnd_desc->nbytes;
+
+		D2(vswp, "%s(%lld): processing inband desc : "
+			": datalen 0x%lx", __func__, ldcp->ldc_id, datalen);
+
+		ncookies = ibnd_desc->ncookies;
+
+		/*
+		 * allocb(9F) returns an aligned data block. We
+		 * need to ensure that we ask ldc for an aligned
+		 * number of bytes also.
+		 */
+		nbytes = datalen;
+		if (nbytes & 0x7) {
+			off = 8 - (nbytes & 0x7);
+			nbytes += off;
+		}
+
+		mp = allocb(datalen, BPRI_MED);
+		if (mp == NULL) {
+			DERR(vswp, "%s(%lld): allocb failed",
+					__func__, ldcp->ldc_id);
+			return;
+		}
+
+		rv = ldc_mem_copy(ldcp->ldc_handle, (caddr_t)mp->b_rptr,
+			0, &nbytes, ibnd_desc->memcookie, (uint64_t)ncookies,
+			LDC_COPY_IN);
+
+		if (rv != 0) {
+			DERR(vswp, "%s(%d): unable to copy in data from "
+				"%d cookie(s)", __func__,
+				ldcp->ldc_id, ncookies);
+			freemsg(mp);
+			return;
+		} else {
+			D2(vswp, "%s(%d): copied in %ld bytes using %d "
+				"cookies", __func__, ldcp->ldc_id, nbytes,
+				ncookies);
+		}
+
+		/* point to the actual end of data */
+		mp->b_wptr = mp->b_rptr + datalen;
+
+		/*
+		 * We ACK back every in-band descriptor message we process
+		 */
+		ibnd_desc->hdr.tag.vio_subtype = VIO_SUBTYPE_ACK;
+		ibnd_desc->hdr.tag.vio_sid = ldcp->local_session;
+		vsw_send_msg(ldcp, (void *)ibnd_desc,
+				sizeof (vio_ibnd_desc_t));
+
+		/* send the packet to be switched */
+		vsw_switch_frame(vswp, mp, VSW_VNETPORT,
+					ldcp->ldc_port, NULL);
+
+		break;
+
+	case VIO_SUBTYPE_ACK:
+		D1(vswp, "%s: VIO_SUBTYPE_ACK", __func__);
+
+		/* Verify the ACK is valid */
+		idx = ibnd_desc->hdr.desc_handle;
+
+		if (idx >= VSW_RING_NUM_EL) {
+			cmn_err(CE_WARN, "%s: corrupted ACK received "
+				"(idx %ld)", __func__, idx);
+			return;
+		}
+
+		if ((dp = ldcp->lane_out.dringp) == NULL) {
+			DERR(vswp, "%s: no dring found", __func__);
+			return;
+		}
+
+		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
+
+		/* move to correct location in ring */
+		priv_addr += idx;
+
+		/*
+		 * When we sent the in-band message to our peer we
+		 * marked the copy in our private ring as READY. We now
+		 * check that the descriptor we are being ACK'ed for is in
+		 * fact READY, i.e. it is one we have shared with our peer.
+		 */
+		if (priv_addr->dstate != VIO_DESC_READY) {
+			cmn_err(CE_WARN, "%s: (%ld) desc at index %ld not "
+				"READY (0x%lx)", __func__, ldcp->ldc_id, idx,
+				priv_addr->dstate);
+			cmn_err(CE_CONT, "%s: bound %d: ncookies %ld\n",
+				__func__, priv_addr->bound,
+				priv_addr->ncookies);
+			cmn_err(CE_CONT, "datalen %ld\n", priv_addr->datalen);
+			return;
+		} else {
+			D2(vswp, "%s: (%lld) freeing descp at %lld", __func__,
+				ldcp->ldc_id, idx);
+
+			/* release resources associated with sent msg */
+			bzero(priv_addr->datap, priv_addr->datalen);
+			priv_addr->datalen = 0;
+			priv_addr->dstate = VIO_DESC_FREE;
+		}
+		break;
+
+	case VIO_SUBTYPE_NACK:
+		DERR(vswp, "%s: VIO_SUBTYPE_NACK", __func__);
+
+		/*
+		 * We should only get a NACK if our peer doesn't like
+		 * something about a message we have sent it. If this
+		 * happens we just release the resources associated with
+		 * the message. (We are relying on higher layers to decide
+		 * whether or not to resend.
+		 */
+
+		/* limit check */
+		idx = ibnd_desc->hdr.desc_handle;
+
+		if (idx >= VSW_RING_NUM_EL) {
+			DERR(vswp, "%s: corrupted NACK received (idx %lld)",
+				__func__, idx);
+			return;
+		}
+
+		if ((dp = ldcp->lane_out.dringp) == NULL) {
+			DERR(vswp, "%s: no dring found", __func__);
+			return;
+		}
+
+		priv_addr = (vsw_private_desc_t *)dp->priv_addr;
+
+		/* move to correct location in ring */
+		priv_addr += idx;
+
+		/* release resources associated with sent msg */
+		bzero(priv_addr->datap, priv_addr->datalen);
+		priv_addr->datalen = 0;
+		priv_addr->dstate = VIO_DESC_FREE;
+
+		break;
+
+	default:
+		DERR(vswp, "%s(%lld): Unknown vio_subtype %x\n", __func__,
+			ldcp->ldc_id, ibnd_desc->hdr.tag.vio_subtype);
+	}
+
+	D1(vswp, "%s(%lld) exit", __func__, ldcp->ldc_id);
+}
+
+static void
+vsw_process_err_pkt(vsw_ldc_t *ldcp, void *epkt, vio_msg_tag_t tag)
+{
+	_NOTE(ARGUNUSED(epkt))
+
+	vsw_t		*vswp = ldcp->ldc_vswp;
+	uint16_t	env = tag.vio_subtype_env;
+
+	D1(vswp, "%s (%lld): enter\n", __func__, ldcp->ldc_id);
+
+	/*
+	 * Error vio_subtypes have yet to be defined. So for
+	 * the moment we can't do anything.
+	 */
+	D2(vswp, "%s: (%x) vio_subtype env", __func__, env);
+
+	D1(vswp, "%s (%lld): exit\n", __func__, ldcp->ldc_id);
+}
+
+/*
+ * Switch the given ethernet frame when operating in layer 2 mode.
+ *
+ * vswp: pointer to the vsw instance
+ * mp: pointer to chain of ethernet frame(s) to be switched
+ * caller: identifies the source of this frame as:
+ * 		1. VSW_VNETPORT - a vsw port (connected to a vnet).
+ *		2. VSW_PHYSDEV - the physical ethernet device
+ *		3. VSW_LOCALDEV - vsw configured as a virtual interface
+ * arg: argument provided by the caller.
+ *		1. for VNETPORT - pointer to the corresponding vsw_port_t.
+ *		2. for PHYSDEV - NULL
+ *		3. for LOCALDEV - pointer to to this vsw_t(self)
+ */
+void
+vsw_switch_l2_frame(vsw_t *vswp, mblk_t *mp, int caller,
+			vsw_port_t *arg, mac_resource_handle_t mrh)
+{
+	struct ether_header	*ehp;
+	vsw_port_t		*port = NULL;
+	mblk_t			*bp, *ret_m;
+	mblk_t			*nmp = NULL;
+	vsw_port_list_t		*plist = &vswp->plist;
+
+	D1(vswp, "%s: enter (caller %d)", __func__, caller);
+
+	/*
+	 * PERF: rather than breaking up the chain here, scan it
+	 * to find all mblks heading to same destination and then
+	 * pass that sub-chain to the lower transmit functions.
+	 */
+
+	/* process the chain of packets */
+	bp = mp;
+	while (bp) {
+		mp = bp;
+		bp = bp->b_next;
+		mp->b_next = mp->b_prev = NULL;
+		ehp = (struct ether_header *)mp->b_rptr;
+
+		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
+			__func__, MBLKSIZE(mp), MBLKL(mp));
+
+		READ_ENTER(&vswp->if_lockrw);
+		if (ether_cmp(&ehp->ether_dhost, &vswp->if_addr) == 0) {
+			/*
+			 * If destination is VSW_LOCALDEV (vsw as an eth
+			 * interface) and if the device is up & running,
+			 * send the packet up the stack on this host.
+			 * If the virtual interface is down, drop the packet.
+			 */
+			if (caller != VSW_LOCALDEV) {
+				if (vswp->if_state & VSW_IF_UP) {
+					RW_EXIT(&vswp->if_lockrw);
+					mac_rx(vswp->if_macp, mrh, mp);
+				} else {
+					RW_EXIT(&vswp->if_lockrw);
+					/* Interface down, drop pkt */
+					freemsg(mp);
+				}
+			} else {
+				RW_EXIT(&vswp->if_lockrw);
+				freemsg(mp);
+			}
+			continue;
+		}
+		RW_EXIT(&vswp->if_lockrw);
+
+		READ_ENTER(&plist->lockrw);
+		port = vsw_lookup_fdb(vswp, ehp);
+		if (port) {
+			/*
+			 * Mark the port as in-use.
+			 */
+			mutex_enter(&port->ref_lock);
+			port->ref_cnt++;
+			mutex_exit(&port->ref_lock);
+			RW_EXIT(&plist->lockrw);
+
+			/*
+			 * If plumbed and in promisc mode then copy msg
+			 * and send up the stack.
+			 */
+			READ_ENTER(&vswp->if_lockrw);
+			if (VSW_U_P(vswp->if_state)) {
+				RW_EXIT(&vswp->if_lockrw);
+				nmp = copymsg(mp);
+				if (nmp)
+					mac_rx(vswp->if_macp, mrh, nmp);
+			} else {
+				RW_EXIT(&vswp->if_lockrw);
+			}
+
+			/*
+			 * If the destination is in FDB, the packet
+			 * should be forwarded to the correponding
+			 * vsw_port (connected to a vnet device -
+			 * VSW_VNETPORT)
+			 */
+			(void) vsw_portsend(port, mp);
+
+			/*
+			 * Decrement use count in port and check if
+			 * should wake delete thread.
+			 */
+			mutex_enter(&port->ref_lock);
+			port->ref_cnt--;
+			if (port->ref_cnt == 0)
+				cv_signal(&port->ref_cv);
+			mutex_exit(&port->ref_lock);
+		} else {
+			RW_EXIT(&plist->lockrw);
+			/*
+			 * Destination not in FDB.
+			 *
+			 * If the destination is broadcast or
+			 * multicast forward the packet to all
+			 * (VNETPORTs, PHYSDEV, LOCALDEV),
+			 * except the caller.
+			 */
+			if (IS_BROADCAST(ehp)) {
+				D3(vswp, "%s: BROADCAST pkt", __func__);
+				(void) vsw_forward_all(vswp, mp,
+								caller, arg);
+			} else if (IS_MULTICAST(ehp)) {
+				D3(vswp, "%s: MULTICAST pkt", __func__);
+				(void) vsw_forward_grp(vswp, mp,
+							caller, arg);
+			} else {
+				/*
+				 * If the destination is unicast, and came
+				 * from either a logical network device or
+				 * the switch itself when it is plumbed, then
+				 * send it out on the physical device and also
+				 * up the stack if the logical interface is
+				 * in promiscious mode.
+				 *
+				 * NOTE:  The assumption here is that if we
+				 * cannot find the destination in our fdb, its
+				 * a unicast address, and came from either a
+				 * vnet or down the stack (when plumbed) it
+				 * must be destinded for an ethernet device
+				 * outside our ldoms.
+				 */
+				if (caller == VSW_VNETPORT) {
+					READ_ENTER(&vswp->if_lockrw);
+					if (VSW_U_P(vswp->if_state)) {
+						RW_EXIT(&vswp->if_lockrw);
+						nmp = copymsg(mp);
+						if (nmp)
+							mac_rx(vswp->if_macp,
+								mrh, nmp);
+					} else {
+						RW_EXIT(&vswp->if_lockrw);
+					}
+					if ((ret_m = vsw_tx_msg(vswp, mp))
+								!= NULL) {
+						DERR(vswp, "%s: drop mblks to "
+							"phys dev", __func__);
+						freemsg(ret_m);
+					}
+
+				} else if (caller == VSW_PHYSDEV) {
+					/*
+					 * Pkt seen because card in promisc
+					 * mode. Send up stack if plumbed in
+					 * promisc mode, else drop it.
+					 */
+					READ_ENTER(&vswp->if_lockrw);
+					if (VSW_U_P(vswp->if_state)) {
+						RW_EXIT(&vswp->if_lockrw);
+						mac_rx(vswp->if_macp, mrh, mp);
+					} else {
+						RW_EXIT(&vswp->if_lockrw);
+						freemsg(mp);
+					}
+
+				} else if (caller == VSW_LOCALDEV) {
+					/*
+					 * Pkt came down the stack, send out
+					 * over physical device.
+					 */
+					if ((ret_m = vsw_tx_msg(vswp, mp))
+								!= NULL) {
+						DERR(vswp, "%s: drop mblks to "
+							"phys dev", __func__);
+						freemsg(ret_m);
+					}
+				}
+			}
+		}
+	}
+	D1(vswp, "%s: exit\n", __func__);
+}
+
+/*
+ * Switch ethernet frame when in layer 3 mode (i.e. using IP
+ * layer to do the routing).
+ *
+ * There is a large amount of overlap between this function and
+ * vsw_switch_l2_frame. At some stage we need to revisit and refactor
+ * both these functions.
+ */
+void
+vsw_switch_l3_frame(vsw_t *vswp, mblk_t *mp, int caller,
+			vsw_port_t *arg, mac_resource_handle_t mrh)
+{
+	struct ether_header	*ehp;
+	vsw_port_t		*port = NULL;
+	mblk_t			*bp = NULL;
+	vsw_port_list_t		*plist = &vswp->plist;
+
+	D1(vswp, "%s: enter (caller %d)", __func__, caller);
+
+	/*
+	 * In layer 3 mode should only ever be switching packets
+	 * between IP layer and vnet devices. So make sure thats
+	 * who is invoking us.
+	 */
+	if ((caller != VSW_LOCALDEV) && (caller != VSW_VNETPORT)) {
+		DERR(vswp, "%s: unexpected caller (%d)", __func__, caller);
+		freemsgchain(mp);
+		return;
+	}
+
+	/* process the chain of packets */
+	bp = mp;
+	while (bp) {
+		mp = bp;
+		bp = bp->b_next;
+		mp->b_next = mp->b_prev = NULL;
+		ehp = (struct ether_header *)mp->b_rptr;
+
+		D2(vswp, "%s: mblk data buffer %lld : actual data size %lld",
+			__func__, MBLKSIZE(mp), MBLKL(mp));
+
+		READ_ENTER(&plist->lockrw);
+		port = vsw_lookup_fdb(vswp, ehp);
+		if (port) {
+			/*
+			 * Mark port as in-use.
+			 */
+			mutex_enter(&port->ref_lock);
+			port->ref_cnt++;
+			mutex_exit(&port->ref_lock);
+			RW_EXIT(&plist->lockrw);
+
+			D2(vswp, "%s: sending to target port", __func__);
+			(void) vsw_portsend(port, mp);
+
+			/*
+			 * Finished with port so decrement ref count and
+			 * check if should wake delete thread.
+			 */
+			mutex_enter(&port->ref_lock);
+			port->ref_cnt--;
+			if (port->ref_cnt == 0)
+				cv_signal(&port->ref_cv);
+			mutex_exit(&port->ref_lock);
+		} else {
+			RW_EXIT(&plist->lockrw);
+			/*
+			 * Destination not in FDB
+			 *
+			 * If the destination is broadcast or
+			 * multicast forward the packet to all
+			 * (VNETPORTs, PHYSDEV, LOCALDEV),
+			 * except the caller.
+			 */
+			if (IS_BROADCAST(ehp)) {
+				D2(vswp, "%s: BROADCAST pkt", __func__);
+				(void) vsw_forward_all(vswp, mp,
+								caller, arg);
+			} else if (IS_MULTICAST(ehp)) {
+				D2(vswp, "%s: MULTICAST pkt", __func__);
+				(void) vsw_forward_grp(vswp, mp,
+							caller, arg);
+			} else {
+				/*
+				 * Unicast pkt from vnet that we don't have
+				 * an FDB entry for, so must be destinded for
+				 * the outside world. Attempt to send up to the
+				 * IP layer to allow it to deal with it.
+				 */
+				if (caller == VSW_VNETPORT) {
+					READ_ENTER(&vswp->if_lockrw);
+					if (vswp->if_state & VSW_IF_UP) {
+						RW_EXIT(&vswp->if_lockrw);
+						D2(vswp, "%s: sending up",
+							__func__);
+						mac_rx(vswp->if_macp, mrh, mp);
+					} else {
+						RW_EXIT(&vswp->if_lockrw);
+						/* Interface down, drop pkt */
+						D2(vswp, "%s I/F down",
+								__func__);
+						freemsg(mp);
+					}
+				}
+			}
+		}
+	}
+
+	D1(vswp, "%s: exit", __func__);
+}
+
+/*
+ * Forward the ethernet frame to all ports (VNETPORTs, PHYSDEV, LOCALDEV),
+ * except the caller (port on which frame arrived).
+ */
+static int
+vsw_forward_all(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
+{
+	vsw_port_list_t	*plist = &vswp->plist;
+	vsw_port_t	*portp;
+	mblk_t		*nmp = NULL;
+	mblk_t		*ret_m = NULL;
+	int		skip_port = 0;
+
+	D1(vswp, "vsw_forward_all: enter\n");
+
+	/*
+	 * Broadcast message from inside ldoms so send to outside
+	 * world if in either of layer 2 modes.
+	 */
+	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
+		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
+		((caller == VSW_LOCALDEV) || (caller == VSW_VNETPORT))) {
+
+		nmp = dupmsg(mp);
+		if (nmp) {
+			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
+				DERR(vswp, "%s: dropping pkt(s) "
+				"consisting of %ld bytes of data for"
+				" physical device", __func__, MBLKL(ret_m));
+			freemsg(ret_m);
+			}
+		}
+	}
+
+	if (caller == VSW_VNETPORT)
+		skip_port = 1;
+
+	/*
+	 * Broadcast message from other vnet (layer 2 or 3) or outside
+	 * world (layer 2 only), send up stack if plumbed.
+	 */
+	if ((caller == VSW_PHYSDEV) || (caller == VSW_VNETPORT)) {
+		READ_ENTER(&vswp->if_lockrw);
+		if (vswp->if_state & VSW_IF_UP) {
+			RW_EXIT(&vswp->if_lockrw);
+			nmp = copymsg(mp);
+			if (nmp)
+				mac_rx(vswp->if_macp, vswp->if_mrh, nmp);
+		} else {
+			RW_EXIT(&vswp->if_lockrw);
+		}
+	}
+
+	/* send it to all VNETPORTs */
+	READ_ENTER(&plist->lockrw);
+	for (portp = plist->head; portp != NULL; portp = portp->p_next) {
+		D2(vswp, "vsw_forward_all: port %d", portp->p_instance);
+		/*
+		 * Caution ! - don't reorder these two checks as arg
+		 * will be NULL if the caller is PHYSDEV. skip_port is
+		 * only set if caller is VNETPORT.
+		 */
+		if ((skip_port) && (portp == arg))
+			continue;
+		else {
+			nmp = dupmsg(mp);
+			if (nmp) {
+				(void) vsw_portsend(portp, nmp);
+			} else {
+				DERR(vswp, "vsw_forward_all: nmp NULL");
+			}
+		}
+	}
+	RW_EXIT(&plist->lockrw);
+
+	freemsg(mp);
+
+	D1(vswp, "vsw_forward_all: exit\n");
+	return (0);
+}
+
+/*
+ * Forward pkts to any devices or interfaces which have registered
+ * an interest in them (i.e. multicast groups).
+ */
+static int
+vsw_forward_grp(vsw_t *vswp, mblk_t *mp, int caller, vsw_port_t *arg)
+{
+	struct ether_header	*ehp = (struct ether_header *)mp->b_rptr;
+	mfdb_ent_t		*entp = NULL;
+	mfdb_ent_t		*tpp = NULL;
+	vsw_port_t 		*port;
+	uint64_t		key = 0;
+	mblk_t			*nmp = NULL;
+	mblk_t			*ret_m = NULL;
+	boolean_t		check_if = B_TRUE;
+
+	/*
+	 * Convert address to hash table key
+	 */
+	KEY_HASH(key, ehp->ether_dhost);
+
+	D1(vswp, "%s: key 0x%llx", __func__, key);
+
+	/*
+	 * If pkt came from either a vnet or down the stack (if we are
+	 * plumbed) and we are in layer 2 mode, then we send the pkt out
+	 * over the physical adapter, and then check to see if any other
+	 * vnets are interested in it.
+	 */
+	if (((vswp->smode[vswp->smode_idx] == VSW_LAYER2) ||
+		(vswp->smode[vswp->smode_idx] == VSW_LAYER2_PROMISC)) &&
+		((caller == VSW_VNETPORT) || (caller == VSW_LOCALDEV))) {
+		nmp = dupmsg(mp);
+		if (nmp) {
+			if ((ret_m = vsw_tx_msg(vswp, nmp)) != NULL) {
+				DERR(vswp, "%s: dropping pkt(s) "
+					"consisting of %ld bytes of "
+					"data for physical device",
+					__func__, MBLKL(ret_m));
+				freemsg(ret_m);
+			}
+		}
+	}
+
+	READ_ENTER(&vswp->mfdbrw);
+	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)key,
+				(mod_hash_val_t *)&entp) != 0) {
+		D3(vswp, "%s: no table entry found for addr 0x%llx",
+								__func__, key);
+	} else {
+		/*
+		 * Send to list of devices associated with this address...
+		 */
+		for (tpp = entp; tpp != NULL; tpp = tpp->nextp) {
+
+			/* dont send to ourselves */
+			if ((caller == VSW_VNETPORT) &&
+				(tpp->d_addr == (void *)arg)) {
+				port = (vsw_port_t *)tpp->d_addr;
+				D3(vswp, "%s: not sending to ourselves"
+					" : port %d", __func__,
+					port->p_instance);
+				continue;
+
+			} else if ((caller == VSW_LOCALDEV) &&
+				(tpp->d_type == VSW_LOCALDEV)) {
+				D3(vswp, "%s: not sending back up stack",
+					__func__);
+				continue;
+			}
+
+			if (tpp->d_type == VSW_VNETPORT) {
+				port = (vsw_port_t *)tpp->d_addr;
+				D3(vswp, "%s: sending to port %ld for "
+					" addr 0x%llx", __func__,
+					port->p_instance, key);
+
+				nmp = dupmsg(mp);
+				if (nmp)
+					(void) vsw_portsend(port, nmp);
+			} else {
+				if (vswp->if_state & VSW_IF_UP) {
+					nmp = copymsg(mp);
+					if (nmp)
+						mac_rx(vswp->if_macp,
+							vswp->if_mrh, nmp);
+					check_if = B_FALSE;
+					D3(vswp, "%s: sending up stack"
+						" for addr 0x%llx", __func__,
+						key);
+				}
+			}
+		}
+	}
+
+	RW_EXIT(&vswp->mfdbrw);
+
+	/*
+	 * If the pkt came from either a vnet or from physical device,
+	 * and if we havent already sent the pkt up the stack then we
+	 * check now if we can/should (i.e. the interface is plumbed
+	 * and in promisc mode).
+	 */
+	if ((check_if) &&
+		((caller == VSW_VNETPORT) || (caller == VSW_PHYSDEV))) {
+		READ_ENTER(&vswp->if_lockrw);
+		if (VSW_U_P(vswp->if_state)) {
+			RW_EXIT(&vswp->if_lockrw);
+			D3(vswp, "%s: (caller %d) finally sending up stack"
+				" for addr 0x%llx", __func__, caller, key);
+			nmp = copymsg(mp);
+			if (nmp)
+				mac_rx(vswp->if_macp, vswp->if_mrh, nmp);
+		} else {
+			RW_EXIT(&vswp->if_lockrw);
+		}
+	}
+
+	freemsg(mp);
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (0);
+}
+
+/* transmit the packet over the given port */
+static int
+vsw_portsend(vsw_port_t *port, mblk_t *mp)
+{
+	vsw_ldc_list_t 	*ldcl = &port->p_ldclist;
+	vsw_ldc_t 	*ldcp;
+	int		status = 0;
+
+
+	READ_ENTER(&ldcl->lockrw);
+	/*
+	 * Note for now, we have a single channel.
+	 */
+	ldcp = ldcl->head;
+	if (ldcp == NULL) {
+		DERR(port->p_vswp, "vsw_portsend: no ldc: dropping packet\n");
+		freemsg(mp);
+		RW_EXIT(&ldcl->lockrw);
+		return (1);
+	}
+
+	/*
+	 * Send the message out using the appropriate
+	 * transmit function which will free mblock when it
+	 * is finished with it.
+	 */
+	mutex_enter(&port->tx_lock);
+	if (port->transmit != NULL)
+		status = (*port->transmit)(ldcp, mp);
+	else {
+		freemsg(mp);
+	}
+	mutex_exit(&port->tx_lock);
+
+	RW_EXIT(&ldcl->lockrw);
+
+	return (status);
+}
+
+/*
+ * Send packet out via descriptor ring to a logical device.
+ */
+static int
+vsw_dringsend(vsw_ldc_t *ldcp, mblk_t *mp)
+{
+	vio_dring_msg_t		dring_pkt;
+	dring_info_t		*dp = NULL;
+	vsw_private_desc_t	*priv_desc = NULL;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	mblk_t			*bp;
+	size_t			n, size;
+	caddr_t			bufp;
+	int			idx;
+	int			status = LDC_TX_SUCCESS;
+
+	D1(vswp, "%s(%lld): enter\n", __func__, ldcp->ldc_id);
+
+	/* TODO: make test a macro */
+	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
+		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
+		DWARN(vswp, "%s(%lld) status(%d) lstate(0x%llx), dropping "
+			"packet\n", __func__, ldcp->ldc_id, ldcp->ldc_status,
+			ldcp->lane_out.lstate);
+		freemsg(mp);
+		return (LDC_TX_FAILURE);
+	}
+
+	/*
+	 * Note - using first ring only, this may change
+	 * in the future.
+	 */
+	if ((dp = ldcp->lane_out.dringp) == NULL) {
+		DERR(vswp, "%s(%lld): no dring for outbound lane on"
+			" channel %d", __func__, ldcp->ldc_id, ldcp->ldc_id);
+		freemsg(mp);
+		return (LDC_TX_FAILURE);
+	}
+
+	mutex_enter(&dp->dlock);
+
+	size = msgsize(mp);
+	if (size > (size_t)ETHERMAX) {
+		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
+		    ldcp->ldc_id, size);
+		status = LDC_TX_FAILURE;
+		goto vsw_dringsend_free_exit;
+	}
+
+	/*
+	 * Find a free descriptor
+	 *
+	 * Note: for the moment we are assuming that we will only
+	 * have one dring going from the switch to each of its
+	 * peers. This may change in the future.
+	 */
+	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
+		DERR(vswp, "%s(%lld): no descriptor available for ring "
+			"at 0x%llx", __func__, ldcp->ldc_id, dp);
+
+		/* nothing more we can do */
+		status = LDC_TX_NORESOURCES;
+		goto vsw_dringsend_free_exit;
+	} else {
+		D2(vswp, "%s(%lld): free private descriptor found at pos "
+			"%ld addr 0x%llx\n", __func__, ldcp->ldc_id, idx,
+			priv_desc);
+	}
+
+	/* copy data into the descriptor */
+	bufp = priv_desc->datap;
+	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
+		n = MBLKL(bp);
+		bcopy(bp->b_rptr, bufp, n);
+		bufp += n;
+	}
+
+	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
+	priv_desc->dstate = VIO_DESC_READY;
+
+	/*
+	 * Copy relevant sections of private descriptor
+	 * to public section
+	 */
+	vsw_dring_priv2pub(priv_desc);
+
+	/*
+	 * Send a vio_dring_msg to peer to prompt them to read
+	 * the updated descriptor ring.
+	 */
+	dring_pkt.tag.vio_msgtype = VIO_TYPE_DATA;
+	dring_pkt.tag.vio_subtype = VIO_SUBTYPE_INFO;
+	dring_pkt.tag.vio_subtype_env = VIO_DRING_DATA;
+	dring_pkt.tag.vio_sid = ldcp->local_session;
+
+	/* Note - for now using first ring */
+	dring_pkt.dring_ident = dp->ident;
+
+	/*
+	 * Access to the seq_num is implicitly protected by the
+	 * fact that we have only one dring associated with the
+	 * lane currently and we hold the associated dring lock.
+	 */
+	dring_pkt.seq_num = ldcp->lane_out.seq_num++;
+
+	/* Note - only updating single descrip at time at the moment */
+	dring_pkt.start_idx = idx;
+	dring_pkt.end_idx = idx;
+
+	D3(vswp, "%s(%lld): dring 0x%llx : ident 0x%llx\n", __func__,
+		ldcp->ldc_id, dp, dring_pkt.dring_ident);
+	D3(vswp, "%s(%lld): start %lld : end %lld : seq %lld\n", __func__,
+		ldcp->ldc_id, dring_pkt.start_idx, dring_pkt.end_idx,
+		dring_pkt.seq_num);
+
+	vsw_send_msg(ldcp, (void *)&dring_pkt, sizeof (vio_dring_msg_t));
+
+vsw_dringsend_free_exit:
+
+	mutex_exit(&dp->dlock);
+
+	/* free the message block */
+	freemsg(mp);
+
+	D1(vswp, "%s(%lld): exit\n", __func__, ldcp->ldc_id);
+	return (status);
+}
+
+/*
+ * Send an in-band descriptor message over ldc.
+ */
+static int
+vsw_descrsend(vsw_ldc_t *ldcp, mblk_t *mp)
+{
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	vio_ibnd_desc_t		ibnd_msg;
+	vsw_private_desc_t	*priv_desc = NULL;
+	dring_info_t		*dp = NULL;
+	size_t			n, size = 0;
+	caddr_t			bufp;
+	mblk_t			*bp;
+	int			idx, i;
+	int			status = LDC_TX_SUCCESS;
+	static int		warn_msg = 1;
+
+	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
+
+	ASSERT(mp != NULL);
+
+	if ((!(ldcp->lane_out.lstate & VSW_LANE_ACTIVE)) ||
+		(ldcp->ldc_status != LDC_UP) || (ldcp->ldc_handle == NULL)) {
+		DERR(vswp, "%s(%lld) status(%d) state (0x%llx), dropping pkt",
+			__func__, ldcp->ldc_id, ldcp->ldc_status,
+			ldcp->lane_out.lstate);
+		freemsg(mp);
+		return (LDC_TX_FAILURE);
+	}
+
+	/*
+	 * only expect single dring to exist, which we use
+	 * as an internal buffer, rather than a transfer channel.
+	 */
+	if ((dp = ldcp->lane_out.dringp) == NULL) {
+		DERR(vswp, "%s(%lld): no dring for outbound lane",
+			__func__, ldcp->ldc_id);
+		DERR(vswp, "%s(%lld) status(%d) state (0x%llx)",
+			__func__, ldcp->ldc_id, ldcp->ldc_status,
+			ldcp->lane_out.lstate);
+		freemsg(mp);
+		return (LDC_TX_FAILURE);
+	}
+
+	mutex_enter(&dp->dlock);
+
+	size = msgsize(mp);
+	if (size > (size_t)ETHERMAX) {
+		DERR(vswp, "%s(%lld) invalid size (%ld)\n", __func__,
+		    ldcp->ldc_id, size);
+		status = LDC_TX_FAILURE;
+		goto vsw_descrsend_free_exit;
+	}
+
+	/*
+	 * Find a free descriptor in our buffer ring
+	 */
+	if (vsw_dring_find_free_desc(dp, &priv_desc, &idx) != 0) {
+		if (warn_msg) {
+			DERR(vswp, "%s(%lld): no descriptor available for ring "
+			"at 0x%llx", __func__, ldcp->ldc_id, dp);
+			warn_msg = 0;
+		}
+
+		/* nothing more we can do */
+		status = LDC_TX_NORESOURCES;
+		goto vsw_descrsend_free_exit;
+	} else {
+		D2(vswp, "%s(%lld): free private descriptor found at pos "
+			"%ld addr 0x%x\n", __func__, ldcp->ldc_id, idx,
+			priv_desc);
+		warn_msg = 1;
+	}
+
+	/* copy data into the descriptor */
+	bufp = priv_desc->datap;
+	for (bp = mp, n = 0; bp != NULL; bp = bp->b_cont) {
+		n = MBLKL(bp);
+		bcopy(bp->b_rptr, bufp, n);
+		bufp += n;
+	}
+
+	priv_desc->datalen = (size < (size_t)ETHERMIN) ? ETHERMIN : size;
+	priv_desc->dstate = VIO_DESC_READY;
+
+	/* create and send the in-band descp msg */
+	ibnd_msg.hdr.tag.vio_msgtype = VIO_TYPE_DATA;
+	ibnd_msg.hdr.tag.vio_subtype = VIO_SUBTYPE_INFO;
+	ibnd_msg.hdr.tag.vio_subtype_env = VIO_DESC_DATA;
+	ibnd_msg.hdr.tag.vio_sid = ldcp->local_session;
+
+	/*
+	 * Access to the seq_num is implicitly protected by the
+	 * fact that we have only one dring associated with the
+	 * lane currently and we hold the associated dring lock.
+	 */
+	ibnd_msg.hdr.seq_num = ldcp->lane_out.seq_num++;
+
+	/*
+	 * Copy the mem cookies describing the data from the
+	 * private region of the descriptor ring into the inband
+	 * descriptor.
+	 */
+	for (i = 0; i < priv_desc->ncookies; i++) {
+		bcopy(&priv_desc->memcookie[i], &ibnd_msg.memcookie[i],
+			sizeof (ldc_mem_cookie_t));
+	}
+
+	ibnd_msg.hdr.desc_handle = idx;
+	ibnd_msg.ncookies = priv_desc->ncookies;
+	ibnd_msg.nbytes = size;
+
+	vsw_send_msg(ldcp, (void *)&ibnd_msg, sizeof (vio_ibnd_desc_t));
+
+vsw_descrsend_free_exit:
+
+	mutex_exit(&dp->dlock);
+
+	/* free the allocated message blocks */
+	freemsg(mp);
+
+	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
+	return (status);
+}
+
+static void
+vsw_send_ver(vsw_ldc_t *ldcp)
+{
+	vsw_t		*vswp = ldcp->ldc_vswp;
+	lane_t		*lp = &ldcp->lane_out;
+	vio_ver_msg_t	ver_msg;
+
+	D1(vswp, "%s enter", __func__);
+
+	ver_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
+	ver_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
+	ver_msg.tag.vio_subtype_env = VIO_VER_INFO;
+	ver_msg.tag.vio_sid = ldcp->local_session;
+
+	ver_msg.ver_major = vsw_versions[0].ver_major;
+	ver_msg.ver_minor = vsw_versions[0].ver_minor;
+	ver_msg.dev_class = VDEV_NETWORK_SWITCH;
+
+	lp->lstate |= VSW_VER_INFO_SENT;
+	lp->ver_major = ver_msg.ver_major;
+	lp->ver_minor = ver_msg.ver_minor;
+
+	DUMP_TAG(ver_msg.tag);
+
+	vsw_send_msg(ldcp, &ver_msg, sizeof (vio_ver_msg_t));
+
+	D1(vswp, "%s (%d): exit", __func__, ldcp->ldc_id);
+}
+
+static void
+vsw_send_attr(vsw_ldc_t *ldcp)
+{
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	lane_t			*lp = &ldcp->lane_out;
+	vnet_attr_msg_t		attr_msg;
+
+	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
+
+	/*
+	 * Subtype is set to INFO by default
+	 */
+	attr_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
+	attr_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
+	attr_msg.tag.vio_subtype_env = VIO_ATTR_INFO;
+	attr_msg.tag.vio_sid = ldcp->local_session;
+
+	/* payload copied from default settings for lane */
+	attr_msg.mtu = lp->mtu;
+	attr_msg.addr_type = lp->addr_type;
+	attr_msg.xfer_mode = lp->xfer_mode;
+	attr_msg.ack_freq = lp->xfer_mode;
+
+	READ_ENTER(&vswp->if_lockrw);
+	bcopy(&(vswp->if_addr), &(attr_msg.addr), ETHERADDRL);
+	RW_EXIT(&vswp->if_lockrw);
+
+	ldcp->lane_out.lstate |= VSW_ATTR_INFO_SENT;
+
+	DUMP_TAG(attr_msg.tag);
+
+	vsw_send_msg(ldcp, &attr_msg, sizeof (vnet_attr_msg_t));
+
+	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
+}
+
+/*
+ * Create dring info msg (which also results in the creation of
+ * a dring).
+ */
+static vio_dring_reg_msg_t *
+vsw_create_dring_info_pkt(vsw_ldc_t *ldcp)
+{
+	vio_dring_reg_msg_t	*mp;
+	dring_info_t		*dp;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+
+	D1(vswp, "vsw_create_dring_info_pkt enter\n");
+
+	/*
+	 * If we can't create a dring, obviously no point sending
+	 * a message.
+	 */
+	if ((dp = vsw_create_dring(ldcp)) == NULL)
+		return (NULL);
+
+	mp = kmem_zalloc(sizeof (vio_dring_reg_msg_t), KM_SLEEP);
+
+	mp->tag.vio_msgtype = VIO_TYPE_CTRL;
+	mp->tag.vio_subtype = VIO_SUBTYPE_INFO;
+	mp->tag.vio_subtype_env = VIO_DRING_REG;
+	mp->tag.vio_sid = ldcp->local_session;
+
+	/* payload */
+	mp->num_descriptors = dp->num_descriptors;
+	mp->descriptor_size = dp->descriptor_size;
+	mp->options = dp->options;
+	mp->ncookies = dp->ncookies;
+	bcopy(&dp->cookie[0], &mp->cookie[0], sizeof (ldc_mem_cookie_t));
+
+	mp->dring_ident = 0;
+
+	D1(vswp, "vsw_create_dring_info_pkt exit\n");
+
+	return (mp);
+}
+
+static void
+vsw_send_dring_info(vsw_ldc_t *ldcp)
+{
+	vio_dring_reg_msg_t	*dring_msg;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+
+	D1(vswp, "%s: (%ld) enter", __func__, ldcp->ldc_id);
+
+	dring_msg = vsw_create_dring_info_pkt(ldcp);
+	if (dring_msg == NULL) {
+		cmn_err(CE_WARN, "vsw_send_dring_info: error creating msg");
+		return;
+	}
+
+	ldcp->lane_out.lstate |= VSW_DRING_INFO_SENT;
+
+	DUMP_TAG_PTR((vio_msg_tag_t *)dring_msg);
+
+	vsw_send_msg(ldcp, dring_msg,
+		sizeof (vio_dring_reg_msg_t));
+
+	kmem_free(dring_msg, sizeof (vio_dring_reg_msg_t));
+
+	D1(vswp, "%s: (%ld) exit", __func__, ldcp->ldc_id);
+}
+
+static void
+vsw_send_rdx(vsw_ldc_t *ldcp)
+{
+	vsw_t		*vswp = ldcp->ldc_vswp;
+	vio_rdx_msg_t	rdx_msg;
+
+	D1(vswp, "%s (%ld) enter", __func__, ldcp->ldc_id);
+
+	rdx_msg.tag.vio_msgtype = VIO_TYPE_CTRL;
+	rdx_msg.tag.vio_subtype = VIO_SUBTYPE_INFO;
+	rdx_msg.tag.vio_subtype_env = VIO_RDX;
+	rdx_msg.tag.vio_sid = ldcp->local_session;
+
+	ldcp->lane_out.lstate |= VSW_RDX_INFO_SENT;
+
+	DUMP_TAG(rdx_msg.tag);
+
+	vsw_send_msg(ldcp, &rdx_msg, sizeof (vio_rdx_msg_t));
+
+	D1(vswp, "%s (%ld) exit", __func__, ldcp->ldc_id);
+}
+
+/*
+ * Generic routine to send message out over ldc channel.
+ */
+static void
+vsw_send_msg(vsw_ldc_t *ldcp, void *msgp, int size)
+{
+	int		rv;
+	size_t		msglen = size;
+	vio_msg_tag_t	*tag = (vio_msg_tag_t *)msgp;
+	vsw_t		*vswp = ldcp->ldc_vswp;
+
+	D1(vswp, "vsw_send_msg (%lld) enter : sending %d bytes",
+			ldcp->ldc_id, size);
+
+	D2(vswp, "send_msg: type 0x%llx", tag->vio_msgtype);
+	D2(vswp, "send_msg: stype 0x%llx", tag->vio_subtype);
+	D2(vswp, "send_msg: senv 0x%llx", tag->vio_subtype_env);
+
+	mutex_enter(&ldcp->ldc_txlock);
+	do {
+		msglen = size;
+		rv = ldc_write(ldcp->ldc_handle, (caddr_t)msgp, &msglen);
+	} while (rv == EWOULDBLOCK && --vsw_wretries > 0);
+
+	mutex_exit(&ldcp->ldc_txlock);
+
+	if ((rv != 0) || (msglen != size)) {
+		DERR(vswp, "vsw_send_msg:ldc_write failed: chan(%lld) "
+			"rv(%d) size (%d) msglen(%d)\n", ldcp->ldc_id,
+			rv, size, msglen);
+	}
+
+	D1(vswp, "vsw_send_msg (%lld) exit : sent %d bytes",
+			ldcp->ldc_id, msglen);
+}
+
+/*
+ * Add an entry into FDB, for the given mac address and port_id.
+ * Returns 0 on success, 1 on failure.
+ *
+ * Lock protecting FDB must be held by calling process.
+ */
+static int
+vsw_add_fdb(vsw_t *vswp, vsw_port_t *port)
+{
+	uint64_t	addr = 0;
+
+	D1(vswp, "%s: enter", __func__);
+
+	KEY_HASH(addr, port->p_macaddr);
+
+	D2(vswp, "%s: key = 0x%llx", __func__, addr);
+
+	/*
+	 * Note: duplicate keys will be rejected by mod_hash.
+	 */
+	if (mod_hash_insert(vswp->fdb, (mod_hash_key_t)addr,
+				(mod_hash_val_t)port) != 0) {
+		DERR(vswp, "%s: unable to add entry into fdb.", __func__);
+		return (1);
+	}
+
+	D1(vswp, "%s: exit", __func__);
+	return (0);
+}
+
+/*
+ * Remove an entry from FDB.
+ * Returns 0 on success, 1 on failure.
+ */
+static int
+vsw_del_fdb(vsw_t *vswp, vsw_port_t *port)
+{
+	uint64_t	addr = 0;
+
+	D1(vswp, "%s: enter", __func__);
+
+	KEY_HASH(addr, port->p_macaddr);
+
+	D2(vswp, "%s: key = 0x%llx", __func__, addr);
+
+	(void) mod_hash_destroy(vswp->fdb, (mod_hash_val_t)addr);
+
+	D1(vswp, "%s: enter", __func__);
+
+	return (0);
+}
+
+/*
+ * Search fdb for a given mac address.
+ * Returns pointer to the entry if found, else returns NULL.
+ */
+static vsw_port_t *
+vsw_lookup_fdb(vsw_t *vswp, struct ether_header *ehp)
+{
+	uint64_t	key = 0;
+	vsw_port_t	*port = NULL;
+
+	D1(vswp, "%s: enter", __func__);
+
+	KEY_HASH(key, ehp->ether_dhost);
+
+	D2(vswp, "%s: key = 0x%llx", __func__, key);
+
+	if (mod_hash_find(vswp->fdb, (mod_hash_key_t)key,
+				(mod_hash_val_t *)&port) != 0) {
+		return (NULL);
+	}
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (port);
+}
+
+/*
+ * Add or remove multicast address(es).
+ *
+ * Returns 0 on success, 1 on failure.
+ */
+static int
+vsw_add_rem_mcst(vnet_mcast_msg_t *mcst_pkt, vsw_port_t *port)
+{
+	mcst_addr_t		*mcst_p = NULL;
+	vsw_t			*vswp = port->p_vswp;
+	uint64_t		addr = 0x0;
+	int			i;
+
+	D1(vswp, "%s: enter", __func__);
+
+	D2(vswp, "%s: %d addresses", __func__, mcst_pkt->count);
+
+	for (i = 0; i < mcst_pkt->count; i++) {
+		/*
+		 * Convert address into form that can be used
+		 * as hash table key.
+		 */
+		KEY_HASH(addr, mcst_pkt->mca[i]);
+
+		/*
+		 * Add or delete the specified address/port combination.
+		 */
+		if (mcst_pkt->set == 0x1) {
+			D3(vswp, "%s: adding multicast address 0x%llx for "
+				"port %ld", __func__, addr, port->p_instance);
+			if (vsw_add_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
+				/*
+				 * Update the list of multicast
+				 * addresses contained within the
+				 * port structure to include this new
+				 * one.
+				 */
+				mcst_p = kmem_alloc(sizeof (mcst_addr_t),
+								KM_NOSLEEP);
+				if (mcst_p == NULL) {
+					DERR(vswp, "%s: unable to alloc mem",
+						__func__);
+					return (1);
+				}
+
+				mcst_p->nextp = NULL;
+				mcst_p->addr = addr;
+
+				mutex_enter(&port->mca_lock);
+				mcst_p->nextp = port->mcap;
+				port->mcap = mcst_p;
+				mutex_exit(&port->mca_lock);
+
+				/*
+				 * Program the address into HW. If the addr
+				 * has already been programmed then the MAC
+				 * just increments a ref counter (which is
+				 * used when the address is being deleted)
+				 *
+				 * Note:
+				 * For the moment we dont care if this
+				 * succeeds because the card must be in
+				 * promics mode. When we have the ability
+				 * to program multiple unicst address into
+				 * the card then we will need to check this
+				 * return value.
+				 */
+				if (vswp->mh != NULL)
+					(void) mac_multicst_add(vswp->mh,
+						(uchar_t *)&mcst_pkt->mca[i]);
+
+			} else {
+				DERR(vswp, "%s: error adding multicast "
+					"address 0x%llx for port %ld",
+					__func__, addr, port->p_instance);
+				return (1);
+			}
+		} else {
+			/*
+			 * Delete an entry from the multicast hash
+			 * table and update the address list
+			 * appropriately.
+			 */
+			if (vsw_del_mcst(vswp, VSW_VNETPORT, addr, port) == 0) {
+				D3(vswp, "%s: deleting multicast address "
+					"0x%llx for port %ld", __func__, addr,
+					port->p_instance);
+
+				vsw_del_addr(VSW_VNETPORT, port, addr);
+
+				/*
+				 * Remove the address from HW. The address
+				 * will actually only be removed once the ref
+				 * count within the MAC layer has dropped to
+				 * zero. I.e. we can safely call this fn even
+				 * if other ports are interested in this
+				 * address.
+				 */
+				if (vswp->mh != NULL)
+					(void) mac_multicst_remove(vswp->mh,
+						(uchar_t *)&mcst_pkt->mca[i]);
+
+			} else {
+				DERR(vswp, "%s: error deleting multicast "
+					"addr 0x%llx for port %ld",
+					__func__, addr, port->p_instance);
+				return (1);
+			}
+		}
+	}
+	D1(vswp, "%s: exit", __func__);
+	return (0);
+}
+
+/*
+ * Add a new multicast entry.
+ *
+ * Search hash table based on address. If match found then
+ * update associated val (which is chain of ports), otherwise
+ * create new key/val (addr/port) pair and insert into table.
+ */
+static int
+vsw_add_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
+{
+	int		dup = 0;
+	int		rv = 0;
+	mfdb_ent_t	*ment = NULL;
+	mfdb_ent_t	*tmp_ent = NULL;
+	mfdb_ent_t	*new_ent = NULL;
+	void		*tgt = NULL;
+
+	if (devtype == VSW_VNETPORT) {
+		/*
+		 * Being invoked from a vnet.
+		 */
+		ASSERT(arg != NULL);
+		tgt = arg;
+		D2(NULL, "%s: port %d : address 0x%llx", __func__,
+			((vsw_port_t *)arg)->p_instance, addr);
+	} else {
+		/*
+		 * We are being invoked via the m_multicst mac entry
+		 * point.
+		 */
+		D2(NULL, "%s: address 0x%llx", __func__, addr);
+		tgt = (void *)vswp;
+	}
+
+	WRITE_ENTER(&vswp->mfdbrw);
+	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
+				(mod_hash_val_t *)&ment) != 0) {
+
+		/* address not currently in table */
+		ment = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
+		ment->d_addr = (void *)tgt;
+		ment->d_type = devtype;
+		ment->nextp = NULL;
+
+		if (mod_hash_insert(vswp->mfdb, (mod_hash_key_t)addr,
+			(mod_hash_val_t)ment) != 0) {
+			DERR(vswp, "%s: hash table insertion failed", __func__);
+			kmem_free(ment, sizeof (mfdb_ent_t));
+			rv = 1;
+		} else {
+			D2(vswp, "%s: added initial entry for 0x%llx to "
+				"table", __func__, addr);
+		}
+	} else {
+		/*
+		 * Address in table. Check to see if specified port
+		 * is already associated with the address. If not add
+		 * it now.
+		 */
+		tmp_ent = ment;
+		while (tmp_ent != NULL) {
+			if (tmp_ent->d_addr == (void *)tgt) {
+				if (devtype == VSW_VNETPORT) {
+					DERR(vswp, "%s: duplicate port entry "
+						"found for portid %ld and key "
+						"0x%llx", __func__,
+						((vsw_port_t *)arg)->p_instance,
+						addr);
+				} else {
+					DERR(vswp, "%s: duplicate entry found"
+						"for key 0x%llx",
+						__func__, addr);
+				}
+				rv = 1;
+				dup = 1;
+				break;
+			}
+			tmp_ent = tmp_ent->nextp;
+		}
+
+		/*
+		 * Port not on list so add it to end now.
+		 */
+		if (0 == dup) {
+			D2(vswp, "%s: added entry for 0x%llx to table",
+				__func__, addr);
+			new_ent = kmem_alloc(sizeof (mfdb_ent_t), KM_SLEEP);
+			new_ent->d_addr = (void *)tgt;
+			new_ent->d_type = devtype;
+			new_ent->nextp = NULL;
+
+			tmp_ent = ment;
+			while (tmp_ent->nextp != NULL)
+				tmp_ent = tmp_ent->nextp;
+
+			tmp_ent->nextp = new_ent;
+		}
+	}
+
+	RW_EXIT(&vswp->mfdbrw);
+	return (rv);
+}
+
+/*
+ * Remove a multicast entry from the hashtable.
+ *
+ * Search hash table based on address. If match found, scan
+ * list of ports associated with address. If specified port
+ * found remove it from list.
+ */
+static int
+vsw_del_mcst(vsw_t *vswp, uint8_t devtype, uint64_t addr, void *arg)
+{
+	mfdb_ent_t	*ment = NULL;
+	mfdb_ent_t	*curr_p, *prev_p;
+	void		*tgt = NULL;
+
+	D1(vswp, "%s: enter", __func__);
+
+	if (devtype == VSW_VNETPORT) {
+		tgt = (vsw_port_t *)arg;
+		D2(vswp, "%s: removing port %d from mFDB for address"
+			" 0x%llx", __func__, ((vsw_port_t *)tgt)->p_instance,
+			addr);
+	} else {
+		D2(vswp, "%s: removing entry", __func__);
+		tgt = (void *)vswp;
+	}
+
+	WRITE_ENTER(&vswp->mfdbrw);
+	if (mod_hash_find(vswp->mfdb, (mod_hash_key_t)addr,
+				(mod_hash_val_t *)&ment) != 0) {
+		D2(vswp, "%s: address 0x%llx not in table", __func__, addr);
+		RW_EXIT(&vswp->mfdbrw);
+		return (1);
+	}
+
+	prev_p = curr_p = ment;
+
+	while (curr_p != NULL) {
+		if (curr_p->d_addr == (void *)tgt) {
+			if (devtype == VSW_VNETPORT) {
+				D2(vswp, "%s: port %d found", __func__,
+					((vsw_port_t *)tgt)->p_instance);
+			} else {
+				D2(vswp, "%s: instance found", __func__);
+			}
+
+			if (prev_p == curr_p) {
+				/*
+				 * head of list, if no other element is in
+				 * list then destroy this entry, otherwise
+				 * just replace it with updated value.
+				 */
+				ment = curr_p->nextp;
+				kmem_free(curr_p, sizeof (mfdb_ent_t));
+				if (ment == NULL) {
+					(void) mod_hash_destroy(vswp->mfdb,
+							(mod_hash_val_t)addr);
+				} else {
+					(void) mod_hash_replace(vswp->mfdb,
+							(mod_hash_key_t)addr,
+							(mod_hash_val_t)ment);
+				}
+			} else {
+				/*
+				 * Not head of list, no need to do
+				 * replacement, just adjust list pointers.
+				 */
+				prev_p->nextp = curr_p->nextp;
+				kmem_free(curr_p, sizeof (mfdb_ent_t));
+			}
+			break;
+		}
+
+		prev_p = curr_p;
+		curr_p = curr_p->nextp;
+	}
+
+	RW_EXIT(&vswp->mfdbrw);
+
+	D1(vswp, "%s: exit", __func__);
+
+	return (0);
+}
+
+/*
+ * Port is being deleted, but has registered an interest in one
+ * or more multicast groups. Using the list of addresses maintained
+ * within the port structure find the appropriate entry in the hash
+ * table and remove this port from the list of interested ports.
+ */
+static void
+vsw_del_mcst_port(vsw_port_t *port)
+{
+	mcst_addr_t	*mcst_p = NULL;
+	vsw_t		*vswp = port->p_vswp;
+
+	D1(vswp, "%s: enter", __func__);
+
+	mutex_enter(&port->mca_lock);
+	while (port->mcap != NULL) {
+		(void) vsw_del_mcst(vswp, VSW_VNETPORT,
+					port->mcap->addr, port);
+
+		mcst_p = port->mcap->nextp;
+		kmem_free(port->mcap, sizeof (mcst_addr_t));
+		port->mcap = mcst_p;
+	}
+	mutex_exit(&port->mca_lock);
+
+	D1(vswp, "%s: exit", __func__);
+}
+
+/*
+ * This vsw instance is detaching, but has registered an interest in one
+ * or more multicast groups. Using the list of addresses maintained
+ * within the vsw structure find the appropriate entry in the hash
+ * table and remove this instance from the list of interested ports.
+ */
+static void
+vsw_del_mcst_vsw(vsw_t *vswp)
+{
+	mcst_addr_t	*next_p = NULL;
+
+	D1(vswp, "%s: enter", __func__);
+
+	mutex_enter(&vswp->mca_lock);
+
+	while (vswp->mcap != NULL) {
+		DERR(vswp, "%s: deleting addr 0x%llx",
+			__func__, vswp->mcap->addr);
+		(void) vsw_del_mcst(vswp, VSW_LOCALDEV,
+				vswp->mcap->addr, NULL);
+
+		next_p = vswp->mcap->nextp;
+		kmem_free(vswp->mcap, sizeof (mcst_addr_t));
+		vswp->mcap = next_p;
+	}
+
+	vswp->mcap = NULL;
+	mutex_exit(&vswp->mca_lock);
+
+	D1(vswp, "%s: exit", __func__);
+}
+
+
+/*
+ * Remove the specified address from the list of address maintained
+ * in this port node.
+ */
+static void
+vsw_del_addr(uint8_t devtype, void *arg, uint64_t addr)
+{
+	vsw_t		*vswp = NULL;
+	vsw_port_t	*port = NULL;
+	mcst_addr_t	*prev_p = NULL;
+	mcst_addr_t	*curr_p = NULL;
+
+	D1(NULL, "%s: enter : devtype %d : addr 0x%llx",
+		__func__, devtype, addr);
+
+	if (devtype == VSW_VNETPORT) {
+		port = (vsw_port_t *)arg;
+		mutex_enter(&port->mca_lock);
+		prev_p = curr_p = port->mcap;
+	} else {
+		vswp = (vsw_t *)arg;
+		mutex_enter(&vswp->mca_lock);
+		prev_p = curr_p = vswp->mcap;
+	}
+
+	while (curr_p != NULL) {
+		if (curr_p->addr == addr) {
+			D2(NULL, "%s: address found", __func__);
+			/* match found */
+			if (prev_p == curr_p) {
+				/* list head */
+				if (devtype == VSW_VNETPORT)
+					port->mcap = curr_p->nextp;
+				else
+					vswp->mcap = curr_p->nextp;
+			} else {
+				prev_p->nextp = curr_p->nextp;
+			}
+			kmem_free(curr_p, sizeof (mcst_addr_t));
+			break;
+		} else {
+			prev_p = curr_p;
+			curr_p = curr_p->nextp;
+		}
+	}
+
+	if (devtype == VSW_VNETPORT)
+		mutex_exit(&port->mca_lock);
+	else
+		mutex_exit(&vswp->mca_lock);
+
+	D1(NULL, "%s: exit", __func__);
+}
+
+/*
+ * Creates a descriptor ring (dring) and links it into the
+ * link of outbound drings for this channel.
+ *
+ * Returns NULL if creation failed.
+ */
+static dring_info_t *
+vsw_create_dring(vsw_ldc_t *ldcp)
+{
+	vsw_private_desc_t	*priv_addr = NULL;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	ldc_mem_info_t		minfo;
+	dring_info_t		*dp, *tp;
+	int			i;
+
+	dp = (dring_info_t *)kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
+
+	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
+
+	/* create public section of ring */
+	if ((ldc_mem_dring_create(VSW_RING_NUM_EL,
+			VSW_PUB_SIZE, &dp->handle)) != 0) {
+
+		DERR(vswp, "vsw_create_dring(%lld): ldc dring create "
+			"failed", ldcp->ldc_id);
+		goto create_fail_exit;
+	}
+
+	ASSERT(dp->handle != NULL);
+
+	/*
+	 * Get the base address of the public section of the ring.
+	 */
+	if ((ldc_mem_dring_info(dp->handle, &minfo)) != 0) {
+		DERR(vswp, "vsw_create_dring(%lld): dring info failed\n",
+			ldcp->ldc_id);
+		goto dring_fail_exit;
+	} else {
+		ASSERT(minfo.vaddr != 0);
+		dp->pub_addr = minfo.vaddr;
+	}
+
+	dp->num_descriptors = VSW_RING_NUM_EL;
+	dp->descriptor_size = VSW_PUB_SIZE;
+	dp->options = VIO_TX_DRING;
+	dp->ncookies = 1;	/* guaranteed by ldc */
+
+	/*
+	 * create private portion of ring
+	 */
+	dp->priv_addr = (vsw_private_desc_t *)kmem_zalloc(
+		(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL), KM_SLEEP);
+
+	if (vsw_setup_ring(ldcp, dp)) {
+		DERR(vswp, "%s: unable to setup ring", __func__);
+		goto dring_fail_exit;
+	}
+
+	/* haven't used any descriptors yet */
+	dp->end_idx = 0;
+
+	/* bind dring to the channel */
+	if ((ldc_mem_dring_bind(ldcp->ldc_handle, dp->handle,
+		LDC_SHADOW_MAP, LDC_MEM_RW,
+		&dp->cookie[0], &dp->ncookies)) != 0) {
+		DERR(vswp, "vsw_create_dring: unable to bind to channel "
+			"%lld", ldcp->ldc_id);
+		goto dring_fail_exit;
+	}
+
+	/*
+	 * Only ever create rings for outgoing lane. Link it onto
+	 * end of list.
+	 */
+	if (ldcp->lane_out.dringp == NULL) {
+		D2(vswp, "vsw_create_dring: adding first outbound ring");
+		ldcp->lane_out.dringp = dp;
+	} else {
+		tp = ldcp->lane_out.dringp;
+		while (tp->next != NULL)
+			tp = tp->next;
+
+		tp->next = dp;
+	}
+
+	return (dp);
+
+dring_fail_exit:
+	(void) ldc_mem_dring_destroy(dp->handle);
+
+create_fail_exit:
+	if (dp->priv_addr != NULL) {
+		priv_addr = dp->priv_addr;
+		for (i = 0; i < VSW_RING_NUM_EL; i++) {
+			if (priv_addr->memhandle != NULL)
+				(void) ldc_mem_free_handle(
+						priv_addr->memhandle);
+			priv_addr++;
+		}
+		kmem_free(dp->priv_addr,
+			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
+	}
+	mutex_destroy(&dp->dlock);
+
+	kmem_free(dp, sizeof (dring_info_t));
+	return (NULL);
+}
+
+/*
+ * Create a ring consisting of just a private portion and link
+ * it into the list of rings for the outbound lane.
+ *
+ * These type of rings are used primarily for temporary data
+ * storage (i.e. as data buffers).
+ */
+void
+vsw_create_privring(vsw_ldc_t *ldcp)
+{
+	dring_info_t		*dp, *tp;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+
+	D1(vswp, "%s(%lld): enter", __func__, ldcp->ldc_id);
+
+	dp = kmem_zalloc(sizeof (dring_info_t), KM_SLEEP);
+
+	mutex_init(&dp->dlock, NULL, MUTEX_DRIVER, NULL);
+
+	/* no public section */
+	dp->pub_addr = NULL;
+
+	dp->priv_addr = kmem_zalloc((sizeof (vsw_private_desc_t) *
+					VSW_RING_NUM_EL), KM_SLEEP);
+
+	if (vsw_setup_ring(ldcp, dp)) {
+		DERR(vswp, "%s: setup of ring failed", __func__);
+		kmem_free(dp->priv_addr,
+			(sizeof (vsw_private_desc_t) * VSW_RING_NUM_EL));
+		mutex_destroy(&dp->dlock);
+		kmem_free(dp, sizeof (dring_info_t));
+		return;
+	}
+
+	/* haven't used any descriptors yet */
+	dp->end_idx = 0;
+
+	/*
+	 * Only ever create rings for outgoing lane. Link it onto
+	 * end of list.
+	 */
+	if (ldcp->lane_out.dringp == NULL) {
+		D2(vswp, "%s: adding first outbound privring", __func__);
+		ldcp->lane_out.dringp = dp;
+	} else {
+		tp = ldcp->lane_out.dringp;
+		while (tp->next != NULL)
+			tp = tp->next;
+
+		tp->next = dp;
+	}
+
+	D1(vswp, "%s(%lld): exit", __func__, ldcp->ldc_id);
+}
+
+/*
+ * Setup the descriptors in the dring. Returns 0 on success, 1 on
+ * failure.
+ */
+int
+vsw_setup_ring(vsw_ldc_t *ldcp, dring_info_t *dp)
+{
+	vnet_public_desc_t	*pub_addr = NULL;
+	vsw_private_desc_t	*priv_addr = NULL;
+	vsw_t			*vswp = ldcp->ldc_vswp;
+	uint64_t		*tmpp;
+	uint64_t		offset = 0;
+	uint32_t		ncookies = 0;
+	static char		*name = "vsw_setup_ring";
+	int			i, j, rv;
+
+	/* note - public section may be null */
+	priv_addr = dp->priv_addr;
+	pub_addr = dp->pub_addr;
+
+	/*
+	 * Allocate the region of memory which will be used to hold
+	 * the data the descriptors will refer to.
+	 */
+	dp->data_sz = (VSW_RING_NUM_EL * VSW_RING_EL_DATA_SZ);
+	dp->data_addr = kmem_alloc(dp->data_sz, KM_SLEEP);
+
+	D2(vswp, "%s: allocated %lld bytes at 0x%llx\n", name,
+		dp->data_sz, dp->data_addr);
+
+	tmpp = (uint64_t *)dp->data_addr;
+	offset = VSW_RING_EL_DATA_SZ / sizeof (tmpp);
+
+	/*
+	 * Initialise some of the private and public (if they exist)
+	 * descriptor fields.
+	 */
+	for (i = 0; i < VSW_RING_NUM_EL; i++) {
+		if ((ldc_mem_alloc_handle(ldcp->ldc_handle,
+			&priv_addr->memhandle)) != 0) {
+			DERR(vswp, "%s: alloc mem handle failed", name);
+			goto setup_ring_cleanup;
+		}
+
+		priv_addr->datap = (void *)tmpp;
+
+		rv = ldc_mem_bind_handle(priv_addr->memhandle,
+			(caddr_t)priv_addr->datap, VSW_RING_EL_DATA_SZ,
+			LDC_SHADOW_MAP, LDC_MEM_R|LDC_MEM_W,
+			&(priv_addr->memcookie[0]), &ncookies);
+		if (rv != 0) {
+			DERR(vswp, "%s(%lld): ldc_mem_bind_handle failed "
+				"(rv %d)", name, ldcp->ldc_id, rv);
+			goto setup_ring_cleanup;
+		}
+		priv_addr->bound = 1;
+
+		D2(vswp, "%s: %d: memcookie 0 : addr 0x%llx : size 0x%llx",
+			name, i, priv_addr->memcookie[0].addr,
+			priv_addr->memcookie[0].size);
+
+		if (ncookies >= (uint32_t)(VSW_MAX_COOKIES + 1)) {
+			DERR(vswp, "%s(%lld) ldc_mem_bind_handle returned "
+				"invalid num of cookies (%d) for size 0x%llx",
+				name, ldcp->ldc_id, ncookies,
+				VSW_RING_EL_DATA_SZ);
+
+			goto setup_ring_cleanup;
+		} else {
+			for (j = 1; j < ncookies; j++) {
+				rv = ldc_mem_nextcookie(priv_addr->memhandle,
+					&(priv_addr->memcookie[j]));
+				if (rv != 0) {
+					DERR(vswp, "%s: ldc_mem_nextcookie "
+						"failed rv (%d)", name, rv);
+					goto setup_ring_cleanup;
+				}
+				D3(vswp, "%s: memcookie %d : addr 0x%llx : "
+					"size 0x%llx", name, j,
+					priv_addr->memcookie[j].addr,
+					priv_addr->memcookie[j].size);
+			}
+
+		}
+		priv_addr->ncookies = ncookies;
+		priv_addr->dstate = VIO_DESC_FREE;
+
+		if (pub_addr != NULL) {
+
+			/* link pub and private sides */
+			priv_addr->descp = pub_addr;
+
+			pub_addr->hdr.dstate = VIO_DESC_FREE;
+			pub_addr++;
+		}
+
+		/*
+		 * move to next element in the dring and the next
+		 * position in the data buffer.
+		 */
+		priv_addr++;
+		tmpp += offset;
+	}
+
+	return (0);
+
+setup_ring_cleanup:
+	priv_addr = dp->priv_addr;
+
+	for (i = 0; i < VSW_RING_NUM_EL; i++) {
+		(void) ldc_mem_unbind_handle(priv_addr->memhandle);
+		(void) ldc_mem_free_handle(priv_addr->memhandle);
+
+		priv_addr++;
+	}
+	kmem_free(dp->data_addr, dp->data_sz);
+
+	return (1);
+}
+
+/*
+ * Searches the private section of a ring for a free descriptor,
+ * starting at the location of the last free descriptor found
+ * previously.
+ *
+ * Returns 0 if free descriptor is available, 1 otherwise.
+ *
+ * FUTURE: might need to return contiguous range of descriptors
+ * as dring info msg assumes all will be contiguous.
+ */
+static int
+vsw_dring_find_free_desc(dring_info_t *dringp,
+		vsw_private_desc_t **priv_p, int *idx)
+{
+	vsw_private_desc_t	*addr;
+	uint64_t		i;
+	uint64_t		j = 0;
+	uint64_t		start = dringp->end_idx;
+	int			num = VSW_RING_NUM_EL;
+	int			ret = 1;
+
+	D1(NULL, "%s enter\n", __func__);
+
+	addr = dringp->priv_addr;
+
+	D2(NULL, "%s: searching ring, dringp 0x%llx : start pos %lld",
+			__func__, dringp, start);
+
+	for (i = start; j < num; i = (i + 1) % num, j++) {
+		addr = (vsw_private_desc_t *)dringp->priv_addr + i;
+		D2(NULL, "%s: descriptor %lld : dstate 0x%llx\n",
+			__func__, i, addr->dstate);
+		if (addr->dstate == VIO_DESC_FREE) {
+			D2(NULL, "%s: descriptor %lld is available",
+								__func__, i);
+			*priv_p = addr;
+			*idx = i;
+			dringp->end_idx = (i + 1) % num;
+			ret = 0;
+			break;
+		}
+	}
+
+	/* ring full */
+	if (ret == 1) {
+		D2(NULL, "%s: no desp free: started at %d", __func__, start);
+	}
+
+	D1(NULL, "%s: exit\n", __func__);
+
+	return (ret);
+}
+
+/*
+ * Copy relevant fields from the private descriptor into the
+ * associated public side.
+ */
+static void
+vsw_dring_priv2pub(vsw_private_desc_t *priv)
+{
+	vnet_public_desc_t	*pub;
+	int			i;
+
+	D1(NULL, "vsw_dring_priv2pub enter\n");
+
+	pub = priv->descp;
+
+	pub->ncookies = priv->ncookies;
+	pub->nbytes = priv->datalen;
+
+	for (i = 0; i < pub->ncookies; i++) {
+		bcopy(&priv->memcookie[i], &pub->memcookie[i],
+			sizeof (ldc_mem_cookie_t));
+	}
+
+	pub->hdr.ack = 1;
+	pub->hdr.dstate = VIO_DESC_READY;
+
+	D1(NULL, "vsw_dring_priv2pub exit");
+}
+
+/*
+ * Map from a dring identifier to the ring itself. Returns
+ * pointer to ring or NULL if no match found.
+ */
+static dring_info_t *
+vsw_ident2dring(lane_t *lane, uint64_t ident)
+{
+	dring_info_t	*dp = NULL;
+
+	if ((dp = lane->dringp) == NULL) {
+		return (NULL);
+	} else {
+		if (dp->ident == ident)
+			return (dp);
+
+		while (dp != NULL) {
+			if (dp->ident == ident)
+				break;
+			dp = dp->next;
+		}
+	}
+
+	return (dp);
+}
+
+/*
+ * Set the default lane attributes. These are copied into
+ * the attr msg we send to our peer. If they are not acceptable
+ * then (currently) the handshake ends.
+ */
+static void
+vsw_set_lane_attr(vsw_t *vswp, lane_t *lp)
+{
+	bzero(lp, sizeof (lane_t));
+
+	READ_ENTER(&vswp->if_lockrw);
+	ether_copy(&(vswp->if_addr), &(lp->addr));
+	RW_EXIT(&vswp->if_lockrw);
+
+	lp->mtu = VSW_MTU;
+	lp->addr_type = ADDR_TYPE_MAC;
+	lp->xfer_mode = VIO_DRING_MODE;
+	lp->ack_freq = 0;	/* for shared mode */
+	lp->seq_num = VNET_ISS;
+}
+
+/*
+ * Verify that the attributes are acceptable.
+ *
+ * FUTURE: If some attributes are not acceptable, change them
+ * our desired values.
+ */
+static int
+vsw_check_attr(vnet_attr_msg_t *pkt, vsw_port_t *port)
+{
+	int	ret = 0;
+
+	D1(NULL, "vsw_check_attr enter\n");
+
+	/*
+	 * Note we currently only support in-band descriptors
+	 * and descriptor rings, not packet based transfer (VIO_PKT_MODE)
+	 */
+	if ((pkt->xfer_mode != VIO_DESC_MODE) &&
+			(pkt->xfer_mode != VIO_DRING_MODE)) {
+		D2(NULL, "vsw_check_attr: unknown mode %x\n",
+			pkt->xfer_mode);
+		ret = 1;
+	}
+
+	/* Only support MAC addresses at moment. */
+	if ((pkt->addr_type != ADDR_TYPE_MAC) || (pkt->addr == 0)) {
+		D2(NULL, "vsw_check_attr: invalid addr_type %x, "
+			"or address 0x%llx\n", pkt->addr_type,
+			pkt->addr);
+		ret = 1;
+	}
+
+	/*
+	 * MAC address supplied by device should match that stored
+	 * in the vsw-port OBP node. Need to decide what to do if they
+	 * don't match, for the moment just warn but don't fail.
+	 */
+	if (bcmp(&pkt->addr, &port->p_macaddr, ETHERADDRL) != 0) {
+		DERR(NULL, "vsw_check_attr: device supplied address "
+			"0x%llx doesn't match node address 0x%llx\n",
+			pkt->addr, port->p_macaddr);
+	}
+
+	/*
+	 * Ack freq only makes sense in pkt mode, in shared
+	 * mode the ring descriptors say whether or not to
+	 * send back an ACK.
+	 */
+	if ((pkt->xfer_mode == VIO_DRING_MODE) &&
+				(pkt->ack_freq > 0)) {
+		D2(NULL, "vsw_check_attr: non zero ack freq "
+			" in SHM mode\n");
+		ret = 1;
+	}
+
+	/*
+	 * Note: for the moment we only support ETHER
+	 * frames. This may change in the future.
+	 */
+	if ((pkt->mtu > VSW_MTU) || (pkt->mtu <= 0)) {
+		D2(NULL, "vsw_check_attr: invalid MTU (0x%llx)\n",
+			pkt->mtu);
+		ret = 1;
+	}
+
+	D1(NULL, "vsw_check_attr exit\n");
+
+	return (ret);
+}
+
+/*
+ * Returns 1 if there is a problem, 0 otherwise.
+ */
+static int
+vsw_check_dring_info(vio_dring_reg_msg_t *pkt)
+{
+	_NOTE(ARGUNUSED(pkt))
+
+	int	ret = 0;
+
+	D1(NULL, "vsw_check_dring_info enter\n");
+
+	if ((pkt->num_descriptors == 0) ||
+		(pkt->descriptor_size == 0) ||
+		(pkt->ncookies != 1)) {
+		DERR(NULL, "vsw_check_dring_info: invalid dring msg");
+		ret = 1;
+	}
+
+	D1(NULL, "vsw_check_dring_info exit\n");
+
+	return (ret);
+}
+
+/*
+ * Returns 1 if two memory cookies match. Otherwise returns 0.
+ */
+static int
+vsw_mem_cookie_match(ldc_mem_cookie_t *m1, ldc_mem_cookie_t *m2)
+{
+	if ((m1->addr != m2->addr) ||
+		(m2->size != m2->size)) {
+		return (0);
+	} else {
+		return (1);
+	}
+}
+
+/*
+ * Returns 1 if ring described in reg message matches that
+ * described by dring_info structure. Otherwise returns 0.
+ */
+static int
+vsw_dring_match(dring_info_t *dp, vio_dring_reg_msg_t *msg)
+{
+	if ((msg->descriptor_size != dp->descriptor_size) ||
+		(msg->num_descriptors != dp->num_descriptors) ||
+		(msg->ncookies != dp->ncookies) ||
+		!(vsw_mem_cookie_match(&msg->cookie[0], &dp->cookie[0]))) {
+		return (0);
+	} else {
+		return (1);
+	}
+
+}
+
+static caddr_t
+vsw_print_ethaddr(uint8_t *a, char *ebuf)
+{
+	(void) sprintf(ebuf, "%x:%x:%x:%x:%x:%x",
+	    a[0], a[1], a[2], a[3], a[4], a[5]);
+	return (ebuf);
+}
+
+/*
+ * Reset and free all the resources associated with
+ * the channel.
+ */
+static void
+vsw_free_lane_resources(vsw_ldc_t *ldcp, uint64_t dir)
+{
+	dring_info_t		*dp, *dpp;
+	lane_t			*lp = NULL;
+	int			rv = 0;
+
+	ASSERT(ldcp != NULL);
+
+	D1(ldcp->ldc_vswp, "%s (%lld): enter", __func__, ldcp->ldc_id);
+
+	if (dir == INBOUND) {
+		D2(ldcp->ldc_vswp, "%s: freeing INBOUND lane"
+			" of channel %lld", __func__, ldcp->ldc_id);
+		lp = &ldcp->lane_in;
+	} else {
+		D2(ldcp->ldc_vswp, "%s: freeing OUTBOUND lane"
+			" of channel %lld", __func__, ldcp->ldc_id);
+		lp = &ldcp->lane_out;
+	}
+
+	lp->lstate = VSW_LANE_INACTIV;
+	lp->seq_num = VNET_ISS;
+	if (lp->dringp) {
+		if (dir == INBOUND) {
+			dp = lp->dringp;
+			while (dp != NULL) {
+				dpp = dp->next;
+				if (dp->handle != NULL)
+					(void) ldc_mem_dring_unmap(dp->handle);
+				kmem_free(dp, sizeof (dring_info_t));
+				dp = dpp;
+			}
+		} else {
+			/*
+			 * unbind, destroy exported dring, free dring struct
+			 */
+			dp = lp->dringp;
+			rv = vsw_free_ring(dp);
+		}
+		if (rv == 0) {
+			lp->dringp = NULL;
+		}
+	}
+
+	D1(ldcp->ldc_vswp, "%s (%lld): exit", __func__, ldcp->ldc_id);
+}
+
+/*
+ * Free ring and all associated resources.
+ */
+static int
+vsw_free_ring(dring_info_t *dp)
+{
+	vsw_private_desc_t	*paddr = NULL;
+	dring_info_t		*dpp;
+	int			i, rv = 1;
+
+	while (dp != NULL) {
+		mutex_enter(&dp->dlock);
+		dpp = dp->next;
+		if (dp->priv_addr != NULL) {
+			/*
+			 * First unbind and free the memory handles
+			 * stored in each descriptor within the ring.
+			 */
+			for (i = 0; i < VSW_RING_NUM_EL; i++) {
+				paddr = (vsw_private_desc_t *)
+						dp->priv_addr + i;
+				if (paddr->memhandle != NULL) {
+					if (paddr->bound == 1) {
+						rv = ldc_mem_unbind_handle(
+							paddr->memhandle);
+
+						if (rv != 0) {
+							DERR(NULL, "error "
+							"unbinding handle for "
+							"ring 0x%llx at pos %d",
+							dp, i);
+							mutex_exit(&dp->dlock);
+							return (rv);
+						}
+						paddr->bound = 0;
+					}
+
+					rv = ldc_mem_free_handle(
+							paddr->memhandle);
+					if (rv != 0) {
+						DERR(NULL, "error freeing "
+							"handle for ring "
+							"0x%llx at pos %d",
+							dp, i);
+						mutex_exit(&dp->dlock);
+						return (rv);
+					}
+					paddr->memhandle = NULL;
+				}
+			}
+			kmem_free(dp->priv_addr, (sizeof (vsw_private_desc_t)
+					* VSW_RING_NUM_EL));
+		}
+
+		/*
+		 * Now unbind and destroy the ring itself.
+		 */
+		if (dp->handle != NULL) {
+			(void) ldc_mem_dring_unbind(dp->handle);
+			(void) ldc_mem_dring_destroy(dp->handle);
+		}
+
+		if (dp->data_addr != NULL) {
+			kmem_free(dp->data_addr, dp->data_sz);
+		}
+
+		mutex_exit(&dp->dlock);
+		mutex_destroy(&dp->dlock);
+		kmem_free(dp, sizeof (dring_info_t));
+
+		dp = dpp;
+	}
+	return (0);
+}
+
+/*
+ * Debugging routines
+ */
+static void
+display_state(void)
+{
+	vsw_t		*vswp;
+	vsw_port_list_t	*plist;
+	vsw_port_t 	*port;
+	vsw_ldc_list_t	*ldcl;
+	vsw_ldc_t 	*ldcp;
+
+	cmn_err(CE_NOTE, "***** system state *****");
+
+	for (vswp = vsw_head; vswp; vswp = vswp->next) {
+		plist = &vswp->plist;
+		READ_ENTER(&plist->lockrw);
+		cmn_err(CE_CONT, "vsw instance %d has %d ports attached\n",
+			vswp->instance, plist->num_ports);
+
+		for (port = plist->head; port != NULL; port = port->p_next) {
+			ldcl = &port->p_ldclist;
+			cmn_err(CE_CONT, "port %d : %d ldcs attached\n",
+				port->p_instance, ldcl->num_ldcs);
+			READ_ENTER(&ldcl->lockrw);
+			ldcp = ldcl->head;
+			for (; ldcp != NULL; ldcp = ldcp->ldc_next) {
+				cmn_err(CE_CONT, "chan %lu : dev %d : "
+					"status %d : phase %u\n",
+					ldcp->ldc_id, ldcp->dev_class,
+					ldcp->ldc_status, ldcp->hphase);
+				cmn_err(CE_CONT, "chan %lu : lsession %lu : "
+					"psession %lu\n",
+					ldcp->ldc_id,
+					ldcp->local_session,
+					ldcp->peer_session);
+
+				cmn_err(CE_CONT, "Inbound lane:\n");
+				display_lane(&ldcp->lane_in);
+				cmn_err(CE_CONT, "Outbound lane:\n");
+				display_lane(&ldcp->lane_out);
+			}
+			RW_EXIT(&ldcl->lockrw);
+		}
+		RW_EXIT(&plist->lockrw);
+	}
+	cmn_err(CE_NOTE, "***** system state *****");
+}
+
+static void
+display_lane(lane_t *lp)
+{
+	dring_info_t	*drp;
+
+	cmn_err(CE_CONT, "ver 0x%x:0x%x : state %lx : mtu 0x%lx\n",
+		lp->ver_major, lp->ver_minor, lp->lstate, lp->mtu);
+	cmn_err(CE_CONT, "addr_type %d : addr 0x%lx : xmode %d\n",
+		lp->addr_type, lp->addr, lp->xfer_mode);
+	cmn_err(CE_CONT, "dringp 0x%lx\n", (uint64_t)lp->dringp);
+
+	cmn_err(CE_CONT, "Dring info:\n");
+	for (drp = lp->dringp; drp != NULL; drp = drp->next) {
+		cmn_err(CE_CONT, "\tnum_desc %u : dsize %u\n",
+			drp->num_descriptors, drp->descriptor_size);
+		cmn_err(CE_CONT, "\thandle 0x%lx\n", drp->handle);
+		cmn_err(CE_CONT, "\tpub_addr 0x%lx : priv_addr 0x%lx\n",
+			(uint64_t)drp->pub_addr, (uint64_t)drp->priv_addr);
+		cmn_err(CE_CONT, "\tident 0x%lx : end_idx %lu\n",
+			drp->ident, drp->end_idx);
+		display_ring(drp);
+	}
+}
+
+static void
+display_ring(dring_info_t *dringp)
+{
+	uint64_t		i;
+	uint64_t		priv_count = 0;
+	uint64_t		pub_count = 0;
+	vnet_public_desc_t	*pub_addr = NULL;
+	vsw_private_desc_t	*priv_addr = NULL;
+
+	for (i = 0; i < VSW_RING_NUM_EL; i++) {
+		if (dringp->pub_addr != NULL) {
+			pub_addr = (vnet_public_desc_t *)dringp->pub_addr + i;
+
+			if (pub_addr->hdr.dstate == VIO_DESC_FREE)
+				pub_count++;
+		}
+
+		if (dringp->priv_addr != NULL) {
+			priv_addr =
+				(vsw_private_desc_t *)dringp->priv_addr + i;
+
+			if (priv_addr->dstate == VIO_DESC_FREE)
+				priv_count++;
+		}
+	}
+	cmn_err(CE_CONT, "\t%lu elements: %lu priv free: %lu pub free\n",
+			i, priv_count, pub_count);
+}
+
+static void
+dump_flags(uint64_t state)
+{
+	int	i;
+
+	typedef struct flag_name {
+		int	flag_val;
+		char	*flag_name;
+	} flag_name_t;
+
+	flag_name_t	flags[] = {
+		VSW_VER_INFO_SENT, "VSW_VER_INFO_SENT",
+		VSW_VER_INFO_RECV, "VSW_VER_INFO_RECV",
+		VSW_VER_ACK_RECV, "VSW_VER_ACK_RECV",
+		VSW_VER_ACK_SENT, "VSW_VER_ACK_SENT",
+		VSW_VER_NACK_RECV, "VSW_VER_NACK_RECV",
+		VSW_VER_NACK_SENT, "VSW_VER_NACK_SENT",
+		VSW_ATTR_INFO_SENT, "VSW_ATTR_INFO_SENT",
+		VSW_ATTR_INFO_RECV, "VSW_ATTR_INFO_RECV",
+		VSW_ATTR_ACK_SENT, "VSW_ATTR_ACK_SENT",
+		VSW_ATTR_ACK_RECV, "VSW_ATTR_ACK_RECV",
+		VSW_ATTR_NACK_SENT, "VSW_ATTR_NACK_SENT",
+		VSW_ATTR_NACK_RECV, "VSW_ATTR_NACK_RECV",
+		VSW_DRING_INFO_SENT, "VSW_DRING_INFO_SENT",
+		VSW_DRING_INFO_RECV, "VSW_DRING_INFO_RECV",
+		VSW_DRING_ACK_SENT, "VSW_DRING_ACK_SENT",
+		VSW_DRING_ACK_RECV, "VSW_DRING_ACK_RECV",
+		VSW_DRING_NACK_SENT, "VSW_DRING_NACK_SENT",
+		VSW_DRING_NACK_RECV, "VSW_DRING_NACK_RECV",
+		VSW_RDX_INFO_SENT, "VSW_RDX_INFO_SENT",
+		VSW_RDX_INFO_RECV, "VSW_RDX_INFO_RECV",
+		VSW_RDX_ACK_SENT, "VSW_RDX_ACK_SENT",
+		VSW_RDX_ACK_RECV, "VSW_RDX_ACK_RECV",
+		VSW_RDX_NACK_SENT, "VSW_RDX_NACK_SENT",
+		VSW_RDX_NACK_RECV, "VSW_RDX_NACK_RECV",
+		VSW_MCST_INFO_SENT, "VSW_MCST_INFO_SENT",
+		VSW_MCST_INFO_RECV, "VSW_MCST_INFO_RECV",
+		VSW_MCST_ACK_SENT, "VSW_MCST_ACK_SENT",
+		VSW_MCST_ACK_RECV, "VSW_MCST_ACK_RECV",
+		VSW_MCST_NACK_SENT, "VSW_MCST_NACK_SENT",
+		VSW_MCST_NACK_RECV, "VSW_MCST_NACK_RECV",
+		VSW_LANE_ACTIVE, "VSW_LANE_ACTIVE"};
+
+	DERR(NULL, "DUMP_FLAGS: %llx\n", state);
+	for (i = 0; i < sizeof (flags)/sizeof (flag_name_t); i++) {
+		if (state & flags[i].flag_val)
+			DERR(NULL, "DUMP_FLAGS %s", flags[i].flag_name);
+	}
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/ldc/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,96 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+# uts/sun4v/ldc/Makefile
+#
+#	This makefile drives the production of the LDC transport kernel module.
+#
+#	sun4v implementation architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= ldc
+OBJECTS		= $(LDC_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(LDC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_MISC_DIR)/$(MODULE)
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+#	Override defaults to build a unique, local modstubs.o.
+#
+MODSTUBS_DIR	= $(OBJS_DIR)
+
+CLEANFILES	+= $(MODSTUBS_O)
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS += -v
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- a/usr/src/uts/sun4v/ml/hcall.s	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/ml/hcall.s	Tue May 16 16:05:21 2006 -0700
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -37,6 +38,30 @@
 #if defined(lint) || defined(__lint)
 
 /*ARGSUSED*/
+uint64_t
+hv_mach_exit(uint64_t exit_code)
+{ return (0); }
+
+uint64_t
+hv_mach_sir(void)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_cpu_start(uint64_t cpuid, uint64_t pc, uint64_t rtba, uint64_t arg)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_cpu_stop(uint64_t cpuid)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_cpu_set_rtba(uint64_t *rtba)
+{ return (0); }
+
+/*ARGSUSED*/
 int64_t
 hv_cnputchar(uint8_t ch)
 { return (0); }
@@ -61,6 +86,11 @@
 hv_mmu_map_perm_addr(void *vaddr, int ctx, uint64_t tte, int flags)
 { return (0); }
 
+/*ARGSUSED */
+uint64_t
+hv_mmu_fault_area_conf(void *raddr)
+{ return (0); }
+
 /*ARGSUSED*/
 uint64_t
 hv_mmu_unmap_perm_addr(void *vaddr, int ctx, int flags)
@@ -171,7 +201,7 @@
 uint64_t
 hv_mach_desc(uint64_t buffer_ra, uint64_t *buffer_sizep)
 { return (0); }
-	
+
 /*ARGSUSED*/	
 uint64_t
 hv_ra2pa(uint64_t ra)
@@ -182,31 +212,190 @@
 hv_hpriv(void *func, uint64_t arg1, uint64_t arg2, uint64_t arg3)
 { return (0); }
 
+/*ARGSUSED*/	
+uint64_t
+hv_ldc_tx_qconf(uint64_t channel, uint64_t ra_base, uint64_t nentries)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_ldc_tx_qinfo(uint64_t channel, uint64_t *ra_base, uint64_t *nentries)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_ldc_tx_get_state(uint64_t channel, 
+	uint64_t *headp, uint64_t *tailp, uint64_t *state)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_ldc_tx_set_qtail(uint64_t channel, uint64_t tail)
+{ return (0); }
+
+/*ARGSUSED*/	
+uint64_t
+hv_ldc_rx_qconf(uint64_t channel, uint64_t ra_base, uint64_t nentries)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_ldc_rx_qinfo(uint64_t channel, uint64_t *ra_base, uint64_t *nentries)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_ldc_rx_get_state(uint64_t channel, 
+	uint64_t *headp, uint64_t *tailp, uint64_t *state)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_ldc_rx_set_qhead(uint64_t channel, uint64_t head)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_ldc_send_msg(uint64_t channel, uint64_t msg_ra)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_ldc_set_map_table(uint64_t channel, uint64_t tbl_ra, uint64_t tbl_entries)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_ldc_copy(uint64_t channel, uint64_t request, uint64_t cookie,
+	uint64_t raddr, uint64_t length, uint64_t *lengthp)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hvldc_intr_getcookie(uint64_t dev_hdl, uint32_t devino, uint64_t *cookie)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hvldc_intr_setcookie(uint64_t dev_hdl, uint32_t devino, uint64_t cookie)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hvldc_intr_getvalid(uint64_t dev_hdl, uint32_t devino, int *intr_valid_state)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hvldc_intr_setvalid(uint64_t dev_hdl, uint32_t devino, int intr_valid_state)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hvldc_intr_getstate(uint64_t dev_hdl, uint32_t devino, int *intr_state)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hvldc_intr_setstate(uint64_t dev_hdl, uint32_t devino, int intr_state)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hvldc_intr_gettarget(uint64_t dev_hdl, uint32_t devino, uint32_t *cpuid)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hvldc_intr_settarget(uint64_t dev_hdl, uint32_t devino, uint32_t cpuid)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_api_get_version(uint64_t api_group, uint64_t *majorp, uint64_t *minorp)
+{ return (0); }
+
+/*ARGSUSED*/
+uint64_t
+hv_api_set_version(uint64_t api_group, uint64_t major, uint64_t minor,
+    uint64_t *supported_minor)
+{ return (0); }
+
 #else	/* lint || __lint */
 
 	/*
-	 * %o0 - character
+	 * int hv_mach_exit(uint64_t exit_code)
+	 */
+	ENTRY(hv_mach_exit)
+	mov	HV_MACH_EXIT, %o5
+	ta	FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hv_mach_exit)
+
+	/*
+	 * uint64_t hv_mach_sir(void)
+	 */
+	ENTRY(hv_mach_sir)
+	mov	HV_MACH_SIR, %o5
+	ta	FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hv_mach_sir)
+
+	/*
+	 * hv_cpu_start(uint64_t cpuid, uint64_t pc, ui64_t rtba,
+	 *     uint64_t arg)
+	 */
+	ENTRY(hv_cpu_start)
+	mov	HV_CPU_START, %o5
+	ta	FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hv_cpu_start)
+
+	/*
+	 * hv_cpu_stop(uint64_t cpuid)
+	 */
+	ENTRY(hv_cpu_stop)
+	mov	HV_CPU_STOP, %o5
+	ta	FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hv_cpu_stop)
+
+	/*
+	 * hv_cpu_set_rtba(uint64_t *rtba)
+	 */
+	ENTRY(hv_cpu_set_rtba)
+	mov	%o0, %o2
+	ldx	[%o2], %o0
+	mov	HV_CPU_SET_RTBA, %o5
+	ta	FAST_TRAP
+	stx	%o1, [%o2]
+	retl
+	  nop
+	SET_SIZE(hv_cpu_set_rtba)
+
+	/*
+	 * int64_t hv_cnputchar(uint8_t ch)
 	 */
 	ENTRY(hv_cnputchar)
-	mov	CONS_WRITE, %o5
+	mov	CONS_PUTCHAR, %o5
 	ta	FAST_TRAP
-	tst	%o0
 	retl
-	movnz	%xcc, -1, %o0
+	  nop
 	SET_SIZE(hv_cnputchar)
 
 	/*
-	 * %o0 pointer to character buffer
-	 * return values:
-	 * 0 success
-	 * hv_errno failure
+	 * int64_t hv_cngetchar(uint8_t *ch)
 	 */
 	ENTRY(hv_cngetchar)
 	mov	%o0, %o2
-	mov	CONS_READ, %o5
+	mov	CONS_GETCHAR, %o5
 	ta	FAST_TRAP
 	brnz,a	%o0, 1f		! failure, just return error
-	mov	1, %o0
+	  nop
 
 	cmp	%o1, H_BREAK
 	be	1f
@@ -220,7 +409,7 @@
 	mov	0, %o0
 1:
 	retl
-	nop
+	  nop
 	SET_SIZE(hv_cngetchar)
 
 	ENTRY(hv_tod_get)
@@ -253,6 +442,19 @@
 	SET_SIZE(hv_mmu_map_perm_addr)
 
 	/*
+	 * hv_mmu_fault_area_conf(void *raddr)
+	 */
+	ENTRY(hv_mmu_fault_area_conf)
+	mov	%o0, %o2
+	ldx	[%o2], %o0
+	mov	MMU_SET_INFOPTR, %o5
+	ta	FAST_TRAP
+	stx	%o1, [%o2]
+	retl
+	  nop
+	SET_SIZE(hv_mmu_fault_area_conf)
+
+	/*
 	 * Unmap permanent address
 	 * arg0 vaddr (%o0)
 	 * arg1 context (%o1)
@@ -308,7 +510,7 @@
 	 * arg2 Size (%o2)
 	 */
 	ENTRY(hv_cpu_qconf)
-	mov	CPU_QCONF, %o5
+	mov	HV_CPU_QCONF, %o5
 	ta	FAST_TRAP
 	retl
 	nop
@@ -537,7 +739,7 @@
 	 * arg0 enable/ freeze (%o0)
 	 * ret0 status (%o0)
 	 * ret1 previous freeze state (%o1)
-	*/
+	 */
 	ENTRY(hv_ttrace_freeze)
 	mov	%o1, %o2
 	mov	TTRACE_FREEZE, %o5
@@ -597,4 +799,320 @@
 	nop
 	SET_SIZE(hv_hpriv)
 
+	/*
+         * hv_ldc_tx_qconf(uint64_t channel, uint64_t ra_base, 
+	 *	uint64_t nentries);
+	 */
+	ENTRY(hv_ldc_tx_qconf)
+	mov     LDC_TX_QCONF, %o5
+	ta      FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hv_ldc_tx_qconf)
+
+
+	/*
+         * hv_ldc_tx_qinfo(uint64_t channel, uint64_t *ra_base, 
+	 *	uint64_t *nentries);
+	 */
+	ENTRY(hv_ldc_tx_qinfo)
+	mov	%o1, %g1
+	mov	%o2, %g2
+	mov     LDC_TX_QINFO, %o5
+	ta      FAST_TRAP
+	stx     %o1, [%g1]
+	retl
+	  stx   %o2, [%g2]
+	SET_SIZE(hv_ldc_tx_qinfo)
+
+
+	/*
+	 * hv_ldc_tx_get_state(uint64_t channel, 
+	 *	uint64_t *headp, uint64_t *tailp, uint64_t *state);
+	 */
+	ENTRY(hv_ldc_tx_get_state)
+	mov     LDC_TX_GET_STATE, %o5
+	mov     %o1, %g1
+	mov     %o2, %g2
+	mov     %o3, %g3
+	ta      FAST_TRAP
+	stx     %o1, [%g1]
+	stx     %o2, [%g2]
+	retl
+	  stx   %o3, [%g3]
+	SET_SIZE(hv_ldc_tx_get_state)
+
+
+	/*
+	 * hv_ldc_tx_set_qtail(uint64_t channel, uint64_t tail)
+	 */
+	ENTRY(hv_ldc_tx_set_qtail)
+	mov     LDC_TX_SET_QTAIL, %o5
+	ta      FAST_TRAP
+	retl
+	SET_SIZE(hv_ldc_tx_set_qtail)
+
+	
+	/*
+         * hv_ldc_rx_qconf(uint64_t channel, uint64_t ra_base, 
+	 *	uint64_t nentries);
+	 */
+	ENTRY(hv_ldc_rx_qconf)
+	mov     LDC_RX_QCONF, %o5
+	ta      FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hv_ldc_rx_qconf)
+
+
+	/*
+         * hv_ldc_rx_qinfo(uint64_t channel, uint64_t *ra_base, 
+	 *	uint64_t *nentries);
+	 */
+	ENTRY(hv_ldc_rx_qinfo)
+	mov	%o1, %g1
+	mov	%o2, %g2
+	mov     LDC_RX_QINFO, %o5
+	ta      FAST_TRAP
+	stx     %o1, [%g1]
+	retl
+	  stx   %o2, [%g2]
+	SET_SIZE(hv_ldc_rx_qinfo)
+
+
+	/*
+	 * hv_ldc_rx_get_state(uint64_t channel, 
+	 *	uint64_t *headp, uint64_t *tailp, uint64_t *state);
+	 */
+	ENTRY(hv_ldc_rx_get_state)
+	mov     LDC_RX_GET_STATE, %o5
+	mov     %o1, %g1
+	mov     %o2, %g2
+	mov     %o3, %g3
+	ta      FAST_TRAP
+	stx     %o1, [%g1]
+	stx     %o2, [%g2]
+	retl
+	  stx   %o3, [%g3]
+	SET_SIZE(hv_ldc_rx_get_state)
+
+
+	/*
+	 * hv_ldc_rx_set_qhead(uint64_t channel, uint64_t head)
+	 */
+	ENTRY(hv_ldc_rx_set_qhead)
+	mov     LDC_RX_SET_QHEAD, %o5
+	ta      FAST_TRAP
+	retl
+	SET_SIZE(hv_ldc_rx_set_qhead)
+
+	/*
+	 * hv_ldc_set_map_table(uint64_t channel, uint64_t tbl_ra, 
+	 *		uint64_t tbl_entries)
+	 */
+	ENTRY(hv_ldc_set_map_table)
+	mov     LDC_SET_MAP_TABLE, %o5
+	ta      FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hv_ldc_set_map_table)
+
+
+	/*
+	 * hv_ldc_get_map_table(uint64_t channel, uint64_t *tbl_ra, 
+	 *		uint64_t *tbl_entries)
+	 */
+	ENTRY(hv_ldc_get_map_table)
+	mov	%o1, %g1
+	mov	%o2, %g2
+	mov     LDC_GET_MAP_TABLE, %o5
+	ta      FAST_TRAP
+	stx     %o1, [%g1]
+	retl
+	  stx     %o2, [%g2]	  
+	SET_SIZE(hv_ldc_get_map_table)
+
+
+	/*
+	 * hv_ldc_copy(uint64_t channel, uint64_t request, uint64_t cookie,
+	 *		uint64_t raddr, uint64_t length, uint64_t *lengthp);
+	 */
+	ENTRY(hv_ldc_copy)
+	mov     %o5, %g1
+	mov     LDC_COPY, %o5
+	ta      FAST_TRAP
+	retl
+	  stx   %o1, [%g1]
+	SET_SIZE(hv_ldc_copy)
+
+
+	/*
+	 * hv_ldc_mapin(uint64_t channel, uint64_t cookie, uint64_t *raddr, 
+	 *		uint64_t *perm)
+	 */
+	ENTRY(hv_ldc_mapin)
+	mov	%o2, %g1
+	mov	%o3, %g2
+	mov     LDC_MAPIN, %o5
+	ta      FAST_TRAP
+	stx     %o1, [%g1]
+	retl
+	  stx     %o2, [%g2]	  
+	SET_SIZE(hv_ldc_mapin)
+
+
+	/*
+	 * hv_ldc_unmap(uint64_t raddr)
+	 */
+	ENTRY(hv_ldc_unmap)
+	mov     LDC_UNMAP, %o5
+	ta      FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hv_ldc_unmap)
+
+
+	/*
+	 * hv_ldc_revoke(uint64_t raddr)
+	 */
+	ENTRY(hv_ldc_revoke)
+	mov     LDC_REVOKE, %o5
+	ta      FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hv_ldc_revoke)
+
+
+	/*
+	 * hvldc_intr_getcookie(uint64_t dev_hdl, uint32_t devino,
+	 *			uint64_t *cookie);
+	 */
+	ENTRY(hvldc_intr_getcookie)
+	mov	%o2, %g1
+	mov     VINTR_GET_COOKIE, %o5
+	ta      FAST_TRAP
+	retl
+	  stx   %o1, [%g1]
+	SET_SIZE(hvldc_intr_getcookie)
+
+	/*
+	 * hvldc_intr_setcookie(uint64_t dev_hdl, uint32_t devino,
+	 *			uint64_t cookie);
+	 */
+	ENTRY(hvldc_intr_setcookie)
+	mov     VINTR_SET_COOKIE, %o5
+	ta      FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hvldc_intr_setcookie)
+
+	
+	/*
+	 * hvldc_intr_getvalid(uint64_t dev_hdl, uint32_t devino,
+	 *			int *intr_valid_state);
+	 */
+	ENTRY(hvldc_intr_getvalid)
+	mov	%o2, %g1
+	mov     VINTR_GET_VALID, %o5
+	ta      FAST_TRAP
+	retl
+	  stuw   %o1, [%g1]
+	SET_SIZE(hvldc_intr_getvalid)
+
+	/*
+	 * hvldc_intr_setvalid(uint64_t dev_hdl, uint32_t devino,
+	 *			int intr_valid_state);
+	 */
+	ENTRY(hvldc_intr_setvalid)
+	mov     VINTR_SET_VALID, %o5
+	ta      FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hvldc_intr_setvalid)
+
+	/*
+	 * hvldc_intr_getstate(uint64_t dev_hdl, uint32_t devino,
+	 *			int *intr_state);
+	 */
+	ENTRY(hvldc_intr_getstate)
+	mov	%o2, %g1
+	mov     VINTR_GET_STATE, %o5
+	ta      FAST_TRAP
+	retl
+	  stuw   %o1, [%g1]
+	SET_SIZE(hvldc_intr_getstate)
+
+	/*
+	 * hvldc_intr_setstate(uint64_t dev_hdl, uint32_t devino,
+	 *			int intr_state);
+	 */
+	ENTRY(hvldc_intr_setstate)
+	mov     VINTR_SET_STATE, %o5
+	ta      FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hvldc_intr_setstate)
+
+	/*
+	 * hvldc_intr_gettarget(uint64_t dev_hdl, uint32_t devino,
+	 *			uint32_t *cpuid);
+	 */
+	ENTRY(hvldc_intr_gettarget)
+	mov	%o2, %g1
+	mov     VINTR_GET_TARGET, %o5
+	ta      FAST_TRAP
+	retl
+	  stuw   %o1, [%g1]
+	SET_SIZE(hvldc_intr_gettarget)
+
+	/*
+	 * hvldc_intr_settarget(uint64_t dev_hdl, uint32_t devino,
+	 *			uint32_t cpuid);
+	 */
+	ENTRY(hvldc_intr_settarget)
+	mov     VINTR_SET_TARGET, %o5
+	ta      FAST_TRAP
+	retl
+	  nop
+	SET_SIZE(hvldc_intr_settarget)
+
+	/*
+	 * hv_api_get_version(uint64_t api_group, uint64_t *majorp,
+	 *			uint64_t *minorp)
+	 *
+	 * API_GET_VERSION
+	 * arg0 API group
+	 * ret0 status
+	 * ret1 major number
+	 * ret2 minor number
+	 */
+	ENTRY(hv_api_get_version)
+	mov	%o1, %o3
+	mov	%o2, %o4
+	mov	API_GET_VERSION, %o5
+	ta	CORE_TRAP
+	stx	%o1, [%o3]
+	retl
+	  stx	%o2, [%o4]
+	SET_SIZE(hv_api_get_version)
+
+	/*
+	 * hv_api_set_version(uint64_t api_group, uint64_t major,
+	 *			uint64_t minor, uint64_t *supported_minor)
+	 *
+	 * API_SET_VERSION
+	 * arg0 API group
+	 * arg1 major number
+	 * arg2 requested minor number
+	 * ret0 status
+	 * ret1 actual minor number
+	 */
+	ENTRY(hv_api_set_version)
+	mov	%o3, %o4
+	mov	API_SET_VERSION, %o5
+	ta	CORE_TRAP
+	retl
+	  stx	%o1, [%o4]
+	SET_SIZE(hv_api_set_version)
+
 #endif	/* lint || __lint */
--- a/usr/src/uts/sun4v/ml/mach_offsets.in	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/ml/mach_offsets.in	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
 \ CDDL HEADER START
 \
 \ The contents of this file are subject to the terms of the
-\ Common Development and Distribution License, Version 1.0 only
-\ (the "License").  You may not use this file except in compliance
-\ with the License.
+\ Common Development and Distribution License (the "License").
+\ You may not use this file except in compliance with the License.
 \
 \ You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 \ or http://www.opensolaris.org/os/licensing.
@@ -19,7 +18,7 @@
 \
 \ CDDL HEADER END
 \
-\ Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+\ Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 \ Use is subject to license terms.
 \
 \ offsets.in: input file to produce assym.h using the stabs program
@@ -79,6 +78,7 @@
 
 #include <vm/hat_sfmmu.h>
 #include <sys/traptrace.h>
+#include <sys/lpad.h>
 
 machcpu
 	intrstat	MCPU_INTRSTAT
@@ -280,3 +280,16 @@
 	ptl1_g6
 	ptl1_g7
 
+lpad_data
+	magic		LPAD_MAGIC
+	inuse		LPAD_INUSE
+	mmfsa_ra	LPAD_MMFSA_RA
+	pc		LPAD_PC
+	arg		LPAD_ARG
+	nmap		LPAD_NMAP
+	map		LPAD_MAP
+
+lpad_map	LPAD_MAP_SIZE
+	flags		LPAD_MAP_FLAGS
+	va		LPAD_MAP_VA
+	tte		LPAD_MAP_TTE
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/ml/mach_proc_init.s	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,211 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * sun4v processor initialization
+ *
+ * This is the kernel entry point for CPUs that enter Solaris
+ * directly from the hypervisor. i.e. without going through OBP.
+ */
+
+#if !defined(lint)
+#include "assym.h"
+#endif /* !lint */
+
+#include <sys/asm_linkage.h>
+#include <sys/hypervisor_api.h>
+#include <sys/machasi.h>
+#include <sys/machpcb.h>
+#include <sys/machlock.h>
+#include <sys/mmu.h>
+#include <sys/lpad.h>
+
+#if defined(lint)
+
+/* ARGSUSED */
+void
+mach_cpu_startup(uint64_t rabase, uint64_t memsz)
+{}
+
+#else	/* lint */
+
+	/*
+	 * %o0 - hcall specified arg (cpuid)
+	 * %i0 - real memory base
+	 * %i1 - memory size
+	 */
+	ENTRY_NP(mach_cpu_startup)
+	/*
+	 * Calculate the data pointer. The landing pad
+	 * data immediately follows the landing pad text.
+	 */
+	rd	%pc, %l0
+	add	%l0, LPAD_TEXT_SIZE, %l1	! %l1 has start of data
+
+	/*
+	 * Setup the initial state of the CPU.
+	 */
+	wrpr	%g0, 0, %tl
+	wrpr	%g0, 0, %gl
+	wrpr	%g0, MAXWIN - 2, %cansave
+	wrpr	%g0, MAXWIN - 2, %cleanwin
+	wrpr	%g0, 0, %canrestore
+	wrpr	%g0, 0, %otherwin
+	wrpr	%g0, 0, %cwp
+	wrpr	%g0, 0, %wstate
+	wr	%g0, %y
+	wrpr	%g0, PIL_MAX, %pil
+
+	set	trap_table, %g1
+	wrpr	%g1, %tba
+
+	! initialize cpuid into scratchpad register
+	mov	SCRATCHPAD_CPUID, %g1
+	stxa	%o0, [%g1]ASI_SCRATCHPAD
+	
+	! sanity check the data section
+	setx	LPAD_MAGIC_VAL, %g2, %g1
+	ldx	[%l1 + LPAD_MAGIC], %g2
+	cmp	%g1, %g2
+	bne	startup_error
+	  nop
+
+	/*
+	 * Loop through the array of TTE's, installing the
+	 * VA to RA mapping for each one.
+	 */
+	ldx	[%l1 + LPAD_NMAP], %l2		! %l2 = number of mappings
+	add	%l1, LPAD_MAP, %l3		! %l3 = the current mapping
+
+	/*
+	 * Sanity check the number of mappings.
+	 */
+	mulx	%l2, LPAD_MAP_SIZE, %g1
+	add	%l3, %g1, %g1			! %g1 = end of the array
+	add	%l1, LPAD_DATA_SIZE, %g2	! %g2 = end of data section
+	sub	%g2, %g1, %g2
+	brlz	%g2, startup_error
+	  nop
+
+0:
+	cmp	%l2, %g0
+	be	3f
+	  nop
+
+	ldx	[%l3 + LPAD_MAP_FLAGS], %l4	! %l4 = flags
+
+	/*
+	 * Generate args for the HV call
+	 */
+	ldx	[%l3 + LPAD_MAP_VA], %o0	! %o0 = virtual address
+	mov	KCONTEXT, %o1			! %o1 = context
+	ldx	[%l3 + LPAD_MAP_TTE], %o2	! %o2 = TTE
+	and	%l4, FLAG_MMUFLAGS_MASK, %o3	! %o3 = MMU flags
+
+	! check if this is a locked TTE
+	and	%l4, FLAG_LOCK_MASK, %l4
+	cmp	%l4, %g0
+	bne	1f
+	  nop
+
+	! install an unlocked entry
+	ta	MMU_MAP_ADDR
+	ba	2f
+	  nop
+1:
+	! install a locked entry
+	mov	MAP_PERM_ADDR, %o5
+	ta	FAST_TRAP
+
+2:
+	! check for errors from the hcall
+	cmp	%o0, %g0
+	bne	startup_error
+	  nop
+	
+	sub	%l2, 1, %l2			! decrement counter
+	add	%l3, LPAD_MAP_SIZE, %l3		! increment pointer
+
+	ba	0b
+	  nop
+
+3:
+	/*
+	 * Set the MMU fault status area
+	 */
+	ldx	[%l1 + LPAD_MMFSA_RA], %o0
+
+	mov	MMU_SET_INFOPTR, %o5
+	ta	FAST_TRAP
+
+	! check for errors from the hcall
+	cmp	%o0, %g0
+	bne	startup_error
+	  nop
+
+	/*
+	 * Load remaining arguments before enabling the
+	 * MMU so that the loads can be done using real
+	 * addresses.
+	 */
+	ldx	[%l1 + LPAD_PC], %l3		! %l3 = specified entry point
+	ldx	[%l1 + LPAD_ARG], %l4		! %l4 = specified argument
+	ldx	[%l1 + LPAD_INUSE], %l5		! %l5 = va of inuse mailbox
+
+	/*
+	 * Enable the MMU. On success, it returns to the
+	 * global version of the landing pad text, rather
+	 * than the text copied into the lpad buffer.
+	 */
+	mov	1, %o0				! %o0 = enable flag (1 = enable)
+	set	startup_complete, %o1		! VA of return address
+	mov	MMU_ENABLE, %o5
+	ta	FAST_TRAP
+
+	/*
+	 * On errors, just enter a spin loop until the
+	 * CPU that initiated the start recovers the CPU.
+	 */
+startup_error:
+	ba	startup_error
+	  nop
+
+	/*
+	 * Jump to the generic CPU initialization code.
+	 */
+startup_complete:
+	mov	%l4, %o0
+	jmpl	%l3, %g0
+	  stx	%g0, [%l5]			! clear the inuse mailbox
+
+	SET_SIZE(mach_cpu_startup)
+
+	.global mach_cpu_startup_end
+mach_cpu_startup_end:
+
+#endif	/* lint */
--- a/usr/src/uts/sun4v/ml/mach_subr_asm.s	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/ml/mach_subr_asm.s	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -74,6 +73,80 @@
 	SET_SIZE(get_mmfsa_scratchpad)
 #endif	/* lint */
 
+
+
+#if defined(lint)
+/* ARGSUSED */
+void
+cpu_intrq_unregister_powerdown(uint64_t doneflag_va)
+{}
+
+#else	/* lint */
+
+/*
+ * Called from a x-trap at tl1 must use %g1 as arg
+ * and save/restore %o0-%o5 after hypervisor calls
+ */
+
+	ENTRY(cpu_intrq_unregister_powerdown)
+
+	CPU_ADDR(%g2, %g3)
+	add %g2, CPU_MCPU, %g2 
+	/*
+	 * Save %o regs
+	 */
+	mov %o0, %g3
+	mov %o1, %g4
+	mov %o2, %g5
+	mov %o5, %g6
+
+	ldx [%g2 + MCPU_CPU_Q_BASE], %o1
+	mov INTR_CPU_Q, %o0
+	call hv_cpu_qconf
+	mov %g0, %o2
+
+	ldx [%g2 + MCPU_DEV_Q_BASE], %o1
+	mov INTR_DEV_Q, %o0
+	call hv_cpu_qconf
+	mov %g0, %o2
+
+	ldx [%g2 + MCPU_RQ_BASE], %o1
+	mov CPU_RQ, %o0
+	call hv_cpu_qconf
+	mov %g0, %o2
+
+	ldx [%g2 + MCPU_NRQ_BASE], %o1
+	mov CPU_NRQ, %o0
+	call hv_cpu_qconf
+	mov %g0, %o2
+
+	/*
+	 * set done flag to 0
+	 */
+	stub %g0, [%g1]
+
+	/*
+	 * Restore %o regs
+	 */
+	mov %g3, %o0
+	mov %g4, %o1
+	mov %g5, %o2
+	mov %g6, %o5
+
+	/*
+	 * This CPU is on its way out. Spin here
+	 * until the DR unconfigure code stops it.
+	 * Returning would put it back in the OS
+	 * where it might grab resources like locks,
+	 * causing some nastiness to occur.
+	 */
+0:
+	ba,a	0b
+
+	SET_SIZE(cpu_intrq_unregister_powerdown)
+#endif	/* lint */
+
+
 #if defined(lint)
 /* ARGSUSED */
 int
--- a/usr/src/uts/sun4v/ml/trap_table.s	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/ml/trap_table.s	Tue May 16 16:05:21 2006 -0700
@@ -151,11 +151,7 @@
 	.align	32
 #define	NOTP4	NOTP; NOTP; NOTP; NOTP
 
-/*
- * RED is for traps that use the red mode handler.
- * We should never see these either.
- */
-#define	RED	NOT
+
 /*
  * BAD is used for trap vectors we don't have a kernel
  * handler for.
@@ -824,6 +820,25 @@
 	.align	32
 
 /*
+ * We take over the rtba after we set our trap table and
+ * fault status area. The watchdog reset trap is now handled by the OS.
+ */
+#define WATCHDOG_RESET			\
+	mov	PTL1_BAD_WATCHDOG, %g1	;\
+	ba,a,pt	%xcc, .watchdog_trap	;\
+	.align	32
+
+/*
+ * RED is for traps that use the red mode handler.
+ * We should never see these either.
+ */
+#define RED			\
+	mov	PTL1_BAD_RED, %g1	;\
+	ba,a,pt	%xcc, .watchdog_trap	;\
+	.align	32
+
+	
+/*
  * MMU Trap Handlers.
  */
 
@@ -1124,7 +1139,7 @@
 	/* hardware traps */
 	NOT;				/* 000	reserved */
 	RED;				/* 001	power on reset */
-	RED;				/* 002	watchdog reset */
+	WATCHDOG_RESET;			/* 002	watchdog reset */
 	RED;				/* 003	externally initiated reset */
 	RED;				/* 004	software initiated reset */
 	RED;				/* 005	red mode exception */
@@ -2683,6 +2698,20 @@
 #endif /* TRAPTRACE */
 
 /*
+ * Handle watchdog reset trap. Enable the MMU using the MMU_ENABLE
+ * HV service, which requires the return target to be specified as a VA
+ * since we are enabling the MMU. We set the target to ptl1_panic.
+ */
+
+	.type	.watchdog_trap, #function
+.watchdog_trap:
+	mov	1, %o0
+	setx	ptl1_panic, %g2, %o1
+	mov	MMU_ENABLE, %o5
+	ta	FAST_TRAP
+	done
+	SET_SIZE(.watchdog_trap)
+/*
  * synthesize for trap(): SFAR in %g2, SFSR in %g3
  */
 	.type	.dmmu_exc_lddf_not_aligned, #function
--- a/usr/src/uts/sun4v/os/fillsysinfo.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/os/fillsysinfo.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -44,301 +43,122 @@
 #include <sys/cmp.h>
 #include <sys/async.h>
 #include <vm/page.h>
-
-/*
- * The OpenBoot Standalone Interface supplies the kernel with
- * implementation dependent parameters through the devinfo/property mechanism
- */
-typedef enum { XDRBOOL, XDRINT, XDRSTRING } xdrs;
-
-/*
- * structure describing properties that we are interested in querying the
- * OBP for.
- */
-struct getprop_info {
-	char	*name;
-	xdrs	type;
-	uint_t	*var;
-};
-
-/*
- * structure used to convert between a string returned by the OBP & a type
- * used within the kernel. We prefer to paramaterize rather than type.
- */
-struct convert_info {
-	char	*name;
-	uint_t	var;
-	char	*realname;
-};
-
-/*
- * structure describing nodes that we are interested in querying the OBP for
- * properties.
- */
-struct node_info {
-	char			*name;
-	int			size;
-	struct getprop_info	*prop;
-	struct getprop_info	*prop_end;
-	unsigned int		*value;
-};
-
-/*
- * macro definitions for routines that form the OBP interface
- */
-#define	NEXT			prom_nextnode
-#define	CHILD			prom_childnode
-#define	GETPROP			prom_getprop
-#define	GETPROPLEN		prom_getproplen
-
-/* 0=quiet; 1=verbose; 2=debug */
-int	debug_fillsysinfo = 0;
-#define	VPRINTF if (debug_fillsysinfo) prom_printf
+#include <vm/hat_sfmmu.h>
+#include <sys/sysmacros.h>
+#include <sys/mach_descrip.h>
+#include <sys/mdesc.h>
+#include <sys/archsystm.h>
+#include <sys/error.h>
+#include <sys/mmu.h>
+#include <sys/bitmap.h>
 
 int ncpunode;
 struct cpu_node cpunodes[NCPU];
 
-void	fill_cpu(pnode_t);
-void	plat_fill_mc(pnode_t);
-#pragma weak plat_fill_mc
+uint64_t cpu_q_entries;
+uint64_t dev_q_entries;
+uint64_t cpu_rq_entries;
+uint64_t cpu_nrq_entries;
+
+void fill_cpu(md_t *, mde_cookie_t);
+
+static uint64_t get_mmu_ctx_bits(md_t *, mde_cookie_t);
+static uint64_t get_cpu_pagesizes(md_t *, mde_cookie_t);
+static char *construct_isalist(md_t *, mde_cookie_t, char **);
+static void set_at_flags(char *, int, char **);
+static void init_md_broken(md_t *);
+static int get_l2_cache_info(md_t *, mde_cookie_t, uint64_t *, uint64_t *,
+    uint64_t *);
+static id_t get_exec_unit_mapping(md_t *, mde_cookie_t, mde_cookie_t *);
+static int find_exec_unit_id(mde_cookie_t, mde_cookie_t *);
+static void get_q_sizes(md_t *, mde_cookie_t);
+static void get_va_bits(md_t *, mde_cookie_t);
+static size_t get_ra_limit(md_t *);
 
 uint64_t	system_clock_freq;
 int		niobus = 0;
 uint_t		niommu_tsbs = 0;
 
-/*
- * Hardware watchdog support.
- */
-#define	CHOSEN_EEPROM	"eeprom"
-static pnode_t 		chosen_eeprom;
-
-/*
- * If this variable is non-zero, cpr should return "not supported" when
- * it is queried even though it would normally be supported on this platform.
- */
-int cpr_supported_override;
-
-/*
- * Some platforms may need to support CPR even in the absence of the
- * energystar-v* property (Enchilada server, for example).  If this
- * variable is non-zero, cpr should proceed even in the absence
- * of the energystar-v* property.
- */
-int cpr_platform_enable = 0;
-
-/*
- * Some nodes have functions that need to be called when they're seen.
- */
-static void	have_pci(pnode_t);
-
-static struct wkdevice {
-	char *wk_namep;
-	void (*wk_func)(pnode_t);
-	caddr_t *wk_vaddrp;
-	ushort_t wk_flags;
-#define	V_OPTIONAL	0x0000
-#define	V_MUSTHAVE	0x0001
-#define	V_MAPPED	0x0002
-#define	V_MULTI		0x0003	/* optional, may be more than one */
-} wkdevice[] = {
-	{ "pci", have_pci, NULL, V_MULTI },
-	{ 0, },
-};
-
-static void map_wellknown(pnode_t);
-
 void
 map_wellknown_devices()
 {
-	struct wkdevice *wkp;
-	phandle_t	ieeprom;
-	pnode_t	root;
-	uint_t	stick_freq;
-
-	/*
-	 * if there is a chosen eeprom, note it (for have_eeprom())
-	 */
-	if (GETPROPLEN(prom_chosennode(), CHOSEN_EEPROM) ==
-	    sizeof (phandle_t) &&
-	    GETPROP(prom_chosennode(), CHOSEN_EEPROM, (caddr_t)&ieeprom) != -1)
-		chosen_eeprom = (pnode_t)prom_decode_int(ieeprom);
+}
 
-	root = prom_nextnode((pnode_t)0);
-	/*
-	 * Get System clock frequency from root node if it exists.
-	 */
-	if (GETPROP(root, "stick-frequency", (caddr_t)&stick_freq) != -1)
-		system_clock_freq = stick_freq;
-
-	map_wellknown(NEXT((pnode_t)0));
-
-	/*
-	 * See if it worked
-	 */
-	for (wkp = wkdevice; wkp->wk_namep; ++wkp) {
-		if (wkp->wk_flags == V_MUSTHAVE) {
-			cmn_err(CE_PANIC, "map_wellknown_devices: required "
-			    "device %s not mapped", wkp->wk_namep);
-		}
-	}
-}
+#define	S_VAC_SIZE	MMU_PAGESIZE
+#define	S_VAC_SHIFT	MMU_PAGESHIFT
 
 /*
- * map_wellknown - map known devices & registers
+ * For backward compatibility we need to verify that we can handle
+ * running on platforms which shipped with missing MD properties.
  */
-static void
-map_wellknown(pnode_t curnode)
-{
-	extern int status_okay(int, char *, int);
-	char tmp_name[MAXSYSNAME];
-	static void fill_address(pnode_t, char *);
-	int sok;
+#define	ONTARIO_PLATNAME1	"SUNW,Sun-Fire-T200"
+#define	ONTARIO_PLATNAME2	"SUNW,Sun-Fire-T2000"
+#define	ERIE_PLATNAME1		"SUNW,Sun-Fire-T100"
+#define	ERIE_PLATNAME2		"SUNW,Sun-Fire-T1000"
 
-#ifdef VPRINTF
-	VPRINTF("map_wellknown(%x)\n", curnode);
-#endif /* VPRINTF */
-
-	for (curnode = CHILD(curnode); curnode; curnode = NEXT(curnode)) {
-		/*
-		 * prune subtree if status property indicating not okay
-		 */
-		sok = status_okay((int)curnode, (char *)NULL, 0);
-		if (!sok) {
-			char devtype_buf[OBP_MAXPROPNAME];
-			int size;
+void
+fill_cpu(md_t *mdp, mde_cookie_t cpuc)
+{
+	struct cpu_node *cpunode;
+	uint64_t cpuid;
+	uint64_t clk_freq;
+	char *namebuf;
+	char *namebufp;
+	int namelen;
+	uint64_t associativity = 0, linesize = 0, size = 0;
+	int status;
 
-#ifdef VPRINTF
-			VPRINTF("map_wellknown: !okay status property\n");
-#endif /* VPRINTF */
-			/*
-			 * a status property indicating bad memory will be
-			 * associated with a node which has a "device_type"
-			 * property with a value of "memory-controller"
-			 */
-			if ((size = GETPROPLEN(curnode,
-			    OBP_DEVICETYPE)) == -1)
-				continue;
-			if (size > OBP_MAXPROPNAME) {
-				cmn_err(CE_CONT, "node %x '%s' prop too "
-				    "big\n", curnode, OBP_DEVICETYPE);
-				continue;
-			}
-			if (GETPROP(curnode, OBP_DEVICETYPE,
-			    devtype_buf) == -1) {
-				cmn_err(CE_CONT, "node %x '%s' get failed\n",
-				    curnode, OBP_DEVICETYPE);
-				continue;
-			}
-			if (strcmp(devtype_buf, "memory-controller") != 0)
-				continue;
-			/*
-			 * ...else fall thru and process the node...
-			 */
-		}
-		bzero(tmp_name, MAXSYSNAME);
-		if (GETPROP(curnode, OBP_NAME, (caddr_t)tmp_name) != -1)
-			fill_address(curnode, tmp_name);
-		if (GETPROP(curnode, OBP_DEVICETYPE, tmp_name) != -1 &&
-		    strcmp(tmp_name, "cpu") == 0) {
-			fill_cpu(curnode);
+	if (md_get_prop_val(mdp, cpuc, "id", &cpuid)) {
+		return;
+	}
+
+	if (cpuid >= NCPU) {
+		cmn_err(CE_CONT, "fill_cpu: out of range cpuid %ld - "
+		    "cpu excluded from configuration", cpuid);
+
+		mutex_enter(&cpu_lock);
+
+		/*
+		 * Since the CPU cannot be used, make sure it
+		 * is in a safe place. If the firmware does not
+		 * support CPU stop, this is known to be true.
+		 * If it fails to stop for any other reason, the
+		 * system is in an inconsistent state and cannot
+		 * be allowed to continue.
+		 */
+		status = stopcpu_bycpuid(cpuid);
+
+		if ((status != 0) && (status != ENOTSUP)) {
+			cmn_err(CE_PANIC, "failed to stop cpu %lu (%d)",
+			    cpuid, status);
 		}
 
-		if (sok && (strcmp(tmp_name, "memory-controller") == 0) &&
-		    (&plat_fill_mc != NULL))
-			plat_fill_mc(curnode);
-		map_wellknown(curnode);
-	}
-}
-
-static void
-fill_address(pnode_t curnode, char *namep)
-{
-	struct wkdevice *wkp;
-	int size;
-	uint32_t vaddr;
-
-	for (wkp = wkdevice; wkp->wk_namep; ++wkp) {
-		if (strcmp(wkp->wk_namep, namep) != 0)
-			continue;
-		if (wkp->wk_flags == V_MAPPED)
-			return;
-		if (wkp->wk_vaddrp != NULL) {
-			if ((size = GETPROPLEN(curnode, OBP_ADDRESS)) == -1) {
-				cmn_err(CE_CONT, "device %s size %d\n",
-				    namep, size);
-				continue;
-			}
-			if (size != sizeof (vaddr)) {
-				cmn_err(CE_CONT, "device %s address prop too "
-				    "big\n", namep);
-				continue;
-			}
-			if (GETPROP(curnode, OBP_ADDRESS,
-			    (caddr_t)&vaddr) == -1) {
-				cmn_err(CE_CONT, "device %s not mapped\n",
-				    namep);
-				continue;
-			}
-
-			/* make into a native pointer */
-			*wkp->wk_vaddrp = (caddr_t)(uintptr_t)vaddr;
-#ifdef VPRINTF
-			VPRINTF("fill_address: %s mapped to %p\n", namep,
-			    *wkp->wk_vaddrp);
-#endif /* VPRINTF */
-		}
-		if (wkp->wk_func != NULL)
-			(*wkp->wk_func)(curnode);
-		/*
-		 * If this one is optional and there may be more than
-		 * one, don't set V_MAPPED, which would cause us to skip it
-		 * next time around
-		 */
-		if (wkp->wk_flags != V_MULTI)
-			wkp->wk_flags = V_MAPPED;
-	}
-}
-
-void
-fill_cpu(pnode_t node)
-{
-	struct cpu_node *cpunode;
-	processorid_t cpuid;
-	uint_t clk_freq;
-	char namebuf[OBP_MAXPROPNAME], unum[UNUM_NAMLEN];
-	char *namebufp;
-
-	if (GETPROP(node, "cpuid", (caddr_t)&cpuid) == -1) {
-		if (GETPROP(node, "reg", (caddr_t)&cpuid) == -1)
-			cmn_err(CE_PANIC, "reg prop not found in cpu node");
-		cpuid = PROM_CFGHDL_TO_CPUID(cpuid);
-	}
-
-	if (cpuid < 0 || cpuid >= NCPU) {
-		cmn_err(CE_CONT, "cpu (dnode %x): out of range cpuid %d - "
-		    "cpu excluded from configuration\n", node, cpuid);
+		mutex_exit(&cpu_lock);
 		return;
 	}
 
 	cpunode = &cpunodes[cpuid];
-	cpunode->cpuid = cpuid;
+	cpunode->cpuid = (int)cpuid;
 	cpunode->device_id = cpuid;
 
-	unum[0] = '\0';
-	(void) snprintf(cpunode->fru_fmri, sizeof (cpunode->fru_fmri),
-		"%s%s", CPU_FRU_FMRI, unum);
-	(void) GETPROP(node, "compatible", namebuf);
+	if (sizeof (cpunode->fru_fmri) > strlen(CPU_FRU_FMRI))
+		(void) strcpy(cpunode->fru_fmri, CPU_FRU_FMRI);
+
+	if (md_get_prop_data(mdp, cpuc,
+	    "compatible", (uint8_t **)&namebuf, &namelen)) {
+		cmn_err(CE_PANIC, "fill_cpu: Cannot read compatible "
+		    "property");
+	}
 	namebufp = namebuf;
 	if (strncmp(namebufp, "SUNW,", 5) == 0)
 		namebufp += 5;
+	if (strlen(namebufp) > sizeof (cpunode->name))
+		cmn_err(CE_PANIC, "Compatible property too big to "
+		    "fit into the cpunode name buffer");
 	(void) strcpy(cpunode->name, namebufp);
 
-	if (GETPROP(node, "clock-frequency", (caddr_t)&clk_freq) == -1) {
-		/*
-		 * If we didn't find it in the CPU node, look in the root node.
-		 */
-		pnode_t root = prom_nextnode((pnode_t)0);
-		if (GETPROP(root, "clock-frequency", (caddr_t)&clk_freq) == -1)
+	if (md_get_prop_val(mdp, cpuc,
+	    "clock-frequency", &clk_freq)) {
 			clk_freq = 0;
 	}
 	cpunode->clock_freq = clk_freq;
@@ -352,65 +172,621 @@
 	cpunode->tick_nsec_scale = (uint_t)(((uint64_t)NANOSEC <<
 	    (32 - TICK_NSEC_SHIFT)) / cpunode->clock_freq);
 
+	/*
+	 * The nodeid is not used in sun4v at all. Setting it
+	 * to positive value to make starting of slave CPUs
+	 * code happy.
+	 */
+	cpunode->nodeid = cpuid + 1;
 
-	cpunode->nodeid = node;
+	/*
+	 * Obtain the L2 cache information from MD.
+	 * If "Cache" node exists, then set L2 cache properties
+	 * as read from MD.
+	 * If node does not exists, then set the L2 cache properties
+	 * in individual CPU module.
+	 */
+	if ((!get_l2_cache_info(mdp, cpuc,
+	    &associativity, &size, &linesize)) ||
+	    associativity == 0 || size == 0 || linesize == 0) {
+		cpu_fiximp(cpunode);
+	} else {
+		/*
+		 * Do not expect L2 cache properties to be bigger
+		 * than 32-bit quantity.
+		 */
+		cpunode->ecache_associativity = (int)associativity;
+		cpunode->ecache_size = (int)size;
+		cpunode->ecache_linesize = (int)linesize;
+	}
+
+	cpunode->ecache_setsize =
+	    cpunode->ecache_size / cpunode->ecache_associativity;
+
+		/*
+		 * Start off by assigning the cpu id as the default
+		 * mapping index.
+		 */
+
+	cpunode->exec_unit_mapping = NO_EU_MAPPING_FOUND;
+
+	if (ecache_setsize == 0)
+		ecache_setsize = cpunode->ecache_setsize;
+	if (ecache_alignsize == 0)
+		ecache_alignsize = cpunode->ecache_linesize;
+
+	ncpunode++;
+}
+
+void
+empty_cpu(int cpuid)
+{
+	bzero(&cpunodes[cpuid], sizeof (struct cpu_node));
+	ncpunode--;
+}
+
+void
+setup_exec_unit_mappings(md_t *mdp)
+{
+	uint64_t num, num_eunits;
+	mde_cookie_t cpus_node;
+	mde_cookie_t *node, *eunit;
+	int idx, i, j;
+	processorid_t cpuid;
+	char *eunit_name = broken_md_flag ? "exec_unit" : "exec-unit";
 
 	/*
-	 * Call cpu module specific code to fill in the cpu properities
+	 * Find the cpu integer exec units - and
+	 * setup the mappings appropriately.
 	 */
-	cpu_fiximp(cpunode);
+	num = md_alloc_scan_dag(mdp, md_root_node(mdp), "cpus", "fwd", &node);
+	if (num < 1)
+		cmn_err(CE_PANIC, "No cpus node in machine desccription");
+	if (num > 1)
+		cmn_err(CE_PANIC, "More than 1 cpus node in machine"
+		    " description");
+
+	cpus_node = node[0];
+	md_free_scan_dag(mdp, &node);
+
+	num_eunits = md_alloc_scan_dag(mdp, cpus_node, eunit_name,
+	    "fwd", &eunit);
+	if (num_eunits > 0) {
+		char *match_type = broken_md_flag ? "int" : "integer";
+
+		/* Spin through and find all the integer exec units */
+		for (i = 0; i < num_eunits; i++) {
+			char *p;
+			char *val;
+			int vallen;
+			uint64_t lcpuid;
+
+				/* ignore nodes with no type */
+			if (md_get_prop_data(mdp, eunit[i], "type",
+				(uint8_t **)&val, &vallen)) continue;
+
+			for (p = val; *p != '\0'; p += strlen(p) + 1) {
+				if (strcmp(p, match_type) == 0)
+					goto found;
+			}
+
+			continue;
+found:
+			idx = NCPU + i;
+			/*
+			 * find the cpus attached to this EU and
+			 * update their mapping indices
+			 */
+			num = md_alloc_scan_dag(mdp, eunit[i], "cpu",
+			    "back", &node);
+
+			if (num < 1)
+				cmn_err(CE_PANIC, "exec-unit node in MD"
+				    " not attached to a cpu node");
+
+			for (j = 0; j < num; j++) {
+				if (md_get_prop_val(mdp, node[j], "id",
+				    &lcpuid))
+					continue;
+				if (lcpuid >= NCPU)
+					continue;
+				cpuid = (processorid_t)lcpuid;
+				cpunodes[cpuid].exec_unit_mapping = idx;
+			}
+			md_free_scan_dag(mdp, &node);
+		}
+
+
+		md_free_scan_dag(mdp, &eunit);
+	}
 }
 
-#define	IOMMU_PER_SCHIZO	2
-
 /*
- * The first psycho must always programmed up for the system clock and error
- * handling purposes.
+ * All the common setup of sun4v CPU modules is done by this routine.
  */
-static void
-have_pci(pnode_t node)
+void
+cpu_setup_common(char **cpu_module_isa_set)
 {
-	int size;
-	uint_t portid;
-	char compatible[OBP_MAXDRVNAME];
+	extern int disable_delay_tlb_flush, delay_tlb_flush;
+	extern int mmu_exported_pagesize_mask;
+	extern int vac_size, vac_shift;
+	extern uint_t vac_mask;
+	int nocpus, i;
+	size_t ra_limit;
+	mde_cookie_t *cpulist;
+	md_t *mdp;
+
+	if ((mdp = md_get_handle()) == NULL)
+		cmn_err(CE_PANIC, "Unable to initialize machine description");
+
+	init_md_broken(mdp);
+
+	nocpus = md_alloc_scan_dag(mdp,
+	    md_root_node(mdp), "cpu", "fwd", &cpulist);
+	if (nocpus < 1) {
+		cmn_err(CE_PANIC, "cpu_common_setup: cpulist allocation "
+		    "failed or incorrect number of CPUs in MD");
+	}
+
+	if (use_page_coloring) {
+		do_pg_coloring = 1;
+		if (use_virtual_coloring) {
+			/*
+			 * XXX Sun4v cpus don't have virtual caches
+			 */
+			do_virtual_coloring = 1;
+		}
+	}
+
+	/*
+	 * Get the valid contexts, mmu page sizes mask, Q sizes and isalist/r
+	 * from the MD for the first available CPU in cpulist.
+	 */
+
+	if (nctxs == 0)
+		nctxs = (uint_t)(1 << get_mmu_ctx_bits(mdp, cpulist[0]));
+
+	if (nctxs > MAX_NCTXS)
+		nctxs = MAX_NCTXS;
+
+	/* Do not expect the MMU page sizes mask to be more than 32-bit. */
+	mmu_exported_pagesize_mask = (int)get_cpu_pagesizes(mdp, cpulist[0]);
+
+	for (i = 0; i < nocpus; i++)
+		fill_cpu(mdp, cpulist[i]);
+
+	setup_exec_unit_mappings(mdp);
+
+	vac_size = S_VAC_SIZE;
+	vac_mask = MMU_PAGEMASK & (vac_size - 1);
+	vac_shift = S_VAC_SHIFT;
+	shm_alignment = vac_size;
+	vac = 0;
+
+	/*
+	 * If MD is broken then append the passed ISA set,
+	 * otherwise trust the MD.
+	 */
 
-	size = GETPROPLEN(node, "portid");
-	if (size == -1) size = GETPROPLEN(node, "upa-portid");
-	if (size == -1)
-		return;
-	if (size > sizeof (portid))
-		cmn_err(CE_PANIC, "portid size wrong");
+	if (broken_md_flag)
+		isa_list = construct_isalist(mdp, cpulist[0],
+		    cpu_module_isa_set);
+	else
+		isa_list = construct_isalist(mdp, cpulist[0], NULL);
+
+	get_q_sizes(mdp, cpulist[0]);
+
+	get_va_bits(mdp, cpulist[0]);
+
+	/*
+	 * ra_limit is the highest real address in the machine.
+	 */
+	ra_limit = get_ra_limit(mdp);
+
+	md_free_scan_dag(mdp, &cpulist);
+
+	(void) md_fini_handle(mdp);
+
+	/*
+	 * Block stores invalidate all pages of the d$ so pagecopy
+	 * et. al. do not need virtual translations with virtual
+	 * coloring taken into consideration.
+	 */
+	pp_consistent_coloring = 0;
+
+	/*
+	 * The kpm mapping window.
+	 * kpm_size:
+	 *	The size of a single kpm range.
+	 *	The overall size will be: kpm_size * vac_colors.
+	 * kpm_vbase:
+	 *	The virtual start address of the kpm range within the kernel
+	 *	virtual address space. kpm_vbase has to be kpm_size aligned.
+	 */
 
-	if (GETPROP(node, "portid", (caddr_t)&portid) == -1)
-		if (GETPROP(node, "upa-portid", (caddr_t)&portid) == -1)
-			cmn_err(CE_PANIC, "portid not found");
+	/*
+	 * Make kpm_vbase, kpm_size aligned to kpm_size_shift.
+	 * To do this find the nearest power of 2 size that the
+	 * actual ra_limit fits within.
+	 * If it is an even power of two use that, otherwise use the
+	 * next power of two larger than ra_limit.
+	 */
+
+	ASSERT(ra_limit != 0);
+
+	kpm_size_shift = (ra_limit & (ra_limit - 1)) != 0 ?
+		highbit(ra_limit) : highbit(ra_limit) - 1;
+
+	/*
+	 * No virtual caches on sun4v so size matches size shift
+	 */
+	kpm_size = 1ul << kpm_size_shift;
 
-	niobus++;
+	if (va_bits < VA_ADDRESS_SPACE_BITS) {
+		/*
+		 * In case of VA hole
+		 * kpm_base = hole_end + 1TB
+		 * Starting 1TB beyond where VA hole ends because on Niagara
+		 * processor software must not use pages within 4GB of the
+		 * VA hole as instruction pages to avoid problems with
+		 * prefetching into the VA hole.
+		 */
+		kpm_vbase = (caddr_t)((0ull - (1ull << (va_bits - 1))) +
+		    (1ull << 40));
+	} else {		/* Number of VA bits 64 ... no VA hole */
+		kpm_vbase = (caddr_t)0x8000000000000000ull;	/* 8 EB */
+	}
 
+	/*
+	 * The traptrace code uses either %tick or %stick for
+	 * timestamping.  The sun4v require use of %stick.
+	 */
+	traptrace_use_stick = 1;
 
 	/*
-	 * Need two physical TSBs for Schizo-compatible nodes,
-	 * one otherwise.
+	 * sun4v provides demap_all
+	 */
+	if (!disable_delay_tlb_flush)
+		delay_tlb_flush = 1;
+}
+
+/*
+ * Get the nctxs from MD. If absent panic.
+ */
+static uint64_t
+get_mmu_ctx_bits(md_t *mdp, mde_cookie_t cpu_node_cookie)
+{
+	uint64_t ctx_bits;
+
+	if (md_get_prop_val(mdp, cpu_node_cookie, "mmu-#context-bits",
+	    &ctx_bits))
+		ctx_bits = 0;
+
+	if (ctx_bits < MIN_NCTXS_BITS || ctx_bits > MAX_NCTXS_BITS)
+		cmn_err(CE_PANIC, "Incorrect %ld number of contexts bits "
+		    "returned by MD", ctx_bits);
+
+	return (ctx_bits);
+}
+
+/*
+ * Initalize supported page sizes information.
+ * Set to 0, if the page sizes mask information is absent in MD.
+ */
+static uint64_t
+get_cpu_pagesizes(md_t *mdp, mde_cookie_t cpu_node_cookie)
+{
+	uint64_t mmu_page_size_list;
+
+	if (md_get_prop_val(mdp, cpu_node_cookie, "mmu-page-size-list",
+	    &mmu_page_size_list))
+		mmu_page_size_list = 0;
+
+	if (mmu_page_size_list == 0 || mmu_page_size_list > MAX_PAGESIZE_MASK)
+		cmn_err(CE_PANIC, "Incorrect 0x%lx pagesize mask returned"
+		    "by MD", mmu_page_size_list);
+
+	return (mmu_page_size_list);
+}
+
+/*
+ * This routine gets the isalist information from MD and appends
+ * the CPU module ISA set if required.
+ */
+static char *
+construct_isalist(md_t *mdp, mde_cookie_t cpu_node_cookie,
+    char **cpu_module_isa_set)
+{
+	extern int at_flags;
+	char *md_isalist;
+	int md_isalen;
+	char *isabuf;
+	int isalen;
+	char **isa_set;
+	char *p, *q;
+	int cpu_module_isalen = 0, found = 0;
+
+	(void) md_get_prop_data(mdp, cpu_node_cookie,
+	    "isalist", (uint8_t **)&isabuf, &isalen);
+
+	/*
+	 * We support binaries for all the cpus that have shipped so far.
+	 * The kernel emulates instructions that are not supported by hardware.
+	 */
+	at_flags = EF_SPARC_SUN_US3 | EF_SPARC_32PLUS | EF_SPARC_SUN_US1;
+
+	/*
+	 * Construct the space separated isa_list.
+	 */
+	if (cpu_module_isa_set != NULL) {
+		for (isa_set = cpu_module_isa_set; *isa_set != NULL;
+		    isa_set++) {
+			cpu_module_isalen += strlen(*isa_set);
+			cpu_module_isalen++;	/* for space character */
+		}
+	}
+
+	/*
+	 * Allocate the buffer of MD isa buffer length + CPU module
+	 * isa buffer length.
 	 */
-	compatible[0] = '\0';
-	(void) prom_getprop(node, OBP_COMPATIBLE, compatible);
-	if (strcmp(compatible, "pci108e,8001") == 0)
-		niommu_tsbs += IOMMU_PER_SCHIZO;
-	else
-		niommu_tsbs++;
+	md_isalen = isalen + cpu_module_isalen + 2;
+	md_isalist = (char *)prom_alloc((caddr_t)0, md_isalen, 0);
+	if (md_isalist == NULL)
+		cmn_err(CE_PANIC, "construct_isalist: Allocation failed for "
+		    "md_isalist");
+
+	md_isalist[0] = '\0'; /* create an empty string to start */
+	for (p = isabuf, q = p + isalen; p < q; p += strlen(p) + 1) {
+		(void) strlcat(md_isalist, p, md_isalen);
+		(void) strcat(md_isalist, " ");
+	}
+
+	/*
+	 * Check if the isa_set is present in isalist returned by MD.
+	 * If yes, then no need to append it, if no then append it to
+	 * isalist returned by MD.
+	 */
+	if (cpu_module_isa_set != NULL) {
+		for (isa_set = cpu_module_isa_set; *isa_set != NULL;
+		    isa_set++) {
+			found = 0;
+			for (p = isabuf, q = p + isalen; p < q;
+			    p += strlen(p) + 1) {
+				if (strcmp(p, *isa_set) == 0) {
+					found = 1;
+					break;
+				}
+			}
+			if (!found) {
+				(void) strlcat(md_isalist, *isa_set, md_isalen);
+				(void) strcat(md_isalist, " ");
+			}
+		}
+	}
+
+	/* Get rid of any trailing white spaces */
+	md_isalist[strlen(md_isalist) - 1] = '\0';
+
+	return (md_isalist);
+}
+
+uint64_t
+get_ra_limit(md_t *mdp)
+{
+	mde_cookie_t *mem_list;
+	mde_cookie_t *mblock_list;
+	int i;
+	int memnodes;
+	int nmblock;
+	uint64_t base;
+	uint64_t size;
+	uint64_t ra_limit = 0, new_limit = 0;
+
+	memnodes = md_alloc_scan_dag(mdp,
+	    md_root_node(mdp), "memory", "fwd", &mem_list);
+
+	ASSERT(memnodes == 1);
+
+	nmblock = md_alloc_scan_dag(mdp,
+	    mem_list[0], "mblock", "fwd", &mblock_list);
+	if (nmblock < 1)
+		cmn_err(CE_PANIC, "cannot find mblock nodes in MD");
+
+	for (i = 0; i < nmblock; i++) {
+		if (md_get_prop_val(mdp, mblock_list[i], "base", &base))
+			cmn_err(CE_PANIC, "base property missing from MD"
+			    " mblock node");
+		if (md_get_prop_val(mdp, mblock_list[i], "size", &size))
+			cmn_err(CE_PANIC, "size property missing from MD"
+			    " mblock node");
+
+		ASSERT(size != 0);
+
+		new_limit = base + size;
+
+		if (base > new_limit)
+			cmn_err(CE_PANIC, "mblock in MD wrapped around");
+
+		if (new_limit > ra_limit)
+		    ra_limit = new_limit;
+	}
+
+	ASSERT(ra_limit != 0);
+
+	if (ra_limit > MAX_REAL_ADDRESS) {
+		cmn_err(CE_WARN, "Highest real address in MD too large"
+		    " clipping to %llx\n", MAX_REAL_ADDRESS);
+		ra_limit = MAX_REAL_ADDRESS;
+	}
+
+	md_free_scan_dag(mdp, &mblock_list);
+
+	md_free_scan_dag(mdp, &mem_list);
+
+	return (ra_limit);
+}
+
+/*
+ * This routine sets the globals for CPU and DEV mondo queue entries and
+ * resumable and non-resumable error queue entries.
+ */
+static uint64_t
+get_single_q_size(md_t *mdp, mde_cookie_t cpu_node_cookie,
+    char *qnamep, uint64_t default_entries)
+{
+	uint64_t entries;
+
+	if (md_get_prop_val(mdp, cpu_node_cookie, qnamep, &entries)) {
+		if (!broken_md_flag)
+			cmn_err(CE_PANIC, "Missing %s property in MD cpu node",
+				qnamep);
+		entries = default_entries;
+	} else {
+		entries = 1 << entries;
+	}
+	return (entries);
 }
 
 
-int
-get_cpu_pagesizes(void)
+static void
+get_q_sizes(md_t *mdp, mde_cookie_t cpu_node_cookie)
+{
+	cpu_q_entries = get_single_q_size(mdp, cpu_node_cookie,
+	    "q-cpu-mondo-#bits", DEFAULT_CPU_Q_ENTRIES);
+
+	dev_q_entries = get_single_q_size(mdp, cpu_node_cookie,
+	    "q-dev-mondo-#bits", DEFAULT_DEV_Q_ENTRIES);
+
+	cpu_rq_entries = get_single_q_size(mdp, cpu_node_cookie,
+	    "q-resumable-#bits", CPU_RQ_ENTRIES);
+
+	cpu_nrq_entries = get_single_q_size(mdp, cpu_node_cookie,
+		"q-nonresumable-#bits", CPU_NRQ_ENTRIES);
+}
+
+
+static void
+get_va_bits(md_t *mdp, mde_cookie_t cpu_node_cookie)
 {
+	uint64_t value = VA_ADDRESS_SPACE_BITS;
+
+	if (md_get_prop_val(mdp, cpu_node_cookie, "mmu-#va-bits", &value))
+		cmn_err(CE_PANIC, "mmu-#va-bits property  not found in MD");
+
+
+	if (value == 0 || value > VA_ADDRESS_SPACE_BITS)
+		cmn_err(CE_PANIC, "Incorrect number of va bits in MD");
+
+	/* Do not expect number of VA bits to be more than 32-bit quantity */
+
+	va_bits = (int)value;
+
+	/*
+	 * Correct the value for VA bits on UltraSPARC-T1 based systems
+	 * in case of broken MD.
+	 */
+	if (broken_md_flag)
+		va_bits = DEFAULT_VA_ADDRESS_SPACE_BITS;
+}
+
+/*
+ * This routine returns the L2 cache information such as -- associativity,
+ * size and linesize.
+ */
+static int
+get_l2_cache_info(md_t *mdp, mde_cookie_t cpu_node_cookie,
+	    uint64_t *associativity, uint64_t *size, uint64_t *linesize)
+{
+	mde_cookie_t *cachelist;
+	int ncaches, i;
+	uint64_t max_level;
+
+	ncaches = md_alloc_scan_dag(mdp, cpu_node_cookie, "cache",
+	    "fwd", &cachelist);
 	/*
-	 * XXXQ Get supported page sizes information from the PD
-	 * and return a bit mask indicating which page sizes are
-	 * supported.
-	 *
-	 * Return 0 when no information is available.
+	 * The "cache" node is optional in MD, therefore ncaches can be 0.
+	 */
+	if (ncaches < 1) {
+		return (0);
+	}
+
+	max_level = 0;
+	for (i = 0; i < ncaches; i++) {
+		uint64_t cache_level;
+		uint64_t local_assoc;
+		uint64_t local_size;
+		uint64_t local_lsize;
+
+		if (md_get_prop_val(mdp, cachelist[i], "level", &cache_level))
+			continue;
+
+		if (cache_level <= max_level) continue;
+
+		/* If properties are missing from this cache ignore it */
+
+		if ((md_get_prop_val(mdp, cachelist[i],
+		    "associativity", &local_assoc))) {
+			continue;
+		}
+
+		if ((md_get_prop_val(mdp, cachelist[i],
+		    "size", &local_size))) {
+			continue;
+		}
+
+		if ((md_get_prop_val(mdp, cachelist[i],
+		    "line-size", &local_lsize))) {
+			continue;
+		}
+
+		max_level = cache_level;
+		*associativity = local_assoc;
+		*size = local_size;
+		*linesize = local_lsize;
+	}
+
+	md_free_scan_dag(mdp, &cachelist);
+
+	return ((max_level > 0) ? 1 : 0);
+}
+
+/*
+ * The broken_md_flag is set to 1, if the MD doesn't have
+ * the domaining-enabled property in the platform node and the platforms
+ * are Ontario and Erie. This flag is used to workaround some of the
+ * incorrect MD properties.
+ */
+static void
+init_md_broken(md_t *mdp)
+{
+	int nrnode;
+	mde_cookie_t *platlist, rootnode;
+	char *vbuf;
+	uint64_t val = 0;
+
+	rootnode = md_root_node(mdp);
+	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
+
+	nrnode = md_alloc_scan_dag(mdp, md_root_node(mdp), "platform", "fwd",
+	    &platlist);
+
+	ASSERT(nrnode == 1);
+
+	if (md_get_prop_str(mdp, platlist[0], "name", &vbuf) != 0)
+		panic("platform name not found in machine description");
+
+	/*
+	 * If domaining-enable prop doesn't exist and the platform name is
+	 * Ontario or Erie the md is broken.
 	 */
 
-	return (0);			/* XXXQ for now return 0 as no PD */
+	if (md_get_prop_val(mdp, platlist[0], "domaining-enabled", &val) != 0 &&
+	    ((strcmp(vbuf, ONTARIO_PLATNAME1) == 0) ||
+	    (strcmp(vbuf, ONTARIO_PLATNAME2) == 0) ||
+	    (strcmp(vbuf, ERIE_PLATNAME1) == 0) ||
+	    (strcmp(vbuf, ERIE_PLATNAME2) == 0)))
+		broken_md_flag = 1;
+
+	md_free_scan_dag(mdp, &platlist);
 }
--- a/usr/src/uts/sun4v/os/hsvc.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/os/hsvc.c	Tue May 16 16:05:21 2006 -0700
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -653,12 +654,15 @@
  * uses hypervisor services belonging to the HSVC_GROUP_CORE API
  * group only for itself.
  *
- * Note that the HSVC_GROUP_DIAG is negotiated on behalf of
- * any driver/module using DIAG services.
+ * Rest of the API groups are currently negotiated on behalf
+ * of the pcitool, glvc support.  In future, when these drivers
+ * are modified to do the negotiation themselves, corresponding
+ * entry should be removed from the table below.
  */
 static hsvc_info_t  hsvcinfo_unix[] = {
 	{HSVC_REV_1, NULL,	HSVC_GROUP_SUN4V,	1,	0, NULL},
-	{HSVC_REV_1, NULL,	HSVC_GROUP_CORE,	1,	0, NULL},
+	{HSVC_REV_1, NULL,	HSVC_GROUP_CORE,	1,	1, NULL},
+	{HSVC_REV_1, NULL,	HSVC_GROUP_VSC,		1,	0, NULL},
 	{HSVC_REV_1, NULL,	HSVC_GROUP_DIAG,	1,	0, NULL}
 };
 
--- a/usr/src/uts/sun4v/os/intrq.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/os/intrq.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -34,28 +34,6 @@
 #include <sys/error.h>
 #include <sys/hypervisor_api.h>
 
-/*
- * XXX needs to be set by some algorithm that derives this
- * from the partition description
- */
-int cpu_q_entries = 128;
-int dev_q_entries = 128;
-
-/*
- * Once the partition description if finallized
- * cpu_q_entries and dev_q_entries will be set
- * and be garaunteed to be two's power multiples.
- */
-#define	INTR_CPU_Q	0x3c
-#define	INTR_DEV_Q	0x3d
-#define	INTR_REPORT_SIZE	64
-#define	INTR_CPU_Q_SIZE	(cpu_q_entries * INTR_REPORT_SIZE)
-#define	INTR_DEV_Q_SIZE	(dev_q_entries * INTR_REPORT_SIZE)
-
-/*
- * XXX -  This needs to be rewritten with prom calls to
- * let OBP know the queues are allocated
- */
 void
 cpu_intrq_register(struct cpu *cpu)
 {
@@ -72,13 +50,12 @@
 		cmn_err(CE_PANIC, "cpu%d: dev_mondo queue configuration "
 		    "failed, error %lu", cpu->cpu_id, ret);
 
-	ret = hv_cpu_qconf(CPU_RQ, mcpup->cpu_rq_base_pa, CPU_RQ_ENTRIES);
+	ret = hv_cpu_qconf(CPU_RQ, mcpup->cpu_rq_base_pa, cpu_rq_entries);
 	if (ret != H_EOK)
 		cmn_err(CE_PANIC, "cpu%d: resumable error queue configuration "
 		    "failed, error %lu", cpu->cpu_id, ret);
 
-	ret = hv_cpu_qconf(CPU_NRQ, mcpup->cpu_nrq_base_pa,
-	    CPU_NRQ_ENTRIES);
+	ret = hv_cpu_qconf(CPU_NRQ, mcpup->cpu_nrq_base_pa, cpu_nrq_entries);
 	if (ret != H_EOK)
 		cmn_err(CE_PANIC, "cpu%d: non-resumable error queue "
 		    "configuration failed, error %lu", cpu->cpu_id, ret);
@@ -89,6 +66,10 @@
 {
 	struct machcpu *mcpup = &cpu->cpu_m;
 	int cpu_list_size;
+	uint64_t cpu_q_size;
+	uint64_t dev_q_size;
+	uint64_t cpu_rq_size;
+	uint64_t cpu_nrq_size;
 
 	/*
 	 * Allocate mondo data for xcalls.
@@ -120,38 +101,109 @@
 	/*
 	 * Allocate sun4v interrupt and error queues.
 	 */
-	mcpup->cpu_q_va = contig_mem_alloc(INTR_CPU_Q_SIZE);
+	cpu_q_size = cpu_q_entries * INTR_REPORT_SIZE;
+	mcpup->cpu_q_va = contig_mem_alloc(cpu_q_size);
 	if (mcpup->cpu_q_va == NULL)
 		cmn_err(CE_PANIC, "cpu%d: cpu intrq allocation failed",
 		    cpu->cpu_id);
 	mcpup->cpu_q_base_pa = va_to_pa(mcpup->cpu_q_va);
-	mcpup->cpu_q_size =  INTR_CPU_Q_SIZE;
+	mcpup->cpu_q_size =  cpu_q_size;
 
-	mcpup->dev_q_va = contig_mem_alloc(INTR_DEV_Q_SIZE);
+	dev_q_size = dev_q_entries * INTR_REPORT_SIZE;
+	mcpup->dev_q_va = contig_mem_alloc(dev_q_size);
 	if (mcpup->dev_q_va == NULL)
 		cmn_err(CE_PANIC, "cpu%d: dev intrq allocation failed",
 		    cpu->cpu_id);
 	mcpup->dev_q_base_pa = va_to_pa(mcpup->dev_q_va);
-	mcpup->dev_q_size =  INTR_DEV_Q_SIZE;
+	mcpup->dev_q_size =  dev_q_size;
 
 	/* Allocate resumable queue and its kernel buffer */
-	mcpup->cpu_rq_va = contig_mem_alloc(2 * CPU_RQ_SIZE);
+	cpu_rq_size = cpu_rq_entries * Q_ENTRY_SIZE;
+	mcpup->cpu_rq_va = contig_mem_alloc(2 * cpu_rq_size);
 	if (mcpup->cpu_rq_va == NULL)
 		cmn_err(CE_PANIC, "cpu%d: resumable queue allocation failed",
 		    cpu->cpu_id);
 	mcpup->cpu_rq_base_pa = va_to_pa(mcpup->cpu_rq_va);
-	mcpup->cpu_rq_size = CPU_RQ_SIZE;
+	mcpup->cpu_rq_size = cpu_rq_size;
 	/* zero out the memory */
-	bzero(mcpup->cpu_rq_va, 2 * CPU_RQ_SIZE);
+	bzero(mcpup->cpu_rq_va, 2 * cpu_rq_size);
 
 	/* Allocate nonresumable queue here */
-	mcpup->cpu_nrq_va = contig_mem_alloc(2 * CPU_NRQ_SIZE);
+	cpu_nrq_size = cpu_nrq_entries * Q_ENTRY_SIZE;
+	mcpup->cpu_nrq_va = contig_mem_alloc(2 * cpu_nrq_size);
 	if (mcpup->cpu_nrq_va == NULL)
 		cmn_err(CE_PANIC, "cpu%d: nonresumable queue "
 		    "allocation failed", cpu->cpu_id);
 	mcpup->cpu_nrq_base_pa = va_to_pa(mcpup->cpu_nrq_va);
-	mcpup->cpu_nrq_size = CPU_NRQ_SIZE;
+	mcpup->cpu_nrq_size = cpu_nrq_size;
 	/* zero out the memory */
-	bzero(mcpup->cpu_nrq_va, 2 * CPU_NRQ_SIZE);
+	bzero(mcpup->cpu_nrq_va, 2 * cpu_nrq_size);
+}
+
+void
+cpu_intrq_cleanup(struct cpu *cpu)
+{
+	struct machcpu *mcpup = &cpu->cpu_m;
+	int cpu_list_size;
+	uint64_t cpu_q_size;
+	uint64_t dev_q_size;
+	uint64_t cpu_rq_size;
+	uint64_t cpu_nrq_size;
+
+	/*
+	 * Free mondo data for xcalls.
+	 */
+	if (mcpup->mondo_data) {
+		contig_mem_free(mcpup->mondo_data, INTR_REPORT_SIZE);
+		mcpup->mondo_data = NULL;
+		mcpup->mondo_data_ra = NULL;
+	}
+
+	/*
+	 *  Free percpu list of NCPU for xcalls
+	 */
+	cpu_list_size = NCPU * sizeof (uint16_t);
+	if (cpu_list_size < INTR_REPORT_SIZE)
+		cpu_list_size = INTR_REPORT_SIZE;
+
+	if (mcpup->cpu_list) {
+		contig_mem_free(mcpup->cpu_list, cpu_list_size);
+		mcpup->cpu_list = NULL;
+		mcpup->cpu_list_ra = NULL;
+	}
 
+	/*
+	 * Free sun4v interrupt and error queues.
+	 */
+	if (mcpup->cpu_q_va) {
+		cpu_q_size = cpu_q_entries * INTR_REPORT_SIZE;
+		contig_mem_free(mcpup->cpu_q_va, cpu_q_size);
+		mcpup->cpu_q_va = NULL;
+		mcpup->cpu_q_base_pa = NULL;
+		mcpup->cpu_q_size = 0;
+	}
+
+	if (mcpup->dev_q_va) {
+		dev_q_size = dev_q_entries * INTR_REPORT_SIZE;
+		contig_mem_free(mcpup->dev_q_va, dev_q_size);
+		mcpup->dev_q_va = NULL;
+		mcpup->dev_q_base_pa = NULL;
+		mcpup->dev_q_size = 0;
+	}
+
+	if (mcpup->cpu_rq_va) {
+		cpu_rq_size = cpu_rq_entries * Q_ENTRY_SIZE;
+		contig_mem_free(mcpup->cpu_rq_va, 2 * cpu_rq_size);
+		mcpup->cpu_rq_va = NULL;
+		mcpup->cpu_rq_base_pa = NULL;
+		mcpup->cpu_rq_size = 0;
+	}
+
+	if (mcpup->cpu_nrq_va) {
+		cpu_nrq_size = cpu_nrq_entries * Q_ENTRY_SIZE;
+		contig_mem_free(mcpup->cpu_nrq_va, 2 * cpu_nrq_size);
+		mcpup->cpu_nrq_va = NULL;
+		mcpup->cpu_nrq_base_pa = NULL;
+		mcpup->cpu_nrq_size = 0;
+	}
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/os/lpad.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,231 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/machsystm.h>
+#include <sys/machparam.h>
+#include <sys/cmn_err.h>
+#include <sys/cpuvar.h>
+#include <sys/note.h>
+#include <sys/hypervisor_api.h>
+#include <sys/lpad.h>
+
+typedef struct {
+	uint64_t	inuse;
+	uint64_t	buf[LPAD_SIZE / sizeof (uint64_t)];
+} lpad_t;
+
+/*
+ * A global pool of landing pad memory. Currently, CPUs are only
+ * brought into the system one at a time, so the pool is only a
+ * single landing pad. In the future, it may be desirable to bring
+ * CPUs into the systems in parallel. At that time, the size of
+ * the pool can be increased by changing the pool size constant.
+ */
+#define	LPAD_POOL_SIZE	1
+
+static lpad_t	lpad_pool[LPAD_POOL_SIZE];
+
+#ifdef DEBUG
+static int lpad_dbg = 0;
+
+#define	LPAD_DBG		if (lpad_dbg) printf
+#define	LPAD_DUMP_DATA		lpad_dump_data
+
+static void lpad_dump_data(uint64_t *lpd_start, uint64_t *lpd_end);
+
+#else /* DEBUG */
+
+#define	LPAD_DBG		_NOTE(CONSTCOND) if (0) printf
+#define	LPAD_DUMP_DATA
+#endif /* DEBUG */
+
+extern void mach_cpu_startup(uint64_t rabase, uint64_t memsize);
+extern void mach_cpu_startup_end(void);
+extern int promif_in_cif(void);
+
+static lpad_t *lpad_alloc(void);
+
+uint64_t *
+lpad_setup(int cpuid, uint64_t pc, uint64_t arg)
+{
+	lpad_t		*lpp;
+	uint64_t	textsz;
+	uint64_t	datasz;
+	lpad_data_t	*lpd;
+	lpad_map_t	*lpm;
+
+	/* external parameters */
+	extern caddr_t	textva;
+	extern caddr_t	datava;
+	extern tte_t	ktext_tte;
+	extern tte_t	kdata_tte;
+	extern caddr_t	mmu_fault_status_area;
+
+	LPAD_DBG("lpad_setup...\n");
+
+	if ((cpuid < 0) || (cpuid > NCPU)) {
+		cmn_err(CE_PANIC, "lpad_setup: invalid cpuid");
+	}
+
+	/* allocate our landing pad */
+	if ((lpp = lpad_alloc()) == NULL) {
+		cmn_err(CE_PANIC, "lpad_setup: unable to allocate lpad");
+	}
+
+	/* calculate the size of our text */
+	textsz = (uint64_t)mach_cpu_startup_end - (uint64_t)mach_cpu_startup;
+
+	LPAD_DBG("lpad textsz=%ld\n", textsz);
+
+	ASSERT(textsz <= LPAD_TEXT_SIZE);
+
+	/* copy over text section */
+	bcopy((void *)mach_cpu_startup, lpp->buf, textsz);
+
+	lpd = (lpad_data_t *)(((caddr_t)lpp->buf) + LPAD_TEXT_SIZE);
+	lpm = (lpad_map_t *)lpd->map;
+
+	ASSERT(mmu_fault_status_area);
+
+	bzero(lpd, LPAD_TEXT_SIZE);
+	lpd->magic = LPAD_MAGIC_VAL;
+	lpd->inuse = &(lpp->inuse);
+	lpd->mmfsa_ra = va_to_pa(mmu_fault_status_area) + (MMFSA_SIZE * cpuid);
+	lpd->pc = pc;
+	lpd->arg = arg;
+
+	/*
+	 * List of mappings:
+	 *
+	 *    - permanent inst/data mapping for kernel text
+	 *    - permanent data mapping for kernel data
+	 *    - non-permanent inst mapping for kernel data,
+	 *	required for landing pad text
+	 */
+	lpd->nmap = 3;
+
+	/* verify the lpad has enough room for the data */
+	datasz = sizeof (lpad_data_t);
+	datasz += (lpd->nmap - 1) * sizeof (lpad_map_t);
+
+	ASSERT(datasz <= LPAD_DATA_SIZE);
+
+	/*
+	 * Kernel Text Mapping
+	 */
+	lpm->va = (uint64_t)textva;
+	lpm->tte = ktext_tte;
+	lpm->flag_mmuflags = (MAP_ITLB | MAP_DTLB);
+	lpm->flag_perm = 1;
+	lpm++;
+
+	/*
+	 * Kernel Data Mapping
+	 */
+	lpm->va = (uint64_t)datava;
+	lpm->tte = kdata_tte;
+	lpm->flag_mmuflags = MAP_DTLB;
+	lpm->flag_perm = 1;
+	lpm++;
+
+	/*
+	 * Landing Pad Text Mapping
+	 *
+	 * Because this mapping should not be permanent,
+	 * the permanent mapping above cannot be used.
+	 */
+	lpm->va = (uint64_t)datava;
+	lpm->tte = kdata_tte;
+	lpm->flag_mmuflags = MAP_ITLB;
+	lpm->flag_perm = 0;
+	lpm++;
+
+	ASSERT(((uint64_t)lpm - (uint64_t)lpd) == datasz);
+
+	LPAD_DBG("copied %ld bytes of data into lpad\n", datasz);
+
+	LPAD_DUMP_DATA((uint64_t *)lpd, (uint64_t *)lpm);
+
+	return (lpp->buf);
+}
+
+static lpad_t *
+lpad_alloc(void)
+{
+	int	idx;
+
+	/*
+	 * No locking is required for the global lpad pool since
+	 * it should only be accessed while in the CIF which is
+	 * single threaded. If this assumption changes, locking
+	 * would be required.
+	 */
+	ASSERT(promif_in_cif());
+
+	/*
+	 * Wait until an lpad buffer becomes available.
+	 */
+	for (;;) {
+		LPAD_DBG("checking lpad pool:\n");
+
+		/* walk the lpad buffer array */
+		for (idx = 0; idx < LPAD_POOL_SIZE; idx++) {
+
+			LPAD_DBG("\tchecking lpad_pool[%d]\n", idx);
+
+			if (lpad_pool[idx].inuse == 0) {
+				LPAD_DBG("found empty lpad (%d)\n", idx);
+
+				/* mark the buffer as busy */
+				lpad_pool[idx].inuse = 1;
+
+				return (&lpad_pool[idx]);
+			}
+		}
+	}
+}
+
+#ifdef DEBUG
+static void
+lpad_dump_data(uint64_t *lpd_start, uint64_t *lpd_end)
+{
+	uint64_t	*lp;
+	uint_t		offset = 0;
+
+	if (lpad_dbg == 0)
+		return;
+
+	printf("lpad data:\n");
+
+	for (lp = lpd_start; lp < lpd_end; lp++) {
+		printf("\t0x%02x  0x%016lx\n", offset, *lp);
+		offset += sizeof (uint64_t);
+	}
+}
+#endif /* DEBUG */
--- a/usr/src/uts/sun4v/os/mach_cpu_states.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/os/mach_cpu_states.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -49,6 +48,8 @@
 #include <sys/dtrace.h>
 #include <sys/xc_impl.h>
 #include <sys/callb.h>
+#include <sys/mdesc.h>
+#include <sys/mach_descrip.h>
 
 /*
  * hvdump_buf_va is a pointer to the currently-configured hvdump_buf.
@@ -438,6 +439,12 @@
 		"CPU ECC error loop",		/* PTL1_BAD_ECC */
 		"unexpected error from hypervisor call", /* PTL1_BAD_HCALL */
 		"unexpected global level(%gl)", /* PTL1_BAD_GL */
+		"Watchdog Reset", 		/* PTL1_BAD_WATCHDOG */
+		"unexpected RED mode trap", 	/* PTL1_BAD_RED */
+		"return value EINVAL from hcall: "\
+		    "UNMAP_PERM_ADDR",	/* PTL1_BAD_HCALL_UNMAP_PERM_EINVAL */
+		"return value ENOMAP from hcall: "\
+		    "UNMAP_PERM_ADDR", /* PTL1_BAD_HCALL_UNMAP_PERM_ENOMAP */
 	};
 
 	uint_t reason = pstate->ptl1_regs.ptl1_gregs[0].ptl1_g1;
@@ -559,7 +566,45 @@
 void
 cpu_init_tick_freq(void)
 {
-	sys_tick_freq = cpunodes[CPU->cpu_id].clock_freq;
+	md_t *mdp;
+	mde_cookie_t rootnode;
+	int		listsz;
+	mde_cookie_t	*listp = NULL;
+	int	num_nodes;
+	uint64_t stick_prop;
+
+	if (broken_md_flag) {
+		sys_tick_freq = cpunodes[CPU->cpu_id].clock_freq;
+		return;
+	}
+
+	if ((mdp = md_get_handle()) == NULL)
+		panic("stick_frequency property not found in MD");
+
+	rootnode = md_root_node(mdp);
+	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
+
+	num_nodes = md_node_count(mdp);
+
+	ASSERT(num_nodes > 0);
+	listsz = num_nodes * sizeof (mde_cookie_t);
+	listp = (mde_cookie_t *)prom_alloc((caddr_t)0, listsz, 0);
+
+	if (listp == NULL)
+		panic("cannot allocate list for MD properties");
+
+	num_nodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "platform"),
+	    md_find_name(mdp, "fwd"), listp);
+
+	ASSERT(num_nodes == 1);
+
+	if (md_get_prop_val(mdp, *listp, "stick-frequency", &stick_prop) != 0)
+		panic("stick_frequency property not found in MD");
+
+	sys_tick_freq = stick_prop;
+
+	prom_free((caddr_t)listp, listsz);
+	(void) md_fini_handle(mdp);
 }
 
 int shipit(int n, uint64_t cpu_list_ra);
--- a/usr/src/uts/sun4v/os/mach_descrip.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/os/mach_descrip.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,33 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+/*
+ * Kernel Machine Description (MD)
+ *
+ * The Kernel maintains a global copy of the machine description for
+ * the system. This is for use by all kernel subsystems and is exported
+ * to user applications through the the 'mdesc' device driver. It is
+ * initially copied in from the Hypervisor at boot time, but can be
+ * updated dynamically on demand. The Kernel provides an interface
+ * for consumers to obtain a handle to the global MD. Consumers of the
+ * MD must use the specified interfaces. An update interface is provided
+ * for platform services to intiate an MD update on notification by a
+ * service entity.
+ *
+ * Locks
+ * The current global MD is protected by the curr_mach_descrip_lock.
+ * Each Machine description has a lock to synchornize its ref count.
+ * The Obsolete MD list is protected by the obs_list_lock.
+ */
+
 #include <sys/machsystm.h>
 #include <sys/vm.h>
 #include <sys/cpu.h>
@@ -37,32 +56,93 @@
 #include <sys/error.h>
 #include <sys/hypervisor_api.h>
 #include <sys/types.h>
-#include <sys/kstat.h>
-#ifdef MACH_DESC_DEBUG
-#include <sys/promif.h>		/* for prom_printf */
-#endif
 #include <sys/sysmacros.h>
+#include <sys/mdesc.h>
+#include <sys/mdesc_impl.h>
 #include <sys/mach_descrip.h>
+#include <sys/prom_plat.h>
+#include <sys/bootconf.h>
+#include <sys/promif.h>
+
+
+static void *mach_descrip_strt_meta_alloc(size_t size);
+static void mach_descrip_strt_meta_free(void *buf, size_t size);
+static void *mach_descrip_strt_buf_alloc(size_t size, size_t align);
+static void mach_descrip_strt_buf_free(void *buf, size_t size);
+static void *mach_descrip_buf_alloc(size_t size, size_t align);
+static void *mach_descrip_meta_alloc(size_t size);
+static uint64_t mach_descrip_find_md_gen(caddr_t ptr);
+static void init_md_params(void);
+static void init_domaining_enabled(md_t *mdp, mde_cookie_t *listp);
+
+extern struct bootops *bootops;
+
+/*
+ * Global ptr of the current generation Machine Description
+ */
+static machine_descrip_t *curr_mach_descrip;
 
 /*
- * Basic code to pull in the machine description from the Hypervisor
- * An equivalent to this should really be available from mlsetup
- * for really early info, but for the time being we are content to
- * invoke this from startup_end once the VM system has been initialised.
- * To do this we use the intrq allocator which means that
- * this function should be called after intrq_init();
- * We try and do this early enough however that it is useful to other
- * components within the kernel.
- * Also, user-level entities can grab the machine description via
- * kstat and/or the mdesc device driver.
+ * Initialized by machine_descrip_startup_init in startup.
+ * machine_descript_init will reintialize the structure with
+ * the vmem allocators once the vmem is available in the boot up
+ * process.
+ */
+static machine_descrip_memops_t *curr_mach_descrip_memops = NULL;
+
+static machine_descrip_memops_t startup_memops = {
+	mach_descrip_strt_buf_alloc,
+	mach_descrip_strt_buf_free,
+	mach_descrip_strt_meta_alloc,
+	mach_descrip_strt_meta_free,
+};
+
+static machine_descrip_memops_t mach_descrip_memops = {
+	mach_descrip_buf_alloc,
+	contig_mem_free,
+	mach_descrip_meta_alloc,
+	kmem_free,
+};
+
+static kmutex_t curr_mach_descrip_lock;
+/*
+ * List of obsolete Machine Descriptions
+ * Machine descriptions that have users are put on this list
+ * and freed after the last user has called md_fini_handle.
  */
+static machine_descrip_t *obs_machine_descrip_list;
 
+static kmutex_t obs_list_lock;
+
+static const char alloc_fail_msg[] =
+	"MD: cannot allocate MD buffer of size %ld bytes\n";
 
-machine_descrip_t machine_descrip;
+/*
+ * Global flag that indicates whether domaining features are
+ * available. The value is set at boot time based on the value
+ * of the 'domaining-enabled' property in the MD and the global
+ * override flag below. Updates to this variable after boot are
+ * not supported.
+ */
+uint_t domaining_enabled;
 
+/*
+ * Global override for the 'domaining_enabled' flag. If this
+ * flag is set in /etc/system, domaining features are disabled,
+ * ignoring the value of the 'domaining-enabled' property in
+ * the MD.
+ */
+uint_t force_domaining_disabled;
 
-#ifdef MACH_DESC_DEBUG
-#define	MDP(ARGS)	prom_printf ARGS
+#define	HAS_GEN(x)	(x != MDESC_INVAL_GEN)
+
+#ifdef DEBUG
+static int mach_descrip_debug = 0;
+
+#define	MDP(ARGS)	if (mach_descrip_debug) prom_printf ARGS
+#define	PRINT_LIST() 	if (mach_descrip_debug) print_obs_list()
+
+#ifdef	MACH_DESC_DEBUG
 static void
 dump_buf(uint8_t *bufp, int size)
 {
@@ -75,74 +155,711 @@
 		prom_printf("\n");
 	}
 }
+#endif /* MACH_DESC_DEBUG */
+
+static void
+print_obs_list(void)
+{
+	machine_descrip_t *lmdescp;
+	mutex_enter(&obs_list_lock);
+
+	lmdescp	= obs_machine_descrip_list;
+	prom_printf("MD_obs_list->");
+	while (lmdescp != NULL) {
+		prom_printf("g:%ld,r:%d", lmdescp->gen, lmdescp->refcnt);
+
+		lmdescp = lmdescp->next;
+		prom_printf("->");
+	}
+	prom_printf("NULL\n");
+	mutex_exit(&obs_list_lock);
+}
+
 #else
-#define	MDP(x)
-#endif
+#define	MDP(ARGS)
+#define	PRINT_LIST()
+#endif /* DEBUG */
+
+/*
+ * MD obsolete list managment functions
+ */
+static machine_descrip_t *
+md_obs_list_look_up_by_gen(uint64_t gen)
+{
+	machine_descrip_t *mdescp;
+
+	mutex_enter(&obs_list_lock);
+	mdescp = obs_machine_descrip_list;
 
+	while (mdescp != NULL) {
+		if (mdescp->gen == gen) {
+			mutex_exit(&obs_list_lock);
+			return (mdescp);
+		}
+		mdescp = mdescp->next;
+	}
 
+	mutex_exit(&obs_list_lock);
+	return (mdescp);
+}
+
+static void
+md_obs_list_remove(machine_descrip_t *mdescp)
+{
+	machine_descrip_t *lmdescp;
+
+	mutex_enter(&obs_list_lock);
+
+	lmdescp	= obs_machine_descrip_list;
 
+	if (obs_machine_descrip_list == mdescp) {
+		obs_machine_descrip_list = mdescp->next;
+	} else {
+		while (lmdescp != NULL) {
+			if (lmdescp->next == mdescp) {
+				lmdescp->next = mdescp->next;
+				mdescp->next = NULL;
+				break;
+			}
+			lmdescp = lmdescp->next;
+		}
+	}
+	mutex_exit(&obs_list_lock);
+	PRINT_LIST();
+}
 
+static void
+md_obs_list_add(machine_descrip_t *mdescp)
+{
+	mutex_enter(&obs_list_lock);
+
+	mdescp->next = obs_machine_descrip_list;
+	obs_machine_descrip_list = mdescp;
+
+	mutex_exit(&obs_list_lock);
+	PRINT_LIST();
+}
+
+/*
+ * Allocate a machine_descrip meta structure and intitialize it.
+ */
+static machine_descrip_t *
+new_mach_descrip(void)
+{
+	machine_descrip_t *mdescp;
 
-void
-mach_descrip_init(void)
+	mdescp = (machine_descrip_t *)(*curr_mach_descrip_memops->meta_allocp)
+	    (sizeof (machine_descrip_t));
+	if (mdescp != NULL) {
+		bzero(mdescp, sizeof (*mdescp));
+		mdescp->memops = curr_mach_descrip_memops;
+		mutex_init(&mdescp->lock, NULL, MUTEX_DRIVER, NULL);
+	}
+
+	return (mdescp);
+}
+
+/*
+ * Free a machine_descrip meta structure and intitialize it.
+ * Also free the MD buffer.
+ */
+static void
+destroy_machine_descrip(machine_descrip_t *mdescp)
 {
-	uint64_t md_size, ret;
+	machine_descrip_memops_t  *mdesc_memopsp;
+
+	ASSERT((mdescp != NULL));
+
+	mdesc_memopsp = mdescp->memops;
+	if (mdescp->memops == NULL)
+		panic("destroy_machine_descrip: memops NULL\n");
+
+	(*mdesc_memopsp->buf_freep)(mdescp->va, mdescp->space);
+	mutex_destroy(&mdescp->lock);
+	(*mdesc_memopsp->meta_freep)(mdescp, sizeof (*mdescp));
+}
+
+/*
+ * Call into the Hypervisor to retrieve the most recent copy of the
+ * machine description. If references to the current MD are active
+ * stow it in the obsolete MD list and update the current MD reference
+ * with the new one.
+ * The obsolete list contains one MD per generation. If the firmware
+ * doesn't support MD generation fail the call.
+ */
+int
+mach_descrip_update(void)
+{
+	uint64_t	md_size0, md_size;
+	uint64_t	md_space = 0;
+	uint64_t	hvret;
+	caddr_t		tbuf = NULL;
+	uint64_t	tbuf_pa;
+	uint64_t	tgen;
+	int		ret = 0;
 
 	MDP(("MD: Requesting buffer size\n"));
 
-	md_size = 0LL;
-	(void) hv_mach_desc((uint64_t)0, &md_size);
-	MDP(("MD: buffer size is %d\n", md_size));
+	ASSERT((curr_mach_descrip != NULL));
+
+	mutex_enter(&curr_mach_descrip_lock);
 
 	/*
-	 * Align allocated space to nearest page contig_mem_alloc_align
-	 * requires a Power of 2 alignment
+	 * If the required MD size changes between our first call
+	 * to hv_mach_desc (to find the required buf size) and the
+	 * second call (to get the actual MD), the MD was in the
+	 * process of being updated. Loop until the two sizes are
+	 * identical.
 	 */
-	machine_descrip.space = P2ROUNDUP(md_size, PAGESIZE);
-	MDP(("MD: allocated space is %d\n", machine_descrip.space));
-	machine_descrip.va = contig_mem_alloc_align(machine_descrip.space,
-	    PAGESIZE);
-	if (machine_descrip.va == NULL)
-		cmn_err(CE_PANIC, "Allocation for machine description failed");
+	do {
+		if (tbuf != NULL)
+			(*curr_mach_descrip_memops->buf_freep)(tbuf, md_space);
+
+		md_size0 = 0LL;
+		(void) hv_mach_desc((uint64_t)0, &md_size0);
+		MDP(("MD: buffer size is %ld\n", md_size0));
+
+		/*
+		 * Align allocated space to nearest page.
+		 * contig_mem_alloc_align() requires a power of 2 alignment.
+		 */
+		md_space = P2ROUNDUP(md_size0, PAGESIZE);
+		MDP(("MD: allocated space is %ld\n", md_space));
+
+		tbuf = (caddr_t)(*curr_mach_descrip_memops->buf_allocp)
+		    (md_space, PAGESIZE);
+		if (tbuf == NULL) {
+			ret = -1;
+			goto done;
+		}
 
-	MDP(("MD: allocated va = 0x%p (size 0x%llx)\n",
-		machine_descrip.va, machine_descrip.space));
-
-	machine_descrip.pa = va_to_pa(machine_descrip.va);
+		tbuf_pa =  va_to_pa(tbuf);
+		hvret = hv_mach_desc(tbuf_pa, &md_size);
+		MDP(("MD: HV return code = %ld\n", hvret));
 
-	MDP(("MD: allocated pa = 0x%llx\n", machine_descrip.pa));
+		/*
+		 * We get H_EINVAL if our buffer size is too small. In
+		 * that case stay in the loop, reallocate the buffer
+		 * and try again.
+		 */
+		if (hvret != H_EOK && hvret != H_EINVAL) {
+			MDP(("MD: Failed with code %ld from HV\n", hvret));
+			ret = -1;
+			goto done;
+		}
 
-	ret = hv_mach_desc(machine_descrip.pa, &md_size);
-	MDP(("MD: HV return code = %ld\n", ret));
+	} while (md_size0 != md_size || hvret == H_EINVAL);
+
+	tgen = mach_descrip_find_md_gen(tbuf);
 
-	if (ret != H_EOK) {
-		MDP(("MD: Failed with code %ld from HV\n", ret));
+#ifdef DEBUG
+	if (!HAS_GEN(tgen)) {
+		MDP(("MD: generation number not found\n"));
+	} else
+		MDP(("MD: generation number %ld\n", tgen));
+#endif /* DEBUG */
+
+	if (curr_mach_descrip->va != NULL) {
 
-		machine_descrip.size = 0;
-
-	} else {
-		MDP(("MD: Grabbed %d bytes from HV\n", md_size));
-#ifdef	MACH_DESC_DEBUG
-		dump_buf((uint8_t *)machine_descrip.va, md_size);
-#endif	/* MACH_DESC_DEBUG */
-
-		machine_descrip.size = md_size;
-
+		/* check for the same generation number */
+		if (HAS_GEN(tgen) && ((curr_mach_descrip->gen == tgen) &&
+		    (curr_mach_descrip->size == md_size))) {
+#ifdef DEBUG
 			/*
-			 * Allocate the kstat to get at the data
+			 * Pedantic Check for generation number. If the
+			 * generation number is the same, make sure the
+			 * MDs are really identical.
 			 */
-		machine_descrip.ksp = kstat_create("unix", 0, "machdesc",
-		    "misc",
-		    KSTAT_TYPE_RAW,
-		    (uint_t)machine_descrip.size,
-		    KSTAT_FLAG_VIRTUAL);
+			if (bcmp(curr_mach_descrip->va, tbuf, md_size) != 0) {
+				cmn_err(CE_WARN, "machine_descrip_update: MDs "
+				    "with the same generation (%ld) are not "
+				    "identical", tgen);
+				ret = -1;
+				goto done;
+			}
+#endif
+			cmn_err(CE_WARN, "machine_descrip_update: new MD has "
+			    "the same generation (%ld) as the old MD", tgen);
+			ret = 0;
+			goto done;
+		}
+
+		/* check for generations moving backwards */
+		if (HAS_GEN(tgen) && HAS_GEN(curr_mach_descrip->gen) &&
+		    (curr_mach_descrip->gen > tgen)) {
+			cmn_err(CE_WARN, "machine_descrip_update: new MD"
+			    " older generation (%ld) than current MD (%ld)",
+			    tgen, curr_mach_descrip->gen);
+			ret = -1;
+			goto done;
+		}
+
+		if (curr_mach_descrip->refcnt == 0) {
+
+			MDP(("MD: freeing old md buffer gen %ld\n",
+			    curr_mach_descrip->gen));
 
-		if (machine_descrip.ksp == NULL) {
-			cmn_err(CE_PANIC,
-			    "Failed to create kstat for machine description");
+			/* Free old space */
+			ASSERT(curr_mach_descrip->space > 0);
+
+			(*curr_mach_descrip_memops->buf_freep)
+			    (curr_mach_descrip->va, curr_mach_descrip->space);
 		} else {
-			machine_descrip.ksp->ks_data = machine_descrip.va;
-			kstat_install(machine_descrip.ksp);
+			if (!HAS_GEN(tgen)) {
+				/*
+				 * No update support if FW
+				 * doesn't have MD generation id
+				 * feature.
+				 */
+				prom_printf("WARNING: F/W does not support MD "
+				    "generation count, MD update failed\n");
+				ret = -1;
+				goto done;
+			}
+
+			MDP(("MD: adding to obs list %ld\n",
+			    curr_mach_descrip->gen));
+
+			md_obs_list_add(curr_mach_descrip);
+
+			curr_mach_descrip = new_mach_descrip();
+
+			if (curr_mach_descrip == NULL) {
+				panic("Allocation for machine description"
+				    " failed\n");
+			}
 		}
 	}
+
+	curr_mach_descrip->va = tbuf;
+	curr_mach_descrip->gen = tgen;
+	curr_mach_descrip->size = md_size;
+	curr_mach_descrip->space = md_space;
+
+#ifdef MACH_DESC_DEBUG
+	dump_buf((uint8_t *)curr_mach_descrip->va, md_size);
+#endif /* MACH_DESC_DEBUG */
+
+	mutex_exit(&curr_mach_descrip_lock);
+	return (ret);
+
+done:
+	if (tbuf != NULL)
+		(*curr_mach_descrip_memops->buf_freep)(tbuf, md_space);
+	mutex_exit(&curr_mach_descrip_lock);
+	return (ret);
 }
+
+static void *
+mach_descrip_buf_alloc(size_t size, size_t align)
+{
+	void *p;
+
+	if ((p = contig_mem_alloc_align(size, align)) == NULL)
+		cmn_err(CE_WARN, alloc_fail_msg, size);
+
+	return (p);
+}
+
+static void *
+mach_descrip_strt_meta_alloc(size_t size)
+{
+	return (BOP_ALLOC(bootops, (caddr_t)0, size, PAGESIZE));
+}
+
+static void
+mach_descrip_strt_meta_free(void *buf, size_t size)
+{
+	BOP_FREE(bootops, buf, size);
+}
+
+static void *
+mach_descrip_strt_buf_alloc(size_t size, size_t align)
+{
+	void *p = prom_alloc((caddr_t)0, size, align);
+
+	if (p == NULL)
+		prom_printf(alloc_fail_msg, size);
+
+	return (p);
+}
+
+static void
+mach_descrip_strt_buf_free(void *buf, size_t size)
+{
+	prom_free((caddr_t)buf, size);
+}
+
+static void *
+mach_descrip_meta_alloc(size_t size)
+{
+	return (kmem_alloc(size, KM_SLEEP));
+}
+
+/*
+ * Initialize the kernel's Machine Description(MD) framework
+ * early on in startup during mlsetup() so consumers
+ * can get to the MD before the VM system has been initialized.
+ *
+ * Also get the most recent version of the MD.
+ */
+void
+mach_descrip_startup_init(void)
+{
+
+	mutex_init(&curr_mach_descrip_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&obs_list_lock, NULL, MUTEX_DRIVER, NULL);
+
+	obs_machine_descrip_list = NULL;
+
+	curr_mach_descrip_memops = &startup_memops;
+
+	curr_mach_descrip = new_mach_descrip();
+	if (curr_mach_descrip == NULL)
+		panic("Allocation for machine description failed\n");
+
+	if (mach_descrip_update())
+		panic("Machine description initialization failed\n");
+
+}
+
+/*
+ * Counterpart to the above init function.  Free up resources
+ * allocated at startup by mach_descrip_startup_setup().
+ * And reset machine description framework state.
+ *
+ * All consumers must have fini'ed their handles at this point.
+ */
+void
+mach_descrip_startup_fini(void)
+{
+
+	ASSERT((curr_mach_descrip != NULL));
+	ASSERT((curr_mach_descrip->refcnt == 0));
+	ASSERT((obs_machine_descrip_list == NULL));
+
+	destroy_machine_descrip(curr_mach_descrip);
+	curr_mach_descrip = NULL;
+	curr_mach_descrip_memops = NULL;
+}
+
+/*
+ * Initialize the kernel's Machine Description(MD) framework
+ * after the the VM system has been initialized.
+ *
+ * Also get the most recent version of the MD.
+ * Assumes that the machine description frame work is in a clean
+ * state and the machine description intialized during startup
+ * has been cleaned up and resources deallocated.
+ */
+void
+mach_descrip_init(void)
+{
+	ASSERT((curr_mach_descrip == NULL &&
+	    curr_mach_descrip_memops == NULL));
+
+	curr_mach_descrip_memops = &mach_descrip_memops;
+
+	curr_mach_descrip = new_mach_descrip();
+	if (curr_mach_descrip == NULL)
+		panic("Allocation for machine description failed\n");
+
+	if (mach_descrip_update())
+		panic("Machine description intialization failed\n");
+
+	/* read in global params */
+	init_md_params();
+}
+
+/*
+ * Client interface to get a handle to the current MD.
+ * The md_fini_handle() interface should be used to
+ * clean up the refernce to the MD returned by this function.
+ */
+md_t *
+md_get_handle(void)
+{
+	md_t *mdp;
+
+	mutex_enter(&curr_mach_descrip_lock);
+
+	if (curr_mach_descrip == NULL) {
+		return (NULL);
+	}
+
+	curr_mach_descrip->refcnt++;
+	mdp = md_init_intern(curr_mach_descrip->va,
+	    curr_mach_descrip->memops->meta_allocp,
+	    curr_mach_descrip->memops->meta_freep);
+
+	mutex_exit(&curr_mach_descrip_lock);
+
+	return (mdp);
+}
+
+/*
+ * Client interface to clean up the refernce to the MD returned
+ * by md_get_handle().
+ */
+int
+md_fini_handle(md_t *ptr)
+{
+	machine_descrip_t *mdescp;
+	md_impl_t *mdp;
+
+
+	mdp = (md_impl_t *)ptr;
+
+	if (mdp == NULL)
+		return (-1);
+	/*
+	 * Check if mdp is current MD gen
+	 */
+	mutex_enter(&curr_mach_descrip_lock);
+
+	if (curr_mach_descrip->gen == mdp->gen) {
+		curr_mach_descrip->refcnt--;
+		mutex_exit(&curr_mach_descrip_lock);
+		goto fini;
+	}
+	mutex_exit(&curr_mach_descrip_lock);
+
+	/*
+	 * MD is in the obsolete list
+	 */
+	mdescp = md_obs_list_look_up_by_gen(mdp->gen);
+	if (mdescp == NULL)
+		return (-1);
+
+	mutex_enter(&mdescp->lock);
+	mdescp->refcnt--;
+	if (mdescp->refcnt == 0) {
+		md_obs_list_remove(mdescp);
+		mutex_exit(&mdescp->lock);
+		destroy_machine_descrip(mdescp);
+		goto fini;
+	}
+	mutex_exit(&mdescp->lock);
+
+fini:
+	return (md_fini(ptr));
+}
+
+/*
+ * General purpose initialization function used to extract parameters
+ * from the MD during the boot process. This is called immediately after
+ * the in kernel copy of the MD has been initialized so that global
+ * flags are available to various subsystems as they get initialized.
+ */
+static void
+init_md_params(void)
+{
+	md_t		*mdp;
+	int		num_nodes;
+	mde_cookie_t	*listp;
+	int		listsz;
+
+	mdp = md_get_handle();
+	ASSERT(mdp);
+	num_nodes = md_node_count(mdp);
+	ASSERT(num_nodes >= 0);
+
+	listsz = num_nodes * sizeof (mde_cookie_t);
+	listp = (mde_cookie_t *)
+	    (*curr_mach_descrip_memops->meta_allocp)(listsz);
+
+	/*
+	 * Import various parameters from the MD. For now,
+	 * the only parameter of interest is whether or not
+	 * domaining features are supported.
+	 */
+	init_domaining_enabled(mdp, listp);
+
+	(*curr_mach_descrip_memops->meta_freep)(listp, listsz);
+	(void) md_fini_handle(mdp);
+}
+
+static void
+init_domaining_enabled(md_t *mdp, mde_cookie_t *listp)
+{
+	mde_cookie_t	rootnode;
+	int		num_nodes;
+	uint64_t	val = 0;
+
+	/*
+	 * If domaining has been manually disabled, always
+	 * honor that and ignore the value in the MD.
+	 */
+	if (force_domaining_disabled) {
+		domaining_enabled = 0;
+		MDP(("domaining manually disabled\n"));
+		return;
+	}
+
+	rootnode = md_root_node(mdp);
+	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
+
+	num_nodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "platform"),
+	    md_find_name(mdp, "fwd"), listp);
+
+	/* should only be one platform node */
+	ASSERT(num_nodes == 1);
+
+	if (md_get_prop_val(mdp, *listp, "domaining-enabled", &val) != 0) {
+		/*
+		 * The property is not present. This implies
+		 * that the firmware does not support domaining
+		 * features.
+		 */
+		MDP(("'domaining-enabled' property not present\n"));
+
+		domaining_enabled = 0;
+		return;
+	}
+
+	domaining_enabled = val;
+
+	MDP(("domaining_enabled = 0x%x\n", domaining_enabled));
+}
+
+/*
+ * Client interface to get a pointer to the raw MD buffer
+ * Private to kernel and mdesc driver.
+ */
+caddr_t
+md_get_md_raw(md_t *ptr)
+{
+	md_impl_t *mdp;
+
+	mdp = (md_impl_t *)ptr;
+	if (mdp ==  NULL)
+		return (NULL);
+	return (mdp->caddr);
+}
+
+/*
+ * This is called before an MD structure is intialized, so
+ * it walks the raw MD looking for the generation property.
+ */
+static uint64_t
+mach_descrip_find_md_gen(caddr_t ptr)
+{
+	md_header_t	*hdrp;
+	md_element_t	*mdep;
+	md_element_t	*rootnode = NULL;
+	md_element_t	*elem = NULL;
+	char		*namep;
+	boolean_t	done;
+	int		idx;
+
+	hdrp = (md_header_t *)ptr;
+	mdep = (md_element_t *)(ptr + MD_HEADER_SIZE);
+	namep = (char *)(ptr + MD_HEADER_SIZE + hdrp->node_blk_sz);
+
+	/*
+	 * Very basic check for alignment to avoid
+	 * bus error issues.
+	 */
+	if ((((uint64_t)ptr) & 7) != 0)
+		return (MDESC_INVAL_GEN);
+
+	if (mdtoh32(hdrp->transport_version) != MD_TRANSPORT_VERSION) {
+		return (MDESC_INVAL_GEN);
+	}
+
+	/*
+	 * Search for the root node. Perform the walk manually
+	 * since the MD structure is not set up yet.
+	 */
+	for (idx = 0, done = B_FALSE; done == B_FALSE; ) {
+
+		md_element_t *np = &(mdep[idx]);
+
+		switch (MDE_TAG(np)) {
+		case MDET_LIST_END:
+			done = B_TRUE;
+			break;
+
+		case MDET_NODE:
+			if (strcmp(namep + MDE_NAME(np), "root") == 0) {
+				/* found root node */
+				rootnode = np;
+				done = B_TRUE;
+				break;
+			}
+			idx = MDE_PROP_INDEX(np);
+			break;
+
+		default:
+			/* ignore */
+			idx++;
+		}
+	}
+
+	if (rootnode == NULL) {
+		/* root not found */
+		return (MDESC_INVAL_GEN);
+	}
+
+	/* search the rootnode for the generation property */
+	for (elem = (rootnode + 1); MDE_TAG(elem) != MDET_NODE_END; elem++) {
+
+		char *prop_name;
+
+		/* generation field is a prop_val */
+		if (MDE_TAG(elem) != MDET_PROP_VAL)
+			continue;
+
+		prop_name = namep + MDE_NAME(elem);
+
+		if (strcmp(prop_name, "md-generation#") == 0) {
+			return (MDE_PROP_VALUE(elem));
+		}
+	}
+
+	return (MDESC_INVAL_GEN);
+}
+
+/*
+ * Failed to allocate the list : Return value -1
+ * md_scan_dag API failed      : Return the result from md_scan_dag API
+ */
+int
+md_alloc_scan_dag(md_t *ptr,
+	mde_cookie_t startnode,
+	char *node_name,
+	char *dag,
+	mde_cookie_t **list)
+{
+	int res;
+	md_impl_t *mdp = (md_impl_t *)ptr;
+
+	*list = (mde_cookie_t *)mdp->allocp(sizeof (mde_cookie_t) *
+	    mdp->node_count);
+	if (*list == NULL)
+		return (-1);
+
+	res = md_scan_dag(ptr, startnode,
+	    md_find_name(ptr, node_name),
+	    md_find_name(ptr, dag), *list);
+
+	/*
+	 * If md_scan_dag API returned 0 or -1 then free the buffer
+	 * and return -1 to indicate the error from this API.
+	 */
+	if (res < 1) {
+		md_free_scan_dag(ptr, list);
+		*list = NULL;
+	}
+
+	return (res);
+}
+
+void
+md_free_scan_dag(md_t *ptr,
+	mde_cookie_t **list)
+{
+	md_impl_t *mdp = (md_impl_t *)ptr;
+
+	mdp->freep(*list, sizeof (mde_cookie_t) * mdp->node_count);
+}
--- a/usr/src/uts/sun4v/os/mach_mp_startup.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/os/mach_mp_startup.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,6 +30,8 @@
 #include <sys/cpu_module.h>
 #include <sys/dtrace.h>
 #include <sys/cpu_sgnblk_defs.h>
+#include <sys/mdesc.h>
+#include <sys/mach_descrip.h>
 
 /*
  * Useful for disabling MP bring-up for an MP capable kernel
@@ -87,25 +89,67 @@
 	}
 }
 
-/* ARGSUSED */
 /*
- * Routine used to cleanup a CPU that has been powered off.  This will
+ * Routine used to cleanup a CPU that has been powered off. This will
  * destroy all per-cpu information related to this cpu.
  */
 int
 mp_cpu_unconfigure(int cpuid)
 {
-	return (0);
+	int retval;
+	extern void empty_cpu(int);
+	extern int cleanup_cpu_common(int);
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	retval = cleanup_cpu_common(cpuid);
+
+	empty_cpu(cpuid);
+
+	return (retval);
 }
 
-/* ARGSUSED */
+struct mp_find_cpu_arg {
+	int cpuid;		/* set by mp_cpu_configure() */
+	dev_info_t *dip;	/* set by mp_find_cpu() */
+};
+
 int
 mp_find_cpu(dev_info_t *dip, void *arg)
 {
-	return (0);
+	struct mp_find_cpu_arg *target = (struct mp_find_cpu_arg *)arg;
+	char	*type;
+	int	rv = DDI_WALK_CONTINUE;
+	int	cpuid;
+
+	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "device_type", &type))
+		return (DDI_WALK_CONTINUE);
+
+	if (strcmp(type, "cpu") != 0)
+		goto out;
+
+	cpuid = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "reg", -1);
+
+	if (cpuid == -1) {
+		cmn_err(CE_PANIC, "reg prop not found in cpu node");
+	}
+
+	cpuid = PROM_CFGHDL_TO_CPUID(cpuid);
+
+	if (cpuid != target->cpuid)
+		goto out;
+
+	/* Found it */
+	rv = DDI_WALK_TERMINATE;
+	target->dip = dip;
+
+out:
+	ddi_prop_free(type);
+	return (rv);
 }
 
-/* ARGSUSED */
 /*
  * Routine used to setup a newly inserted CPU in preparation for starting
  * it running code.
@@ -113,5 +157,68 @@
 int
 mp_cpu_configure(int cpuid)
 {
+	extern void fill_cpu(md_t *, mde_cookie_t);
+	extern void setup_cpu_common(int);
+	extern void setup_exec_unit_mappings(md_t *);
+	md_t *mdp;
+	mde_cookie_t rootnode, cpunode = MDE_INVAL_ELEM_COOKIE;
+	int listsz, i;
+	mde_cookie_t *listp = NULL;
+	int	num_nodes;
+	uint64_t cpuid_prop;
+
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	if ((mdp = md_get_handle()) == NULL)
+		return (ENODEV);
+
+	rootnode = md_root_node(mdp);
+
+	ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE);
+
+	num_nodes = md_node_count(mdp);
+
+	ASSERT(num_nodes > 0);
+
+	listsz = num_nodes * sizeof (mde_cookie_t);
+	listp = kmem_zalloc(listsz, KM_SLEEP);
+
+	num_nodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "cpu"),
+	    md_find_name(mdp, "fwd"), listp);
+
+	if (num_nodes < 0)
+		return (ENODEV);
+
+	for (i = 0; i < num_nodes; i++) {
+		if (md_get_prop_val(mdp, listp[i], "id", &cpuid_prop))
+			break;
+		if (cpuid_prop == (uint64_t)cpuid) {
+			cpunode = listp[i];
+			break;
+		}
+	}
+
+	if (cpunode == MDE_INVAL_ELEM_COOKIE)
+		return (ENODEV);
+
+	kmem_free(listp, listsz);
+
+	/*
+	 * Note: uses cpu_lock to protect cpunodes and ncpunodes
+	 * which will be modified inside of fill_cpu and
+	 * setup_exec_unit_mappings.
+	 */
+	fill_cpu(mdp, cpunode);
+
+	/*
+	 * Remap all the cpunodes' execunit mappings.
+	 */
+	setup_exec_unit_mappings(mdp);
+
+	(void) md_fini_handle(mdp);
+
+	setup_cpu_common(cpuid);
+
 	return (0);
 }
--- a/usr/src/uts/sun4v/os/mach_mp_states.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/os/mach_mp_states.c	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,14 +18,26 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+#include <sys/cpuvar.h>
 #include <sys/cpu_module.h>
+#include <sys/machsystm.h>
+#include <sys/archsystm.h>
+#include <sys/prom_plat.h>
+#include <sys/hypervisor_api.h>
+#include <sys/hsvc.h>
+
+extern uint64_t xc_tick_limit;
+extern uint64_t xc_tick_jump_limit;
+
+extern void cpu_intrq_unregister_powerdown(uint64_t doneflag_va);
 
 /*
  * set_idle_cpu is called from idle() when a CPU becomes idle.
@@ -45,3 +56,224 @@
 unset_idle_cpu(int cpun)
 {
 }
+
+/*
+ * Stop a CPU based on its cpuid, using the cpu_stop hypervisor call.
+ * Since this requires that the hypervisor force a remote CPU to stop,
+ * the assumption is made that this should take roughly the same amount
+ * of time as a CPU mondo. Consequently, the mondo timeout is used to
+ * determine when to give up waiting for the CPU to stop.
+ *
+ * Attempts to stop a CPU already in the stopped or error state will
+ * silently succeed. Zero is returned on success and a non-negative
+ * errno value is returned on failure.
+ */
+int
+stopcpu_bycpuid(int cpuid)
+{
+	uint64_t	loop_cnt;
+	uint64_t	state;
+	uint64_t	rv;
+	uint64_t	major = 0;
+	uint64_t	minor = 0;
+	uint64_t	cpu_stop_time_limit;
+	extern uint64_t	xc_mondo_time_limit;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	/*
+	 * Check the state of the CPU up front to see if an
+	 * attempt to stop it is even necessary.
+	 */
+	if (hv_cpu_state(cpuid, &state) != H_EOK)
+		return (EINVAL);
+
+	/* treat stopped and error state the same */
+	if (state != CPU_STATE_RUNNING) {
+		/* nothing to do */
+		return (0);
+	}
+
+	/*
+	 * The HV API to stop a CPU is only supported in
+	 * version 1.1 and later of the core group. If an
+	 * older version of the HV is in use, return not
+	 * supported.
+	 */
+	if (hsvc_version(HSVC_GROUP_CORE, &major, &minor) != 0)
+		return (EINVAL);
+
+	ASSERT(major != 0);
+
+	if ((major == 1) && (minor < 1))
+		return (ENOTSUP);
+
+	/* use the mondo timeout if it has been initialized */
+	cpu_stop_time_limit = xc_mondo_time_limit;
+
+	/*
+	 * If called early in boot before the mondo time limit
+	 * is set, use a reasonable timeout based on the the
+	 * clock frequency of the current CPU.
+	 */
+	if (cpu_stop_time_limit == 0)
+		cpu_stop_time_limit = cpunodes[CPU->cpu_id].clock_freq;
+
+	/* should only fail if called too early in boot */
+	ASSERT(cpu_stop_time_limit > 0);
+
+	loop_cnt = 0;
+
+	/*
+	 * Attempt to stop the CPU, retrying if it is busy.
+	 */
+	while (loop_cnt++ < cpu_stop_time_limit) {
+
+		if ((rv = hv_cpu_stop(cpuid)) != H_EWOULDBLOCK)
+			break;
+	}
+
+	if (loop_cnt == cpu_stop_time_limit)
+		return (ETIMEDOUT);
+
+	if (rv != H_EOK)
+		return (EINVAL);
+
+	/*
+	 * Verify that the CPU has reached the stopped state.
+	 */
+	while (loop_cnt++ < cpu_stop_time_limit) {
+
+		if (hv_cpu_state(cpuid, &state) != H_EOK)
+			return (EINVAL);
+
+		/* treat stopped and error state the same */
+		if (state != CPU_STATE_RUNNING)
+			break;
+	}
+
+	return ((loop_cnt == cpu_stop_time_limit) ? ETIMEDOUT : 0);
+}
+
+/*
+ * X-trap to the target to unregister its interrupt and error queues
+ * and put it in a safe place just before the CPU is stopped. After
+ * unregistering its queues, the target CPU must not return from the
+ * trap to priv or user context. Ensure that the interrupt CPU unregister
+ * succeeded.
+ */
+void
+xt_cpu_unreg_powerdown(struct cpu *cpup)
+{
+	uint8_t volatile not_done;
+	uint64_t starttick, endtick, tick, lasttick;
+	processorid_t cpuid = cpup->cpu_id;
+
+	kpreempt_disable();
+
+	/*
+	 * Sun4v uses a queue for receiving mondos. Successful
+	 * transmission of a mondo only indicates that the mondo
+	 * has been written into the queue.
+	 *
+	 * Set the not_done flag to 1 before sending the cross
+	 * trap and wait until the other cpu resets it to 0.
+	 */
+
+	not_done = 1;
+
+	xt_one_unchecked(cpuid, (xcfunc_t *)cpu_intrq_unregister_powerdown,
+	    (uint64_t)&not_done, 0);
+
+	starttick = lasttick = gettick();
+	endtick = starttick + xc_tick_limit;
+
+	while (not_done) {
+
+		tick = gettick();
+
+		/*
+		 * If there is a big jump between the current tick
+		 * count and lasttick, we have probably hit a break
+		 * point. Adjust endtick accordingly to avoid panic.
+		 */
+		if (tick > (lasttick + xc_tick_jump_limit)) {
+			endtick += (tick - lasttick);
+		}
+
+		lasttick = tick;
+		if (tick > endtick) {
+			cmn_err(CE_CONT, "Cross trap timeout at cpu id %x\n",
+			    cpuid);
+			cmn_err(CE_WARN, "xt_intrq_unreg_powerdown: timeout");
+		}
+	}
+
+	kpreempt_enable();
+}
+
+int
+plat_cpu_poweroff(struct cpu *cp)
+{
+	int		rv = 0;
+	int		status;
+	processorid_t	cpuid = cp->cpu_id;
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	/*
+	 * Capture all CPUs (except for detaching proc) to prevent
+	 * crosscalls to the detaching proc until it has cleared its
+	 * bit in cpu_ready_set.
+	 *
+	 * The CPU's remain paused and the prom_mutex is known to be free.
+	 * This prevents the x-trap victim from blocking when doing prom
+	 * IEEE-1275 calls at a high PIL level.
+	 */
+	promsafe_pause_cpus();
+
+	/*
+	 * Quiesce interrupts on the target CPU. We do this by setting
+	 * the CPU 'not ready'- (i.e. removing the CPU from cpu_ready_set)
+	 * to prevent it from receiving cross calls and cross traps. This
+	 * prevents the processor from receiving any new soft interrupts.
+	 */
+	mp_cpu_quiesce(cp);
+
+	/*
+	 * Send a cross trap to the cpu to unregister its interrupt
+	 * error queues.
+	 */
+	xt_cpu_unreg_powerdown(cp);
+
+	cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_POWEROFF;
+
+	/* call into the Hypervisor to stop the CPU */
+	if ((status = stopcpu_bycpuid(cpuid)) != 0) {
+		rv = -1;
+	}
+
+	start_cpus();
+
+	if (rv != 0) {
+		cmn_err(CE_WARN, "failed to stop cpu %d (%d)", cpuid, status);
+		/* mark the CPU faulted so that it cannot be onlined */
+		cp->cpu_flags = CPU_OFFLINE | CPU_QUIESCED | CPU_FAULTED;
+	}
+
+	return (rv);
+}
+
+int
+plat_cpu_poweron(struct cpu *cp)
+{
+	extern void	restart_other_cpu(int);
+
+	ASSERT(MUTEX_HELD(&cpu_lock));
+
+	cp->cpu_flags &= ~CPU_POWEROFF;
+
+	restart_other_cpu(cp->cpu_id);
+
+	return (0);
+}
--- a/usr/src/uts/sun4v/os/mach_startup.c	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/os/mach_startup.c	Tue May 16 16:05:21 2006 -0700
@@ -18,6 +18,7 @@
  *
  * CDDL HEADER END
  */
+
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
@@ -36,6 +37,8 @@
 #include <sys/disp.h>
 #include <sys/hypervisor_api.h>
 #include <sys/traptrace.h>
+#include <sys/modctl.h>
+#include <sys/ldoms.h>
 
 #ifdef TRAPTRACE
 int mach_htraptrace_enable = 1;
@@ -61,7 +64,7 @@
 	mmfsa_va =
 	    mmu_fault_status_area + (MMFSA_SIZE * CPU->cpu_id);
 
-	intr_init(CPU);			/* init interrupt request free list */
+	intr_init(CPU);		/* init interrupt request free list */
 	setwstate(WSTATE_KERN);
 	set_mmfsa_scratchpad(mmfsa_va);
 	prom_set_mmfsa_traptable(&trap_table, va_to_pa(mmfsa_va));
@@ -427,3 +430,54 @@
 		ctlp->d.hpaddr_base = NULL;
 	}
 }
+
+/*
+ * Load any required machine class (sun4v) specific drivers.
+ */
+void
+load_mach_drivers(void)
+{
+	/*
+	 * We don't want to load these LDOMs-specific
+	 * modules if domaining has been disabled.  Also,
+	 * we must be able to run on non-LDOMs firmware.
+	 */
+	if (!domaining_enabled)
+		return;
+
+	/*
+	 * Load the core domain services module
+	 */
+	if (modload("misc", "ds") == -1)
+		cmn_err(CE_NOTE, "!'ds' module failed to load");
+
+	/*
+	 * Load the rest of the domain services
+	 */
+	if (modload("misc", "fault_iso") == -1)
+		cmn_err(CE_NOTE, "!'fault_iso' module failed to load");
+
+	if (modload("misc", "platsvc") == -1)
+		cmn_err(CE_NOTE, "!'platsvc' module failed to load");
+
+	if (modload("misc", "dr_cpu") == -1)
+		cmn_err(CE_NOTE, "!'dr_cpu' module failed to load");
+
+	/*
+	 * Attempt to attach any virtual device servers. These
+	 * drivers must be loaded at start of day so that they
+	 * can respond to any updates to the machine description.
+	 *
+	 * Since it is quite likely that a domain will not support
+	 * one or more of these servers, failures are ignored.
+	 */
+
+	/* virtual disk server */
+	(void) i_ddi_attach_hw_nodes("vds");
+
+	/* virtual network switch */
+	(void) i_ddi_attach_hw_nodes("vsw");
+
+	/* virtual console concentrator */
+	(void) i_ddi_attach_hw_nodes("vcc");
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/platsvc/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,97 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+# This makefile drives the production of the platsvc kernel module.
+#
+# sun4v implementation architecture dependent
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE		= platsvc 
+OBJECTS		= $(PLATSVC_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(PLATSVC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_MISC_DIR)/$(MODULE)
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+# Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS		+= -v
+
+#
+# Turn on doubleword alignment for 64 bit registers
+#
+CFLAGS		+= -dalign
+
+#
+# Module Dependencies
+#
+LDFLAGS		+= -dy -Nmisc/ds
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_asr.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+
+/*
+ * The Automatic System Recovery (ASR) database present in some
+ * versions of firmware is not supported on sun4v platforms.
+ * However, there is an external interface to these prom interfaces
+ * from the openprom(7D) driver. They are not documented in the
+ * man page, but they should still be handled here, just enough
+ * so the user gets a sensible error back if they stumble onto
+ * them.
+ */
+
+int
+promif_asr_list_keys_len(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+
+	ci[3] = p1275_int2cell(-1);
+
+	return (-1);
+}
+
+int
+promif_asr_list_keys(void *p)
+{
+	_NOTE(ARGUNUSED(p))
+
+	return (-1);
+}
+
+int
+promif_asr_export_len(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+
+	ci[3] = p1275_int2cell(-1);
+
+	return (-1);
+}
+
+int
+promif_asr_export(void *p)
+{
+	_NOTE(ARGUNUSED(p))
+
+	return (-1);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_cpu.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,122 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+#include <sys/machsystm.h>
+#include <sys/hypervisor_api.h>
+#include <sys/lpad.h>
+
+extern int (*prom_cif_handler)(void *);
+extern int cif_cpu_mp_ready;
+
+int
+promif_set_mmfsa_traptable(void *p)
+{
+	cell_t		*ci = (cell_t *)p;
+	uint64_t	rtba;
+	caddr_t		tba;
+	uint64_t	mmfsa_ra;
+	int		rv, ret;
+
+	ASSERT(ci[1] == 2);
+
+	/*
+	 * We use the same trap table for the rtba as well.
+	 */
+	rtba = va_to_pa(p1275_cell2ptr(ci[3]));
+
+	/*
+	 * if cif_cpu_mp_ready is not set the prom is still
+	 * setting the mmfsa and trap table. Set the rtba
+	 * after the prom cif call.
+	 */
+	if (!cif_cpu_mp_ready) {
+		ret = (*prom_cif_handler)(p);
+		if ((rv = hv_cpu_set_rtba(&rtba)) != H_EOK)
+			panic("hv_cpu_set_rtba failed: %d\n", rv);
+		return (ret);
+	}
+
+	tba = p1275_cell2ptr(ci[3]);
+	mmfsa_ra = (uint64_t)p1275_cell2ptr(ci[4]);
+
+	if (tba != (caddr_t)KERNELBASE)
+		return (-1);
+
+	(void) set_tba(tba);
+
+	if ((rv = hv_mmu_fault_area_conf(&mmfsa_ra)) != H_EOK) {
+		panic("hv_mmu_fault_area_conf failed: %d\n", rv);
+	}
+
+	if ((rv = hv_cpu_set_rtba(&rtba)) != H_EOK) {
+		panic("hv_cpu_set_rtba failed: %d\n", rv);
+	}
+
+	return (0);
+}
+
+int
+promif_start_cpu(void *p)
+{
+	cell_t		*ci = (cell_t *)p;
+	int		cpuid;
+	caddr_t		pc;
+	int		arg;
+	uint64_t	rtba = 0;
+	int		rv;
+	uint64_t	*lpp;
+
+	ASSERT(ci[1] == 3);
+
+	cpuid = p1275_cell2int(ci[3]);
+	pc = p1275_cell2ptr(ci[4]);
+	arg = p1275_cell2int(ci[5]);
+
+	if (!cif_cpu_mp_ready)
+		return ((*prom_cif_handler)(p));
+
+	rtba = va_to_pa(&trap_table);
+
+	lpp = lpad_setup(cpuid, (uint64_t)pc, (uint64_t)arg);
+
+	ASSERT(lpp);
+
+	pc = (caddr_t)lpp;
+
+	rv = hv_cpu_start(cpuid, va_to_pa(pc), rtba, cpuid);
+
+	if (rv != H_EOK) {
+		panic("promif_start_cpu: failed to start cpu %d (%d)\n",
+		    cpuid, rv);
+	}
+
+	ci[6] = p1275_int2cell(rv);
+
+	return (0);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_emul.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,268 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+#include <sys/machsystm.h>
+#include <sys/lpad.h>
+#include <sys/vmsystm.h>
+#include <sys/prom_plat.h>
+#include <sys/ldoms.h>
+#include <sys/kobj.h>
+#include <sys/reboot.h>
+#include <sys/hypervisor_api.h>
+
+#ifndef _KMDB
+static processorid_t cif_cpu;
+static struct translation *cif_prom_trans;
+static size_t cif_prom_ntrans;
+
+int cif_cpu_mp_ready;
+int (*prom_cif_handler)(void *) = NULL;
+#endif
+
+#ifdef DEBUG
+uint_t cif_debug;
+#endif /* DEBUG */
+
+extern int (*cif_handler)(void *);
+
+typedef struct {
+	char		*name;
+	cif_func_t	func;
+} cif_callback_t;
+
+static cif_callback_t cb_table[] = {
+	{ "getprop",			promif_getprop		    },
+	{ "getproplen",			promif_getproplen	    },
+	{ "nextprop",			promif_nextprop		    },
+	{ "peer",			promif_nextnode		    },
+	{ "child",			promif_childnode	    },
+	{ "parent",			promif_parentnode	    },
+	{ "enter",			promif_enter_mon	    },
+	{ "exit",			promif_exit_to_mon	    },
+	{ "boot",			promif_reboot		    },
+	{ "write",			promif_write		    },
+	{ "read",			promif_read		    },
+	{ "interpret",			promif_interpret	    },
+	{ "finddevice",			promif_finddevice	    },
+	{ "instance-to-package",	promif_instance_to_package  },
+#ifndef _KMDB
+	{ "setprop",			promif_setprop		    },
+	{ "test",			promif_test		    },
+	{ "instance-to-path",		promif_instance_to_path	    },
+	{ "SUNW,power-off",		promif_power_off	    },
+	{ "SUNW,asr-list-keys-len",	promif_asr_list_keys_len    },
+	{ "SUNW,asr-list-keys",		promif_asr_list_keys	    },
+	{ "SUNW,asr-export-len",	promif_asr_export_len	    },
+	{ "SUNW,asr-export",		promif_asr_export	    },
+	{ "SUNW,set-security-key",	promif_set_security_key	    },
+	{ "SUNW,get-security-key",	promif_get_security_key	    },
+	{ "SUNW,start-cpu-by-cpuid",	promif_start_cpu	    },
+	{ "SUNW,set-trap-table",	promif_set_mmfsa_traptable  },
+	{ "SUNW,set-sun4v-api-version",	promif_set_sun4v_api_version },
+	{ "SUNW,get-sun4v-api-version",	promif_get_sun4v_api_version },
+#endif
+	{ NULL,				NULL			    }
+};
+
+cif_func_t
+promif_find_cif_callback(char *opname)
+{
+	cif_callback_t	*cb;
+
+	if (opname == NULL)
+		return (NULL);
+
+	for (cb = cb_table; cb->name; cb++) {
+		if (prom_strcmp(cb->name, opname) == 0)
+			break;
+	}
+
+	return (cb->func);
+}
+
+static int
+kern_cif_handler(void *p)
+{
+	cell_t		*ci = (cell_t *)p;
+	char		*opname;
+	cif_func_t	func;
+	int		rv;
+
+	ASSERT(cif_handler == kern_cif_handler);
+
+#ifndef _KMDB
+	cif_cpu = getprocessorid();
+#endif
+
+	opname = p1275_cell2ptr(ci[0]);
+
+	/* lookup the callback for the desired operation */
+	func = promif_find_cif_callback(opname);
+
+	if (func == NULL) {
+#ifdef _KMDB
+		prom_fatal_error("sun4v unsupported CIFs\n");
+#else
+		cmn_err(CE_CONT, "!sun4v unsupported CIF: %s\n", opname);
+		return (-1);
+#endif
+	}
+
+	/* callback found, execute it */
+	rv = func(p);
+
+#ifndef _KMDB
+	cif_cpu = -1;
+#endif
+
+	return (rv);
+}
+
+#ifdef _KMDB
+
+void
+cif_init(char *pgmname, caddr_t root, ihandle_t in, ihandle_t out,
+    phandle_t pin, phandle_t pout, pnode_t chosen, pnode_t options)
+{
+	/* initialize pointer to a copy of OBP device tree */
+	promif_stree_setroot(root);
+
+	promif_set_nodes(chosen, options);
+
+	/* initialize io parameters */
+	promif_io_init(in, out, pin, pout);
+
+	/*
+	 * Switch CIF handler to the kernel.
+	 */
+	if (pgmname != NULL)
+		prom_init(pgmname, (void *)kern_cif_handler);
+	else
+		cif_handler = kern_cif_handler;
+}
+
+#else
+
+static void cache_prom_data(void);
+
+/*
+ * This function returns 1 if the current thread is executing in
+ * the CIF and 0 otherwise. This is useful information to know
+ * since code that implements CIF handlers can assume that it has
+ * gone through the kern_preprom() entry point, implying it is
+ * running single threaded, has preemption disabled, etc.
+ */
+int
+promif_in_cif(void)
+{
+	int	mycpuid = getprocessorid();
+
+	return ((cif_cpu == mycpuid) ? 1 : 0);
+}
+
+void
+cif_init(void)
+{
+	void (*kmdb_cb)(void);
+	uint64_t rtba;
+	uint64_t rv;
+
+	/*
+	 * Check if domaining is enabled. If not, do not
+	 * initialize the kernel CIF handler.
+	 */
+	if (!domaining_enabled)
+		return;
+
+	/*
+	 * Cache PROM data that is needed later, e.g. a shadow
+	 * copy of the device tree, IO mappings, etc.
+	 */
+	cache_prom_data();
+
+	/*
+	 * Prepare to take over the get/set of environmental variables.
+	 */
+	promif_prop_init();
+
+	/*
+	 * Switch CIF handler to the kernel.
+	 */
+	prom_cif_handler = cif_handler;
+
+	promif_preprom();
+	cif_handler = kern_cif_handler;
+
+	/*
+	 * Take over rtba for the boot CPU. The rtba for
+	 * all other CPUs are set as they enter the system.
+	 */
+	rtba = va_to_pa(&trap_table);
+	if ((rv = hv_cpu_set_rtba(&rtba)) != H_EOK)
+		panic("hv_cpu_set_rtba failed: %ld\n", rv);
+
+	promif_postprom();
+
+	/*
+	 * If the system has been booted with kmdb we need kmdb to
+	 * use the kernel cif handler instead of the PROM cif handler.
+	 */
+	if (boothowto & RB_KMDB) {
+		kmdb_cb = (void (*)(void))modlookup("misc/kmdbmod",
+		    "kctl_switch_promif");
+		ASSERT(kmdb_cb != NULL);
+		(*kmdb_cb)();
+	}
+}
+
+static void
+cache_prom_data(void)
+{
+	/* initialize copy of OBP device tree */
+	promif_stree_init();
+
+	/* initialize io parameters */
+	promif_io_init();
+}
+
+
+/*
+ * Platform-specific actions to be taken when all cpus are running
+ * in the OS.
+ */
+void
+cpu_mp_init(void)
+{
+	if (!domaining_enabled)
+		return;
+
+	cif_cpu_mp_ready = 1;
+}
+
+#endif	/* _KMDB */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_interp.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,42 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+
+/*
+ * There is no support for prom_interpret() once the kernel
+ * takes over the CIF, so this function just returns an error.
+ * Having this stub keeps harmless messages out of the log file
+ * that report that prom_interpret() is not supported.
+ */
+/*ARGSUSED*/
+int
+promif_interpret(void *p)
+{
+	return (-1);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_io.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,220 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+#include <sys/systm.h>
+#include <sys/hypervisor_api.h>
+#ifndef _KMDB
+#include <sys/kmem.h>
+#endif
+
+#define	PROM_REG_TO_UNIT_ADDR(r)	((r) & ~(0xful << 28))
+
+static pnode_t instance_to_package(ihandle_t ih);
+
+/* cached copies of IO params */
+static phandle_t pstdin;
+static phandle_t pstdout;
+
+static ihandle_t istdin;
+static ihandle_t istdout;
+
+int
+promif_instance_to_package(void *p)
+{
+	cell_t		*ci = (cell_t *)p;
+	ihandle_t	ih;
+	phandle_t	ph;
+
+	ih = p1275_cell2ihandle(ci[3]);
+
+	ph = instance_to_package(ih);
+
+	ci[4] = p1275_phandle2cell(ph);
+
+	return (0);
+}
+
+int
+promif_write(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	uint_t	fd;
+	char	*buf;
+	size_t	len;
+	size_t	rlen;
+
+	ASSERT(ci[1] == 3);
+
+	fd  = p1275_cell2uint(ci[3]);
+	buf = p1275_cell2ptr(ci[4]);
+	len = p1275_cell2size(ci[5]);
+
+	/* only support stdout (console) */
+	ASSERT(fd == istdout);
+
+	for (rlen = 0; rlen < len; rlen++) {
+		while (hv_cnputchar((uint8_t)buf[rlen]) == H_EWOULDBLOCK)
+			/* try forever */;
+	}
+
+	/* return the length written */
+	ci[6] = p1275_size2cell(rlen);
+
+	return (0);
+}
+
+int
+promif_read(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	uint_t	fd;
+	char	*buf;
+	size_t	len;
+	size_t	rlen;
+
+	ASSERT(ci[1] == 3);
+
+	/* unpack arguments */
+	fd  = p1275_cell2uint(ci[3]);
+	buf = p1275_cell2ptr(ci[4]);
+	len = p1275_cell2size(ci[5]);
+
+	/* only support stdin (console) */
+	ASSERT(fd == istdin);
+
+	for (rlen = 0; rlen < len; rlen++) {
+		if (hv_cngetchar((uint8_t *)&buf[rlen]) != H_EOK)
+			break;
+	}
+
+	/* return the length read */
+	ci[6] = p1275_size2cell(rlen);
+
+	return (0);
+}
+
+static pnode_t
+instance_to_package(ihandle_t ih)
+{
+	/* only support stdin and stdout */
+	ASSERT((ih == istdin) || (ih == istdout));
+
+	if (ih == istdin)
+		return (pstdin);
+
+	if (ih == istdout)
+		return (pstdout);
+
+	return (OBP_BADNODE);
+}
+
+#ifdef _KMDB
+
+void
+promif_io_init(ihandle_t in, ihandle_t out, phandle_t pin, phandle_t pout)
+{
+	istdin = in;
+	istdout = out;
+	pstdin = pin;
+	pstdout = pout;
+}
+
+#else
+
+void
+promif_io_init(void)
+{
+	/*
+	 * Cache the mapping between the stdin and stdout
+	 * ihandles and their respective phandles.
+	 */
+	pstdin = prom_stdin_node();
+	pstdout = prom_stdout_node();
+
+	istdin = prom_stdin_ihandle();
+	istdout = prom_stdout_ihandle();
+}
+
+int
+promif_instance_to_path(void *p)
+{
+	cell_t		*ci = (cell_t *)p;
+	pnode_t		node;
+	ihandle_t	ih;
+	char		*buf;
+	int		rlen;
+	char		*regval;
+	uint_t		*csaddr;
+	char		name[OBP_MAXPROPNAME];
+	char		scratch[OBP_MAXPATHLEN];
+	int		rvlen;
+
+	ih = p1275_cell2ihandle(ci[3]);
+	buf = p1275_cell2ptr(ci[4]);
+
+	ci[6] = p1275_uint2cell(0);
+
+	node = instance_to_package(ih);
+
+	*buf = '\0';
+
+	while (node != prom_rootnode()) {
+		if (prom_getprop(node, OBP_NAME, name) == -1) {
+			prom_printf("instance_to_path: no name property "
+			    "node=0x%x\n", node);
+			return (-1);
+		}
+
+		/* construct the unit address from the 'reg' property */
+		if ((rlen = prom_getproplen(node, OBP_REG)) == -1)
+			return (-1);
+
+		regval = kmem_zalloc(rlen, KM_SLEEP);
+
+		(void) prom_getprop(node, OBP_REG, regval);
+
+		csaddr = (uint_t *)regval;
+
+		(void) prom_sprintf(scratch, "/%s@%lx%s", name,
+		    PROM_REG_TO_UNIT_ADDR(*csaddr), buf);
+
+		kmem_free(regval, rlen);
+
+		(void) prom_strcpy(buf, scratch);
+
+		node = prom_parentnode(node);
+	}
+
+	rvlen = prom_strlen(buf);
+	ci[6] = p1275_uint2cell(rvlen);
+
+	return (0);
+}
+
+#endif	/* _KMDB */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_key.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,58 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+
+/*
+ * Secure WAN boot requires firmware support for storing and
+ * retrieving security keys. The user command to set these
+ * keys in firmware storage is ickey(1M). Currently, sun4v
+ * platforms do not support this functionality. However, there
+ * is an external interface to these prom interfaces from the
+ * openprom(7D) driver. They are not documented in the man page,
+ * but they should still be handled just well enough so that
+ * the user gets a sensible error back.
+ */
+
+int
+promif_set_security_key(void *p)
+{
+	_NOTE(ARGUNUSED(p))
+
+	return (-1);
+}
+
+int
+promif_get_security_key(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+
+	ci[6] = p1275_int2cell(-1);
+
+	return (-1);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_mon.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,203 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+#include <sys/uadmin.h>
+#include <sys/machsystm.h>
+#include <sys/hypervisor_api.h>
+
+#ifdef _KMDB
+
+extern int kmdb_dpi_get_master_cpuid(void);
+extern void kmdb_dpi_kernpanic(int cpuid);
+extern void prom_reboot(char *bootstr);
+
+#define	PIL_DECL(p)
+#define	PIL_SET7(p)
+#define	PIL_REST(p)
+
+#else
+
+extern int vx_handler(cell_t *argument_array);
+
+#define	PIL_DECL(p) int p
+#define	PIL_SET7(p) (p = spl7())
+#define	PIL_REST(p) (splx(p))
+
+#endif
+
+#define	PROMIF_ENTER	0
+#define	PROMIF_EXIT	1
+
+#define	PROMIF_ISPRINT(c)	(((c) >= ' ') && ((c) <= '~'))
+
+static void promif_mon(int mode);
+
+/*ARGSUSED*/
+int
+promif_enter_mon(void *p)
+{
+	PIL_DECL(pil);
+
+	PIL_SET7(pil);
+
+	prom_printf("\n");
+
+#ifdef _KMDB
+	promif_mon(PROMIF_ENTER);
+#else
+	idle_other_cpus();
+	promif_mon(PROMIF_ENTER);
+	resume_other_cpus();
+#endif
+
+	PIL_REST(pil);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+promif_exit_to_mon(void *p)
+{
+	PIL_DECL(pil);
+
+	PIL_SET7(pil);
+
+	prom_printf("Program terminated\n");
+
+	promif_mon(PROMIF_EXIT);
+
+	PIL_REST(pil);
+
+	return (0);
+}
+
+static void
+promif_mon(int mode)
+{
+	char		cmd;
+	char		*prompt;
+	boolean_t	invalid_option;
+#ifdef _KMDB
+	static char	*exit_prompt  = "r)eboot, h)alt? ";
+#else
+	char		value[ 8 ];	/* holds "true" or "false" */
+	char		*boot_msg;
+	static char	*null_msg = ".\" \"";
+	static char	*ignore_msg =
+	    "cr .\" Ignoring auto-boot? setting for this boot.\" cr";
+	static char	*exit_prompt  = "r)eboot, o)k prompt, h)alt? ";
+#endif
+	static char	*enter_prompt = "c)ontinue, s)ync, r)eboot, h)alt? ";
+
+	prompt = (mode == PROMIF_EXIT) ? exit_prompt : enter_prompt;
+
+	for (;;) {
+		prom_printf("%s", prompt);
+
+		while (hv_cngetchar((uint8_t *)&cmd) != H_EOK)
+			;
+
+		prom_printf("%c\n", cmd);
+
+		invalid_option = B_FALSE;
+
+		switch (cmd) {
+
+		case 'r':
+			prom_reboot("");
+			break;
+
+		case 'h':
+			(void) hv_mach_exit(0);
+			ASSERT(0);
+
+			break;
+
+#ifndef _KMDB
+		case 'o':
+			/*
+			 * This option gives the user an "ok" prompt after
+			 * the system reset regardless of the value of
+			 * auto-boot?  We offer this option because halt(1m)
+			 * doesn't leave the user at the ok prompt (as it
+			 * does on non-ldoms systems).  If auto-boot? is
+			 * true tell user we are overriding the setting
+			 * for this boot only.
+			 */
+			if (mode == PROMIF_EXIT) {
+				bzero(value, sizeof (value));
+				(void) promif_stree_getprop(prom_optionsnode(),
+				    "auto-boot?", value);
+				boot_msg = strcmp(value, "true") ? null_msg :
+					ignore_msg;
+				(void) promif_ldom_setprop("reboot-command",
+				    boot_msg, strlen(boot_msg) + 1);
+				(void) hv_mach_sir();
+			} else {
+				invalid_option = B_TRUE;
+			}
+			break;
+#endif
+
+		case '\r':
+			break;
+
+		case 's':
+			if (mode == PROMIF_ENTER) {
+#ifdef _KMDB
+				kmdb_dpi_kernpanic(kmdb_dpi_get_master_cpuid());
+#else
+				cell_t arg = p1275_ptr2cell("sync");
+				(void) vx_handler(&arg);
+#endif
+			} else {
+				invalid_option = B_TRUE;
+			}
+			break;
+
+		case 'c':
+			if (mode == PROMIF_ENTER) {
+				return;
+			} else {
+				invalid_option = B_TRUE;
+			}
+			break;
+
+		default:
+			invalid_option = B_TRUE;
+			break;
+		}
+
+		if (invalid_option && PROMIF_ISPRINT(cmd))
+			prom_printf("invalid option (%c)\n", cmd);
+	}
+
+	_NOTE(NOTREACHED)
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_node.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,293 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/esunddi.h>
+#include <sys/promif_impl.h>
+
+#ifdef _KMDB
+static pnode_t chosennode;
+static pnode_t optionsnode;
+#else
+static char *gettoken(char *tp, char *token);
+static pnode_t finddevice(char *path);
+#endif
+
+/*
+ * Routines for walking the PROMs devinfo tree
+ */
+
+#ifdef _KMDB
+
+void
+promif_set_nodes(pnode_t chosen, pnode_t options)
+{
+	chosennode = chosen;
+	optionsnode = options;
+}
+
+int
+promif_finddevice(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	char *path;
+
+	ASSERT(ci[1] == 1);
+
+	path = p1275_cell2ptr(ci[3]);
+
+	if (strcmp("/chosen", path) == 0) {
+		ci[4] = p1275_dnode2cell(chosennode);
+	} else if (strcmp("/options", path) == 0) {
+		ci[4] = p1275_dnode2cell(optionsnode);
+	} else {
+		/* only supports known nodes */
+		ASSERT(0);
+	}
+
+	return (0);
+}
+
+#else
+
+int
+promif_finddevice(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	pnode_t	node;
+
+	ASSERT(ci[1] == 1);
+
+	node = finddevice(p1275_cell2ptr(ci[3]));
+
+	ci[4] = p1275_dnode2cell(node);
+
+	return (0);
+}
+
+#endif
+
+int
+promif_nextnode(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	pnode_t	next;
+
+	ASSERT(ci[1] == 1);
+
+	next = promif_stree_nextnode(p1275_cell2dnode(ci[3]));
+
+	ci[4] = p1275_dnode2cell(next);
+
+	return (0);
+}
+
+int
+promif_childnode(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	pnode_t	child;
+
+	ASSERT(ci[1] == 1);
+
+	child = promif_stree_childnode(p1275_cell2dnode(ci[3]));
+
+	ci[4] = p1275_dnode2cell(child);
+
+	return (0);
+}
+
+int
+promif_parentnode(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	pnode_t	parent;
+
+	ASSERT(ci[1] == 1);
+
+	parent = promif_stree_parentnode(p1275_cell2dnode(ci[3]));
+
+	ci[4] = p1275_dnode2cell(parent);
+
+	return (0);
+}
+
+#ifndef _KMDB
+
+/*
+ * Get a token from a prom pathname, collecting everything
+ * until a non-comma, non-colon separator is found. Any
+ * options, including the ':' option separator, on the end
+ * of the token are removed.
+ */
+static char *
+gettoken(char *tp, char *token)
+{
+	char *result = token;
+
+	for (;;) {
+		tp = prom_path_gettoken(tp, token);
+		token += prom_strlen(token);
+		if ((*tp == ',') || (*tp == ':')) {
+			*token++ = *tp++;
+			*token = '\0';
+			continue;
+		}
+		break;
+	}
+
+	/* strip off any options from the token */
+	prom_strip_options(result, result);
+
+	return (tp);
+}
+
+/*
+ * Retrieve the unit address for a node by looking it up
+ * in the corresponding dip. -1 is returned if no unit
+ * address can be determined.
+ */
+static int
+get_unit_addr(pnode_t np, char *paddr)
+{
+	dev_info_t	*dip;
+	char		*addr;
+
+	if ((dip = e_ddi_nodeid_to_dip(np)) == NULL) {
+		return (-1);
+	}
+
+	if ((addr = ddi_get_name_addr(dip)) == NULL) {
+		ddi_release_devi(dip);
+		return (-1);
+	}
+
+	(void) prom_strcpy(paddr, addr);
+
+	ddi_release_devi(dip);
+
+	return (0);
+}
+
+/*
+ * Get node id of node in prom tree that path identifies
+ */
+static pnode_t
+finddevice(char *path)
+{
+	char	name[OBP_MAXPROPNAME];
+	char	addr[OBP_MAXPROPNAME];
+	char	pname[OBP_MAXPROPNAME];
+	char	paddr[OBP_MAXPROPNAME];
+	char	*tp;
+	pnode_t	np;
+	pnode_t	device;
+
+	CIF_DBG_NODE("finddevice: %s\n", path);
+
+	tp = path;
+	np = prom_rootnode();
+	device = OBP_BADNODE;
+
+	/* must be a fully specified path */
+	if (*tp++ != '/')
+		goto done;
+
+	for (;;) {
+		/* get the name from the path */
+		tp = gettoken(tp, name);
+		if (*name == '\0')
+			break;
+
+		/* get the address from the path */
+		if (*tp == '@') {
+			tp++;
+			tp = gettoken(tp, addr);
+		} else {
+			addr[0] = '\0';
+		}
+
+		CIF_DBG_NODE("looking for: %s%s%s\n", name,
+		    (*addr != '\0') ? "@" : "", addr);
+
+		if ((np = prom_childnode(np)) == OBP_NONODE)
+			break;
+
+		while (np != OBP_NONODE) {
+
+			/* get the name from the current node */
+			if (prom_getprop(np, OBP_NAME, pname) < 0)
+				goto done;
+
+			/* get the address from the current node */
+			if (get_unit_addr(np, paddr) < 0)
+				paddr[0] = '\0';
+
+			/* compare the names and addresses */
+			if ((prom_strcmp(name, pname) == 0) &&
+			    (prom_strcmp(addr, paddr) == 0)) {
+				CIF_DBG_NODE("found dev: %s%s%s (0x%x)\n",
+				    pname, (*paddr != '\0') ? "@" : "",
+				    paddr, np);
+				break;
+			} else {
+				CIF_DBG_NODE("  no match: %s%s%s vs %s%s%s\n",
+				    name, (*addr != '\0') ? "@" : "", addr,
+				    pname, (*paddr != '\0') ? "@" : "", paddr);
+			}
+			np = prom_nextnode(np);
+		}
+
+		/* path does not map to a node */
+		if (np == OBP_NONODE)
+			break;
+
+		if (*tp == '\0') {
+			/* found a matching node */
+			device = np;
+			break;
+		}
+
+		/*
+		 * Continue the loop with the
+		 * next component of the path.
+		 */
+		tp++;
+	}
+done:
+
+	if (device == OBP_BADNODE) {
+		CIF_DBG_NODE("device not found\n\n");
+	} else {
+		CIF_DBG_NODE("returning 0x%x\n\n", device);
+	}
+
+	return (device);
+}
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_power_off.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,45 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+#include <sys/hypervisor_api.h>
+
+int
+promif_power_off(void *p)
+{
+	_NOTE(ARGUNUSED(p))
+
+	int	rv = 0;
+
+	rv = hv_mach_exit(0);
+
+	/* should not return */
+	ASSERT(0);
+
+	return (rv);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_prop.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,327 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/promif_impl.h>
+#include <sys/ds.h>
+#include <sys/modctl.h>
+#include <sys/ksynch.h>
+#include <sys/varconfig.h>
+
+#ifndef _KMDB
+
+#define	PROMIF_DS_TIMEOUT_SEC 15
+
+static kmutex_t promif_prop_lock;
+static kcondvar_t promif_prop_cv;
+static var_config_msg_t promif_ds_resp;
+static var_config_resp_t *cfg_rsp = &promif_ds_resp.var_config_resp;
+static int (*ds_send)();
+static int (*ds_init)();
+
+/*
+ * Domains Services interaction
+ */
+static ds_svc_hdl_t	ds_primary_handle;
+static ds_svc_hdl_t	ds_backup_handle;
+
+static ds_ver_t		vc_version[] = { { 1, 0 } };
+
+#define	VC_NVERS	(sizeof (vc_version) / sizeof (vc_version[0]))
+
+static ds_capability_t vc_primary_cap = {
+	"var-config",		/* svc_id */
+	vc_version,		/* vers */
+	VC_NVERS		/* nvers */
+};
+
+static ds_capability_t vc_backup_cap = {
+	"var-config-backup",	/* svc_id */
+	vc_version,		/* vers */
+	VC_NVERS		/* nvers */
+};
+
+static void vc_reg_handler(ds_cb_arg_t, ds_ver_t *, ds_svc_hdl_t);
+static void vc_unreg_handler(ds_cb_arg_t);
+static void vc_data_handler(ds_cb_arg_t, void *, size_t);
+
+static ds_clnt_ops_t vc_primary_ops = {
+	vc_reg_handler,		/* ds_primary_reg_cb */
+	vc_unreg_handler,	/* ds_primary_unreg_cb */
+	vc_data_handler,	/* ds_data_cb */
+	&ds_primary_handle	/* cb_arg */
+};
+
+static ds_clnt_ops_t vc_backup_ops = {
+	vc_reg_handler,		/* ds_backup_reg_cb */
+	vc_unreg_handler,	/* ds_backup_unreg_cb */
+	vc_data_handler,	/* ds_data_cb */
+	&ds_backup_handle	/* cb_arg */
+};
+
+static void
+vc_reg_handler(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
+{
+	_NOTE(ARGUNUSED(ver))
+
+	if ((ds_svc_hdl_t *)arg == &ds_primary_handle)
+		ds_primary_handle = hdl;
+	else if ((ds_svc_hdl_t *)arg == &ds_backup_handle)
+		ds_primary_handle = hdl;
+}
+
+static void
+vc_unreg_handler(ds_cb_arg_t arg)
+{
+	if ((ds_svc_hdl_t *)arg == &ds_primary_handle)
+		ds_primary_handle = DS_INVALID_HDL;
+	else if ((ds_svc_hdl_t *)arg == &ds_backup_handle)
+		ds_backup_handle = DS_INVALID_HDL;
+}
+
+static void
+vc_data_handler(ds_cb_arg_t arg, void *buf, size_t buflen)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	bcopy(buf, &promif_ds_resp, buflen);
+	mutex_enter(&promif_prop_lock);
+	cv_signal(&promif_prop_cv);
+	mutex_exit(&promif_prop_lock);
+}
+
+/*
+ * Initialize the linkage with DS (Domain Services).  We assume that
+ * the DS module has already been loaded by the platmod.
+ *
+ * The call to the DS init functions will eventually result in the
+ * invocation of our registration callback handlers, at which time DS
+ * is able to accept requests.
+ */
+static void
+promif_ds_init(void)
+{
+	static char *me = "promif_ds_init";
+	int rv;
+
+	if ((ds_init =
+	    (int (*)())modgetsymvalue("ds_cap_init", 0)) == 0) {
+		cmn_err(CE_WARN, "%s: can't find ds_cap_init", me);
+		return;
+	}
+
+	if ((ds_send =
+	    (int (*)())modgetsymvalue("ds_cap_send", 0)) == 0) {
+		cmn_err(CE_WARN, "%s: can't find ds_cap_send", me);
+		return;
+	}
+
+	if ((rv = (*ds_init)(&vc_primary_cap, &vc_primary_ops)) != 0) {
+		cmn_err(CE_NOTE,
+		    "%s: ds_cap_init failed (primary): %d", me, rv);
+	}
+
+
+	if ((rv = (*ds_init)(&vc_backup_cap, &vc_backup_ops)) != 0) {
+		cmn_err(CE_NOTE,
+		    "%s: ds_cap_init failed (backup): %d", me, rv);
+	}
+}
+
+/*
+ * Prepare for ldom variable requests.
+ */
+void
+promif_prop_init(void)
+{
+	mutex_init(&promif_prop_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&promif_prop_cv, NULL, CV_DEFAULT, NULL);
+
+	promif_ds_init();
+}
+
+
+/*
+ * Replace the current value of a property string given its name and
+ * new value.
+ */
+int
+promif_ldom_setprop(char *name, void *value, int valuelen)
+{
+	var_config_msg_t *req;
+	var_config_set_req_t *setp;
+	var_config_cmd_t cmd;
+	ds_svc_hdl_t ds_handle;
+	int rv;
+	int namelen = strlen(name);
+	int paylen = namelen + 1 + valuelen; /* valuelen includes the null */
+	static char *me = "promif_ldom_setprop";
+
+	if (ds_primary_handle != DS_INVALID_HDL)
+		ds_handle = ds_primary_handle;
+	else if (ds_backup_handle != DS_INVALID_HDL)
+		ds_handle = ds_backup_handle;
+	else
+		return (-1);
+
+	req = kmem_zalloc(sizeof (var_config_hdr_t) + paylen, KM_SLEEP);
+	req->var_config_cmd = VAR_CONFIG_SET_REQ;
+	setp = &req->var_config_set;
+	(void) strcpy(setp->name_and_value, name);
+	(void) strncpy(&setp->name_and_value[namelen + 1], value, valuelen);
+
+	if ((rv = (*ds_send)(ds_handle, req,
+	    sizeof (var_config_hdr_t) + paylen)) != 0) {
+		cmn_err(CE_WARN, "%s: ds_cap_send failed: %d", me, rv);
+		kmem_free(req, sizeof (var_config_hdr_t) + paylen);
+		return (-1);
+	}
+
+	kmem_free(req, sizeof (var_config_hdr_t) + paylen);
+
+	/*
+	 * Since we are emulating OBP, we must comply with the promif
+	 * infrastructure and execute only on the originating cpu.
+	 */
+	thread_affinity_set(curthread, CPU_CURRENT);
+
+	mutex_enter(&promif_prop_lock);
+	if (cv_timedwait(&promif_prop_cv,
+	    &promif_prop_lock, lbolt + PROMIF_DS_TIMEOUT_SEC * hz) == -1) {
+		cmn_err(CE_WARN, "%s: ds response timeout", me);
+		rv = -1;
+		goto out;
+	}
+
+	cmd = promif_ds_resp.vc_hdr.cmd;
+	if (cmd != VAR_CONFIG_SET_RESP) {
+		cmn_err(CE_WARN, "%s: bad response type: %d", me, cmd);
+		rv = -1;
+		goto out;
+	}
+	rv = (cfg_rsp->result == VAR_CONFIG_SUCCESS) ? valuelen : -1;
+
+out:
+	mutex_exit(&promif_prop_lock);
+	thread_affinity_clear(curthread);
+	return (rv);
+}
+
+int
+promif_setprop(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	pnode_t node;
+	caddr_t	name;
+	caddr_t	value;
+	int	len;
+
+	ASSERT(ci[1] == 4);
+
+	node  = p1275_cell2dnode(ci[3]);
+	ASSERT(node == prom_optionsnode());
+	name  = p1275_cell2ptr(ci[4]);
+	value = p1275_cell2ptr(ci[5]);
+	len = p1275_cell2int(ci[6]);
+
+	if (promif_stree_getproplen(node, name) != -1)
+		len = promif_ldom_setprop(name, value, len);
+
+	if (len >= 0)
+		len = promif_stree_setprop(node, name, (void *)value, len);
+
+
+	ci[7] = p1275_int2cell(len);
+
+	return ((len == -1) ? len : 0);
+}
+
+#endif
+
+int
+promif_getprop(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	pnode_t	node;
+	caddr_t	name;
+	caddr_t	value;
+	int	len;
+
+	ASSERT(ci[1] == 4);
+
+	node  = p1275_cell2dnode(ci[3]);
+	name  = p1275_cell2ptr(ci[4]);
+	value = p1275_cell2ptr(ci[5]);
+
+	len = promif_stree_getprop(node, name, value);
+
+	ci[7] = p1275_int2cell(len);
+
+	return ((len == -1) ? len : 0);
+}
+
+int
+promif_getproplen(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	pnode_t	node;
+	caddr_t	name;
+	int	len;
+
+	ASSERT(ci[1] == 2);
+
+	node = p1275_cell2dnode(ci[3]);
+	name = p1275_cell2ptr(ci[4]);
+
+	len = promif_stree_getproplen(node, name);
+
+	ci[5] = p1275_int2cell(len);
+
+	return (0);
+}
+
+int
+promif_nextprop(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	pnode_t	node;
+	caddr_t	prev;
+	caddr_t	next;
+
+	ASSERT(ci[1] == 3);
+
+	node = p1275_cell2dnode(ci[3]);
+	prev = p1275_cell2ptr(ci[4]);
+	next = p1275_cell2ptr(ci[5]);
+
+	(void) promif_stree_nextprop(node, prev, next);
+
+	return (0);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_reboot.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,115 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+#include <sys/hypervisor_api.h>
+
+/*
+ * Reboot Command String
+ *
+ * The prom_reboot() CIF handler takes an optional string containing
+ * arguments to the boot command that are to be applied to the reboot.
+ * This information is used to create a full boot command string that
+ * is stored in a well known ldom variable (REBOOT_CMD_VAR_NAME). The
+ * string is constructed to take the following form:
+ *
+ *	boot <specified boot arguments><NULL>
+ *
+ * When the domain comes back up, OBP consults this variable. If set,
+ * it will use the unmodified boot command string to boot the domain.
+ * The maximum length of the boot command is specified by the constant
+ * REBOOT_CMD_MAX_LEN. If the specified arguments cause the command
+ * string to exceed this length, the arguments are truncated.
+ */
+#define	REBOOT_CMD_VAR_NAME		"reboot-command"
+#define	REBOOT_CMD_BASE			"boot "
+#define	REBOOT_CMD_MAX_LEN		256
+#define	REBOOT_CMD_ARGS_MAX_LEN		(REBOOT_CMD_MAX_LEN - 		\
+					prom_strlen(REBOOT_CMD_BASE) - 1)
+int
+promif_reboot(void *p)
+{
+	cell_t	*ci = (cell_t *)p;
+	int	rv = 0;
+#ifndef _KMDB
+	char	*bootargs;
+	char	bootcmd[REBOOT_CMD_MAX_LEN];
+	char	*cmd_end;
+	int	cmd_len;
+#endif
+
+	/* one argument expected */
+	ASSERT(ci[1] == 1);
+
+#ifndef _KMDB
+	bootargs = p1275_cell2ptr(ci[3]);
+
+	if (bootargs == NULL)
+		bootargs = "";
+
+	/* verify the length of the command string */
+	cmd_len = prom_strlen(REBOOT_CMD_BASE) + prom_strlen(bootargs) + 1;
+
+	if (cmd_len > REBOOT_CMD_MAX_LEN) {
+		/*
+		 * Unable to set the requested boot arguments.
+		 * Truncate them so that the boot command will
+		 * fit within the maximum length. This follows
+		 * the policy also used by OBP.
+		 */
+		cmd_end = bootargs + REBOOT_CMD_ARGS_MAX_LEN;
+		*cmd_end = '\0';
+
+		prom_printf("WARNING: reboot command length (%d) too long, "
+		    "truncating command arguments\n", cmd_len);
+	}
+
+	/* construct the boot command string */
+	(void) prom_sprintf(bootcmd, "%s%s", REBOOT_CMD_BASE, bootargs);
+
+	cmd_len = prom_strlen(bootcmd) + 1;
+	ASSERT(cmd_len <= REBOOT_CMD_MAX_LEN);
+
+	CIF_DBG_REBOOT("bootcmd='%s'\n", bootcmd);
+
+	/* attempt to set the ldom variable */
+	if (promif_ldom_setprop(REBOOT_CMD_VAR_NAME, bootcmd, cmd_len) == -1) {
+		prom_printf("WARNING: unable to store boot command for "
+		    "use on reboot\n");
+	}
+#endif
+
+	prom_printf("Resetting...\n");
+
+	rv = hv_mach_sir();
+
+	/* should not return */
+	ASSERT(0);
+
+	return (rv);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_stree.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,455 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+#include <sys/kmem.h>
+#include <sys/machsystm.h>
+
+/*
+ * A property attached to a node in the kernel's
+ * shadow copy of the PROM device tree.
+ */
+typedef struct prom_prop {
+	struct prom_prop *pp_next;
+	char		 *pp_name;
+	int		 pp_len;
+	void		 *pp_val;
+} prom_prop_t;
+
+/*
+ * A node in the kernel's shadow copy of the PROM
+ * device tree.
+ */
+typedef struct prom_node {
+	pnode_t			pn_nodeid;
+	struct prom_prop	*pn_propp;
+	struct prom_node	*pn_parent;
+	struct prom_node	*pn_child;
+	struct prom_node	*pn_sibling;
+} prom_node_t;
+
+static prom_node_t *promif_root;
+
+static prom_node_t *find_node(pnode_t nodeid);
+static prom_node_t *find_node_work(prom_node_t *np, pnode_t node);
+static int getproplen(prom_node_t *pnp, char *name);
+static void *getprop(prom_node_t *pnp, char *name);
+static char *nextprop(prom_node_t *pnp, char *name);
+
+#ifndef _KMDB
+static void create_prop(prom_node_t *pnp, char *name, void *val, int len);
+static prom_node_t *create_node(prom_node_t *parent, pnode_t node);
+static void create_peers(prom_node_t *pnp, pnode_t node);
+static void create_children(prom_node_t *pnp, pnode_t parent);
+#endif
+
+/*
+ * Hooks for kmdb for accessing the PROM shadow tree. The driver portion
+ * of kmdb will retrieve the root of the tree and pass it down to the
+ * debugger portion of kmdb. As the kmdb debugger is standalone, it has
+ * its own promif_root pointer that it will be set to the value passed by
+ * the driver so that kmdb points to the shadow tree maintained by the kernel.
+ * So the "get" function is in the kernel while the "set" function is in kmdb.
+ */
+#ifdef _KMDB
+void
+promif_stree_setroot(void *root)
+{
+	promif_root = (prom_node_t *)root;
+}
+#else
+void *
+promif_stree_getroot(void)
+{
+	return (promif_root);
+}
+#endif
+
+/*
+ * Interfaces used internally by promif functions.
+ * These hide all accesses to the shadow tree.
+ */
+
+pnode_t
+promif_stree_parentnode(pnode_t nodeid)
+{
+	prom_node_t *pnp;
+
+	pnp = find_node(nodeid);
+	if (pnp && pnp->pn_parent) {
+		return (pnp->pn_parent->pn_nodeid);
+	}
+
+	return (OBP_NONODE);
+}
+
+pnode_t
+promif_stree_childnode(pnode_t nodeid)
+{
+	prom_node_t *pnp;
+
+	pnp = find_node(nodeid);
+	if (pnp && pnp->pn_child)
+		return (pnp->pn_child->pn_nodeid);
+
+	return (OBP_NONODE);
+}
+
+pnode_t
+promif_stree_nextnode(pnode_t nodeid)
+{
+	prom_node_t *pnp;
+
+	/*
+	 * Note: next(0) returns the root node
+	 */
+	pnp = find_node(nodeid);
+	if (pnp && (nodeid == OBP_NONODE))
+		return (pnp->pn_nodeid);
+	if (pnp && pnp->pn_sibling)
+		return (pnp->pn_sibling->pn_nodeid);
+
+	return (OBP_NONODE);
+}
+
+int
+promif_stree_getproplen(pnode_t nodeid, char *name)
+{
+	prom_node_t *pnp;
+
+	pnp = find_node(nodeid);
+	if (pnp == NULL)
+		return (-1);
+
+	return (getproplen(pnp, name));
+}
+
+int
+promif_stree_getprop(pnode_t nodeid, char *name, void *value)
+{
+	prom_node_t	*pnp;
+	void		*prop;
+	int		len;
+
+	pnp = find_node(nodeid);
+	if (pnp == NULL) {
+		prom_printf("find_node: no node?\n");
+		return (-1);
+	}
+
+	len = getproplen(pnp, name);
+	if (len > 0) {
+		prop = getprop(pnp, name);
+		bcopy(prop, value, len);
+	} else {
+		prom_printf("find_node: getproplen: %d\n", len);
+	}
+
+	return (len);
+}
+
+char *
+promif_stree_nextprop(pnode_t nodeid, char *name, char *next)
+{
+	prom_node_t	*pnp;
+	char		*propname;
+
+	next[0] = '\0';
+
+	pnp = find_node(nodeid);
+	if (pnp == NULL)
+		return (NULL);
+
+	propname = nextprop(pnp, name);
+	if (propname == NULL)
+		return (next);
+
+	(void) prom_strcpy(next, propname);
+
+	return (next);
+}
+
+static prom_node_t *
+find_node_work(prom_node_t *np, pnode_t node)
+{
+	prom_node_t *nnp;
+
+	if (np->pn_nodeid == node)
+		return (np);
+
+	if (np->pn_child)
+		if ((nnp = find_node_work(np->pn_child, node)) != NULL)
+			return (nnp);
+
+	if (np->pn_sibling)
+		if ((nnp = find_node_work(np->pn_sibling, node)) != NULL)
+			return (nnp);
+
+	return (NULL);
+}
+
+static prom_node_t *
+find_node(pnode_t nodeid)
+{
+
+	if (nodeid == OBP_NONODE)
+		return (promif_root);
+
+	if (promif_root == NULL)
+		return (NULL);
+
+	return (find_node_work(promif_root, nodeid));
+}
+
+static int
+getproplen(prom_node_t *pnp, char *name)
+{
+	struct prom_prop *propp;
+
+	for (propp = pnp->pn_propp; propp != NULL; propp = propp->pp_next)
+		if (prom_strcmp(propp->pp_name, name) == 0)
+			return (propp->pp_len);
+
+	return (-1);
+}
+
+static void *
+getprop(prom_node_t *np, char *name)
+{
+	struct prom_prop *propp;
+
+	for (propp = np->pn_propp; propp != NULL; propp = propp->pp_next)
+		if (prom_strcmp(propp->pp_name, name) == 0)
+			return (propp->pp_val);
+
+	return (NULL);
+}
+
+static char *
+nextprop(prom_node_t *pnp, char *name)
+{
+	struct prom_prop *propp;
+
+	/*
+	 * getting next of NULL or a null string returns the first prop name
+	 */
+	if (name == NULL || *name == '\0')
+		if (pnp->pn_propp)
+			return (pnp->pn_propp->pp_name);
+
+	for (propp = pnp->pn_propp; propp != NULL; propp = propp->pp_next)
+		if (prom_strcmp(propp->pp_name, name) == 0)
+			if (propp->pp_next)
+				return (propp->pp_next->pp_name);
+
+	return (NULL);
+}
+
+#ifndef _KMDB
+
+int
+promif_stree_setprop(pnode_t nodeid, char *name, void *value, int len)
+{
+	prom_node_t		*pnp;
+	struct prom_prop	*prop;
+
+	pnp = find_node(nodeid);
+	if (pnp == NULL) {
+		prom_printf("find_node: no node?\n");
+		return (-1);
+	}
+
+	/*
+	 * If a property with this name exists, replace the existing
+	 * value.
+	 */
+	for (prop = pnp->pn_propp; prop; prop = prop->pp_next)
+		if (prom_strcmp(prop->pp_name, name) == 0) {
+			kmem_free(prop->pp_val, prop->pp_len);
+			prop->pp_val = NULL;
+			if (len > 0) {
+				prop->pp_val = kmem_zalloc(len, KM_SLEEP);
+				bcopy(value, prop->pp_val, len);
+			}
+			prop->pp_len = len;
+			return (len);
+		}
+
+	return (-1);
+}
+
+/*
+ * Create a promif private copy of boot's device tree.
+ */
+void
+promif_stree_init(void)
+{
+	pnode_t		node;
+	prom_node_t	*pnp;
+
+	node = prom_rootnode();
+	promif_root = pnp = create_node(OBP_NONODE, node);
+
+	create_peers(pnp, node);
+	create_children(pnp, node);
+}
+
+static void
+create_children(prom_node_t *pnp, pnode_t parent)
+{
+	prom_node_t	*cnp;
+	pnode_t		child;
+
+	_NOTE(CONSTCOND)
+	while (1) {
+		child = prom_childnode(parent);
+		if (child == 0)
+			break;
+		if (prom_getproplen(child, "name") <= 0) {
+			parent = child;
+			continue;
+		}
+		cnp = create_node(pnp, child);
+		pnp->pn_child = cnp;
+		create_peers(cnp, child);
+		pnp = cnp;
+		parent = child;
+	}
+}
+
+static void
+create_peers(prom_node_t *np, pnode_t node)
+{
+	prom_node_t	*pnp;
+	pnode_t		peer;
+
+	_NOTE(CONSTCOND)
+	while (1) {
+		peer = prom_nextnode(node);
+		if (peer == 0)
+			break;
+		if (prom_getproplen(peer, "name") <= 0) {
+			node = peer;
+			continue;
+		}
+		pnp = create_node(np->pn_parent, peer);
+		np->pn_sibling = pnp;
+		create_children(pnp, peer);
+		np = pnp;
+		node = peer;
+	}
+}
+
+static prom_node_t *
+create_node(prom_node_t *parent, pnode_t node)
+{
+	prom_node_t	*pnp;
+	char		prvname[OBP_MAXPROPNAME];
+	char		propname[OBP_MAXPROPNAME];
+	int		proplen;
+	void		*propval;
+
+	pnp = kmem_zalloc(sizeof (prom_node_t), KM_SLEEP);
+	pnp->pn_nodeid = node;
+	pnp->pn_parent = parent;
+
+	prvname[0] = '\0';
+
+	_NOTE(CONSTCOND)
+	while (1) {
+		(void) prom_nextprop(node, prvname, propname);
+		if (prom_strlen(propname) == 0)
+			break;
+		if ((proplen = prom_getproplen(node, propname)) == -1)
+			continue;
+		propval = NULL;
+		if (proplen != 0) {
+			propval = kmem_zalloc(proplen, KM_SLEEP);
+			(void) prom_getprop(node, propname, propval);
+		}
+		create_prop(pnp, propname, propval, proplen);
+
+		(void) prom_strcpy(prvname, propname);
+	}
+
+	return (pnp);
+}
+
+static void
+create_prop(prom_node_t *pnp, char *name, void *val, int len)
+{
+	struct prom_prop	*prop;
+	struct prom_prop	*newprop;
+
+	newprop = kmem_zalloc(sizeof (*newprop), KM_SLEEP);
+	newprop->pp_name = kmem_zalloc(prom_strlen(name) + 1, KM_SLEEP);
+	(void) prom_strcpy(newprop->pp_name, name);
+	newprop->pp_val = val;
+	newprop->pp_len = len;
+
+	if (pnp->pn_propp == NULL) {
+		pnp->pn_propp = newprop;
+		return;
+	}
+
+	/* move to the end of the prop list */
+	for (prop = pnp->pn_propp; prop->pp_next != NULL; prop = prop->pp_next)
+		/* empty */;
+
+	/* append the new prop */
+	prop->pp_next = newprop;
+}
+
+static void
+promif_dump_tree(prom_node_t *pnp)
+{
+	int		i;
+	static int	level = 0;
+
+	if (pnp == NULL)
+		return;
+
+	for (i = 0; i < level; i++) {
+		prom_printf("    ");
+	}
+
+	prom_printf("Node 0x%x (parent=0x%x, sibling=0x%x)\n", pnp->pn_nodeid,
+	    (pnp->pn_parent) ? pnp->pn_parent->pn_nodeid : 0,
+	    (pnp->pn_sibling) ? pnp->pn_sibling->pn_nodeid : 0);
+
+	if (pnp->pn_child != NULL) {
+		level++;
+		promif_dump_tree(pnp->pn_child);
+		level--;
+	}
+
+	if (pnp->pn_sibling != NULL)
+		promif_dump_tree(pnp->pn_sibling);
+}
+
+#endif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_test.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,51 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+
+int
+promif_test(void *p)
+{
+	cell_t		*ci = (cell_t *)p;
+	char		*opname;
+	cif_func_t	func;
+	int		rv;
+
+	ASSERT(ci[1] == 1);
+
+	opname = p1275_cell2ptr(ci[3]);
+
+	func = promif_find_cif_callback(opname);
+
+	/* zero indicates operation is supported */
+	rv = (func != NULL) ? 0 : 1;
+
+	ci[4] = p1275_int2cell(rv);
+
+	return (0);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/promif/promif_version.c	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,82 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/promif_impl.h>
+#include <sys/hypervisor_api.h>
+
+/*
+ * Wrappers to get/set the API version with Hypervisor.
+ */
+
+int
+promif_set_sun4v_api_version(void *p)
+{
+	cell_t *ci = (cell_t *)p;
+	uint64_t api_group;
+	uint64_t major;
+	uint64_t minor;
+	uint64_t status;
+	uint64_t supported_minor;
+
+	ASSERT(ci[1] == 3);
+	ASSERT(ci[2] == 2);
+
+	api_group = (uint64_t)p1275_cell2int(ci[3]);
+	major = (uint64_t)p1275_cell2int(ci[4]);
+	minor = (uint64_t)p1275_cell2int(ci[5]);
+
+	status = hv_api_set_version(api_group, major, minor, &supported_minor);
+
+	ci[6] = p1275_int2cell(status);
+	ci[7] = p1275_int2cell(supported_minor);
+
+	return ((status == H_EOK) ? 0 : -1);
+}
+
+int
+promif_get_sun4v_api_version(void *p)
+{
+	cell_t *ci = (cell_t *)p;
+	uint64_t api_group;
+	uint64_t major;
+	uint64_t minor;
+	uint64_t status;
+
+	ASSERT(ci[1] == 1);
+	ASSERT(ci[2] == 3);
+
+	api_group = (uint64_t)p1275_cell2int(ci[3]);
+
+	status = hv_api_get_version(api_group, &major, &minor);
+
+	ci[4] = p1275_int2cell(status);
+	ci[5] = p1275_int2cell(major);
+	ci[6] = p1275_int2cell(minor);
+
+	return ((status == H_EOK) ? 0 : -1);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/cnex.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,98 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _CNEX_H
+#define	_CNEX_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Channel nexus "reg" spec
+ */
+typedef struct cnex_regspec {
+	uint64_t physaddr;
+	uint64_t size;
+} cnex_regspec_t;
+
+/*
+ * Channel nexus interrupt map
+ */
+struct cnex_pil_map {
+	ldc_dev_t	devclass;	/* LDC device class */
+	uint32_t	pil;		/* PIL for device class */
+};
+
+/*
+ * Channel interrupt information
+ */
+typedef struct cnex_intr {
+	uint64_t	ino;		/* dev intr number */
+	uint64_t	icookie;	/* dev intr cookie */
+	uint_t		(*hdlr)();	/* intr handler */
+	caddr_t		arg1;		/* intr argument 1 */
+	caddr_t		arg2;		/* intr argument 2 */
+	void		*ssp;		/* back ptr to soft state */
+} cnex_intr_t;
+
+/* cnex interrupt types */
+typedef enum {
+	CNEX_TX_INTR = 1,		/* transmit interrupt */
+	CNEX_RX_INTR			/* receive interrupt */
+} cnex_intrtype_t;
+
+/*
+ * Channel information
+ */
+typedef struct cnex_ldc {
+	kmutex_t	lock;		/* Channel lock */
+	struct cnex_ldc	*next;
+
+	uint64_t	id;
+	ldc_dev_t 	devclass;	/* Device class channel belongs to */
+
+	cnex_intr_t	tx;		/* Transmit interrupt */
+	cnex_intr_t	rx;		/* Receive interrupt */
+} cnex_ldc_t;
+
+/*
+ * Channel nexus soft state pointer
+ */
+typedef struct cnex_soft_state {
+	dev_info_t 	*devi;
+	uint64_t	cfghdl;		/* cnex config handle */
+	kmutex_t	clist_lock;	/* lock to protect channel list */
+	cnex_ldc_t	*clist;		/* list of registered channels */
+} cnex_soft_state_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _CNEX_H */
--- a/usr/src/uts/sun4v/sys/cpu_module.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/cpu_module.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -154,6 +153,18 @@
 int	cpu_trapstat_conf(int cmd);
 void	cpu_trapstat_data(void *buf, uint_t pgszs);
 
+#define	NO_EU_MAPPING_FOUND		0xffffffff
+/*
+ * Default MMU pagesize mask for sun4v architecture.
+ */
+#define	DEFAULT_SUN4V_MMU_PAGESIZE_MASK	((1 << TTE8K) | (1 << TTE64K) \
+					    | (1 << TTE4M))
+
+void	cpu_setup_common(char **);
+
+boolean_t	broken_md_flag;
+int	va_bits;
+
 #endif /* _KERNEL */
 
 #ifdef	__cplusplus
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/dr_cpu.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DR_CPU_H
+#define	_DR_CPU_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * CPU DR Control Protocol
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * CPU DR Message Header
+ */
+typedef struct {
+	uint64_t	req_num;	/* request number */
+	uint32_t	msg_type;	/* message type */
+	uint32_t	num_records;	/* number of records */
+} dr_cpu_hdr_t;
+
+/*
+ * CPU command and response messages
+ */
+
+#define	DR_CPU_DS_ID		"dr-cpu"
+
+#define	DR_CPU_CONFIGURE	('C')
+#define	DR_CPU_UNCONFIGURE	('U')
+#define	DR_CPU_FORCE_UNCONFIG	('F')
+#define	DR_CPU_STATUS		('S')
+
+#define	DR_CPU_OK		('o')
+#define	DR_CPU_ERROR		('e')
+
+/*
+ * Response Message
+ */
+typedef struct {
+	uint32_t	cpuid;		/* virtual CPU ID */
+	uint32_t	result;		/* result of the operation */
+	uint32_t	status;		/* status of the CPU */
+	uint32_t	string_off;	/* informational string offset */
+} dr_cpu_stat_t;
+
+/*
+ * Result Codes
+ */
+#define	DR_CPU_RES_OK			0x0	/* operation succeeded */
+#define	DR_CPU_RES_FAILURE		0x1	/* operation failed */
+#define	DR_CPU_RES_BLOCKED		0x2	/* operation was blocked */
+#define	DR_CPU_RES_CPU_NOT_RESPONDING	0x3	/* CPU was not responding */
+#define	DR_CPU_RES_NOT_IN_MD		0x4	/* CPU not defined in MD */
+
+/*
+ * Status Codes
+ */
+#define	DR_CPU_STAT_NOT_PRESENT		0x0	/* CPU ID not in MD */
+#define	DR_CPU_STAT_UNCONFIGURED	0x1	/* CPU unconfigured */
+#define	DR_CPU_STAT_CONFIGURED		0x2	/* CPU configured */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DR_CPU_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/dr_util.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DR_UTIL_H
+#define	_DR_UTIL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * sun4v Common DR Header
+ */
+
+#include <sys/ksynch.h>
+#include <sys/cmn_err.h>
+#include <sys/note.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Debugging support
+ */
+#ifdef DEBUG
+
+extern uint_t	dr_debug;
+
+#define	DR_DBG_FLAG_CTL		0x01
+#define	DR_DBG_FLAG_CPU		0x02
+#define	DR_DBG_FLAG_MEM		0x04
+#define	DR_DBG_FLAG_IO		0x08
+#define	DR_DBG_FLAG_TRANS	0x10
+
+#define	DR_DBG_ALL	if (dr_debug)			  printf
+#define	DR_DBG_CTL	if (dr_debug & DR_DBG_FLAG_CTL)	  printf
+#define	DR_DBG_CPU	if (dr_debug & DR_DBG_FLAG_CPU)	  printf
+#define	DR_DBG_MEM	if (dr_debug & DR_DBG_FLAG_MEM)	  printf
+#define	DR_DBG_IO	if (dr_debug & DR_DBG_FLAG_IO)	  printf
+#define	DR_DBG_TRANS	if (dr_debug & DR_DBG_FLAG_TRANS) printf
+
+#define	DR_DBG_DUMP_MSG(buf, len)	dr_dbg_dump_msg(buf, len)
+
+extern void dr_dbg_dump_msg(void *buf, size_t len);
+
+#else /* DEBUG */
+
+#define	DR_DBG_ALL	_NOTE(CONSTCOND) if (0)	printf
+#define	DR_DBG_CTL	DR_DBG_ALL
+#define	DR_DBG_CPU	DR_DBG_ALL
+#define	DR_DBG_MEM	DR_DBG_ALL
+#define	DR_DBG_IO	DR_DBG_ALL
+#define	DR_DBG_TRANS	DR_DBG_ALL
+
+#define	DR_DBG_DUMP_MSG(buf, len)
+
+#endif /* DEBUG */
+
+typedef enum {
+	DR_TYPE_INVAL,
+	DR_TYPE_CPU,
+	DR_TYPE_MEM,
+	DR_TYPE_VIO,
+	DR_TYPE_DIO
+} dr_type_t;
+
+/*
+ * Macro to convert a dr_type_t into a string. These strings are
+ * used to generate DR events and should only be modified using
+ * extreme caution.
+ */
+#define	DR_TYPE2STR(t)	((t) == DR_TYPE_INVAL ? "invalid" :	\
+			    (t) == DR_TYPE_CPU ? OBP_CPU : 	\
+			    (t) == DR_TYPE_MEM ? "memory" :	\
+			    (t) == DR_TYPE_VIO ? "vio" :	\
+			    (t) == DR_TYPE_DIO ? "dio" :	\
+			    "unknown")
+
+extern boolean_t dr_is_disabled(dr_type_t type);
+extern void dr_generate_event(dr_type_t type, int se_hint);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DR_UTIL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/ds.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,114 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DS_H
+#define	_DS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Domain Services Client Interface
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint64_t	ds_svc_hdl_t;	/* opaque service handle */
+typedef void		*ds_cb_arg_t;	/* client specified callback arg */
+
+#define	DS_INVALID_HDL	(0)		/* a ds handle cannot be zero */
+
+/*
+ * Domain Services Versioning
+ */
+typedef struct ds_ver {
+	uint16_t	major;
+	uint16_t	minor;
+} ds_ver_t;
+
+/*
+ * Domain Services Capability
+ *
+ * A DS capability is exported by a client using a unique service
+ * identifier string. Along with this identifier is the list of
+ * versions of the capability that the client supports.
+ */
+typedef struct ds_capability {
+	char		*svc_id;	/* service identifier */
+	ds_ver_t	*vers;		/* list of supported versions */
+	int		nvers;		/* number of supported versions */
+} ds_capability_t;
+
+/*
+ * Domain Services Client Event Callbacks
+ *
+ * A client implementing a DS capability provides a set of callbacks
+ * when it registers with the DS framework. The use of these callbacks
+ * is described below:
+ *
+ *    ds_reg_cb(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl)
+ *
+ *	    The ds_reg_cb() callback is invoked when the DS framework
+ *	    has successfully completed version negotiation with the
+ *	    remote endpoint for the capability. It provides the client
+ *	    with the negotiated version and a handle to use when sending
+ *	    data.
+ *
+ *    ds_unreg_cb(ds_cb_arg_t arg)
+ *
+ *	    The ds_unreg_cb() callback is invoked when the DS framework
+ *	    detects an event that causes the registered capability to
+ *	    become unavailable. This includes an explicit unregister
+ *	    message, a failure in the underlying communication transport,
+ *	    etc. Any such event invalidates the service handle that was
+ *	    received from the register callback.
+ *
+ *    ds_data_cb(ds_cb_arg_t arg, void *buf, size_t buflen)
+ *
+ *	    The ds_data_cb() callback is invoked whenever there is an
+ *	    incoming data message for the client to process. It provides
+ *	    the contents of the message along with the message length.
+ */
+typedef struct ds_clnt_ops {
+	void (*ds_reg_cb)(ds_cb_arg_t arg, ds_ver_t *ver, ds_svc_hdl_t hdl);
+	void (*ds_unreg_cb)(ds_cb_arg_t arg);
+	void (*ds_data_cb)(ds_cb_arg_t arg, void *buf, size_t buflen);
+	ds_cb_arg_t cb_arg;
+} ds_clnt_ops_t;
+
+/*
+ * Domain Services Capability Interface
+ */
+extern int ds_cap_init(ds_capability_t *cap, ds_clnt_ops_t *ops);
+extern int ds_cap_fini(ds_capability_t *cap);
+extern int ds_cap_send(ds_svc_hdl_t hdl, void *buf, size_t buflen);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DS_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/ds_impl.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,332 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _DS_IMPL_H
+#define	_DS_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * The Domain Services Protocol
+ *
+ * The DS protocol is divided into two parts. The first is fixed and
+ * must remain exactly the same for *all* versions of the DS protocol.
+ * The only messages supported by the fixed portion of the protocol are
+ * to negotiate a version to use for the rest of the protocol.
+ */
+
+/*
+ * Domain Services Header
+ */
+typedef struct ds_hdr {
+	uint32_t	msg_type;	/* message type */
+	uint32_t	payload_len;	/* payload length */
+} ds_hdr_t;
+
+#define	DS_HDR_SZ	(sizeof (ds_hdr_t))
+
+/*
+ * DS Fixed Message Types
+ */
+#define	DS_INIT_REQ		0x0	/* initiate DS connection */
+#define	DS_INIT_ACK		0x1	/* initiation acknowledgement */
+#define	DS_INIT_NACK		0x2	/* initiation negative acknowledgment */
+
+/*
+ * DS Fixed Initialization Messages
+ */
+typedef struct ds_init_req {
+	uint16_t	major_vers;	/* requested major version */
+	uint16_t	minor_vers;	/* requested minor version */
+} ds_init_req_t;
+
+typedef struct ds_init_ack {
+	uint16_t	minor_vers;	/* highest supported minor version */
+} ds_init_ack_t;
+
+typedef struct ds_init_nack {
+	uint16_t	major_vers;	/* alternate supported major version */
+} ds_init_nack_t;
+
+/*
+ * DS Message Types for Version 1.0
+ */
+#define	DS_REG_REQ		0x3	/* register a service */
+#define	DS_REG_ACK		0x4	/* register acknowledgement */
+#define	DS_REG_NACK		0x5	/* register failed */
+#define	DS_UNREG		0x6	/* unregister a service */
+#define	DS_UNREG_ACK		0x7	/* unregister acknowledgement */
+#define	DS_UNREG_NACK		0x8	/* unregister failed */
+#define	DS_DATA			0x9	/* data message */
+#define	DS_NACK			0xa	/* data error */
+
+/* result codes */
+#define	DS_OK			0x0	/* success */
+#define	DS_REG_VER_NACK		0x1	/* unsupported major version */
+#define	DS_REG_DUP		0x2	/* duplicate registration attempted */
+#define	DS_INV_HDL		0x3	/* service handle not valid */
+#define	DS_TYPE_UNKNOWN		0x4	/* unknown message type received */
+
+/*
+ * Service Register Messages
+ */
+typedef struct ds_reg_req {
+	uint64_t	svc_handle;	/* service handle to register */
+	uint16_t	major_vers;	/* requested major version */
+	uint16_t	minor_vers;	/* requested minor version */
+	char		svc_id[1];	/* service identifier string */
+} ds_reg_req_t;
+
+typedef struct ds_reg_ack {
+	uint64_t	svc_handle;	/* service handle sent in register */
+	uint16_t	minor_vers;	/* highest supported minor version */
+} ds_reg_ack_t;
+
+typedef struct ds_reg_nack {
+	uint64_t	svc_handle;	/* service handle sent in register */
+	uint64_t	result;		/* reason for the failure */
+	uint16_t	major_vers;	/* alternate supported major version */
+} ds_reg_nack_t;
+
+/*
+ * Service Unregister Messages
+ */
+typedef struct ds_unreg_req {
+	uint64_t	svc_handle;	/* service handle to unregister */
+} ds_unreg_req_t;
+
+typedef struct ds_unreg_ack {
+	uint64_t	svc_handle;	/* service handle sent in unregister */
+} ds_unreg_ack_t;
+
+typedef struct ds_unreg_nack {
+	uint64_t	svc_handle;	/* service handle sent in unregister */
+} ds_unreg_nack_t;
+
+/*
+ * Data Transfer Messages
+ */
+typedef struct ds_data_handle {
+	uint64_t	svc_handle;	/* service handle for data */
+} ds_data_handle_t;
+
+typedef struct ds_data_nack {
+	uint64_t	svc_handle;	/* service handle sent in data msg */
+	uint64_t	result;		/* reason for failure */
+} ds_data_nack_t;
+
+/*
+ * Message Processing Utilities
+ */
+#define	DS_MSG_TYPE_VALID(type)		((type) <= DS_NACK)
+#define	DS_MSG_LEN(ds_type)		(sizeof (ds_hdr_t) + sizeof (ds_type))
+
+
+/*
+ * Domain Service Port
+ *
+ * A DS port is a logical representation of an LDC dedicated to
+ * communication between DS endpoints. The ds_port_t maintains state
+ * associated with a connection to a remote endpoint. This includes
+ * the state of the port, the LDC state, the current version of the
+ * DS protocol in use on the port, and other port properties.
+ *
+ * Locking: The port is protected by a single mutex. It must be held
+ *   while the port structure is being accessed and also when data is
+ *   being read or written using the port
+ */
+typedef enum {
+	DS_PORT_FREE,			/* port structure not in use */
+	DS_PORT_INIT,			/* port structure created */
+	DS_PORT_LDC_INIT,		/* ldc successfully initialized */
+	DS_PORT_INIT_REQ,		/* initialization handshake sent */
+	DS_PORT_READY			/* init handshake completed */
+} ds_port_state_t;
+
+typedef struct ds_ldc {
+	uint64_t	id;		/* LDC id */
+	ldc_handle_t	hdl;		/* LDC handle */
+	ldc_status_t	state;		/* current LDC state */
+} ds_ldc_t;
+
+typedef struct ds_port {
+	kmutex_t	lock;		/* port lock */
+	uint64_t	id;		/* port id from MD */
+	ds_port_state_t	state;		/* state of the port */
+	ds_ver_t	ver;		/* DS protocol version in use */
+	uint32_t	ver_idx;	/* index of version during handshake */
+	ds_ldc_t	ldc;		/* LDC for this port */
+} ds_port_t;
+
+/*
+ * A DS portset is a bitmap that represents a collection of DS
+ * ports. Each bit represent a particular port id. The current
+ * implementation constrains the maximum number of ports to 64.
+ */
+typedef uint64_t ds_portset_t;
+
+#define	DS_MAX_PORTS			((sizeof (ds_portset_t)) * 8)
+#define	DS_MAX_PORT_ID			(DS_MAX_PORTS - 1)
+
+#define	DS_PORT_SET(port)		(1UL << (port))
+#define	DS_PORT_IN_SET(set, port)	((set) & DS_PORT_SET(port))
+#define	DS_PORTSET_ADD(set, port)	((void)((set) |= DS_PORT_SET(port)))
+#define	DS_PORTSET_DEL(set, port)	((void)((set) &= ~DS_PORT_SET(port)))
+#define	DS_PORTSET_ISNULL(set)		((set) == 0)
+#define	DS_PORTSET_DUP(set1, set2)	((void)((set1) = (set2)))
+
+/*
+ * LDC Information
+ */
+#define	DS_QUEUE_LEN	128		/* LDC queue size */
+
+/*
+ * Machine Description Constants
+ */
+#define	DS_MD_PORT_NAME		"domain-services-port"
+#define	DS_MD_CHAN_NAME		"channel-endpoint"
+
+/*
+ * DS Services
+ *
+ * A DS Service is a mapping between a DS capability and a client
+ * of the DS framework that provides that capability. It includes
+ * information on the state of the service, the currently negotiated
+ * version of the capability specific protocol, the port that is
+ * currently in use by the capability, etc.
+ */
+
+typedef enum {
+	DS_SVC_INVAL,			/* svc structure uninitialized */
+	DS_SVC_FREE,			/* svc structure not in use */
+	DS_SVC_INACTIVE,		/* svc not registered */
+	DS_SVC_REG_PENDING,		/* register message sent */
+	DS_SVC_ACTIVE			/* register message acknowledged */
+} ds_svc_state_t;
+
+typedef struct ds_svc {
+	ds_capability_t	cap;		/* capability information */
+	ds_clnt_ops_t	ops;		/* client ops vector */
+	ds_svc_hdl_t	hdl;		/* handle assigned by DS */
+	ds_svc_state_t	state;		/* current service state */
+	ds_ver_t	ver;		/* svc protocol version in use */
+	uint_t		ver_idx;	/* index into client version array */
+	ds_port_t	*port;		/* port for this service */
+	ds_portset_t	avail;		/* ports available to this service */
+} ds_svc_t;
+
+#define	DS_SVC_ISFREE(svc)	((svc == NULL) || (svc->state == DS_SVC_FREE))
+
+/*
+ * A service handle is a 64 bit value with two pieces of information
+ * encoded in it. The upper 32 bits is the index into the table of
+ * a particular service structure. The lower 32 bits is a counter
+ * that is incremented each time a service structure is reused.
+ */
+#define	DS_IDX_SHIFT			32
+#define	DS_COUNT_MASK			0xfffffffful
+
+#define	DS_ALLOC_HDL(_idx, _count)	(((uint64_t)_idx << DS_IDX_SHIFT) | \
+					((uint64_t)(_count + 1) &	    \
+					DS_COUNT_MASK))
+#define	DS_HDL2IDX(hdl)			(hdl >> DS_IDX_SHIFT)
+#define	DS_HDL2COUNT(hdl)		(hdl & DS_COUNT_MASK)
+
+/*
+ * DS Message Logging
+ *
+ * The DS framework logs all incoming and outgoing messages to a
+ * ring buffer. This provides the ability to reconstruct a trace
+ * of DS activity for use in debugging. In addition to the message
+ * data, each log entry contains a timestamp and the destination
+ * of the message. The destination is based on the port number the
+ * message passed through (port number + 1). The sign of the dest
+ * field distinguishes incoming messages from outgoing messages.
+ * Incoming messages have a negative destination field.
+ */
+
+typedef struct ds_log_entry {
+	struct ds_log_entry	*next;		/* next in log or free list */
+	struct ds_log_entry	*prev;		/* previous in log */
+	time_t			timestamp;	/* time message added to log */
+	size_t			datasz;		/* size of the data */
+	void			*data;		/* the data itself */
+	int32_t			dest;		/* message destination */
+} ds_log_entry_t;
+
+#define	DS_LOG_IN(pid)		(-(pid + 1))
+#define	DS_LOG_OUT(pid)		(pid + 1)
+
+/*
+ * DS Log Limits:
+ *
+ * The size of the log is controlled by two limits. The first is
+ * a soft limit that is configurable by the user (via the global
+ * variable ds_log_sz). When this limit is exceeded, each new
+ * message that is added to the log replaces the oldest message.
+ *
+ * The second is a hard limit that is calculated based on the soft
+ * limit (DS_LOG_LIMIT). It is defined to be ~3% above the soft limit.
+ * Once this limit is exceeded, a thread is scheduled to delete old
+ * messages until the size of the log is below the soft limit.
+ */
+#define	DS_LOG_DEFAULT_SZ	(128 * 1024)	/* 128 KB */
+
+#define	DS_LOG_LIMIT		(ds_log_sz + (ds_log_sz >> 5))
+
+#define	DS_LOG_ENTRY_SZ(ep)	(sizeof (ds_log_entry_t) + (ep)->datasz)
+
+/*
+ * DS Log Memory Usage:
+ *
+ * The log free list is initialized from a pre-allocated pool of entry
+ * structures (the global ds_log_entry_pool). The number of entries
+ * in the pool (DS_LOG_NPOOL) is the number of entries that would
+ * take up half the default size of the log.
+ *
+ * As messages are added to the log, entry structures are pulled from
+ * the free list. If the free list is empty, memory is allocated for
+ * the entry. When entries are removed from the log, they are placed
+ * on the free list. Allocated memory is only deallocated when the
+ * entire log is destroyed.
+ */
+#define	DS_LOG_NPOOL		((DS_LOG_DEFAULT_SZ >> 1) / \
+				sizeof (ds_log_entry_t))
+
+#define	DS_LOG_POOL_END		(ds_log_entry_pool + DS_LOG_NPOOL)
+
+#define	DS_IS_POOL_ENTRY(ep)	(((ep) >= ds_log_entry_pool) && \
+				((ep) <= &(ds_log_entry_pool[DS_LOG_NPOOL])))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _DS_IMPL_H */
--- a/usr/src/uts/sun4v/sys/error.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/error.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -33,15 +32,11 @@
 extern "C" {
 #endif
 
-#define	CPU_RQ_ENTRIES		64
-#define	CPU_NRQ_ENTRIES		64
-
-
 /*
  * Resumable and Non-resumable queues
  */
-#define	CPU_RQ			0x3e
-#define	CPU_NRQ			0x3f
+#define	CPU_RQ_ENTRIES		64
+#define	CPU_NRQ_ENTRIES		64
 #define	Q_ENTRY_SIZE		64
 #define	CPU_RQ_SIZE		(CPU_RQ_ENTRIES * Q_ENTRY_SIZE)
 #define	CPU_NRQ_SIZE		(CPU_NRQ_ENTRIES * Q_ENTRY_SIZE)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/fault_iso.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,96 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FAULT_ISO_H
+#define	_FAULT_ISO_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* FMA CPU message numbers */
+#define	FMA_CPU_REQ_STATUS	0x0
+#define	FMA_CPU_REQ_OFFLINE	0x1
+#define	FMA_CPU_REQ_ONLINE	0x2
+
+typedef struct {
+	uint64_t	req_num;
+	uint32_t	msg_type;
+	uint32_t	cpu_id;
+} fma_cpu_service_req_t;
+
+/* FMA CPU result codes */
+#define	FMA_CPU_RESP_OK		0x0
+#define	FMA_CPU_RESP_FAILURE	0x1
+
+/* FMA CPU status codes */
+#define	FMA_CPU_STAT_ONLINE	0x0
+#define	FMA_CPU_STAT_OFFLINE	0x1
+#define	FMA_CPU_STAT_ILLEGAL	0x2
+
+typedef struct {
+	uint64_t	req_num;
+	uint32_t	result;
+	uint32_t	status;
+} fma_cpu_resp_t;
+
+/* FMA memory services message numbers */
+#define	FMA_MEM_REQ_STATUS	0x0
+#define	FMA_MEM_REQ_RETIRE	0x1
+#define	FMA_MEM_REQ_RESURRECT	0x2
+
+typedef struct {
+	uint64_t	req_num;
+	uint32_t	msg_type;
+	uint32_t	_resvd;
+	uint64_t	real_addr;
+	uint64_t	length;
+} fma_mem_service_req_t;
+
+/* FMA result codes */
+#define	FMA_MEM_RESP_OK		0x0
+#define	FMA_MEM_RESP_FAILURE	0x1
+
+/* FMA status codes */
+#define	FMA_MEM_STAT_NOTRETIRED		0x0
+#define	FMA_MEM_STAT_RETIRED		0x1
+#define	FMA_MEM_STAT_ILLEGAL		0x2
+
+typedef struct {
+	uint64_t	req_num;
+	uint32_t	result;
+	uint32_t	status;
+	uint64_t	res_addr;
+	uint64_t	res_length;
+} fma_mem_resp_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FAULT_ISO_H */
--- a/usr/src/uts/sun4v/sys/hsvc.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/hsvc.h	Tue May 16 16:05:21 2006 -0700
@@ -43,6 +43,7 @@
  */
 #define	HSVC_GROUP_SUN4V		0x0000
 #define	HSVC_GROUP_CORE			0x0001
+#define	HSVC_GROUP_INTR			0x0002
 #define	HSVC_GROUP_VPCI			0x0100
 #define	HSVC_GROUP_LDC			0x0101
 #define	HSVC_GROUP_VSC			0x0102
--- a/usr/src/uts/sun4v/sys/hypervisor_api.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/hypervisor_api.h	Tue May 16 16:05:21 2006 -0700
@@ -49,6 +49,8 @@
 #define	MMU_MAP_ADDR		0x83
 #define	MMU_UNMAP_ADDR		0x84
 
+#define	CORE_TRAP		0xff
+
 /*
  * Error returns in %o0.
  * (Additional result is returned in %o1.)
@@ -71,6 +73,8 @@
 #define	H_ENOMAP		14	/* Mapping is not valid, */
 					/* no translation exists */
 #define	H_EBUSY			17	/* Resource busy */
+#define	H_ETOOMANY		15	/* Hard resource limit exceeded */
+#define	H_ECHANNEL		16	/* Illegal LDC channel */
 
 #define	H_BREAK			-1	/* Console Break */
 #define	H_HUP			-2	/* Console Break */
@@ -85,9 +89,15 @@
  */
 #define	HV_MACH_EXIT		0x00
 #define	HV_MACH_DESC		0x01
+#define	HV_MACH_SIR		0x02
+
+#define	HV_CPU_START		0x10
+#define	HV_CPU_STOP		0x11
 #define	HV_CPU_YIELD		0x12
-#define	CPU_QCONF		0x14
+#define	HV_CPU_QCONF		0x14
 #define	HV_CPU_STATE		0x17
+#define	HV_CPU_SET_RTBA		0x18
+
 #define	MMU_TSB_CTX0		0x20
 #define	MMU_TSB_CTXNON0		0x21
 #define	MMU_DEMAP_PAGE		0x22
@@ -95,20 +105,24 @@
 #define	MMU_DEMAP_ALL		0x24
 #define	MAP_PERM_ADDR		0x25
 #define	MMU_SET_INFOPTR		0x26
+#define	MMU_ENABLE		0x27
 #define	UNMAP_PERM_ADDR		0x28
+
 #define	HV_MEM_SCRUB		0x31
 #define	HV_MEM_SYNC		0x32
+
 #define	HV_INTR_SEND		0x42
+
 #define	TOD_GET			0x50
 #define	TOD_SET			0x51
-#define	CONS_READ		0x60
-#define	CONS_WRITE		0x61
+
+#define	CONS_GETCHAR		0x60
+#define	CONS_PUTCHAR		0x61
 
 #define	TTRACE_BUF_CONF		0x90
 #define	TTRACE_BUF_INFO		0x91
 #define	TTRACE_ENABLE		0x92
 #define	TTRACE_FREEZE		0x93
-
 #define	DUMP_BUF_UPDATE		0x94
 
 #define	HVIO_INTR_DEVINO2SYSINO	0xa0
@@ -119,6 +133,31 @@
 #define	HVIO_INTR_GETTARGET	0xa5
 #define	HVIO_INTR_SETTARGET	0xa6
 
+#define	VINTR_GET_COOKIE	0xa7
+#define	VINTR_SET_COOKIE	0xa8
+#define	VINTR_GET_VALID		0xa9
+#define	VINTR_SET_VALID		0xaa
+#define	VINTR_GET_STATE		0xab
+#define	VINTR_SET_STATE		0xac
+#define	VINTR_GET_TARGET	0xad
+#define	VINTR_SET_TARGET	0xae
+
+#define	LDC_TX_QCONF		0xe0
+#define	LDC_TX_QINFO		0xe1
+#define	LDC_TX_GET_STATE	0xe2
+#define	LDC_TX_SET_QTAIL	0xe3
+#define	LDC_RX_QCONF		0xe4
+#define	LDC_RX_QINFO		0xe5
+#define	LDC_RX_GET_STATE	0xe6
+#define	LDC_RX_SET_QHEAD	0xe7
+
+#define	LDC_SET_MAP_TABLE	0xea
+#define	LDC_GET_MAP_TABLE	0xeb
+#define	LDC_COPY		0xec
+#define	LDC_MAPIN		0xed
+#define	LDC_UNMAP		0xee
+#define	LDC_REVOKE		0xef
+
 #ifdef SET_MMU_STATS
 #define	MMU_STAT_AREA		0xfc
 #endif /* SET_MMU_STATS */
@@ -127,6 +166,15 @@
 #define	HV_HPRIV		0x201
 
 /*
+ * Function numbers for CORE_TRAP.
+ */
+#define	API_SET_VERSION		0x00
+#define	API_PUT_CHAR		0x01
+#define	API_EXIT		0x02
+#define	API_GET_VERSION		0x03
+
+
+/*
  * Bits for MMU functions flags argument:
  *	arg3 of MMU_MAP_ADDR
  *	arg3 of MMU_DEMAP_CTX
@@ -188,14 +236,14 @@
 };
 #endif /* SET_MMU_STATS */
 
-#endif /* _ASM */
+#endif /* ! _ASM */
 
 /*
  * CPU States
  */
 #define	CPU_STATE_INVALID	0x0
-#define	CPU_STATE_IDLE		0x1	/* cpu not started */
-#define	CPU_STATE_GUEST		0x2	/* cpu running guest code */
+#define	CPU_STATE_STOPPED	0x1	/* cpu not started */
+#define	CPU_STATE_RUNNING	0x2	/* cpu running guest code */
 #define	CPU_STATE_ERROR		0x3	/* cpu is in the error state */
 #define	CPU_STATE_LAST_PUBLIC	CPU_STATE_ERROR	/* last valid state */
 
@@ -256,19 +304,34 @@
 #define	HVIO_DMA_SYNC_DIR_TO_DEV	0x01
 #define	HVIO_DMA_SYNC_DIR_FROM_DEV	0x02
 
+/*
+ * LDC Channel States
+ */
+#define	LDC_CHANNEL_DOWN	0x0
+#define	LDC_CHANNEL_UP		0x1
+#define	LDC_CHANNEL_RESET	0x2
+
 #ifndef _ASM
 
 extern uint64_t hv_mmu_map_perm_addr(void *, int, uint64_t, int);
 extern uint64_t	hv_mmu_unmap_perm_addr(void *, int, int);
+extern uint64_t hv_mach_exit(uint64_t exit_code);
+extern uint64_t hv_mach_sir(void);
+
+extern uint64_t hv_cpu_start(uint64_t cpuid, uint64_t pc, uint64_t rtba,
+    uint64_t arg);
+extern uint64_t hv_cpu_stop(uint64_t cpuid);
+extern uint64_t hv_cpu_set_rtba(uint64_t *rtba);
+
 extern uint64_t	hv_set_ctx0(uint64_t, uint64_t);
 extern uint64_t	hv_set_ctxnon0(uint64_t, uint64_t);
+extern uint64_t hv_mmu_fault_area_conf(void *raddr);
 #ifdef SET_MMU_STATS
 extern uint64_t hv_mmu_set_stat_area(uint64_t, uint64_t);
 #endif /* SET_MMU_STATS */
 
 extern uint64_t hv_cpu_qconf(int queue, uint64_t paddr, int size);
-extern uint64_t hv_cpu_yield();
-
+extern uint64_t hv_cpu_yield(void);
 extern uint64_t hv_cpu_state(uint64_t cpuid, uint64_t *cpu_state);
 extern uint64_t hv_mem_scrub(uint64_t real_addr, uint64_t length,
     uint64_t *scrubbed_len);
@@ -282,7 +345,6 @@
 extern uint64_t hv_service_getstatus(uint64_t s_id, uint64_t *vreg);
 extern uint64_t hv_service_setstatus(uint64_t s_id, uint64_t bits);
 extern uint64_t hv_service_clrstatus(uint64_t s_id, uint64_t bits);
-
 extern uint64_t hv_mach_desc(uint64_t buffer_ra, uint64_t *buffer_sizep);
 
 extern uint64_t hv_ttrace_buf_info(uint64_t *, uint64_t *);
@@ -300,16 +362,64 @@
 extern uint64_t hvio_intr_devino_to_sysino(uint64_t dev_hdl, uint32_t devino,
     uint64_t *sysino);
 extern uint64_t hvio_intr_getvalid(uint64_t sysino,
-    int *intr_valid_state);
+	int *intr_valid_state);
 extern uint64_t hvio_intr_setvalid(uint64_t sysino,
-    int intr_valid_state);
+	int intr_valid_state);
 extern uint64_t hvio_intr_getstate(uint64_t sysino,
-    int *intr_state);
+	int *intr_state);
 extern uint64_t hvio_intr_setstate(uint64_t sysino, int intr_state);
 extern uint64_t hvio_intr_gettarget(uint64_t sysino, uint32_t *cpuid);
 extern uint64_t hvio_intr_settarget(uint64_t sysino, uint32_t cpuid);
 
-#endif
+extern uint64_t hv_ldc_tx_qconf(uint64_t channel, uint64_t ra_base,
+    uint64_t nentries);
+extern uint64_t hv_ldc_tx_qinfo(uint64_t channel, uint64_t *ra_base,
+    uint64_t *nentries);
+extern uint64_t hv_ldc_tx_get_state(uint64_t channel, uint64_t *headp,
+    uint64_t *tailp, uint64_t *state);
+extern uint64_t hv_ldc_tx_set_qtail(uint64_t channel, uint64_t tail);
+extern uint64_t hv_ldc_rx_qconf(uint64_t channel, uint64_t ra_base,
+    uint64_t nentries);
+extern uint64_t hv_ldc_rx_qinfo(uint64_t channel, uint64_t *ra_base,
+    uint64_t *nentries);
+extern uint64_t hv_ldc_rx_get_state(uint64_t channel, uint64_t *headp,
+    uint64_t *tailp, uint64_t *state);
+extern uint64_t hv_ldc_rx_set_qhead(uint64_t channel, uint64_t head);
+
+extern uint64_t hv_ldc_set_map_table(uint64_t channel, uint64_t tbl_ra,
+    uint64_t tbl_entries);
+extern uint64_t hv_ldc_get_map_table(uint64_t channel, uint64_t *tbl_ra,
+    uint64_t *tbl_entries);
+extern uint64_t hv_ldc_copy(uint64_t channel, uint64_t request,
+    uint64_t cookie, uint64_t raddr, uint64_t length, uint64_t *lengthp);
+extern uint64_t hv_ldc_mapin(uint64_t channel, uint64_t cookie,
+    uint64_t *raddr, uint64_t *perm);
+extern uint64_t hv_ldc_unmap(uint64_t raddr);
+extern uint64_t hv_ldc_revoke(uint64_t raddr);
+extern uint64_t hv_api_get_version(uint64_t api_group, uint64_t *majorp,
+    uint64_t *minorp);
+extern uint64_t hv_api_set_version(uint64_t api_group, uint64_t major,
+    uint64_t minor, uint64_t *supported_minor);
+
+extern uint64_t hvldc_intr_getcookie(uint64_t dev_hdl, uint32_t devino,
+    uint64_t *cookie);
+extern uint64_t hvldc_intr_setcookie(uint64_t dev_hdl, uint32_t devino,
+    uint64_t cookie);
+extern uint64_t hvldc_intr_getvalid(uint64_t dev_hdl, uint32_t devino,
+    int *intr_valid_state);
+extern uint64_t hvldc_intr_setvalid(uint64_t dev_hdl, uint32_t devino,
+    int intr_valid_state);
+extern uint64_t hvldc_intr_getstate(uint64_t dev_hdl, uint32_t devino,
+    int *intr_state);
+extern uint64_t hvldc_intr_setstate(uint64_t dev_hdl, uint32_t devino,
+    int intr_state);
+extern uint64_t hvldc_intr_gettarget(uint64_t dev_hdl, uint32_t devino,
+    uint32_t *cpuid);
+extern uint64_t hvldc_intr_settarget(uint64_t dev_hdl, uint32_t devino,
+    uint32_t cpuid);
+
+#endif /* ! _ASM */
+
 
 #ifdef __cplusplus
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/ldc.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,221 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LDC_H
+#define	_LDC_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ioctl.h>
+#include <sys/processor.h>
+
+/* Types */
+typedef uint64_t ldc_handle_t;		/* Channel handle */
+typedef uint64_t ldc_mem_handle_t;	/* Channel memory handle */
+typedef uint64_t ldc_dring_handle_t;	/* Descriptor ring handle */
+
+/* LDC transport mode */
+typedef enum {
+	LDC_MODE_RAW,			/* Raw mode */
+	LDC_MODE_UNRELIABLE,		/* Unreliable packet mode */
+	LDC_MODE_RELIABLE,		/* Reliable packet mode */
+	LDC_MODE_STREAM			/* Reliable byte stream */
+} ldc_mode_t;
+
+/* LDC message payload sizes */
+#define	LDC_ELEM_SIZE			8		/* size in bytes */
+#define	LDC_PACKET_SIZE			(LDC_ELEM_SIZE * 8)
+#define	LDC_PAYLOAD_SIZE_RAW		(LDC_PACKET_SIZE)
+#define	LDC_PAYLOAD_SIZE_UNRELIABLE	(LDC_PACKET_SIZE - LDC_ELEM_SIZE)
+#define	LDC_PAYLOAD_SIZE_RELIABLE	(LDC_PACKET_SIZE - (LDC_ELEM_SIZE * 2))
+
+/* LDC Channel Status */
+typedef enum {
+	LDC_INIT = 1,			/* Channel initialized */
+	LDC_OPEN,			/* Channel open */
+	LDC_READY,			/* Channel peer opened (hw-link-up) */
+	LDC_UP				/* Channel UP - ready for data xfer */
+} ldc_status_t;
+
+/* Callback return values */
+#define	LDC_SUCCESS	0
+#define	LDC_FAILURE	1
+
+/* LDC callback mode */
+typedef enum {
+	LDC_CB_ENABLE,			/* Enable callbacks */
+	LDC_CB_DISABLE			/* Disable callbacks */
+} ldc_cb_mode_t;
+
+/* Callback events */
+#define	LDC_EVT_DOWN		0x1	/* Channel DOWN, status = OPEN */
+#define	LDC_EVT_RESET		0x2	/* Channel RESET, status = READY */
+#define	LDC_EVT_UP		0x4	/* Channel UP, status = UP */
+#define	LDC_EVT_READ		0x8	/* Channel has data for read */
+#define	LDC_EVT_WRITE		0x10	/* Channel has space for write */
+
+/* LDC device classes */
+typedef enum {
+	LDC_DEV_GENERIC = 1,		/* generic device */
+	LDC_DEV_BLK,			/* block device, eg. vdc */
+	LDC_DEV_BLK_SVC,		/* block device service, eg. vds */
+	LDC_DEV_NT,			/* network device, eg. vnet */
+	LDC_DEV_NT_SVC,			/* network service eg. vsw */
+	LDC_DEV_SERIAL			/* serial device eg. vldc, vcc */
+} ldc_dev_t;
+
+/* Channel nexus registration */
+typedef struct ldc_cnex {
+	dev_info_t	*dip;		/* dip of channel nexus */
+	int		(*reg_chan)();	/* interface for channel register */
+	int		(*unreg_chan)(); /* interface for channel unregister */
+	int		(*add_intr)();	/* interface for adding interrupts */
+	int		(*rem_intr)();	/* interface for removing interrupts */
+	int		(*clr_intr)();	/* interface for clearing interrupts */
+} ldc_cnex_t;
+
+/* LDC attribute structure */
+
+/*
+ * FIXME: Attribute passed in should be an MTU size
+ * Allocate the queue internally to ldc module to accomodate
+ * based on MTU size. For streaming mode, size can be zero.
+ */
+
+typedef struct ldc_attr {
+	ldc_dev_t	devclass;	/* device class */
+	uint64_t	instance;	/* device class instance */
+	ldc_mode_t	mode;		/* channel mode */
+	uint64_t	qlen;		/* channel queue elements */
+} ldc_attr_t;
+
+/* LDC memory cookie */
+typedef struct ldc_mem_cookie {
+	uint64_t	addr;		/* cookie address */
+	uint64_t	size;		/* size @ offset */
+} ldc_mem_cookie_t;
+
+/*
+ * LDC Memory Map Type
+ * Specifies how shared memory being created is shared with its
+ * peer and/or how the peer has mapped in the exported memory.
+ */
+#define	LDC_SHADOW_MAP		0x1	/* share mem via shadow copy only */
+#define	LDC_DIRECT_MAP		0x2	/* share mem direct access */
+#define	LDC_IO_MAP		0x4	/* share mem for IOMMU/DMA access */
+
+/* LDC Memory Access Permissions  */
+#define	LDC_MEM_R		0x1	/* Memory region is read only */
+#define	LDC_MEM_W		0x2	/* Memory region is write only */
+#define	LDC_MEM_X		0x4	/* Memory region is execute only */
+#define	LDC_MEM_RW		(LDC_MEM_R|LDC_MEM_W)
+#define	LDC_MEM_RWX		(LDC_MEM_R|LDC_MEM_W|LDC_MEM_X)
+
+/* LDC Memory Copy Direction */
+#define	LDC_COPY_IN		0x0	/* Copy data to VA from cookie mem */
+#define	LDC_COPY_OUT		0x1	/* Copy data from VA to cookie mem */
+
+/* LDC memory/dring (handle) status */
+typedef enum {
+	LDC_UNBOUND,			/* Memory handle is unbound */
+	LDC_BOUND,			/* Memory handle is bound */
+	LDC_MAPPED			/* Memory handle is mapped */
+} ldc_mstatus_t;
+
+/* LDC [dring] memory info */
+typedef struct ldc_mem_info {
+	uint8_t		mtype;		/* map type */
+	uint8_t		perm;		/* RWX permissions */
+	caddr_t		vaddr;		/* base VA */
+	uintptr_t	raddr;		/* base RA */
+	ldc_mstatus_t	status;		/* dring/mem handle status */
+} ldc_mem_info_t;
+
+/* API functions */
+int ldc_register(ldc_cnex_t *cinfo);
+int ldc_unregister(ldc_cnex_t *cinfo);
+
+int ldc_init(uint64_t id, ldc_attr_t *attr, ldc_handle_t *handle);
+int ldc_fini(ldc_handle_t handle);
+int ldc_open(ldc_handle_t handle);
+int ldc_close(ldc_handle_t handle);
+int ldc_up(ldc_handle_t handle);
+int ldc_reset(ldc_handle_t handle);
+int ldc_reg_callback(ldc_handle_t handle,
+    uint_t(*callback)(uint64_t event, caddr_t arg), caddr_t arg);
+int ldc_unreg_callback(ldc_handle_t handle);
+int ldc_set_cb_mode(ldc_handle_t handle, ldc_cb_mode_t imode);
+int ldc_chkq(ldc_handle_t handle, boolean_t *isempty);
+int ldc_read(ldc_handle_t handle, caddr_t buf, size_t *size);
+int ldc_write(ldc_handle_t handle, caddr_t buf, size_t *size);
+int ldc_status(ldc_handle_t handle, ldc_status_t *status);
+
+int ldc_mem_alloc_handle(ldc_handle_t handle, ldc_mem_handle_t *mhandle);
+int ldc_mem_free_handle(ldc_mem_handle_t mhandle);
+int ldc_mem_bind_handle(ldc_mem_handle_t mhandle, caddr_t vaddr, size_t len,
+    uint8_t mtype, uint8_t perm, ldc_mem_cookie_t *cookie, uint32_t *ccount);
+int ldc_mem_unbind_handle(ldc_mem_handle_t mhandle);
+int ldc_mem_info(ldc_mem_handle_t mhandle, ldc_mem_info_t *minfo);
+int ldc_mem_nextcookie(ldc_mem_handle_t mhandle, ldc_mem_cookie_t *cookie);
+int ldc_mem_copy(ldc_handle_t handle, caddr_t vaddr, uint64_t off, size_t *len,
+    ldc_mem_cookie_t *cookies, uint32_t ccount, uint8_t direction);
+int ldc_mem_rdwr_pa(ldc_handle_t handle, caddr_t vaddr, size_t *size,
+    caddr_t paddr, uint8_t  direction);
+int ldc_mem_map(ldc_mem_handle_t mhandle, ldc_mem_cookie_t *cookie,
+    uint32_t ccount, uint8_t mtype, caddr_t *vaddr, caddr_t *raddr);
+int ldc_mem_acquire(ldc_mem_handle_t mhandle, uint64_t offset, uint64_t size);
+int ldc_mem_release(ldc_mem_handle_t mhandle, uint64_t offset, uint64_t size);
+
+int ldc_mem_dring_create(uint32_t len, uint32_t dsize,
+    ldc_dring_handle_t *dhandle);
+int ldc_mem_dring_destroy(ldc_dring_handle_t dhandle);
+int ldc_mem_dring_bind(ldc_handle_t handle, ldc_dring_handle_t dhandle,
+    uint8_t mtype, uint8_t perm, ldc_mem_cookie_t *dcookie, uint32_t *ccount);
+int ldc_mem_dring_nextcookie(ldc_dring_handle_t mhandle,
+    ldc_mem_cookie_t *cookie);
+int ldc_mem_dring_unbind(ldc_dring_handle_t dhandle);
+int ldc_mem_dring_info(ldc_dring_handle_t dhandle, ldc_mem_info_t *minfo);
+int ldc_mem_dring_map(ldc_handle_t handle, ldc_mem_cookie_t *cookie,
+    uint32_t ccount, uint32_t len, uint32_t dsize, uint8_t mtype,
+    ldc_dring_handle_t *dhandle);
+int ldc_mem_dring_unmap(ldc_dring_handle_t dhandle);
+int ldc_mem_dring_acquire(ldc_dring_handle_t dhandle, uint64_t start,
+    uint64_t end);
+int ldc_mem_dring_release(ldc_dring_handle_t dhandle, uint64_t start,
+    uint64_t end);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LDC_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/ldc_impl.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,487 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LDC_IMPL_H
+#define	_LDC_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ioctl.h>
+
+/* Memory map table size */
+#define	MTBL_MAX_SIZE		65536	/* 64K */
+
+/* Define LDC Queue info */
+#define	LDC_PACKET_SHIFT	6
+#define	LDC_QUEUE_ENTRIES	128
+#define	LDC_QUEUE_SIZE		(LDC_QUEUE_ENTRIES << LDC_PACKET_SHIFT)
+#define	LDC_STREAM_MTU		(LDC_QUEUE_SIZE >> 1)
+
+/*
+ * LDC Reliable mode - initial packet seqid
+ * - If peer initiated handshake, RDX should contain init_seqid + 1
+ * - If this endpoint initiated handshake first data packet should
+ *   contain the message init_seqid + 1
+ */
+#define	LDC_INIT_SEQID	0x0
+
+/* LDC Message types */
+#define	LDC_CTRL	0x01	/* Control Pkt */
+#define	LDC_DATA	0x02	/* Data Pkt */
+#define	LDC_ERR		0x10	/* Error Pkt */
+
+/* LDC Message Subtypes */
+#define	LDC_INFO	0x01	/* Control/Data/Error info pkt */
+#define	LDC_ACK		0x02	/* Control/Data ACK */
+#define	LDC_NACK	0x04	/* Control/Data NACK */
+
+/* LDC Control Messages */
+#define	LDC_VER		0x01	/* Version message */
+#define	LDC_RTS		0x02	/* Request to Send */
+#define	LDC_RTR		0x03	/* Ready To Receive */
+#define	LDC_RDX		0x04	/* Ready for data exchange */
+
+#define	LDC_CTRL_MASK	0x0f	/* Mask to read control bits */
+
+/* LDC Channel Transport State (tstate) */
+#define	TS_TXQ_RDY	0x01	/* allocated TX queue */
+#define	TS_RXQ_RDY	0x02	/* allocated RX queue */
+#define	TS_INIT		(TS_TXQ_RDY | TS_RXQ_RDY)
+#define	TS_QCONF_RDY	0x04	/* registered queues with HV */
+#define	TS_CNEX_RDY	0x08	/* registered channel with cnex */
+#define	TS_OPEN		(TS_INIT | TS_QCONF_RDY | TS_CNEX_RDY)
+#define	TS_LINK_READY	0x10	/* both endpts registered Rx queues */
+#define	TS_READY	(TS_OPEN | TS_LINK_READY)
+#define	TS_VER_DONE	0x20	/* negotiated version */
+#define	TS_VREADY	(TS_READY | TS_VER_DONE)
+#define	TS_HSHAKE_DONE	0x40	/* completed handshake */
+#define	TS_UP		(TS_READY | TS_VER_DONE | TS_HSHAKE_DONE)
+
+/*  LDC Channel Transport Handshake states */
+#define	TS_SENT_RTS	0x01	/* Sent RTS */
+#define	TS_RCVD_RTR	0x02	/* Received RTR */
+#define	TS_SENT_RDX	0x04	/* Sent RDX */
+#define	TS_RCVD_RTS	0x10	/* Received RTS */
+#define	TS_SENT_RTR	0x20	/* Sent RTR */
+#define	TS_RCVD_RDX	0x40	/* Received RDX */
+
+/* LDC MSG Envelope */
+#define	LDC_LEN_MASK	0x3F
+#define	LDC_FRAG_MASK	0xC0
+
+#define	LDC_FRAG_START	0x40	/* frag_info = 0x01 */
+#define	LDC_FRAG_STOP	0x80	/* frag_info = 0x02 */
+#define	LDC_FRAG_CONT	0x00	/* frag_info = 0x00 */
+
+/*
+ * LDC fragmented xfer loop wait cnt
+ * When data is arriving in fragments, the read thread will
+ * look for a packet 'LDC_CHK_CNT' times. Between each check
+ * it will loop 'LDC_LOOP_CNT' times
+ */
+#define	LDC_CHK_CNT	1000
+#define	LDC_LOOP_CNT	1000
+
+/*
+ * LDC Version information
+ */
+#define	LDC_PAYLOAD_VER_OFF	8	/* offset of version in payload */
+
+typedef struct ldc_ver {
+	uint16_t	major;
+	uint16_t	minor;
+} ldc_ver_t;
+
+/*
+ * Each guest consists of one or more LDC endpoints represented by a ldc_chan
+ * structure. Each ldc_chan structure points to a ldc_mtbl structure that
+ * contains information about the map table associated with this LDC endpoint.
+ * The map table contains the list of pages being shared by this guest over
+ * this endpoint with the guest at the other end of this endpoint. Each LDC
+ * endpoint also points to a list of memory handles used to bind and export
+ * memory segments from this guest. If a memory segment is bound, it points to
+ * a memory segment structure, which inturn consists of an array of ldc_page
+ * structure for all the pages within that segment. Each ldc_page structure
+ * contains information about the shared page and also points to the
+ * corresponding entry in the map table.
+ *
+ * Each LDC endpoint also points to a list of ldc_dring structures that refer
+ * to both imported and exported descriptor rings. If it is a exported
+ * descriptor ring, it then points to memory handle/memseg corresponding to
+ * the region of memory associated with the descriptor ring.
+ *
+ *     +----------+   +----------+   +----------+
+ *     | ldc_chan |-->| ldc_chan |-->| ldc_chan |-->....
+ *     +----------+   +----------+   +----------+
+ *       |  |  |
+ *       |  |  |
+ *       |  |  |      +-----------+     +-----------+
+ *       |  |  +----->| ldc_dring |---->| ldc_dring |---->......
+ *       |  |         +-----------+     +-----------+
+ *       |  |               |
+ *       |  |               +----------------------------+
+ *       |  |                                            |
+ *       |  |                                            v
+ *       |  |      +----------+     +----------+     +----------+
+ *       |  +----->| ldc_mhdl |---->| ldc_mhdl |---->| ldc_mhdl |---> ....
+ *       |         +----------+     +----------+     +----------+
+ *       v                 |                             |
+ *  +----------+           |    +------------+           |    +------------+
+ *  | ldc_mtbl |--+        +--->| ldc_memseg |-----+     +--->| ldc_memseg |
+ *  +----------+  |             +------------+     |          +------------+
+ *                |                   |            |            |       |
+ *                v                   v            v            |       v
+ *     +--------------+         +----------+  +--------+        |   +--------+
+ *     | ldc_mte_slot |<--------| ldc_page |  | cookie |        |   | cookie |
+ *     +--------------+         +----------+  +--------+        |   +--------+
+ *     | ldc_mte_slot |<--------| ldc_page |  | cookie |        v
+ *     +--------------+         +----------+  +--------+   +----------+
+ *     | ldc_mte_slot |<-----------------------------------| ldc_page |
+ *     +--------------+                                    +----------+
+ *     | ldc_mte_slot |
+ *     +--------------+
+ *     |    ......    |/ +------------+
+ *     +--------------+  |   entry    |
+ *     | ldc_mte_slot |  +------------+
+ *     +--------------+  | inv_cookie |
+ *                     \ +------------+
+ *
+ */
+
+/*
+ * Message format of each packet sent over the LDC channel.
+ * Each packet is 64-bytes long.
+ *
+ * Each packet that is sent over LDC can contain either data or acks.
+ * The type will reflect the contents. The len will contain in bytes
+ * the amount of data being sent. In the case of ACKs, the seqid and
+ * data fields will contain the SEQIDs of messages for which ACKs are
+ * being sent.
+ *
+ * Raw pkt format:
+ *
+ *          +------------------------------------------------------+
+ *  0 - 7   |                 data payload                         |
+ *          +------------------------------------------------------+
+ *
+ * Unreliable pkt format:
+ *
+ *          +------------------------------------------------------+
+ *      0   |          seqid          | env  | ctrl | stype | type |
+ *          +------------------------------------------------------+
+ *  1 - 7   |                 data payload                         |
+ *          +------------------------------------------------------+
+ *
+ * Reliable pkt format:
+ *
+ *          +------------------------------------------------------+
+ *      0   |            seqid        | env  | ctrl | stype | type |
+ *          +------------------------------------------------------+
+ *      1   |          ackid          |         unused             |
+ *          +------------------------------------------------------+
+ *  2 - 7   |                 data payload                         |
+ *          +------------------------------------------------------+
+ */
+
+typedef struct ldc_msg {
+	union {
+		struct {
+			uint8_t		_type;	/* Message type */
+			uint8_t		_stype;	/* Message subtype */
+			uint8_t		_ctrl;	/* Control/Error Message */
+			uint8_t 	_env;	/* Message Envelope */
+			uint32_t	_seqid;	/* Sequence ID */
+
+			union {
+				uint8_t	_ud[LDC_PAYLOAD_SIZE_UNRELIABLE];
+						/* Unreliable data payload */
+				struct {
+					uint32_t _unused;	/* unused */
+					uint32_t _ackid;	/* ACK ID */
+					uint8_t	_rd[LDC_PAYLOAD_SIZE_RELIABLE];
+						/* Reliable data payload */
+				} _rl;
+			} _data;
+		} _tpkt;
+
+		uint8_t		_raw[LDC_PAYLOAD_SIZE_RAW];
+	} _pkt;
+
+} ldc_msg_t;
+
+#define	raw		_pkt._raw
+#define	type		_pkt._tpkt._type
+#define	stype		_pkt._tpkt._stype
+#define	ctrl		_pkt._tpkt._ctrl
+#define	env		_pkt._tpkt._env
+#define	seqid		_pkt._tpkt._seqid
+#define	udata		_pkt._tpkt._data._ud
+#define	ackid		_pkt._tpkt._data._rl._ackid
+#define	rdata		_pkt._tpkt._data._rl._rd
+
+/*
+ * LDC Map Table Entry (MTE)
+ *
+ *   6    6                               1    1  1
+ *  |3    0|                       psz|   3|   1| 0| 9| 8| 7|6|5|4|      0|
+ *  +------+--------------------------+----+----+--+--+--+--+-+-+-+-------+
+ *  | rsvd |           PFN            | 0  | 0  |CW|CR|IW|IR|X|W|R| pgszc |
+ *  +------+--------------------------+----+----+--+--+--+--+-+-+-+-------+
+ *  |                       hv invalidation cookie                        |
+ *  +---------------------------------------------------------------------+
+ */
+typedef union {
+	struct {
+		uint64_t	_rsvd2:8,	/* <63:56> reserved */
+				rpfn:43,	/* <55:13> real pfn */
+				_rsvd1:2,	/* <12:11> reserved */
+				cw:1,		/* <10> copy write access */
+				cr:1,		/* <9> copy read perm */
+				iw:1,		/* <8> iommu write perm */
+				ir:1,		/* <7> iommu read perm */
+				x:1,		/* <6> execute perm */
+				w:1,		/* <5> write perm */
+				r:1,		/* <4> read perm */
+				pgszc:4;	/* <3:0> pgsz code */
+	} mte_bit;
+
+	uint64_t 		ll;
+
+} ldc_mte_t;
+
+#define	mte_rpfn	mte_bit.rpfn
+#define	mte_cw		mte_bit.cw
+#define	mte_cr		mte_bit.cr
+#define	mte_iw		mte_bit.iw
+#define	mte_ir		mte_bit.ir
+#define	mte_x		mte_bit.x
+#define	mte_w		mte_bit.w
+#define	mte_r		mte_bit.r
+#define	mte_pgszc	mte_bit.pgszc
+
+#define	MTE_BSZS_SHIFT(sz)	((sz) * 3)
+#define	MTEBYTES(sz)    	(MMU_PAGESIZE << MTE_BSZS_SHIFT(sz))
+#define	MTEPAGES(sz)    	(1 << MTE_BSZS_SHIFT(sz))
+#define	MTE_PAGE_SHIFT(sz)	(MMU_PAGESHIFT + MTE_BSZS_SHIFT(sz))
+#define	MTE_PAGE_OFFSET(sz)	(MTEBYTES(sz) - 1)
+#define	MTE_PAGEMASK(sz)	(~MTE_PAGE_OFFSET(sz))
+#define	MTE_PFNMASK(sz)		(~(MTE_PAGE_OFFSET(sz) >> MMU_PAGESHIFT))
+
+/*
+ * LDC Map Table Slot
+ */
+typedef struct ldc_mte_slot {
+	ldc_mte_t	entry;
+	uint64_t	cookie;
+} ldc_mte_slot_t;
+
+/*
+ * LDC Memory Map Table
+ *
+ * Each LDC has a memory map table it uses to list all the pages
+ * it exporting to its peer over the channel. This structure
+ * contains information about the map table and is pointed to
+ * by the ldc_chan structure.
+ */
+typedef struct ldc_mtbl {
+	kmutex_t		lock;		/* Table lock */
+	size_t			size;		/* Table size (in bytes) */
+	uint64_t		next_entry;	/* Next entry to use */
+	uint64_t		num_entries;	/* Num entries in table */
+	uint64_t		num_avail;	/* Num of available entries */
+	ldc_mte_slot_t		*table;		/* The table itself */
+} ldc_mtbl_t;
+
+/*
+ * LDC page and memory segment information
+ */
+typedef struct ldc_page {
+	uintptr_t		raddr;		/* Exported page RA */
+	uint64_t		offset;		/* Exported page offset */
+	size_t			size;		/* Exported page size */
+	uint64_t		index;		/* Index in map table */
+	ldc_mte_slot_t		*mte;		/* Map table entry */
+} ldc_page_t;
+
+typedef struct ldc_memseg {
+	caddr_t			vaddr;		/* Exported segment VA */
+	uintptr_t		raddr;		/* Exported segment VA */
+	size_t			size;		/* Exported segment size */
+	uint64_t		npages;		/* Number of pages */
+	ldc_page_t		*pages;		/* Array of exported pages */
+	uint32_t		ncookies;	/* Number of cookies */
+	ldc_mem_cookie_t	*cookies;
+	uint64_t		next_cookie;	/* Index to next cookie */
+} ldc_memseg_t;
+
+/*
+ * LDC Cookie address format
+ *
+ *   6       6          m+n
+ *  |3|      0|          |                  m|                  0|
+ *  +-+-------+----------+-------------------+-------------------+
+ *  |X| pgszc |   rsvd   |      table_idx    |     page_offset   |
+ *  +-+-------+----------+-------------------+-------------------+
+ */
+#define	LDC_COOKIE_PGSZC_MASK	0x7
+#define	LDC_COOKIE_PGSZC_SHIFT	60
+
+/*
+ * LDC Memory handle
+ */
+typedef struct ldc_chan ldc_chan_t;
+
+typedef struct ldc_mhdl {
+	kmutex_t		lock;		/* Mutex for memory handle */
+	ldc_mstatus_t		status;		/* Memory map status */
+
+	uint8_t			mtype;		/* Type of sharing */
+	uint8_t			perm;		/* Access permissions */
+	boolean_t		myshadow;	/* TRUE=alloc'd shadow mem */
+
+	ldc_chan_t		*ldcp;		/* Pointer to channel struct */
+	ldc_memseg_t		*memseg;	/* Bound memory segment */
+	struct ldc_mhdl		*next;		/* Next memory handle */
+} ldc_mhdl_t;
+
+/*
+ * LDC Descriptor rings
+ */
+
+typedef struct ldc_dring {
+	kmutex_t		lock;		/* Desc ring lock */
+	ldc_mstatus_t		status;		/* Desc ring status */
+
+	uint32_t		dsize;		/* Descriptor size */
+	uint32_t		length;		/* Descriptor ring length */
+	uint64_t		size;		/* Desc ring size (in bytes) */
+	caddr_t			base;		/* Descriptor ring base addr */
+
+	ldc_chan_t		*ldcp;		/* Pointer to bound channel */
+	ldc_mem_handle_t	mhdl;		/* Mem handle to desc ring */
+
+	struct ldc_dring	*ch_next;	/* Next dring in channel */
+	struct ldc_dring 	*next;		/* Next dring overall */
+
+} ldc_dring_t;
+
+
+/*
+ * Channel specific information is kept in a separate
+ * structure. These are then stored on a array indexed
+ * by the channel number.
+ */
+struct ldc_chan {
+	ldc_chan_t	*next;		/* Next channel */
+
+	kmutex_t	lock;		/* Channel lock */
+	uint64_t	id;		/* Channel ID */
+	ldc_status_t	status;		/* Channel status */
+	uint32_t	tstate;		/* Channel transport state */
+	uint32_t	hstate;		/* Channel transport handshake state */
+
+	ldc_dev_t	devclass;	/* Associated device class */
+	uint64_t	devinst;	/* Associated device instance */
+	ldc_mode_t	mode;		/* Channel mode */
+
+	uint64_t	mtu;		/* Max TU size (streaming for now) */
+
+	ldc_ver_t	version;	/* Channel version */
+	uint32_t	next_vidx;	/* Next version to match */
+
+	uint_t		(*cb)(uint64_t event, caddr_t arg);
+	caddr_t		cb_arg;		/* Channel callback and arg */
+	boolean_t	cb_inprogress;	/* Channel callback in progress */
+	boolean_t	cb_enabled;	/* Channel callbacks are enabled */
+
+	boolean_t	intr_pending;	/* TRUE if interrupts are pending */
+
+	uint64_t	tx_q_entries;	/* Num entries in transmit queue */
+	uint64_t	tx_q_va;	/* Virtual addr of transmit queue */
+	uint64_t	tx_q_ra;	/* Real addr of transmit queue */
+	uint64_t	tx_head;	/* Tx queue head */
+	uint64_t	tx_ackd_head;	/* Tx queue ACKd head (Reliable) */
+	uint64_t	tx_tail;	/* Tx queue tail */
+
+	uint64_t	rx_q_entries;	/* Num entries in receive queue */
+	uint64_t	rx_q_va;	/* Virtual addr of receive queue */
+	uint64_t	rx_q_ra;	/* Real addr of receive queue */
+
+	uint64_t	link_state;	/* Underlying HV channel state */
+
+	ldc_mtbl_t	*mtbl;		/* Memory table used by channel */
+	ldc_mhdl_t	*mhdl_list;	/* List of memory handles */
+	kmutex_t	mlist_lock;	/* Mem handle list lock */
+
+	ldc_dring_t	*exp_dring_list; /* Exported desc ring list */
+	kmutex_t	exp_dlist_lock;	/* Lock for exported desc ring list */
+	ldc_dring_t	*imp_dring_list; /* Imported desc ring list */
+	kmutex_t	imp_dlist_lock;	/* Lock for imported desc ring list */
+
+	uint8_t		pkt_payload;	/* Size of packet payload */
+
+	uint32_t	first_fragment;	/* Seqid of first msg fragment */
+	uint32_t	last_msg_snt;	/* Seqid of last packet sent */
+	uint32_t	last_ack_rcd;	/* Seqid of last ACK recd */
+	uint32_t	last_msg_rcd;	/* Seqid of last packet received */
+
+	uint32_t	stream_remains;	/* Number of bytes in stream */
+					/* packet buffer */
+	uint32_t	stream_offset;	/* Offset into packet buffer for */
+					/* next read */
+	uint8_t		*stream_bufferp; /* Stream packet buffer */
+
+	int		(*read_p)(ldc_chan_t *ldcp, caddr_t bufferp,
+				size_t *sizep);
+	int		(*write_p)(ldc_chan_t *ldcp, caddr_t bufferp,
+				size_t *sizep);
+};
+
+
+/*
+ * LDC module soft state structure
+ */
+typedef struct ldc_soft_state {
+	kmutex_t 	lock;		/* Protects ldc_soft_state_t  */
+	ldc_cnex_t	cinfo;		/* channel nexus info */
+	uint64_t	channel_count;	/* Number of channels */
+	uint64_t	channels_open;	/* Number of open channels */
+	ldc_chan_t 	*chan_list;	/* List of LDC endpoints */
+	ldc_dring_t	*dring_list;	/* Descriptor rings (for export) */
+} ldc_soft_state_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LDC_IMPL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/ldoms.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_LDOMS_H
+#define	_LDOMS_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/param.h>	/* for MAXHOSTNAMELEN */
+
+/*
+ * Global LDoms definitions.
+ */
+
+/* Maximum number of logical domains supported */
+#define	LDOMS_MAX_DOMAINS	32
+
+/* maximum number of characters in the logical domain name */
+#define	LDOMS_MAX_NAME_LEN	MAXHOSTNAMELEN
+
+/*
+ * Global flag that indicates whether domaining features are
+ * available. The value is set at boot time based on the value
+ * of the 'domaining-enabled' property in the MD.  Updates to
+ * this variable after boot are not supported.
+ */
+extern uint_t domaining_enabled;
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _LDOMS_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/lpad.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _LPAD_H
+#define	_LPAD_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * sun4v Landing Pad
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _ASM
+
+#include <sys/pte.h>
+
+typedef union {
+	struct {
+		unsigned int	rsvd0:32;
+		unsigned int	rsvd1:29;
+		unsigned int	perm:1;
+		unsigned int	mmuflags:2;
+	} flag_bits;
+	uint64_t	ll;
+} lpad_map_flag_t;
+
+typedef struct lpad_map {
+	lpad_map_flag_t	flags;
+	uint64_t	va;
+	tte_t		tte;
+} lpad_map_t;
+
+#define	flag_mmuflags	flags.flag_bits.mmuflags
+#define	flag_perm	flags.flag_bits.perm
+
+typedef struct lpad_data {
+	uint64_t	magic;		/* magic value for sanity checking */
+	uint64_t	*inuse;		/* clear flag when done with lpad */
+	uint64_t	mmfsa_ra;	/* RA of MMU fault status area */
+	uint64_t	pc;		/* VA of CPU startup function */
+	uint64_t	arg;		/* argument to startup function */
+	uint64_t	nmap;		/* number of mappings */
+	lpad_map_t	map[1];		/* array of mappings */
+} lpad_data_t;
+
+extern uint64_t *lpad_setup(int cpuid, uint64_t pc, uint64_t arg);
+
+#endif /* ! _ASM */
+
+/*
+ * General landing pad constants
+ */
+#define	LPAD_TEXT_SIZE		1024
+#define	LPAD_DATA_SIZE		1024
+#define	LPAD_SIZE		(LPAD_TEXT_SIZE + LPAD_DATA_SIZE)
+#define	LPAD_MAGIC_VAL		0x4C502D4D41474943	/* "LP-MAGIC" */
+
+/*
+ * Masks for the lpad_map_t flag bitfield
+ */
+#define	FLAG_MMUFLAGS_MASK	0x3
+#define	FLAG_LOCK_MASK		0x4
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _LPAD_H */
--- a/usr/src/uts/sun4v/sys/mach_descrip.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/mach_descrip.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,8 +18,9 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,22 +33,58 @@
 extern "C" {
 #endif
 
+#include <sys/kstat.h>
+#include <sys/ksynch.h>
+#include <sys/mdesc.h>
+
 /*
- * Common structure between kernel and mdesc driver
- * enabling the current machine description to be retrieved
- * or updated.
+ * MD memory operations (memops) are of two types:
+ * buf:
+ * 	Buffer allocator routines used to allocate the MD buffer.
+ *	Allocator must support an alignment argument.
+ *
+ * meta:
+ *	Meta allocator routines to allocate meta data strcutures.
+ *	These allocations are small and don't have alignment
+ *	requirements. Examples, md_t handles and the machine_descrip_t
+ *	structure.
  */
-struct machine_descrip_s {
-	void		*va;
-	uint64_t	pa;
-	uint64_t	size;
-	uint64_t	space;
-	kstat_t		*ksp;
-};
+typedef struct machine_descrip_memops {
+	void 		*(*buf_allocp)(size_t size, size_t align);
+	void 		(*buf_freep)(void *, size_t size);
+	void 		*(*meta_allocp)(size_t size);
+	void 		(*meta_freep)(void *, size_t size);
+} machine_descrip_memops_t;
 
-typedef struct machine_descrip_s machine_descrip_t;
+/*
+ * Common structure/list between kernel and mdesc driver enabling
+ * the current machine description to be retrieved or updated.
+ *
+ * Locks:
+ * The current global MD is protected by the curr_mach_descrip_lock.
+ * Each Machine description has a lock to synchronize its ref count.
+ * The Obsolete MD list is protected by the obs_list_lock.
+ */
+typedef struct machine_descrip_s {
+	uint64_t	gen;		/* Generation number for MD */
+	kmutex_t	lock;		/* synchronize access to MD */
+	void		*va;		/* virtual address */
+	uint64_t	size;		/* size of MD */
+	uint64_t	space;		/* space allocated for MD */
+	int		refcnt;		/* MD ref count */
+	struct machine_descrip_s *next;	/* Next MD in list */
+	machine_descrip_memops_t *memops; /* Memory operations for MD */
+} machine_descrip_t;
 
-extern machine_descrip_t machine_descrip;
+/*
+ * Utility wrappers to get/fini a handle to the current MD.
+ */
+extern md_t *md_get_handle(void);
+extern int md_fini_handle(md_t *);
+extern caddr_t md_get_md_raw(md_t *);
+extern int md_alloc_scan_dag(md_t *, mde_cookie_t, char *, char *,
+	    mde_cookie_t **);
+extern void md_free_scan_dag(md_t *, mde_cookie_t **);
 
 #ifdef __cplusplus
 }
--- a/usr/src/uts/sun4v/sys/machcpuvar.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/machcpuvar.h	Tue May 16 16:05:21 2006 -0700
@@ -207,6 +207,7 @@
 	int	ecache_associativity;
 	int	ecache_setsize;
 	uint64_t	device_id;
+	id_t	exec_unit_mapping;
 };
 
 extern struct cpu_node cpunodes[];
--- a/usr/src/uts/sun4v/sys/machparam.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/machparam.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -281,22 +280,26 @@
  * names defined in sun4u/os/mach_cpu_states.c which should be kept up to
  * date if new #defines are added.
  */
-#define	PTL1_BAD_DEBUG		0
-#define	PTL1_BAD_WTRAP		1
-#define	PTL1_BAD_KMISS		2
-#define	PTL1_BAD_KPROT_FAULT	3
-#define	PTL1_BAD_ISM		4
-#define	PTL1_BAD_MMUTRAP	5
-#define	PTL1_BAD_TRAP		6
-#define	PTL1_BAD_FPTRAP		7
-#define	PTL1_BAD_INTR_REQ	8
-#define	PTL1_BAD_TRACE_PTR	9
-#define	PTL1_BAD_STACK		10
-#define	PTL1_BAD_DTRACE_FLAGS	11
-#define	PTL1_BAD_CTX_STEAL	12
-#define	PTL1_BAD_ECC		13
-#define	PTL1_BAD_HCALL		14
-#define	PTL1_BAD_GL		15
+#define	PTL1_BAD_DEBUG				0
+#define	PTL1_BAD_WTRAP				1
+#define	PTL1_BAD_KMISS				2
+#define	PTL1_BAD_KPROT_FAULT			3
+#define	PTL1_BAD_ISM				4
+#define	PTL1_BAD_MMUTRAP			5
+#define	PTL1_BAD_TRAP				6
+#define	PTL1_BAD_FPTRAP				7
+#define	PTL1_BAD_INTR_REQ			8
+#define	PTL1_BAD_TRACE_PTR			9
+#define	PTL1_BAD_STACK				10
+#define	PTL1_BAD_DTRACE_FLAGS			11
+#define	PTL1_BAD_CTX_STEAL			12
+#define	PTL1_BAD_ECC				13
+#define	PTL1_BAD_HCALL				14
+#define	PTL1_BAD_GL				15
+#define	PTL1_BAD_WATCHDOG			16
+#define	PTL1_BAD_RED				17
+#define	PTL1_BAD_HCALL_UNMAP_PERM_EINVAL	18
+#define	PTL1_BAD_HCALL_UNMAP_PERM_ENOMAP	19
 
 /*
  * Defines the max trap level allowed
--- a/usr/src/uts/sun4v/sys/machsystm.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/machsystm.h	Tue May 16 16:05:21 2006 -0700
@@ -337,6 +337,7 @@
 extern void set_idle_cpu(int);
 extern void unset_idle_cpu(int);
 extern void mp_cpu_quiesce(struct cpu *);
+extern int stopcpu_bycpuid(int);
 
 /*
  * Panic at TL > 0
@@ -396,6 +397,12 @@
 extern caddr_t	kpm_vbase;
 
 #define	INVALID_VADDR(a)	(((a) >= hole_start && (a) < hole_end))
+#define	VA_ADDRESS_SPACE_BITS		64
+#define	RA_ADDRESS_SPACE_BITS		56
+#define	MAX_REAL_ADDRESS		(1ull << RA_ADDRESS_SPACE_BITS)
+#define	DEFAULT_VA_ADDRESS_SPACE_BITS	48	/* def. Niagara (broken MD) */
+#define	PAGESIZE_MASK_BITS		16
+#define	MAX_PAGESIZE_MASK		((1<<PAGESIZE_MASK_BITS) - 1)
 
 extern void adjust_hw_copy_limits(int);
 
@@ -466,6 +473,25 @@
 #define	HV_TOD_RETRY_THRESH	100
 #define	HV_TOD_WAIT_USEC	5
 
+/*
+ * Interrupt Queues and Error Queues
+ */
+
+#define	INTR_CPU_Q	0x3c
+#define	INTR_DEV_Q	0x3d
+#define	CPU_RQ		0x3e
+#define	CPU_NRQ		0x3f
+#define	DEFAULT_CPU_Q_ENTRIES	0x100
+#define	DEFAULT_DEV_Q_ENTRIES	0x100
+#define	INTR_REPORT_SIZE	64
+
+#ifndef	_ASM
+extern uint64_t cpu_q_entries;
+extern uint64_t dev_q_entries;
+extern uint64_t cpu_rq_entries;
+extern uint64_t cpu_nrq_entries;
+#endif /* _ASM */
+
 #endif /* _KERNEL */
 
 #ifdef __cplusplus
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/mdeg.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _MDEG_H
+#define	_MDEG_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * MD Event Generator (mdeg) interface.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/mdesc.h>
+
+/*
+ * Specification of a node property.
+ */
+typedef struct {
+	uint8_t		type;
+	char		*namep;
+	union {
+		char		*strp;
+		uint64_t	val;
+	} _p;
+
+} mdeg_prop_spec_t;
+
+#define	ps_str	_p.strp
+#define	ps_val	_p.val
+
+/*
+ * Specification of unique node in the MD. The array
+ * of property name value pairs is used to determine
+ * whether the node matches the specification.
+ */
+typedef struct {
+	char			*namep;
+	mdeg_prop_spec_t	*specp;
+} mdeg_node_spec_t;
+
+/*
+ * Specification of a method to match nodes. The
+ * array of properties are used to match two nodes
+ * from different MDs. If the specified properties
+ * match, the nodes are the same.
+ */
+typedef struct {
+	char		*namep;
+	md_prop_match_t	*matchp;
+} mdeg_node_match_t;
+
+/*
+ * The result of the MD update as communicated
+ * through the parameter to the registered callback.
+ */
+typedef struct {
+	md_t		*mdp;
+	mde_cookie_t	*mdep;
+	uint_t		nelem;
+} mdeg_diff_t;
+
+/*
+ * Results of the MD update for a specific registration
+ */
+typedef struct {
+	mdeg_diff_t	added;
+	mdeg_diff_t	removed;
+	mdeg_diff_t	match_curr;
+	mdeg_diff_t	match_prev;
+} mdeg_result_t;
+
+/*
+ * Client Interface
+ */
+
+#define	MDEG_SUCCESS	0
+#define	MDEG_FAILURE	1
+
+typedef uint64_t mdeg_handle_t;
+
+typedef int (*mdeg_cb_t)(void *cb_argp, mdeg_result_t *resp);
+
+int mdeg_register(mdeg_node_spec_t *pspecp, mdeg_node_match_t *nmatchp,
+    mdeg_cb_t cb, void *cb_argp, mdeg_handle_t *hdlp);
+
+int mdeg_unregister(mdeg_handle_t hdl);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _MDEG_H */
--- a/usr/src/uts/sun4v/sys/mmu.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/mmu.h	Tue May 16 16:05:21 2006 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -117,6 +117,10 @@
  */
 #define	FLUSH_ADDR	(KERNELBASE + 2 * MMU_PAGESIZE4M)
 
+#define	MAX_NCTXS_BITS			16	/* sun4v max. contexts bits */
+#define	MIN_NCTXS_BITS			2
+#define	MAX_NCTXS	(1ull << MAX_NCTXS_BITS)
+
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/sun4v/sys/ncp.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/ncp.h	Tue May 16 16:05:21 2006 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -30,7 +29,12 @@
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/types.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
 #include <sys/kmem.h>
+#include <sys/mdesc.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/spi.h>
 #include <sys/ncs.h>
 
 #ifdef	__cplusplus
@@ -45,11 +49,6 @@
 #define	FALSE		0
 #define	TRUE		1
 
-/*
- * XXX
- * NCP_MAX_NMAUS should come from OBP/HV
- * NCP_MAX_CPUS_PER_MAU should come from OBP/HV
- */
 #define	NCP_MAX_NMAUS		8
 #define	NCP_MAX_CPUS_PER_MAU	4
 #define	NCP_CPUID2MAUID(c)	((c) / NCP_MAX_CPUS_PER_MAU)
@@ -96,8 +95,6 @@
 typedef struct ncp_listnode ncp_listnode_t;
 typedef struct ncp_request ncp_request_t;
 typedef struct ncp_stat ncp_stat_t;
-typedef struct ncp_mau_queue ncp_mau_queue_t;
-typedef struct ncp_desc ncp_desc_t;
 
 
 
@@ -246,46 +243,16 @@
 	kstat_named_t		ns_status;
 	kstat_named_t		ns_algs[DS_MAX];
 	struct {
+		kstat_named_t	ns_mauid;
+		kstat_named_t	ns_mauhandle;
+		kstat_named_t	ns_maustate;
 		kstat_named_t	ns_submit;
 		kstat_named_t	ns_qfull;
+		kstat_named_t	ns_qbusy;
 		kstat_named_t	ns_qupdate_failure;
 	}			ns_mau[NCP_MAX_NMAUS];
 };
 
-
-struct ncp {
-	kmutex_t			n_lock;
-	kmem_cache_t			*n_ds_cache;
-	kmem_cache_t			*n_mactl_cache;
-	kmem_cache_t			*n_mabuf_cache;
-	dev_info_t			*n_dip;
-	minor_t				n_minor;
-	int				n_nmaus;
-	int				n_max_nmaus;
-	int				*n_mauids;
-	ncp_mau_queue_t			*n_mau_q;
-	int				n_mau_q_size;
-
-	ddi_taskq_t			*n_taskq;
-
-	unsigned			n_flags;	/* dev state flags */
-
-	kstat_t				*n_ksp;
-	kstat_t				*n_intrstats;
-	u_longlong_t			n_stats[DS_MAX];
-	u_longlong_t			n_qfull[NCP_MAX_NMAUS];
-	u_longlong_t			n_qupdate_failure[NCP_MAX_NMAUS];
-
-	ulong_t				n_pagesize;
-	crypto_kcf_provider_handle_t	n_prov;
-
-	kmutex_t			n_freereqslock;
-	ncp_listnode_t			n_freereqs;   /* available requests */
-
-	kmutex_t			n_ctx_list_lock;
-	ncp_listnode_t			n_ctx_list;
-};
-
 /*
  * Device flags (ncp_t.ncp_flags)
  */
@@ -294,10 +261,25 @@
 
 /*
  * IMPORTANT:
- *	(NCP_MAQUEUE_NENTRIES * sizeof (ncs_hvdesc_t)) <= PAGESIZE
+ *	NCP_MAQUEUE_NENTRIES *must* be a power-of-2.
+ *	requirement: sizeof (ncs_hvdesc_t) == 64
  */
-#define	NCP_MAQUEUE_NENTRIES	64
+#define	NCP_MAQUEUE_NENTRIES	(1 << 9)	/* 512 */
 #define	NCP_MAQUEUE_WRAPMASK	(NCP_MAQUEUE_NENTRIES - 1)
+#define	NCP_MAQUEUE_SIZE	(NCP_MAQUEUE_NENTRIES * sizeof (ncs_hvdesc_t))
+#define	NCP_MAQUEUE_ALIGN	(NCP_MAQUEUE_SIZE - 1)
+#define	NCP_MAQUEUE_SLOTS_AVAIL(q)	\
+		(((q)->nmq_head > (q)->nmq_tail) ? \
+			((q)->nmq_head > (q)->nmq_tail - 1) : \
+			(NCP_MAQUEUE_NENTRIES - \
+			((q)->nmq_tail - (q)->nmq_head) - 1))
+
+#define	NCP_QINDEX_TO_QOFFSET(i)	((i) * sizeof (ncs_hvdesc_t))
+#define	NCP_QOFFSET_TO_QINDEX(o)	((o) / sizeof (ncs_hvdesc_t))
+#define	NCP_QINDEX_INCR(i)		(((i) + 1) & NCP_MAQUEUE_WRAPMASK)
+#define	NCP_QINDEX_IS_VALID(i)		(((i) >= 0) && \
+						((i) < NCP_MAQUEUE_NENTRIES))
+#define	NCP_QTIMEOUT_SECONDS		15
 
 typedef struct ncp_ma {
 	kmutex_t	nma_lock;
@@ -305,24 +287,141 @@
 	int		nma_ref;	/* # of descriptor references */
 } ncp_ma_t;
 
+typedef struct ncp_desc ncp_desc_t;
 struct ncp_desc {
 	ncs_hvdesc_t	nd_hv;
 	ncp_desc_t	*nd_link;	/* to string related descriptors */
 	ncp_ma_t	*nd_ma;		/* referenced MA buffer */
 };
 
+typedef struct ncp_descjob {
+	int			dj_id;
+	kcondvar_t		dj_cv;
+	ncp_desc_t		*dj_jobp;
+	struct ncp_descjob	*dj_prev;
+	struct ncp_descjob	*dj_next;
+} ncp_descjob_t;
+
 /*
  * nmq_head, nmq_tail = indexes into nmq_desc[].
  */
-struct ncp_mau_queue {
-	int		nmq_id;
+typedef struct {
+	uint64_t	nmq_mauhandle;
+	uint64_t	nmq_devino;
+	int		nmq_inum;
+	int		nmq_mauid;
+	int		nmq_init;
+	int		nmq_busy_wait;
+	kcondvar_t	nmq_busy_cv;
 	kmutex_t	nmq_lock;
 	int		nmq_head;
 	int		nmq_tail;
 	uint_t		nmq_wrapmask;
+	ncp_descjob_t	**nmq_jobs;
+	size_t		nmq_jobs_size;
 	ncs_hvdesc_t	*nmq_desc;	/* descriptor array */
-	int		nmq_desc_size;
-	uint64_t	nmq_njobs;
+	char		*nmq_mem;
+	size_t		nmq_memsize;
+	ncp_descjob_t	*nmq_joblist;
+	int		nmq_joblistcnt;
+	struct {
+		uint64_t	qks_njobs;
+		uint64_t	qks_qfull;
+		uint64_t	qks_qbusy;
+		uint64_t	qks_qfail;
+	} nmq_ks;
+} ncp_mau_queue_t;
+
+#define	MAU_STATE_ERROR		(-1)
+#define	MAU_STATE_OFFLINE	0
+#define	MAU_STATE_ONLINE	1
+
+typedef struct {
+	int		mm_mauid;
+	int		mm_cpulistsz;
+	int		*mm_cpulist;
+	int		mm_ncpus;
+	int		mm_nextcpuidx;
+	/*
+	 * Only protects mm_nextcpuidx field.
+	 */
+	kmutex_t	mm_lock;
+	/*
+	 * xxx - maybe need RW lock for mm_state?
+	 */
+	int		mm_state;	/* MAU_STATE_... */
+
+	ncp_mau_queue_t	mm_queue;
+} mau_entry_t;
+
+typedef struct {
+	int		mc_cpuid;
+	int		mc_mauid;
+	/*
+	 * xxx - maybe need RW lock for mm_state?
+	 * Mirrors mm_state in mau_entry_t.  Duplicated
+	 * for speed so we don't have search mau_entry
+	 * table.  Field rarely updated.
+	 */
+	int		mc_state;	/* MAU_STATE_... */
+} cpu_entry_t;
+
+typedef struct {
+	/*
+	 * MAU stuff
+	 */
+	int		m_maulistsz;
+	mau_entry_t	*m_maulist;
+	int		m_nmaus;
+	int		m_nextmauidx;
+	/*
+	 * Only protects m_nextmauidx field.
+	 */
+	kmutex_t	m_lock;
+
+	/*
+	 * CPU stuff
+	 */
+	int		m_cpulistsz;
+	cpu_entry_t	*m_cpulist;
+	int		m_ncpus;
+} ncp_mau2cpu_map_t;
+
+struct ncp {
+	uint_t				n_hvapi_minor_version;
+	kmutex_t			n_lock;
+	kmem_cache_t			*n_ds_cache;
+	kmem_cache_t			*n_mactl_cache;
+	kmem_cache_t			*n_mabuf_cache;
+	dev_info_t			*n_dip;
+	minor_t				n_minor;
+
+	ddi_taskq_t			*n_taskq;
+
+	unsigned			n_flags;	/* dev state flags */
+
+	kstat_t				*n_ksp;
+	kstat_t				*n_intrstats;
+	u_longlong_t			n_stats[DS_MAX];
+
+	ddi_intr_handle_t		*n_htable;
+	int				n_intr_mid[NCP_MAX_NMAUS];
+	int				n_intr_type;
+	int				n_intr_cnt;
+	size_t				n_intr_size;
+	uint_t				n_intr_pri;
+
+	ulong_t				n_pagesize;
+	crypto_kcf_provider_handle_t	n_prov;
+
+	kmutex_t			n_freereqslock;
+	ncp_listnode_t			n_freereqs;   /* available requests */
+
+	kmutex_t			n_ctx_list_lock;
+	ncp_listnode_t			n_ctx_list;
+
+	md_t				*n_mdp;
+	ncp_mau2cpu_map_t		n_maumap;
 };
 
 #endif	/* _KERNEL */
@@ -343,14 +442,18 @@
 #define	DMA_LDST	0x00000004
 #define	DNCS_QTAIL	0x00000008
 #define	DATTACH		0x00000010
-#define	DMOD		0x00000040  /* _init/_fini/_info/attach/detach */
-#define	DENTRY		0x00000080  /* crypto routine entry/exit points */
+#define	DMD		0x00000020
+#define	DHV		0x00000040
+#define	DINTR		0x00000080
+#define	DMOD		0x00000100  /* _init/_fini/_info/attach/detach */
+#define	DENTRY		0x00000200  /* crypto routine entry/exit points */
 #define	DALL		0xFFFFFFFF
 
 #define	DBG0	ncp_dprintf
 #define	DBG1	ncp_dprintf
 #define	DBG2	ncp_dprintf
 #define	DBG3	ncp_dprintf
+#define	DBG4	ncp_dprintf
 #define	DBGCALL(flag, func)	{ if (ncp_dflagset(flag)) (void) func; }
 
 void	ncp_dprintf(ncp_t *, int, const char *, ...);
@@ -363,6 +466,7 @@
 #define	DBG1(vca, lvl, fmt, arg1)
 #define	DBG2(vca, lvl, fmt, arg1, arg2)
 #define	DBG3(vca, lvl, fmt, arg1, arg2, arg3)
+#define	DBG4(vca, lvl, fmt, arg1, arg2, arg3, arg4)
 #define	DBGCALL(flag, func)
 
 #endif	/* !defined(DEBUG) */
@@ -404,6 +508,16 @@
 			crypto_data_t *, int, crypto_req_handle_t, int);
 
 /*
+ * ncp_md.
+ */
+int	ncp_init_mau2cpu_map(ncp_t *);
+void	ncp_deinit_mau2cpu_map(ncp_t *);
+int	ncp_map_cpu_to_mau(ncp_t *, int);
+int	ncp_map_mau_to_cpu(ncp_t *, int);
+int	ncp_map_nextmau(ncp_t *);
+mau_entry_t	*ncp_map_findmau(ncp_t *, int);
+
+/*
  * ncp_kstat.c
  */
 void	ncp_ksinit(ncp_t *);
--- a/usr/src/uts/sun4v/sys/ncs.h	Tue May 16 15:54:21 2006 -0700
+++ b/usr/src/uts/sun4v/sys/ncs.h	Tue May 16 16:05:21 2006 -0700
@@ -33,12 +33,24 @@
 #endif
 
 /*
- * NCS HV API versioni definitions.
+ * NCS HV API version definitions.
  */
 #define	NCS_MAJOR_VER		1
-#define	NCS_MINOR_VER		0
+#define	NCS_MINOR_VER		1
 
+/*
+ * NCS HV API v1.0
+ */
 #define	HV_NCS_REQUEST		0x110
+/*
+ * NCS HV API v1.1
+ */
+#define	HV_NCS_QCONF			0x111
+#define	HV_NCS_QINFO			0x112
+#define	HV_NCS_GETHEAD			0x113
+#define	HV_NCS_GETTAIL			0x114
+#define	HV_NCS_SETTAIL			0x115
+#define	HV_NCS_QHANDLE_TO_DEVINO	0x116
 
 #ifndef _ASM
 /* Forward typedefs */
@@ -62,7 +74,7 @@
 		uint64_t	length:6;
 	} bits;
 };
-#endif	/* !_ASM */
+#endif /* _ASM */
 
 /* Values for ma_ctl operation field */
 #define	MA_OP_LOAD		0x0
@@ -114,7 +126,7 @@
 #endif	/* !_ASM */
 
 /*
- * NCS API definitions
+ * NCS HV API v1.0 definitions (PSARC/2005/125)
  */
 
 /*
@@ -164,8 +176,8 @@
 } ma_regs_t;
 
 #define	ND_TYPE_UNASSIGNED	0
-#define	ND_TYPE_MA		1
-#define	ND_TYPE_SPU		2
+#define	ND_TYPE_MA		1	/* v1.0 only */
+#define	ND_TYPE_SPU		2	/* v1.0 only */
 
 #define	ND_STATE_FREE		0
 #define	ND_STATE_PENDING	1
@@ -190,7 +202,50 @@
 
 extern uint64_t hv_ncs_request(int, uint64_t, size_t);
 
-#endif	/* !_ASM */
+#endif	/* _ASM */
+
+/*
+ * NCS HV API v1.1 definitions (FWARC/2006/174)
+ *
+ * Some of the structures above (v1.0) are inherited for v1.1
+ */
+/*
+ * In v1.1, the nhd_type field has the following values
+ * when non-zero (unassigned).  The nhd_type field indicates
+ * whether the descriptor is the beginning of a crypto job,
+ * the continuation, or the end/last descriptor in a job.
+ * A job may be comprised of multiple descriptors.
+ */
+#define	ND_TYPE_START		0x01
+#define	ND_TYPE_CONT		0x02
+#define	ND_TYPE_END		0x80
+
+/*
+ * Types of queues supported by NCS
+ */
+#define	NCS_QTYPE_MAU		0x1
+#define	NCS_QTYPE_CWQ		0x2
+
+/*
+ * This structure is accessed with offsets in ml/hcall.s.
+ * Any changes to this structure will require updates to
+ * the hv_ncs_qinfo entrypoint in ml/hcall.s.
+ */
+#ifndef _ASM
+typedef struct ncs_qinfo {
+	uint64_t	qi_qtype;
+	uint64_t	qi_baseaddr;
+	uint64_t	qi_qsize;
+} ncs_qinfo_t;
+
+extern uint64_t	hv_ncs_qconf(uint64_t, uint64_t, uint64_t, uint64_t *);
+extern uint64_t	hv_ncs_qinfo(uint64_t, ncs_qinfo_t *);
+extern uint64_t	hv_ncs_gethead(uint64_t, uint64_t *);
+extern uint64_t	hv_ncs_gettail(uint64_t, uint64_t *);
+extern uint64_t	hv_ncs_settail(uint64_t, uint64_t);
+extern uint64_t	hv_ncs_qhandle_to_devino(uint64_t, uint64_t *);
+extern uint64_t	hv_ncs_intr_clrstate(uint64_t);
+#endif /* _ASM */
 
 #ifdef	__cplusplus
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/platsvc.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _PLATSVC_H
+#define	_PLATSVC_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/ds.h>
+
+#define	MAX_REASON_SIZE		1
+
+/*
+ * PLATSVC STATUS
+ */
+#define	PLATSVC_SUCCESS		0x0
+#define	PLATSVC_FAILURE		0x1
+#define	PLATSVC_INVALID_MESG	0x2
+
+#define	MD_UPDATE_SUCCESS		PLATSVC_SUCCESS
+#define	MD_UPDATE_FAILURE		PLATSVC_FAILURE
+#define	MD_UPDATE_INVALID_MSG		PLATSVC_INVALID_MESG
+
+#define	DOMAIN_SHUTDOWN_SUCCESS		PLATSVC_SUCCESS
+#define	DOMAIN_SHUTDOWN_FAILURE		PLATSVC_FAILURE
+#define	DOMAIN_SHUTDOWN_INVALID_MSG	PLATSVC_INVALID_MESG
+
+#define	DOMAIN_PANIC_SUCCESS		PLATSVC_SUCCESS
+#define	DOMAIN_PANIC_FAILURE		PLATSVC_FAILURE
+#define	DOMAIN_PANIC_INVALID_MSG	PLATSVC_INVALID_MESG
+
+typedef struct platsvc_md_update_req {
+	uint64_t	req_num;
+} platsvc_md_update_req_t;
+
+typedef struct platsvc_md_update_resp {
+	uint64_t	req_num;
+	uint32_t	result;
+} platsvc_md_update_resp_t;
+
+
+typedef struct platsvc_shutdown_req {
+	uint64_t	req_num;
+	uint32_t	delay;
+} platsvc_shutdown_req_t;
+
+typedef struct platsvc_shutdown_resp {
+	uint64_t	req_num;
+	uint32_t	result;
+	char		reason[MAX_REASON_SIZE];
+} platsvc_shutdown_resp_t;
+
+typedef struct platsvc_panic_req {
+	uint64_t	req_num;
+} platsvc_panic_req_t;
+
+typedef struct platsvc_panic_resp {
+	uint64_t	req_num;
+	uint32_t	result;
+	char		reason[MAX_REASON_SIZE];
+} platsvc_panic_resp_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PLATSVC_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/promif_impl.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,144 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_PROMIF_IMPL_H
+#define	_SYS_PROMIF_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#ifdef _KERNEL
+#include <sys/promimpl.h>
+#endif
+#include <sys/obpdefs.h>
+#include <sys/cmn_err.h>
+#include <sys/note.h>
+
+/*
+ * CIF handler functions
+ */
+typedef int (*cif_func_t) (void *);
+extern int promif_getprop(void *p);
+extern int promif_getproplen(void *p);
+extern int promif_nextprop(void *p);
+extern int promif_nextnode(void *p);
+extern int promif_childnode(void *p);
+extern int promif_parentnode(void *p);
+extern int promif_enter_mon(void *p);
+extern int promif_exit_to_mon(void *p);
+extern int promif_reboot(void *p);
+extern int promif_write(void *p);
+extern int promif_read(void *p);
+extern int promif_interpret(void *p);
+extern int promif_finddevice(void *p);
+extern int promif_instance_to_package(void *p);
+#ifndef _KMDB
+extern int promif_setprop(void *p);
+extern int promif_test(void *p);
+extern int promif_instance_to_path(void *p);
+extern int promif_power_off(void *p);
+extern int promif_asr_list_keys_len(void *p);
+extern int promif_asr_list_keys(void *p);
+extern int promif_asr_export_len(void *p);
+extern int promif_asr_export(void *p);
+extern int promif_set_security_key(void *p);
+extern int promif_get_security_key(void *p);
+extern int promif_start_cpu(void *p);
+extern int promif_set_mmfsa_traptable(void *p);
+extern int promif_set_sun4v_api_version(void *p);
+extern int promif_get_sun4v_api_version(void *p);
+#endif
+
+/*
+ * Shadow device tree access functions
+ */
+extern pnode_t promif_stree_nextnode(pnode_t nodeid);
+extern pnode_t promif_stree_childnode(pnode_t nodeid);
+extern pnode_t promif_stree_parentnode(pnode_t nodeid);
+extern int promif_stree_getproplen(pnode_t, char *name);
+extern int promif_stree_getprop(pnode_t, char *name, void *value);
+extern int promif_stree_setprop(pnode_t, char *name, void *value, int len);
+extern char *promif_stree_nextprop(pnode_t nodeid, char *name, char *next);
+
+/*
+ * Hooks for kmdb to get and set a pointer to the PROM shadow tree
+ */
+#ifdef _KMDB
+extern void promif_stree_setroot(void *root);
+extern caddr_t promif_stree_getroot(void);
+#endif
+
+/*
+ * Miscellaneous functions
+ */
+extern cif_func_t promif_find_cif_callback(char *opname);
+extern int promif_ldom_setprop(char *name, void *value, int valuelen);
+
+/*
+ * Initialization functions
+ */
+#ifdef _KMDB
+extern void cif_init(char *, caddr_t, ihandle_t, ihandle_t,
+    phandle_t, phandle_t, pnode_t, pnode_t);
+extern void promif_io_init(ihandle_t, ihandle_t, phandle_t, phandle_t);
+extern void promif_set_nodes(pnode_t, pnode_t);
+#else
+extern void promif_io_init(void);
+extern void promif_stree_init(void);
+extern void promif_prop_init(void);
+#endif
+
+/*
+ * Debugging support
+ */
+#ifdef DEBUG
+
+extern uint_t cif_debug;
+
+#define	CIF_DBG_FLAG_NODE		0x01
+#define	CIF_DBG_FLAG_REBOOT		0x02
+
+#define	CIF_DBG_ALL	if (cif_debug)				prom_printf
+#define	CIF_DBG_NODE	if (cif_debug & CIF_DBG_FLAG_NODE)	prom_printf
+#define	CIF_DBG_REBOOT	if (cif_debug & CIF_DBG_FLAG_REBOOT)	prom_printf
+
+#else /* DEBUG */
+
+#define	CIF_DBG_ALL	_NOTE(CONSTCOND) if (0)	prom_printf
+#define	CIF_DBG_NODE	CIF_DBG_ALL
+#define	CIF_DBG_REBOOT	CIF_DBG_ALL
+
+#endif /* DEBUG */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_PROMIF_IMPL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/varconfig.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_VARCONFIG_H
+#define	_SYS_VARCONFIG_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+typedef enum {
+	VAR_CONFIG_SET_REQ,
+	VAR_CONFIG_DELETE_REQ,
+	VAR_CONFIG_SET_RESP,
+	VAR_CONFIG_DELETE_RESP
+} var_config_cmd_t;
+
+typedef struct  {
+	var_config_cmd_t cmd;
+} var_config_hdr_t;
+
+
+typedef struct {
+	char name_and_value[1];
+} var_config_set_req_t;
+
+typedef struct {
+	char name[1];
+} var_config_delete_req_t;
+
+
+typedef enum {
+	VAR_CONFIG_SUCCESS = 0,
+	VAR_CONFIG_NO_SPACE,
+	VAR_CONFIG_INVALID_VAR,
+	VAR_CONFIG_INVALID_VAL,
+	VAR_CONFIG_VAR_NOT_PRESENT
+} var_config_status_t;
+
+typedef struct {
+	var_config_status_t result;
+} var_config_resp_t;
+
+
+typedef struct {
+	var_config_hdr_t vc_hdr;
+	union {
+		var_config_set_req_t vc_set;
+		var_config_delete_req_t vc_delete;
+		var_config_resp_t vc_resp;
+	} un;
+} var_config_msg_t;
+
+#define	var_config_cmd		vc_hdr.cmd
+#define	var_config_set		un.vc_set
+#define	var_config_delete	un.vc_delete
+#define	var_config_resp		un.vc_resp
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_VARCONFIG_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vcc.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,110 @@
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VCC_H
+#define	_VCC_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/stream.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ioctl.h>
+
+/*
+ * vcc and vntsd exchange information using ioctl commands. When vntsd starts,
+ * it uses VCC_NUM_CONSOLE to get number of existing ports and
+ * VCC_CONS_TBL to obtain the table of existing consoles. In this table,
+ * vcc returns information about each of the console ports using vcc_console_t
+ * structure. Vntsd then sleeps on polling vcc control port.
+ *
+ * When there is a change in configuration, such as addtion or deletion
+ * of a console port, vcc wakes up vntsd via the poll events. Subsequently,
+ * vntsd uses VCC_INQUIRY ioctl to determine the reason for wakeup. In
+ * response to the inquiry, vcc provides a vcc_response_t structure
+ * containing reason and port number.
+ *
+ * If a port is being added or updated (group change), vntsd uses
+ * VCC_CONS_INFO ioctl with port number to obtain configuration of
+ * the port.
+ *
+ * If the port is being deleted, vntsd uses VCC_DEL_CONS_OK ioctl to notify
+ * vcc after its clean up is done. Vcc subsequently tears down
+ * its internal configuration and remove the associated TTY minor node.
+ *
+ * Only one open is allowd for each vcc port. If vntsd opens a port that is
+ * already open, vntsd will use VNTSD_FORCE_CLOSE to take port from other
+ * application
+ */
+
+/* VCC CNTRL IOCTL */
+
+#define	    VCC_IOCTL_CMD		('c' << 8)
+
+
+#define	    VCC_NUM_CONSOLE	VCC_IOCTL_CMD | 0x1	/* num of consoles */
+#define	    VCC_CONS_TBL	VCC_IOCTL_CMD | 0x2	/* config table */
+#define	    VCC_INQUIRY		VCC_IOCTL_CMD | 0x3	/* inquiry by vntsd */
+#define	    VCC_CONS_INFO	VCC_IOCTL_CMD | 0x4	/* config */
+#define	    VCC_CONS_STATUS	VCC_IOCTL_CMD | 0x5	/* console status */
+#define	    VCC_FORCE_CLOSE	VCC_IOCTL_CMD | 0x6	/* force to close */
+
+/* reasons to wake up vntsd */
+typedef enum {
+	VCC_CONS_ADDED,		    /* a port was added */
+	VCC_CONS_DELETED,	    /* a port was removed */
+	/* XXX not implemented yet */
+	VCC_CONS_UPDATED	    /* a port configuration was changed */
+} vcc_reason_t;
+
+/*
+ * structure that vcc returns to vntsd in response to VCC_CONS_TBL and
+ * VCC_CONS_INFO  ioctl call.
+ */
+typedef struct vcc_console {
+	int		cons_no;		    /* console port number  */
+	uint64_t	tcp_port;		    /* tcp port for the group */
+	char		domain_name[MAXPATHLEN];    /* domain name */
+	char		group_name[MAXPATHLEN];	    /* group name */
+	char		dev_name[MAXPATHLEN];
+} vcc_console_t;
+
+/* structure that vcc sends to vntsd in response to wake up inquiry */
+typedef struct vcc_response {
+	int		cons_no;	/* console port number */
+	vcc_reason_t	reason;		/* wake up reason */
+} vcc_response_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VCC_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vcc_impl.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,304 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VCC_IMPL_H
+#define	_VCC_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ioctl.h>
+#include <sys/vcc.h>
+
+#define	    VCC_DEV_TO_INST(dev)		(getminor(dev))
+#define	    VCC_INST_TO_DEV(instance)		(instance)
+
+#define	    VCC_DRIVER_NAME			"vcc"
+#define	    VCC_NAME				VCC_DRIVER_NAME
+
+/*
+ * VCC Port States
+ */
+
+/*
+ * There is one lock in port structure to protect the states of the port.
+ * States of the port are:
+ * 1. VCC_PORT_AVAIL
+ * 2. VCC_PORT_OPEN
+ * 3. VCC_PORT_USE_READ_LDC  - There is a thread doing vcc_read.
+ * 4. VCC_PORT_USE_WRITE_LDC - There is a thread doing vcc_write.
+ * 6. VCC_PORT_LDC_DATA_READY - Data is ready from ldc.
+ * 5. VCC_PORT_LDC_WRITE_READY - Ldc has space to receive data.
+ * 7. VCC_PORT_LDC_CHANNEL_READY - Ldc channel is up.
+ * 8. VCC_PORT_ADDED		- A new port was added.
+ * 9. VCC_PORT_TERM_RD		- Terminal read is enabled vs suspended
+ * 10. VCC_PORT_TERM_WR		- Terminal write is enabled vc suspended
+ * 11. VCC_PORT_NONBLOCK	- A port was opened with non blocking flag.
+ * 12. VCC_PORT_LDC_LINK_DOWN
+ *
+ *
+ * Code flow for port to transit from one state to another is as the follows:
+ *
+ * 1. VCC_PORT_AVAIL
+ *
+ *	    Transition from unavailable to available
+ *		- obtain port lock
+ *		Transit port to available and added states
+ *		- release port lock
+ *		- obtain softstate lock
+ *		Increase total number of ports
+ *		- release softsate lock
+ *
+ *		after download added port to vntsd
+ *		- obtain port lock
+ *		Transit port to not added state
+ *		- release port lock
+ *
+ *	    Transition from available to unavailable
+ *		- obtain port lock
+ *		- cv_wait read available
+ *		Transit port to read unavailable
+ *		- cv_wait write available
+ *		Transit port to write unavailable
+ *		Transit port to not ready. (close ldc channel)
+ *		Transit port to deleted state
+ *		Transit port to read and write available
+ *		- cv_broadcast
+ *		- release lock
+ *
+ *		vntsd close the deleted port
+ *		- obtained port lock
+ *		Transit port to close and deleted state
+ *		- release port lock
+ *
+ *		after vntsd deletion of the port
+ *		- obtain softstate lock
+ *		- cv_wait port table unlocked
+ *		Transit softstate to port table locked
+ *		- release softstate lock
+ *		- obtain port lock
+ *		Transit port to unavailable
+ *		destroy port lock
+ *		- obtain softstate lock
+ *		Transit softstate to port table unlocked
+ *		- cv_broadcast
+ *		- release softsate lock
+ *
+ * 2. VCC_PORT_OPEN
+ *
+ *	    Transition from close to open
+ *		- obtain port lock
+ *		transit port to open
+ *		- release port lock
+ *
+ *	    Transition from open to close
+ *		- obtain port lock
+ *		- cv_wait read available
+ *		Transit port to read unavailable
+ *		- cv_wait write available
+ *		Transit port to write unavailable
+ *		Transit port to not ready. (close ldc channel)
+ *		Transit port to close state
+ *		Transit port to read and write available
+ *		- cv_broadcast
+ *		- release lock
+ *
+ * 3. VCC_PORT_USE_READ_LDC/VCC_PORT_USE_WRITE_LDC
+ *	    Transition from read availale/write available
+ *	    to read unavailable/write unavailable
+ *		- obtain port lock
+ *		- cv_wait read available
+ *		Transit to read/write unavailable
+ *		- release port lock
+ *
+ *	    Transition from read unavailale/write unavailable
+ *	    to read available/write available
+ *		- obtain port lock
+ *		Transit to read/write available
+ *		- cv_broadcast
+ *		- release port lock
+ *
+ * 4. VCC_PORT_LDC_CHANNEL_READY
+ *	    Transition from data not ready to data ready
+ *		- obtain port lock
+ *		Transit to data ready
+ *		- cv_broadcast
+ *		- release port lock
+ *
+ *	    Transition from data ready to data not ready
+ *		- obtain port lock
+ *		Transit to data not ready
+ *		- release port lock
+ */
+
+#define	    VCC_PORT_AVAIL		0x1	/* port is configured */
+#define	    VCC_PORT_OPEN		0x2	/* port is opened */
+#define	    VCC_PORT_LDC_CHANNEL_READY	0x4	/* ready for data transfer */
+#define	    VCC_PORT_USE_READ_LDC	0x8	/* read lock */
+#define	    VCC_PORT_USE_WRITE_LDC	0x10	/* write lock */
+#define	    VCC_PORT_LDC_DATA_READY	0x20	/* data ready */
+#define	    VCC_PORT_LDC_WRITE_READY	0x40	/* ldc ready receive data */
+#define	    VCC_PORT_ADDED		0x80	/* added, no ack from vntsd */
+#define	    VCC_PORT_UPDATED		0x100	/* updated, no ack from vntsd */
+#define	    VCC_PORT_TERM_RD		0x200	/* suspend write */
+#define	    VCC_PORT_TERM_WR		0x400	/* suspend read */
+#define	    VCC_PORT_NONBLOCK		0x800	/* open with non block flag */
+#define	    VCC_PORT_LDC_LINK_DOWN	0x1000	/* ldc link down */
+
+/* Poll Flags */
+#define	    VCC_POLL_CONFIG	    0x1	    /* poll configuration change  */
+
+/* Poll evnets */
+#define	    VCC_POLL_ADD_PORT	    0x10    /* add a console port */
+#define	    VCC_POLL_UPDATE_PORT    0x20    /* update a console port  */
+
+/* softstate port table state */
+#define	    VCC_LOCK_PORT_TBL		0x1
+
+/* VCC limits */
+#define	    VCC_MAX_PORTS	    0x800	    /* number of domains */
+#define	    VCC_MAX_MINORS	    VCC_MAX_PORTS   /* number of minors */
+
+
+#define	    VCC_MAX_PORT_MINORS		(VCC_MAX_MINORS - 1)
+#define	    VCC_CONTROL_MINOR_IDX	(VCC_MAX_MINORS - 1)
+
+/* size of vcc message data */
+#define	    VCC_MTU_SZ		    56
+
+
+/* Default values */
+#define	    VCC_HDR_SZ		    8	    /* header size */
+#define	    VCC_BUF_SZ		    (VCC_HDR_SZ + VCC_MTU_SZ)
+
+#define	    VCC_CONTROL_PORT	    0x7ff   /* port 2047 is control port  */
+#define	    VCC_INST_SHIFT	    11
+#define	    VCC_INVALID_CHANNEL	    -1
+#define	    VCC_NO_PID_BLOCKING	    -1
+
+#define	    VCC_QUEUE_LEN	    0x80    /* ldc queue size */
+
+#define	    VCC_MINOR_NAME_PREFIX   "ldom-" /* device name prefix */
+
+/* HV message data type */
+#define	    LDC_CONSOLE_CTRL	    0x1	    /* ctrl msg */
+#define	    LDC_CONSOLE_DATA	    0x2	    /* data msg */
+
+/* HV control messages */
+#define	    LDC_CONSOLE_BREAK	    -1	    /* brk */
+#define	    LDC_CONSOLE_HUP	    -2	    /* hup */
+
+/*  minor number to port number */
+#define	    VCCPORT(p, minor)	    (p->minor_tbl[(minor & \
+    VCC_CONTROL_PORT)].portno)
+
+/*  minor number to minor pointer */
+#define	    VCCMINORP(p, minor)	    (&(p->minor_tbl[(minor & \
+    VCC_CONTROL_PORT)]))
+
+/* minor number to instance */
+#define	    VCCINST(minor)	    ((minor) >> VCC_INST_SHIFT)
+
+
+/* hv console packet format */
+typedef struct vcc_msg {
+	uint8_t		type;		    /* type - data or ctrl */
+	uint8_t		size;		    /* data size */
+	uint16_t	unused;		    /* not used */
+	int32_t		ctrl_msg;	    /* data if type is ctrl */
+	uint8_t		data[VCC_MTU_SZ];   /* data if type is data */
+} vcc_msg_t;
+
+/*
+ *  minor node to port mapping table
+ */
+typedef struct vcc_minor {
+	uint_t		portno;			    /* port number */
+	char		domain_name[MAXPATHLEN];    /* doman name */
+} vcc_minor_t;
+
+/* console port structure */
+typedef struct vcc_port {
+
+	kmutex_t 	lock;		/* protects port */
+	kcondvar_t	read_cv;	/* cv to sleep for reads */
+	kcondvar_t	write_cv;	/* cv to sleep for writes */
+
+	uint_t		number;		/* port number */
+	uint32_t	status;		/* port status */
+
+	char		group_name[MAXPATHLEN];
+	uint64_t	tcp_port;	/* tcp port num */
+
+	struct	termios	term;		/* terminal emulation */
+
+	vcc_minor_t	*minorp;	/* pointer to minor table entry */
+
+	uint64_t	ldc_id;		/* Channel number */
+	ldc_handle_t	ldc_handle;	/* Channel handle */
+	ldc_status_t	ldc_status;	/* Channel Status */
+
+	uint_t		pollflag;	/* indicated poll status */
+	struct pollhead	poll;
+	uint32_t	pollevent;
+	pid_t		valid_pid;	/* pid that allows cb_ops */
+
+} vcc_port_t;
+
+/*
+ * vcc  driver's soft state structure
+ */
+typedef struct vcc {
+
+	/* protects vcc_t (soft state)  */
+	kmutex_t		lock;
+
+	uint_t			status;
+
+	dev_info_t		*dip;			   /* dev_info */
+
+	mdeg_node_spec_t	*md_ispecp;		   /* mdeg prop spec */
+	mdeg_handle_t		mdeg_hdl;		   /* mdeg handle */
+
+	vcc_port_t		port[VCC_MAX_PORTS];	   /* port table */
+	uint_t			num_ports;		   /* avail ports */
+
+	vcc_minor_t		minor_tbl[VCC_MAX_PORTS];   /* minor table */
+	uint_t			minors_assigned;	   /* assigned minors */
+} vcc_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _VCC_IMPL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vdc.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,260 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_VDC_H
+#define	_VDC_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+/*
+ * Virtual disk client implementation definitions
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/note.h>
+
+#include <sys/ldc.h>
+#include <sys/vio_mailbox.h>
+#include <sys/vdsk_mailbox.h>
+#include <sys/vdsk_common.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	VDC_DRIVER_NAME		"vdc"
+
+/*
+ * Bit-field values to indicate if parts of the vdc driver are initialised.
+ */
+#define	VDC_SOFT_STATE	0x0001
+#define	VDC_LOCKS	0x0002
+#define	VDC_MINOR	0x0004
+#define	VDC_THREAD	0x0008
+#define	VDC_LDC		0x0010
+#define	VDC_LDC_INIT	0x0020
+#define	VDC_LDC_CB	0x0040
+#define	VDC_LDC_OPEN	0x0080
+#define	VDC_DRING_INIT	0x0100	/* The DRing was created */
+#define	VDC_DRING_BOUND	0x0200	/* The DRing was bound to an LDC channel */
+#define	VDC_DRING_LOCAL	0x0400	/* The local private DRing was allocated */
+#define	VDC_DRING_ENTRY	0x0800	/* At least one DRing entry was initialised */
+#define	VDC_DRING	(VDC_DRING_INIT | VDC_DRING_BOUND |	\
+				VDC_DRING_LOCAL | VDC_DRING_ENTRY)
+#define	VDC_HANDSHAKE	0x1000	/* Indicates if a handshake is in progress */
+#define	VDC_HANDSHAKE_STOP	0x2000	/* stop further handshakes */
+
+/*
+ * Bit-field values to indicate status of local DRing entry
+ *
+ * The lowest 8 bits are reserved for the DRing state.
+ */
+#define	VDC_ALLOC_HANDLE	0x10
+
+/*
+ * Definitions of strings to be used to create device node properties.
+ * (vdc uses the capitalised versions of these properties as they are 64-bit)
+ */
+#define	VDC_NBLOCKS_PROP_NAME		"Nblocks"
+#define	VDC_SIZE_PROP_NAME		"Size"
+
+/*
+ * Definitions of MD nodes/properties.
+ */
+#define	VDC_MD_CHAN_NAME		"channel-endpoint"
+#define	VDC_MD_VDEV_NAME		"virtual-device"
+#define	VDC_MD_DISK_NAME		"disk"
+#define	VDC_MD_CFG_HDL			"cfg-handle"
+#define	VDC_ID_PROP			"id"
+
+/*
+ * Scheme to store the instance number and the slice number in the minor number.
+ * (Uses the same format and definitions as the sd(7D) driver)
+ */
+#define	VD_MAKE_DEV(instance, minor)	((instance << SDUNIT_SHIFT) | minor)
+
+/*
+ * variables controlling how long to wait before timing out and how many
+ * retries to attempt before giving up when communicating with vds.
+ */
+#define	VDC_RETRIES	10
+
+#define	VDC_USEC_TIMEOUT_MIN	(30 * MICROSEC)		/* 30 sec */
+
+#define	VD_GET_TIMEOUT_HZ(mul)	\
+	(ddi_get_lbolt() + (vdc_hz_timeout * MAX(1, mul)))
+
+/*
+ * Macros to manipulate Descriptor Ring variables in the soft state
+ * structure.
+ */
+#define	VDC_GET_NEXT_REQ_ID(vdc)	((vdc->req_id)++)
+
+#define	VDC_GET_DRING_ENTRY_PTR(vdc, idx)	\
+		(vd_dring_entry_t *)(vdc->dring_mem_info.vaddr +	\
+			(idx * vdc->dring_entry_size))
+
+#define	VDC_MARK_DRING_ENTRY_FREE(vdc, idx)			\
+	{ \
+		vd_dring_entry_t *dep = NULL;				\
+		ASSERT(vdc != NULL);					\
+		ASSERT((idx >= 0) && (idx < VD_DRING_LEN));		\
+		ASSERT(vdc->dring_mem_info.vaddr != NULL);		\
+		dep = (vd_dring_entry_t *)(vdc->dring_mem_info.vaddr +	\
+			(idx * vdc->dring_entry_size));			\
+		ASSERT(dep != NULL);					\
+		dep->hdr.dstate = VIO_DESC_FREE;			\
+	}
+
+/* Initialise the Session ID and Sequence Num in the DRing msg */
+#define	VDC_INIT_DRING_DATA_MSG_IDS(dmsg, vdc)		\
+		ASSERT(vdc != NULL);			\
+		dmsg.tag.vio_sid = vdc->session_id;	\
+		dmsg.seq_num = ++(vdc->seq_num);
+
+/*
+ * The states the message processing thread can be in.
+ */
+typedef enum vdc_thr_state {
+	VDC_THR_RUNNING,	/* thread is running & ready to process */
+	VDC_THR_STOP,		/* The detach func signals the thread to stop */
+	VDC_THR_DONE		/* Thread has exited */
+} vdc_thr_state_t;
+
+/*
+ * Local Descriptor Ring entry
+ *
+ * vdc creates a Local (private) descriptor ring the same size as the
+ * public descriptor ring it exports to vds.
+ */
+typedef struct vdc_local_desc {
+	kmutex_t		lock;		/* protects all fields */
+	kcondvar_t		cv;		/* indicate processing done */
+	int			flags;		/* Dring entry state, etc */
+	int			operation;	/* VD_OP_xxx to be performed */
+	caddr_t			addr;		/* addr passed in by consumer */
+	caddr_t			align_addr;	/* used if addr non-aligned */
+	struct buf 		*buf;		/* buf passed to strategy() */
+	ldc_mem_handle_t	desc_mhdl;	/* Mem handle of buf */
+	vd_dring_entry_t	*dep;		/* public Dring Entry Pointer */
+} vdc_local_desc_t;
+
+/*
+ * vdc soft state structure
+ */
+typedef struct vdc {
+	kmutex_t	attach_lock;	/* used by CV which waits in attach */
+	kcondvar_t	attach_cv;	/* signal when attach can finish */
+
+	kmutex_t	lock;		/* protects next 2 sections of vars */
+	kcondvar_t	cv;		/* signal when upper layers can send */
+
+	dev_info_t	*dip;		/* device info pointer */
+	int		instance;	/* driver instance number */
+	int		initialized;	/* keeps track of what's init'ed */
+	int		open;		/* count of outstanding opens */
+	int		dkio_flush_pending;	/* # outstanding DKIO flushes */
+
+	uint64_t	session_id;	/* common ID sent with all messages */
+	uint64_t	seq_num;	/* most recent sequence num generated */
+	uint64_t	seq_num_reply;	/* Last seq num ACK/NACK'ed by vds */
+	uint64_t	req_id;		/* Most recent Request ID generated */
+	vd_state_t	state;		/* Current handshake state */
+	vd_disk_type_t	vdisk_type;	/* type of device/disk being imported */
+	uint64_t	vdisk_size;	/* device size in bytes */
+	uint64_t	max_xfer_sz;	/* maximum block size of a descriptor */
+	uint64_t	block_size;	/* device block size used */
+	struct dk_cinfo	*cinfo;		/* structure to store DKIOCINFO data */
+	struct dk_minfo	*minfo;		/* structure for DKIOCGMEDIAINFO data */
+	struct vtoc	*vtoc;		/* structure to store VTOC data */
+
+	/*
+	 * The mutex 'msg_proc_lock' protects the following group of fields.
+	 *
+	 * The callback function checks to see if LDC triggered it due to
+	 * there being data available and the callback will signal to
+	 * the message processing thread waiting on 'msg_proc_cv'.
+	 */
+	kmutex_t		msg_proc_lock;
+	kcondvar_t		msg_proc_cv;
+	boolean_t		msg_pending;
+	vdc_thr_state_t		msg_proc_thr_state;
+	kthread_t		*msg_proc_thr_id;
+
+	/*
+	 * The mutex 'dring_lock'  protects the following group of fields.
+	 */
+	kmutex_t		dring_lock;
+	ldc_mem_info_t		dring_mem_info;
+	uint_t			dring_curr_idx;
+	uint32_t		dring_len;
+	uint32_t		dring_cookie_count;
+	uint32_t		dring_entry_size;
+	ldc_mem_cookie_t	*dring_cookie;
+	uint64_t		dring_ident;
+
+	vdc_local_desc_t	*local_dring;
+
+	uint64_t		ldc_id;
+	ldc_status_t		ldc_state;
+	ldc_handle_t		ldc_handle;
+	ldc_dring_handle_t	ldc_dring_hdl;
+} vdc_t;
+
+/*
+ * Debugging macros
+ */
+#ifdef DEBUG
+extern int	vdc_msglevel;
+
+#define	PR0 if (vdc_msglevel > 0)	\
+		vdc_msg
+
+#define	PR1 if (vdc_msglevel > 1)	\
+		vdc_msg
+
+#define	PR2 if (vdc_msglevel > 2)	\
+		vdc_msg
+
+#define	VDC_DUMP_DRING_MSG(dmsgp)					\
+		vdc_msg("sq:%d start:%d end:%d ident:%x\n",		\
+			dmsgp->seq_num, dmsgp->start_idx,		\
+			dmsgp->end_idx, dmsgp->dring_ident);
+
+#else	/* !DEBUG */
+#define	PR0(...)
+#define	PR1(...)
+#define	PR2(...)
+
+#define	VDC_DUMP_DRING_MSG(dmsgp)
+
+#endif	/* !DEBUG */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VDC_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vdsk_common.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,194 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_VDSK_COMMON_H
+#define	_VDSK_COMMON_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This header file contains the private LDoms Virtual Disk (vDisk) definitions
+ * common to both the server (vds) and the client (vdc)
+ */
+
+#include <sys/machparam.h>
+#include <sys/vtoc.h>
+
+#include <sys/ldc.h>
+#include <sys/vio_common.h>
+#include <sys/vio_mailbox.h>
+
+/*
+ * vDisk definitions
+ */
+
+/*
+ * The number of Descriptor Ring entries
+ *
+ * Constraints:
+ * 	- overall DRing size must be greater than 8K (MMU_PAGESIZE)
+ *	- overall DRing size should be 8K aligned (desirable but not enforced)
+ *	- DRing entry must be 8 byte aligned
+ */
+#define	VD_DRING_LEN		512
+
+/*
+ *
+ */
+#define	VD_DRING_ENTRY_SZ	(sizeof (vd_dring_entry_t) + 		\
+		(sizeof (ldc_mem_cookie_t) * (VD_MAX_COOKIES - 1)))
+
+/*
+ * The maximum block size we can transmit using one Descriptor Ring entry
+ *
+ * Currently no FS uses more than 128K and it doesn't look like they
+ * will either as there is no perf gain to be had by larger values.
+ * ( see ZFS comment at definition of SPA_MAXBLOCKSIZE ).
+ *
+ * We choose 256K to give us some headroom.
+ */
+#define	VD_MAX_BLOCK_SIZE	(256 * 1024)
+
+#define	VD_MAX_COOKIES		((VD_MAX_BLOCK_SIZE / PAGESIZE) + 1)
+#define	VD_USEC_TIMEOUT		20000
+#define	VD_LDC_IDS_PROP		"ldc-ids"
+#define	VD_LDC_QLEN		32
+
+/*
+ * Flags used by ioctl routines to indicate if a copyin/copyout is needed
+ */
+#define	VD_COPYOUT		0x1
+#define	VD_COPYIN		0x2
+
+/*
+ * vDisk operations on physical devices
+ */
+#define	VD_OP_BREAD		0x01	/* Block Read */
+#define	VD_OP_BWRITE		0x02	/* Block Write */
+#define	VD_OP_FLUSH		0x03	/* Flush disk write cache contents */
+#define	VD_OP_GET_WCE		0x04	/* Get disk W$ status */
+#define	VD_OP_SET_WCE		0x05	/* Enable/Disable disk W$ */
+#define	VD_OP_GET_VTOC		0x06	/* Get VTOC */
+#define	VD_OP_SET_VTOC		0x07	/* Set VTOC */
+#define	VD_OP_GET_DISKGEOM	0x08	/* Get disk geometry */
+#define	VD_OP_SET_DISKGEOM	0x09	/* Set disk geometry */
+#define	VD_OP_SCSICMD		0x0a	/* SCSI control command */
+#define	VD_OP_MASK		0xFF	/* mask of all possible operations */
+#define	VD_OP_COUNT		10	/* Number of operations */
+
+/*
+ * Definitions of the various ways vds can export disk support to vdc.
+ */
+typedef enum vd_disk_type {
+	VD_DISK_TYPE_UNK = 0,		/* Unknown device type */
+	VD_DISK_TYPE_SLICE,		/* slice in block device */
+	VD_DISK_TYPE_DISK		/* entire disk (slice 2) */
+} vd_disk_type_t;
+
+/*
+ * vDisk Descriptor payload
+ */
+typedef struct vd_dring_payload {
+	uint64_t	req_id;		/* The request ID being processed */
+	uint8_t		operation;	/* operation for server to perform */
+	uint8_t		slice;		/* The disk slice being accessed */
+	uint16_t	resv1;		/* padding */
+	uint32_t	status;		/* "errno" of server operation */
+	uint64_t	addr;		/* LP64	diskaddr_t (block I/O) */
+	uint64_t	nbytes;		/* LP64 size_t */
+	uint32_t	ncookies;	/* Number of cookies used */
+	uint32_t	resv2;		/* padding */
+
+	ldc_mem_cookie_t	cookie[1];	/* variable sized array */
+} vd_dring_payload_t;
+
+
+/*
+ * vDisk Descriptor entry
+ */
+typedef struct vd_dring_entry {
+	vio_dring_entry_hdr_t		hdr;		/* common header */
+	vd_dring_payload_t		payload;	/* disk specific data */
+} vd_dring_entry_t;
+
+
+/*
+ * vDisk control operation structures
+ *
+ * XXX FIXME - future support - add structures for VD_OP_XXXX
+ */
+
+/*
+ * VTOC message
+ *
+ * vDisk Get Volume Table of Contents (VD_OP_GET_VTOC)
+ *
+ */
+typedef struct vd_partition {
+	uint16_t	p_tag;		/* ID tag of partition */
+	uint16_t	p_flag;		/* permision flags */
+	uint32_t	reserved;	/* padding */
+	int64_t		p_start;	/* start sector no of partition */
+	int64_t		p_size;		/* # of blocks in partition */
+} vd_partition_t;
+
+typedef struct vd_vtoc {
+	uint8_t		v_volume[LEN_DKL_VVOL]; /* volume name */
+	uint16_t	v_sectorsz;		/* sector size in bytes */
+	uint16_t	v_nparts;		/* num of partitions */
+	uint32_t	reserved;		/* padding */
+	uint8_t		v_asciilabel[LEN_DKL_ASCII];    /* for compatibility */
+
+} vd_vtoc_t;
+
+
+/*
+ * vDisk Get Geometry (VD_OP_GET_GEOM)
+ */
+typedef struct vd_geom {
+	uint16_t	dkg_ncyl;	/* # of data cylinders */
+	uint16_t	dkg_acyl;	/* # of alternate cylinders */
+	uint16_t	dkg_bcyl;	/* cyl offset (for fixed head area) */
+	uint16_t	dkg_nhead;	/* # of heads */
+	uint16_t	dkg_nsect;	/* # of data sectors per track */
+	uint16_t	dkg_intrlv;	/* interleave factor */
+	uint16_t	dkg_apc;	/* alternates per cyl (SCSI only) */
+	uint16_t	dkg_rpm;	/* revolutions per minute */
+	uint16_t	dkg_pcyl;	/* # of physical cylinders */
+	uint16_t	dkg_write_reinstruct;	/* # sectors to skip, writes */
+	uint16_t	dkg_read_reinstruct;	/* # sectors to skip, reads */
+} vd_geom_t;
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VDSK_COMMON_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vdsk_mailbox.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_VDSK_MAILBOX_H
+#define	_VDSK_MAILBOX_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This header file contains the private LDoms Virtual Disk (vDisk) mailbox
+ * definitions common to both the server (vds) and the client (vdc)
+ */
+
+#include <sys/vio_mailbox.h>
+#include <sys/vio_common.h>
+#include <sys/vdsk_common.h>
+
+/*
+ * Definition of the various states the vDisk state machine can
+ * be in during the handshake between vdc and vds.
+ */
+typedef enum vd_state {
+	VD_STATE_INIT = 0,
+	VD_STATE_VER,
+	VD_STATE_ATTR,
+	VD_STATE_DRING,
+	VD_STATE_RDX,
+	VD_STATE_DATA
+} vd_state_t;
+
+#define	VD_VER_MAJOR		0x1
+#define	VD_VER_MINOR		0x0
+
+/*
+ * vDisk device attributes information message.
+ *
+ * tag.msgtype == VIO_TYPE_CTRL
+ * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK|NACK}
+ * tag.subtype_env == VIO_ATTR_INFO
+ */
+typedef struct vd_attr_msg {
+	/* Common tag */
+	vio_msg_tag_t 	tag;
+
+	/* vdisk-attribute-specific payload */
+	uint8_t		xfer_mode;	/* data exchange method. */
+	uint8_t		vdisk_type;	/* disk, slice, read-only, etc. */
+	uint16_t	resv;		/* padding */
+	uint32_t	vdisk_block_size;	/* bytes per disk block */
+	uint64_t	operations;	/* bit-field of server supported ops */
+	uint64_t	vdisk_size;	/* size for Nblocks property. */
+	uint64_t	max_xfer_sz;	/* maximum block transfer size */
+
+	uint64_t	resv2[VIO_PAYLOAD_ELEMS - 4];	/* padding */
+} vd_attr_msg_t;
+
+/*
+ * vDisk inband descriptor message.
+ *
+ * For clients that do not use descriptor rings, the descriptor contents
+ * are sent as part of an inband message.
+ */
+typedef struct vd_dring_inband_msg {
+	vio_inband_desc_msg_hdr_t	hdr;
+	vd_dring_payload_t		payload;
+} vd_dring_inband_msg_t;
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VDSK_MAILBOX_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vio_common.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VIO_COMMON_H
+#define	_SYS_VIO_COMMON_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+ *  Common header for VIO descriptor ring entries
+ */
+typedef struct vio_dring_entry_hdr {
+	uint8_t		dstate;		/* Current state of Dring entry */
+	uint8_t		ack:1;		/* 1 => receiver must ACK when DONE */
+
+	/*
+	 * Padding.
+	 */
+	uint16_t	resv[3];
+} vio_dring_entry_hdr_t;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _SYS_VIO_COMMON_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vio_mailbox.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,331 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VIO_MAILBOX_H
+#define	_SYS_VIO_MAILBOX_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/ldc.h>
+
+/* Message types */
+#define		VIO_TYPE_CTRL	0x1
+#define		VIO_TYPE_DATA	0x2
+#define		VIO_TYPE_ERR	0x4
+
+/* Message sub-types */
+#define		VIO_SUBTYPE_INFO	0x1
+#define		VIO_SUBTYPE_ACK		0x2
+#define		VIO_SUBTYPE_NACK	0x4
+
+/*
+ * VIO specific control envelopes:  0x0000 - 0x00FF
+ * VNET specific control envelopes: 0x0100 - 0x01FF
+ * VDSK specific control envelopes: 0x0200 - 0x02FF
+ * UNUSED envelopes:                0x0300 - 0x0FFF
+ */
+
+/*
+ * Generic Control Subtype Envelopes:
+ * 	type == VIO_TYPE_CTRL
+ *	subtype == VIO_SUBTYPE_{INFO|ACK|NACK}
+ *
+ * 	0x0000 - 0x003F
+ */
+#define	VIO_VER_INFO		0x0001
+#define	VIO_ATTR_INFO		0x0002
+#define	VIO_DRING_REG		0x0003
+#define	VIO_DRING_UNREG		0x0004
+#define	VIO_RDX			0x0005
+
+/*
+ * Generic subtype Data envelopes
+ * 	type == VIO_TYPE_DATA
+ * 	subtype == VIO_SUBTYPE_{INFO|ACK|NACK}
+ *
+ * 	0x0040 - 0x007F
+ */
+#define	VIO_PKT_DATA		0x0040
+#define	VIO_DESC_DATA		0x0041
+#define	VIO_DRING_DATA		0x0042
+
+
+/*
+ * Generic subtype Error envelopes
+ * 	type == VIO_TYPE_ERR
+ * 	subtype == VIO_SUBTYPE_{INFO|ACK|NACK}
+ *
+ * 	0x0080 - 0x00FF
+ *
+ * Currently unused
+ */
+
+/*
+ * Supported Device Types
+ */
+#define	VDEV_NETWORK		0x1
+#define	VDEV_NETWORK_SWITCH	0x2
+#define	VDEV_DISK		0x3
+#define	VDEV_DISK_SERVER	0x4
+
+/* addr_type */
+#define	ADDR_TYPE_MAC	0x1	/* XXX move to vnet_mailbox.h ? */
+
+/*
+ * VIO data transfer mode
+ */
+#define	VIO_PKT_MODE	0x1
+#define	VIO_DESC_MODE	0x2
+#define	VIO_DRING_MODE	0x3
+
+/*
+ * VIO Descriptor Ring registration options
+ * (intended use for Descriptor Ring)
+ */
+#define	VIO_TX_DRING	0x1
+#define	VIO_RX_DRING	0x2
+
+/*
+ * Size of message payload
+ */
+#define	VIO_MSGTAG_SZ		(sizeof (vio_msg_tag_t))	/* bytes */
+#define	VIO_PAYLOAD_SZ		(LDC_PAYLOAD_SIZE_UNRELIABLE - VIO_MSGTAG_SZ)
+#define	VIO_PAYLOAD_ELEMS	(VIO_PAYLOAD_SZ / LDC_ELEM_SIZE) /* num words */
+
+/*
+ * VIO device message tag.
+ *
+ * These 64 bits are used as a common header for all VIO message types.
+ */
+typedef union vio_msg_tag {
+	struct {
+		uint8_t		_msgtype;
+		uint8_t		_subtype;
+		uint16_t	_subtype_env;
+		uint32_t	_sid;		/* session id */
+	} _hdr;
+	uint64_t	tagword;
+} vio_msg_tag_t;
+
+#define	vio_msgtype		_hdr._msgtype
+#define	vio_subtype		_hdr._subtype
+#define	vio_subtype_env		_hdr._subtype_env
+#define	vio_sid			_hdr._sid
+
+/*
+ * VIO version negotation message.
+ *
+ * tag.msgtype == VIO_TYPE_CTRL
+ * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK|NACK}
+ * tag.subtype_env == VIO_VER_INFO
+ */
+typedef struct vio_ver_msg {
+	/* Common tag */
+	vio_msg_tag_t		tag;
+
+	/* version specific payload */
+	uint32_t		ver_major:16,	/* major version number */
+				ver_minor:16;	/* minor version number */
+
+	uint8_t			dev_class;	/* type of device */
+
+	/* padding */
+	uint8_t			resv1;
+	uint16_t		resv2;
+	uint64_t		resv3[VIO_PAYLOAD_ELEMS - 1];
+} vio_ver_msg_t;
+
+/*
+ * VIO Descriptor Ring Register message.
+ *
+ * tag.msgtype == VIO_TYPE_CTRL
+ * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK|NACK}
+ * tag.subtype_env == VIO_DRING_REG
+ */
+typedef struct vio_dring_reg_msg {
+	/* Common tag */
+	vio_msg_tag_t		tag;
+
+	/* Descriptor ring information */
+	uint64_t		dring_ident;	/* =0 for SUBTYPE_INFO msg */
+	uint32_t		num_descriptors; /* # of desc in the ring */
+	uint32_t		descriptor_size; /* size of each entry */
+	uint16_t		options;	/* intended use */
+	uint16_t		resv;		/* padding */
+	uint32_t		ncookies;	/* # cookies exporting ring */
+
+	/*
+	 * cookie is a variable sized array.  If the number of cookies is 1,
+	 * the message can be sent by LDC without fragmentation.
+	 */
+	ldc_mem_cookie_t	cookie[1];
+} vio_dring_reg_msg_t;
+
+/*
+ * VIO Descriptor Ring Unregister message.
+ *
+ * tag.msgtype == VIO_TYPE_CTRL
+ * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK|NACK}
+ * tag.subtype_env == VIO_DRING_UNREG
+ */
+typedef struct vio_dring_unreg_msg {
+	/* Common tag */
+	vio_msg_tag_t	tag;
+
+	/* Descriptor ring information */
+	uint64_t	dring_ident;
+	uint64_t	resv[VIO_PAYLOAD_ELEMS - 1];
+} vio_dring_unreg_msg_t;
+
+
+/*
+ * Definition of a generic VIO message (with no payload) which can be cast
+ * to other message types.
+ */
+typedef struct vio_msg {
+	/* Common tag */
+	vio_msg_tag_t		tag;
+
+	/* no payload */
+	uint64_t		resv[VIO_PAYLOAD_ELEMS];
+} vio_msg_t;
+
+/*
+ * VIO Ready to Receive message.
+ *
+ * tag.msgtype == VIO_TYPE_CTRL
+ * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK}
+ * tag.subtype_env == VIO_RDX
+ */
+typedef vio_msg_t	vio_rdx_msg_t;
+
+/*
+ * VIO error message.
+ *
+ * tag.msgtype == VIO_TYPE_ERR
+ * tag.subtype == VIO_SUBTYPE_{INFO|ACK|NACK}
+ * tag.subtype_env == TBD
+ */
+typedef vio_msg_t	vio_err_msg_t;
+
+/*
+ * VIO descriptor ring data message.
+ *
+ * tag.msgtype == VIO_TYPE_DATA
+ * tag.subtype == VIO_SUBTYPE_{INFO|ACK|NACK}
+ * tag.subtype_env == VIO_DRING_DATA
+ */
+typedef struct vio_dring_msg {
+	/* Common message tag */
+	vio_msg_tag_t		tag;
+
+	/* Data dring info */
+	uint64_t		seq_num;
+	uint64_t		dring_ident;	/* ident of modified DRing */
+	uint32_t		start_idx;	/* Indx of first updated elem */
+	int32_t			end_idx;	/* Indx of last updated elem */
+
+	/*
+	 * Padding.
+	 */
+	uint64_t	resv[VIO_PAYLOAD_ELEMS - 3];
+} vio_dring_msg_t;
+
+/*
+ * VIO Common header for inband descriptor messages.
+ *
+ * Clients will then combine this header with a device specific payload.
+ */
+typedef struct vio_inband_desc_msg_hdr {
+	/* Common message tag */
+	vio_msg_tag_t		tag;
+
+	uint64_t		seq_num;	/* sequence number */
+	uint64_t		desc_handle;	/* opaque descriptor handle */
+} vio_inband_desc_msg_hdr_t;
+
+/*
+ * VIO raw data message.
+ *
+ * tag.msgtype == VIO_TYPE_DATA
+ * tag.subtype == VIO_SUBTYPE_{INFO|ACK|NACK}
+ * tag.subtype_env == VIO_PKT_DATA
+ *
+ * Note the data payload is so small to keep this message
+ * within the size LDC can cope with without fragmentation.
+ * If it turns out in the future that we are not concerned
+ * with fragmentation then we can increase the size of this
+ * field.
+ */
+typedef struct vio_raw_data_msg {
+	/* Common message tag */
+	vio_msg_tag_t		tag;
+
+	/* Raw data packet payload */
+	uint64_t		seq_num;	/* sequence number */
+	uint64_t		data[VIO_PAYLOAD_ELEMS - 1];
+} vio_raw_data_msg_t;
+
+/*
+ * Definitions of the valid states a Descriptor can be in.
+ */
+#define	VIO_DESC_FREE		0x1
+#define	VIO_DESC_READY		0x2
+#define	VIO_DESC_ACCEPTED	0x3
+#define	VIO_DESC_DONE		0x4
+#define	VIO_DESC_MASK		0xf
+
+/* Macro to check that the state in variable supplied is a valid DRing state */
+#define	VIO_IS_VALID_DESC_STATE(flag)					\
+	(((flag | VIO_DESC_MASK) == VIO_DESC_FREE) ||			\
+		((flag | VIO_DESC_MASK) == VIO_DESC_READY) ||		\
+		((flag | VIO_DESC_MASK) == VIO_DESC_ACCEPTED) ||	\
+		((flag | VIO_DESC_MASK) == VIO_DESC_READY))
+
+#define	VIO_SET_DESC_STATE(flag, state)					\
+	{								\
+		flag &= (flag | ~VIO_DESC_MASK);			\
+		flag |= (state & VIO_DESC_MASK);			\
+	}
+
+#define	VIO_GET_DESC_STATE(flag)	((flag) & VIO_DESC_MASK)
+
+/* Macro to populate the generic fields of the DRing data msg */
+#define	VIO_INIT_DRING_DATA_TAG(dmsg)	\
+		dmsg.tag.vio_msgtype = VIO_TYPE_DATA;	\
+		dmsg.tag.vio_subtype = VIO_SUBTYPE_INFO;	\
+		dmsg.tag.vio_subtype_env = VIO_DRING_DATA;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _SYS_VIO_MAILBOX_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vldc.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VLDC_H
+#define	_VLDC_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+
+/* Channel IOCTL Commands */
+
+#define	VLDC_IOCTL_SHIFT	8
+#define	VLDC_IOCTL		('1' << VLDC_IOCTL_SHIFT)
+
+#define	VLDC_IOCTL_OPT_OP	(VLDC_IOCTL | 0x1)	/* ctrl op */
+#define	VLDC_IOCTL_READ_COOKIE	(VLDC_IOCTL | 0x2)   	/* read cookie */
+#define	VLDC_IOCTL_WRITE_COOKIE	(VLDC_IOCTL | 0x3)   	/* write cookie */
+
+/* supported ctrl operations */
+#define	VLDC_OP_GET		0x1	    /* get specified value */
+#define	VLDC_OP_SET		0x2	    /* set specified value */
+
+/* supported ctrl operation options */
+#define	VLDC_OPT_MTU_SZ		0x1	    /* MTU */
+#define	VLDC_OPT_STATUS		0x2	    /* port status */
+#define	VLDC_OPT_MODE		0x3	    /* port channel mode */
+
+/* values returned by VLDC_OPT_OP_STATUS */
+#define	VLDC_PORT_CLOSED	0x1	    /* port is closed */
+#define	VLDC_PORT_OPEN		0x2	    /* port is already open */
+#define	VLDC_PORT_READY		0x4	    /* port is open and ready */
+
+/*
+ * Values for VLDC_OPT_MODE are defined in ldc.h.
+ */
+
+/*
+ * Structure that is used by vldc driver and all its clients to communicate
+ * the type and nature of the option as well as for clients to get port
+ * status.
+ */
+typedef struct vldc_opt_op {
+	int32_t		op_sel;		/* operation selector(ex: GET) */
+	int32_t		opt_sel;	/* option selector (ex: MTU) */
+	uint32_t	opt_val;	/* option value to set or returned */
+} vldc_opt_op_t;
+
+/*
+ * Structure that is used by the LDom manager to download instruction
+ * sequences and read/write new machine descriptions.
+ */
+typedef struct vldc_data {
+	uint64_t	src_addr;	/* source address */
+	uint64_t	dst_addr;	/* destination address */
+	uint64_t	length;		/* size of transfer */
+} vldc_data_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VLDC_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vldc_impl.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,133 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VLDC_IMPL_H
+#define	_VLDC_IMPL_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/stream.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/ldc.h>
+#include <sys/vldc.h>
+
+/* default values */
+#define	VLDC_DEFAULT_MTU	0x800	/* default mtu size */
+
+/* VLDC limits */
+#define	VLDC_MAX_COOKIE		0x40000	/* max. size of xfer to/from HV */
+#define	VLDC_MAX_MTU		0x40000	/* 256K */
+#define	VLDC_MAX_PORTS		0x800
+#define	VLDC_MAX_MINORS		VLDC_MAX_PORTS
+#define	VLDC_QUEUE_LEN		0x80
+
+#define	VLDC_MINOR_MASK		(VLDC_MAX_PORTS - 1)
+#define	VLDC_INST_SHIFT		11
+
+/* get port number from minor number */
+#define	VLDCPORT(vldcp, minor)	\
+		((vldcp)->minor_tbl[(minor) & VLDC_MINOR_MASK].portno)
+
+/* get minor table entry from minor number */
+#define	VLDCMINOR(vldcp, minor)	\
+		(&((vldcp)->minor_tbl[(minor) & VLDC_MINOR_MASK]))
+
+/* get instance number from minor number */
+#define	VLDCINST(minor)		((minor) >> VLDC_INST_SHIFT)
+
+/* indicates an invalid port number */
+#define	VLDC_INVALID_PORTNO	((uint_t)-1)
+
+/*
+ * Minor node number to port number mapping table.
+ *
+ * The lock field in the vldc_minor structure is used to serialize operations
+ * on the port associated with the minor node. It also protects the minor node
+ * in_use field which is used to track the number of active users of the minor
+ * node.  Driver ops will either hold the lock over the whole operation or
+ * will increment (and then decrement) the in use count if they need to
+ * release and re-acquire the lock, e.g. when copying data in from or out to
+ * userland. When the MDEG framework calls into the driver via the callback to
+ * remove a port, the driver must wait until the in use count for the minor
+ * node associated with the port drops to zero, before it can remove the
+ * port.
+ */
+typedef struct vldc_minor {
+	kmutex_t 	lock;			/* protects port/in_use count */
+	kcondvar_t	cv;			/* for waiting on in use */
+	uint_t		in_use;			/* in use counter */
+	uint_t		portno;			/* port number */
+	char		sname[MAXPATHLEN];	/* service name */
+} vldc_minor_t;
+
+typedef struct vldc_port {
+	uint_t		number;			/* port number */
+	uint32_t	status;			/* port status */
+	vldc_minor_t	*minorp;		/* minor table entry pointer */
+	uint32_t	mtu;			/* port mtu */
+	caddr_t		send_buf;		/* send buffer */
+	caddr_t		recv_buf;		/* receive buffer */
+
+	uint64_t	ldc_id;			/* Channel number */
+	ldc_handle_t	ldc_handle;		/* Channel handle */
+	ldc_mode_t	ldc_mode;		/* Channel mode */
+
+	boolean_t	is_stream;		/* streaming mode */
+	boolean_t	hanged_up;		/* port hanged up */
+
+	struct pollhead	poll;			/* for poll */
+} vldc_port_t;
+
+/*
+ * vldc driver's soft state structure
+ */
+typedef struct vldc {
+	kmutex_t 		lock;		/* serializes detach and MDEG */
+	boolean_t		detaching; 	/* true iff busy detaching */
+	dev_info_t		*dip;		/* dev_info */
+	mdeg_node_spec_t	*inst_spec;	/* vldc instance specifier */
+	mdeg_handle_t		mdeg_hdl;	/* MD event handle */
+
+	uint_t 			num_ports;
+	vldc_port_t		port[VLDC_MAX_PORTS];
+
+	/* table for assigned minors */
+	vldc_minor_t		minor_tbl[VLDC_MAX_MINORS];
+
+	/* number of minors already assigned */
+	uint_t			minors_assigned;
+} vldc_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VLDC_IMPL_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vnet.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,118 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VNET_H
+#define	_VNET_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	VNET_SUCCESS		(0)	/* successful return */
+#define	VNET_FAILURE		(-1)	/* unsuccessful return */
+
+#define	KMEM_FREE(_p)		kmem_free((_p), sizeof (*(_p)))
+
+#define	VNET_NTXDS		512		/* power of 2 tx descriptors */
+#define	VNET_RECLAIM_LOWAT	32		/* tx reclaim low watermark */
+#define	VNET_RECLAIM_HIWAT	(512 - 32)	/* tx reclaim high watermark */
+#define	VNET_LDCWD_INTERVAL	1000		/* watchdog freq in msec */
+#define	VNET_LDCWD_TXTIMEOUT	1000		/* tx timeout in msec */
+#define	VNET_LDC_QLEN		1024		/* ldc qlen */
+
+/*
+ * vnet proxy transport layer information. There is one instance of this for
+ * every transport being used by a vnet device and a list of these transports
+ * is maintained by vnet.
+ */
+typedef struct vp_tl {
+	struct vp_tl		*nextp;			/* next in list */
+	mac_t			*macp;			/* transport ops */
+	char			name[LIFNAMSIZ];	/* device name */
+	major_t			major;			/* driver major # */
+	uint_t			instance;		/* dev instance */
+} vp_tl_t;
+
+/*
+ * Forwarding database (FDB) entry, used by vnet to provide switching
+ * functionality. Each fdb entry corresponds to a destination vnet device
+ * within the ldoms which is directly reachable by invoking a transmit
+ * function provided by a vnet proxy transport layer. Currently, the generic
+ * transport layer adds/removes/modifies entries in fdb.
+ */
+typedef struct fdb {
+	struct fdb	*nextp;			/* next entry in the list */
+	uint8_t		macaddr[ETHERADDRL];	/* destination mac address */
+	mac_tx_t	m_tx;			/* transmit function */
+	void		*txarg;			/* arg to the transmit func */
+} fdb_t;
+
+/* FDB hash queue head */
+typedef struct fdbf_s {
+	fdb_t		*headp;			/* head of fdb entries */
+	krwlock_t	rwlock;			/* protect the list */
+} fdb_fanout_t;
+
+#define	VNET_NFDB_HASH	4	/* default number of hash queues in fdb */
+#define	VNET_NFDB_HASH_MAX 32	/* max number of hash queues in fdb */
+
+/* Hash calculation using the mac address */
+#define	MACHASH(a, n)	((*(((uchar_t *)(a)) + 0) ^		\
+			*(((uchar_t *)(a)) + 1) ^		\
+			*(((uchar_t *)(a)) + 2) ^		\
+			*(((uchar_t *)(a)) + 3) ^		\
+			*(((uchar_t *)(a)) + 4) ^		\
+			*(((uchar_t *)(a)) + 5)) % (uint32_t)n)
+
+/* rwlock macros */
+#define	READ_ENTER(x)	rw_enter(x, RW_READER)
+#define	WRITE_ENTER(x)	rw_enter(x, RW_WRITER)
+#define	RW_EXIT(x)	rw_exit(x)
+
+/*
+ * vnet instance state information
+ */
+typedef struct vnet {
+	int			instance;	/* instance # */
+	dev_info_t		*dip;		/* dev_info */
+	struct vnet		*nextp;		/* next in list */
+	mac_t 			*macp;		/* MAC - macinfo */
+	uchar_t			vendor_addr[ETHERADDRL]; /* orig macadr */
+	uchar_t			curr_macaddr[ETHERADDRL]; /* current macadr */
+	vp_tl_t			*tlp;		/* list of vp_tl */
+	krwlock_t		trwlock;	/* lock for vp_tl list */
+	char			vgen_name[MAXNAMELEN];	/* name of generic tl */
+	fdb_fanout_t		*fdbhp;		/* fdb hash queues */
+	int			nfdb_hash;	/* num fdb hash queues */
+} vnet_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _VNET_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vnet_common.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,76 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VNET_COMMON_H
+#define	_VNET_COMMON_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/vio_common.h>
+#include <sys/vio_mailbox.h>
+#include <sys/ethernet.h>
+
+/*
+ * This header file contains definitions common to LDoms Virtual Network
+ * server (vsw) and client (vnet).
+ */
+
+/* max # of cookies per frame size */
+#define	MAX_COOKIES	 ((ETHERMAX >> MMU_PAGESHIFT) + 2)
+
+/* initial send sequence number */
+#define	VNET_ISS		0x1
+
+/* vnet descriptor */
+typedef struct vnet_public_desc {
+	vio_dring_entry_hdr_t	hdr;		/* descriptor header */
+	uint32_t		nbytes;		/* data length */
+	uint32_t		ncookies;	/* number of data cookies */
+	ldc_mem_cookie_t	memcookie[MAX_COOKIES]; /* data cookies */
+} vnet_public_desc_t;
+
+/*
+ * VIO in-band descriptor. Used by those vio clients
+ * such as OBP who do not use descriptor rings.
+ */
+typedef struct vio_ibnd_desc {
+	vio_inband_desc_msg_hdr_t	hdr;
+
+	/* payload */
+	uint32_t			nbytes;
+	uint32_t			ncookies;
+	ldc_mem_cookie_t		memcookie[MAX_COOKIES];
+} vio_ibnd_desc_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _VNET_COMMON_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vnet_gen.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VNET_GEN_H
+#define	_VNET_GEN_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	VGEN_SUCCESS		(0)	/* successful return */
+#define	VGEN_FAILURE		(-1)	/* unsuccessful return */
+
+#define	VGEN_NUM_VER		1	/* max # of vgen versions */
+
+#define	VGEN_LOCAL	1	/* local ldc end-point */
+#define	VGEN_PEER	2	/* peer ldc end-point */
+
+/* vgen_t flags */
+#define	VGEN_STOPPED		0x0
+#define	VGEN_STARTED		0x1
+
+#define	KMEM_FREE(_p)		kmem_free((_p), sizeof (*(_p)))
+
+#define	VGEN_INIT_MCTAB_SIZE	16	/* initial size of multicast table */
+
+#define	READ_ENTER(x)	rw_enter(x, RW_READER)
+#define	WRITE_ENTER(x)	rw_enter(x, RW_WRITER)
+#define	RW_EXIT(x)	rw_exit(x)
+
+/* channel flags */
+#define	CHANNEL_ATTACHED	0x1
+#define	CHANNEL_STARTED		0x2
+
+/* transmit return values */
+#define	VGEN_TX_SUCCESS		0	/* transmit success */
+#define	VGEN_TX_FAILURE		1	/* transmit failure */
+#define	VGEN_TX_NORESOURCES	2	/* out of tbufs/txds */
+
+/* private descriptor flags */
+#define	VGEN_PRIV_DESC_FREE	0x0	/* desc is available */
+#define	VGEN_PRIV_DESC_BUSY	0x1	/* desc in use */
+
+#define	LDC_TO_VNET(ldcp)  ((ldcp)->portp->vgenp->vnetp)
+#define	LDC_TO_VGEN(ldcp)  ((ldcp)->portp->vgenp)
+
+/* get the address of next tbuf */
+#define	NEXTTBUF(ldcp, tbufp)	(((tbufp) + 1) == (ldcp)->tbufendp    \
+		? (ldcp)->tbufp : ((tbufp) + 1))
+
+/* increment recv index */
+#define	INCR_RXI(i, ldcp)	\
+		((i) = (((i) + 1) & ((ldcp)->num_rxds - 1)))
+
+/* decrement recv index */
+#define	DECR_RXI(i, ldcp)	\
+		((i) = (((i) - 1) & ((ldcp)->num_rxds - 1)))
+
+/* increment tx index */
+#define	INCR_TXI(i, ldcp)	\
+		((i) = (((i) + 1) & ((ldcp)->num_txds - 1)))
+
+/* decrement tx index */
+#define	DECR_TXI(i, ldcp)	\
+		((i) = (((i) - 1) & ((ldcp)->num_txds - 1)))
+
+/* bounds check rx index */
+#define	CHECK_RXI(i, ldcp)	\
+		(((i) >= 0) && ((i) < (ldcp)->num_rxds))
+
+/* bounds check tx index */
+#define	CHECK_TXI(i, ldcp)	\
+		(((i) >= 0) && ((i) < (ldcp)->num_txds))
+
+/* private descriptor */
+typedef struct vgen_priv_desc {
+	uint64_t		flags;		/* flag bits */
+	vnet_public_desc_t	*descp;		/* associated public desc */
+	ldc_mem_handle_t	memhandle;	/* mem handle for data */
+	mblk_t			*mp;		/* associated packet */
+	uint64_t		datap;		/* mp->b_rptr */
+	uint64_t		datalen;	/* total actual datalen */
+	uint64_t		seqnum;		/* sequence number of pkt */
+	uint64_t		ncookies;	/* num ldc_mem_cookies */
+	ldc_mem_cookie_t	memcookie[MAX_COOKIES];	/* data cookies */
+} vgen_private_desc_t;
+
+/*
+ * Handshake parameters (per vio_mailbox.h) of each ldc end point, used
+ * during handshake negotiation.
+ */
+typedef struct vgen_handshake_params {
+	/* version specific params */
+	uint32_t	ver_major:16,
+			ver_minor:16;		/* major, minor version */
+	uint8_t		dev_class;		/* device class */
+
+	/* attributes specific params */
+	uint64_t		mtu;		/* max transfer unit size */
+	uint64_t		addr;		/* address of the device */
+	uint8_t			addr_type;	/* type of address */
+	uint8_t			xfer_mode;	/* SHM or PKT */
+	uint16_t		ack_freq;	/* dring data ack freq */
+
+	/* descriptor ring params */
+	uint32_t		num_desc;	/* # of descriptors in ring */
+	uint32_t		desc_size;	/* size of descriptor */
+	ldc_mem_cookie_t	dring_cookie;	/* desc ring cookie */
+	uint32_t		num_dcookies;	/* # of dring cookies */
+	uint64_t		dring_ident;	/* ident=0 for INFO msg */
+	boolean_t		dring_ready;   /* dring ready flag */
+} vgen_hparams_t;
+
+/* version info */
+typedef struct vgen_ver {
+	uint32_t	ver_major:16,
+			ver_minor:16;
+} vgen_ver_t;
+
+typedef struct vgen_stats {
+
+	/* Link Input/Output stats */
+	uint64_t	ipackets;
+	uint64_t	ierrors;
+	uint64_t	opackets;
+	uint64_t	oerrors;
+#if 0
+	uint64_t	collisions;
+#endif
+
+	/* MIB II variables */
+	uint64_t	rbytes;		/* # bytes received */
+	uint64_t	obytes;		/* # bytes transmitted */
+	uint32_t	multircv;	/* # multicast packets received */
+	uint32_t	multixmt;	/* # multicast packets for xmit */
+	uint32_t	brdcstrcv;	/* # broadcast packets received */
+	uint32_t	brdcstxmt;	/* # broadcast packets for xmit */
+	uint32_t	norcvbuf;	/* # rcv packets discarded */
+	uint32_t	noxmtbuf;	/* # xmit packets discarded */
+
+	/* Tx Statistics */
+	uint32_t	tx_no_desc;
+	uint32_t	tx_allocb_fail;
+
+	/* Rx Statistics */
+	uint32_t	rx_no_desc;
+	uint32_t	rx_allocb_fail;
+	uint32_t	rx_lost_pkts;
+
+	/* Callback statistics */
+	uint32_t	callbacks;
+	uint32_t	dring_data_acks;
+
+} vgen_stats_t;
+
+typedef struct vgen_kstats {
+	/*
+	 * Link Input/Output stats
+	 */
+	kstat_named_t	ipackets;
+	kstat_named_t	ipackets64;
+	kstat_named_t	ierrors;
+	kstat_named_t	opackets;
+	kstat_named_t	opackets64;
+	kstat_named_t	oerrors;
+#if 0
+	kstat_named_t	collisions;
+#endif
+	/*
+	 * required by kstat for MIB II objects(RFC 1213)
+	 */
+	kstat_named_t	rbytes; 	/* MIB - ifInOctets */
+	kstat_named_t	rbytes64;
+	kstat_named_t	obytes; 	/* MIB - ifOutOctets */
+	kstat_named_t	obytes64;
+	kstat_named_t	multircv; 	/* MIB - ifInNUcastPkts */
+	kstat_named_t	multixmt; 	/* MIB - ifOutNUcastPkts */
+	kstat_named_t	brdcstrcv;	/* MIB - ifInNUcastPkts */
+	kstat_named_t	brdcstxmt;	/* MIB - ifOutNUcastPkts */
+	kstat_named_t	norcvbuf; 	/* MIB - ifInDiscards */
+	kstat_named_t	noxmtbuf; 	/* MIB - ifOutDiscards */
+
+	/* Tx Statistics */
+	kstat_named_t	tx_no_desc;
+	kstat_named_t	tx_allocb_fail;
+
+	/* Rx Statistics */
+	kstat_named_t	rx_no_desc;
+	kstat_named_t	rx_allocb_fail;
+	kstat_named_t	rx_lost_pkts;
+
+	/* Callback statistics */
+	kstat_named_t	callbacks;
+	kstat_named_t	dring_data_acks;
+
+} vgen_kstats_t;
+
+/* Channel information associated with a vgen-port */
+typedef struct vgen_ldc {
+
+	struct vgen_ldc		*nextp;		/* next ldc in the list */
+	struct vgen_port	*portp;		/* associated port */
+
+	/*
+	 * Locks:
+	 * locking hierarchy when more than one lock is held concurrently:
+	 * cblock > txlock > tclock.
+	 */
+	kmutex_t		cblock;		/* sync callback processing */
+	kmutex_t		txlock;		/* sync transmits */
+	kmutex_t		tclock;		/* tx reclaim lock */
+
+	/* channel info from ldc layer */
+	uint64_t		ldc_id;		/* channel number */
+	uint64_t		ldc_handle;	/* channel handle */
+	ldc_status_t		ldc_status;	/* channel status */
+
+	/* handshake info */
+	vgen_ver_t		vgen_versions[VGEN_NUM_VER]; /* versions */
+	int			hphase;		/* handshake phase */
+	int			hstate;		/* handshake state bits */
+	uint32_t		local_sid;	/* local session id */
+	uint32_t		peer_sid;	/* session id of peer */
+	vgen_hparams_t		local_hparams;	/* local handshake params */
+	vgen_hparams_t		peer_hparams;	/* peer's handshake params */
+	timeout_id_t		htid;		/* handshake wd timeout id */
+
+	/* transmit and receive descriptor ring info */
+	ldc_dring_handle_t	tx_dhandle;	/* tx descriptor ring handle */
+	ldc_mem_cookie_t	tx_dcookie;	/* tx descriptor ring cookie */
+	ldc_dring_handle_t	rx_dhandle;	/* mapped rx dhandle */
+	ldc_mem_cookie_t	rx_dcookie;	/* rx descriptor ring cookie */
+	vnet_public_desc_t	*txdp;		/* transmit frame descriptors */
+	vnet_public_desc_t	*txdendp;	/* txd ring end */
+	vgen_private_desc_t	*tbufp;		/* associated tx resources */
+	vgen_private_desc_t	*tbufendp;	/* tbuf ring end */
+	vgen_private_desc_t	*next_tbufp;	/* next free tbuf */
+	vgen_private_desc_t	*cur_tbufp;	/* next reclaim tbuf */
+	uint64_t		next_txseq;	/* next tx sequence number */
+	uint32_t		num_txdcookies;	/* # of tx dring cookies */
+	uint32_t		num_rxdcookies;	/* # of rx dring cookies */
+	uint32_t		next_txi;	/* next tx descriptor index */
+	uint32_t		num_txds;	/* number of tx descriptors */
+	uint32_t		reclaim_lowat;	/* lowat for tx reclaim */
+	uint32_t		reclaim_hiwat;	/* hiwat for tx reclaim */
+	clock_t			reclaim_lbolt;	/* time of last tx reclaim */
+	timeout_id_t		wd_tid;		/* tx watchdog timeout id */
+	vnet_public_desc_t	*rxdp;		/* receive frame descriptors */
+	uint64_t		next_rxseq;	/* next expected recv seqnum */
+	uint32_t		next_rxi;	/* next expected recv index */
+	uint32_t		num_rxds;	/* number of rx descriptors */
+
+	/* misc */
+	uint32_t		flags;		/* flags */
+	boolean_t		need_resched;	/* reschedule tx */
+	boolean_t		need_ldc_reset; /* ldc_reset needed */
+	boolean_t		need_mcast_sync; /* sync mcast table with vsw */
+	uint32_t		hretries;	/* handshake retry count */
+
+	/* channel statistics */
+	vgen_stats_t		*statsp;	/* channel statistics */
+	kstat_t			*ksp;		/* channel kstats */
+
+} vgen_ldc_t;
+
+/* Channel list structure */
+typedef struct vgen_ldclist_s {
+	vgen_ldc_t	*headp;		/* head of the list */
+	krwlock_t	rwlock;		/* sync access to the list */
+	int		num_ldcs;	/* number of channels in the list */
+} vgen_ldclist_t;
+
+/* port information  structure */
+typedef struct vgen_port {
+	struct vgen_port	*nextp;		/* next port in the list */
+	struct vgen		*vgenp;		/* associated vgen_t */
+	int			port_num;	/* port number */
+	vgen_ldclist_t		ldclist;	/* list of ldcs for this port */
+	struct ether_addr	macaddr;	/* mac address of peer */
+} vgen_port_t;
+
+/* port list structure */
+typedef struct vgen_portlist {
+	vgen_port_t	*headp;		/* head of ports */
+	vgen_port_t	*tailp;		/* tail */
+	krwlock_t	rwlock;		/* sync access to the port list */
+} vgen_portlist_t;
+
+/* vgen instance information  */
+typedef struct vgen {
+	void			*vnetp;		/* associated vnet instance */
+	dev_info_t		*vnetdip;	/* dip of vnet */
+	void			*vnetmacp;	/* mac_t of vnet */
+	uint8_t			macaddr[ETHERADDRL];	/* mac addr of vnet */
+	mac_resource_handle_t	mrh;		/* handle for mac_rx() */
+	kmutex_t		lock;		/* synchornize ops */
+	int			flags;		/* flags */
+	vgen_portlist_t		vgenports;	/* Port List */
+	mdeg_node_spec_t	*mdeg_parentp;
+	mdeg_handle_t		mdeg_hdl;
+	vgen_port_t		*vsw_portp;	/* port connected to vsw */
+	mac_t			vgenmac;	/* vgen mac ops */
+	struct ether_addr	*mctab;		/* multicast addr table */
+	uint32_t		mcsize;		/* allocated size of mctab */
+	uint32_t		mccount;	/* # of valid addrs in mctab */
+} vgen_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _VNET_GEN_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vnet_mailbox.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,95 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_VNET_MAILBOX_H
+#define	_SYS_VNET_MAILBOX_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/vio_mailbox.h>
+#include <sys/ethernet.h>
+
+/*
+ * VNET specific Control envelopes: 0x0100 - 0x01FF
+ *	type == VIO_TYPE_CTRL
+ *	subtype == VIO_SUBTYPE_{INFO|ACK|NACK}
+ */
+#define	VNET_MCAST_INFO		0x0101
+
+/*
+ * Vnet/Vswitch device attributes information message.
+ *
+ * tag.msgtype == VIO_TYPE_CTRL
+ * tag.submsgtype = VIO_SUBTYPE_{INFO|ACK|NACK}
+ * tag.subtype_env == VIO_ATTR_INFO
+ */
+typedef struct vnet_attr_msg {
+	/* Common tag */
+	vio_msg_tag_t		tag;
+
+	/* attributes specific payload */
+	uint8_t			xfer_mode;	/* data transfer mode */
+	uint8_t			addr_type;	/* device address type */
+	uint16_t		ack_freq;	/* ack after rcving # of pkts */
+	uint32_t		resv1;		/* padding */
+
+	uint64_t		addr;		/* device address */
+	uint64_t		mtu;		/* maximum data xfer unit */
+
+	/* padding to align things */
+	uint64_t		resv2[3];
+
+} vnet_attr_msg_t;
+
+/*
+ * Vnet/Vswitch enable/disable multicast address msg
+ *
+ * tag.msgtype == VIO_TYPE_CTRL
+ * tag.subtype == VIO_SUBTYPE_{INFO|ACK|NACK}
+ * tag.subtype_env == VNET_MCAST_INFO
+ */
+#define	VNET_NUM_MCAST	7	/* max # of multicast addresses in the msg */
+
+typedef struct vnet_mcast_msg {
+	/* Common tag */
+	vio_msg_tag_t		tag;
+
+	/* multicast address information */
+	uint8_t			set;	/* add if set to 1, else remove */
+	uint8_t			count;	/* number of addrs in the msg */
+	struct ether_addr	mca[VNET_NUM_MCAST];	/* mcast addrs */
+	uint32_t		resv1;	/* padding */
+} vnet_mcast_msg_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _SYS_VNET_MAILBOX_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vnet_proxy.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,133 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VNET_PROXY_H
+#define	_VNET_PROXY_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * vnet proxy client is a low level driver which provides link specific
+ * functionality required by the vnet device. The vnet leaf driver and vnet
+ * proxy implement generic mac functionality required by the mac module as
+ * part of NEMO network stack. A vnet proxy provides these entry points
+ * as defined below in the vnet_proxy_ops structure. Note that some of the
+ * entry points may not be implemented by certain modules and will be
+ * initialized to NULL. All entry points return 0 for success and non zero
+ * for failure.
+ */
+
+typedef	uint64_t	vp_handle_t;	/* vnet proxy handle */
+
+typedef struct vnet_proxy_ops {
+
+/*
+ * vp_start() enables the client to send and receive data and generate
+ * interrupts. In addition a client may register opaque objects to be
+ * passed during transmit. This is done by a client which provides links
+ * to specific destination mac addresses by calling vnet_add_fdb().
+ * (described below: Functions exported by vnet).
+ * vp_stop() disables the client from generating interrupts and IO.
+ * The client will also unregister any opaque objects using vnet_del_fdb().
+ */
+    int	(*vp_start)(vp_handle_t vp_handle);
+    int	(*vp_stop)(vp_handle_t vp_handle);
+
+/*
+ * vp_tx() is invoked to transmit a packet. The first argument points
+ * to the client specific opaque object.
+ * The vp_tx routine must return 0 if unable to send the packet (eg, due to
+ * lack of resources).
+ */
+    int	(*vp_tx)(void *arg, mblk_t *mp);
+
+/*
+ * vp_resources() is called to enable the client register its receive
+ * resources.
+ */
+    int	(*vp_resources)(vp_handle_t vp_handle);
+
+/*
+ * vp_multicast() is used to add/remove addresses to and from the set of
+ * multicast addresses for which the client will receive packets.
+ * If the second argument is B_TRUE then the address pointed to by the
+ * third argument should be added to the set. If the second argument is
+ * B_FALSE then the address pointed to by the third argument should be
+ * removed.
+ */
+    int	(*vp_multicast)(vp_handle_t vp_handle, boolean_t add,
+		const uint8_t *mca);
+
+/*
+ * vp_promisc() is used to set the promiscuity of the client.
+ * If the second argument is B_TRUE then the client should receive all
+ * packets. If it is set to B_FALSE then only packets destined for the
+ * vnet device's unicast address and broadcast address should be received.
+ */
+    int	(*vp_promisc)(vp_handle_t vp_handle, boolean_t on);
+
+/* vp_unicast() is used to set a new unicast address for the vnet device */
+    int	(*vp_unicast)(vp_handle_t vp_handle, const uint8_t *mca);
+
+/* TBD: vp_statistics */
+    uint64_t	(*vp_statistics)(vp_handle_t vp_handle, enum mac_stat);
+
+/* TBD: vp_ctl is used to to support client specific control commands */
+    int	(*vp_ctl)(vp_handle_t vp_handle, mblk_t *mp);
+
+} vnet_proxy_ops_t;
+
+/* vnet_proxy entry point types */
+
+typedef int	(*vp_start_t)(vp_handle_t);
+typedef int 	(*vp_stop_t)(vp_handle_t);
+typedef int	(*vp_tx_t)(void *, mblk_t *);
+typedef int	(*vp_resources_t)(vp_handle_t);
+typedef int	(*vp_multicast_t)(vp_handle_t, boolean_t,
+			const uint8_t *);
+typedef int 	(*vp_promisc_t)(vp_handle_t, boolean_t);
+typedef int	(*vp_unicast_t)(vp_handle_t, const uint8_t *);
+typedef uint64_t	(*vp_statistics_t)(vp_handle_t, enum mac_stat);
+typedef int	(*vp_ctl_t)(vp_handle_t, mblk_t *);
+
+/*
+ * The client calls this function to add/remove an entry into vnet's FBD.
+ */
+void vnet_add_fdb(void *arg, uint8_t *macaddr, vp_tx_t vp_tx, void *txarg);
+void vnet_del_fdb(void *arg, uint8_t *macaddr);
+void vnet_modify_fdb(void *arg, uint8_t *macaddr, vp_tx_t vp_tx, void *txarg);
+void vnet_add_def_rte(void *arg, vp_tx_t vp_tx, void *txarg);
+void vnet_del_def_rte(void *arg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _VNET_PROXY_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vnetmsg.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,81 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _VNETMSG_H
+#define	_VNETMSG_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	LM_SIGNATURE	0x564E45544C4D5347	/* "VNETLMSG" */
+
+/* lm_type (below) */
+#define	LM_DATA	0x1
+#define	LM_ACK	0x2
+
+/*
+ * msg protocol used for ldc_mem IO. currently, 2 cookies are supported.
+ * (In Unreliable mode LDC-maxpayload is 56 bytes).
+ */
+
+typedef struct vnet_ldc_msg {
+	uint64_t		lm_signature;	/* signature: "VNETLMSG" */
+	uint8_t			lm_type;	/* data or ack */
+	uint8_t			lm_ncookies;	/* # of cookies in the msg */
+	uint16_t		lm_id;		/* opaque id (sender) */
+	uint16_t		lm_dlen;	/* actual data length */
+	uint16_t		lm_resv;	/* reserved */
+	ldc_mem_cookie_t	lm_cookie[2];	/* array of cookies */
+} vnet_ldc_msg_t;
+
+/*
+ * XXX Co-ordinate these def's with Harsha, expect that these will
+ * come from vnet header file.
+ */
+#define	MAX_COOKIES	((ETHERMTU >> MMU_PAGESHIFT) + 2)
+
+#define	VNET_PUB_DESC_FREE	0x0
+#define	VNET_PUB_DESC_READY	0x1
+#define	VNET_PUB_DESC_DONE	0x2
+#define	VNET_PUB_DESC_ACK	0x4
+
+#define	VNET_PRIV_DESC_FREE	0x0
+#define	VNET_PRIV_DESC_BUSY	0x1
+
+typedef struct vnet_public_desc {
+	uint64_t		flags;
+	uint64_t		ncookies;
+	ldc_mem_cookie_t	memcookie[MAX_COOKIES];
+} vnet_public_desc_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _VNETMSG_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vsw.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,455 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This header file contains the basic data structures which the
+ * virtual switch (vsw) uses to communicate with its clients and
+ * the outside world.
+ *
+ * The virtual switch reads the machine description (MD) to
+ * determine how many port_t structures to create (each port_t
+ * can support communications to a single network device). The
+ * port_t's are maintained in a linked list.
+ *
+ * Each port in turn contains a number of logical domain channels
+ * (ldc's) which are inter domain communications channels which
+ * are used for passing small messages between the domains. Their
+ * may be an unlimited number of channels associated with each port,
+ * though most devices only use a single channel.
+ *
+ * The ldc is a bi-directional channel, which is divided up into
+ * two directional 'lanes', one outbound from the switch to the
+ * virtual network device, the other inbound to the switch.
+ * Depending on the type of device each lane may have seperate
+ * communication paramaters (such as mtu etc).
+ *
+ * For those network clients which use descriptor rings the
+ * rings are associated with the appropriate lane. I.e. rings
+ * which the switch exports are associated with the outbound lanes
+ * while those which the network clients are exporting to the switch
+ * are associated with the inbound lane.
+ *
+ * In diagram form the data structures look as follows:
+ *
+ * vsw instance
+ *     |
+ *     +----->port_t----->port_t----->port_t----->
+ *		|
+ *		+--->ldc_t--->ldc_t--->ldc_t--->
+ *		       |
+ *		       +--->lane_t (inbound)
+ *		       |       |
+ *		       |       +--->dring--->dring--->
+ *		       |
+ *		       +--->lane_t (outbound)
+ *			       |
+ *			       +--->dring--->dring--->
+ *
+ */
+
+#ifndef	_VSW_H
+#define	_VSW_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/vio_mailbox.h>
+#include <sys/vnet_common.h>
+#include <sys/ethernet.h>
+
+/*
+ * Default message type.
+ */
+typedef struct def_msg {
+	uint64_t	data[8];
+} def_msg_t;
+
+/*
+ * Currently only support one major/minor pair.
+ */
+#define	VSW_NUM_VER	1
+
+typedef struct ver_sup {
+	uint32_t	ver_major:16,
+			ver_minor:16;
+} ver_sup_t;
+
+/*
+ * Only support ETHER mtu at moment.
+ */
+#define	VSW_MTU		ETHERMAX
+
+/*
+ * Lane states.
+ */
+#define	VSW_LANE_INACTIV	0x0	/* No params set for lane */
+
+#define	VSW_VER_INFO_SENT	0x1	/* Version # sent to peer */
+#define	VSW_VER_INFO_RECV	0x2	/* Version # recv from peer */
+#define	VSW_VER_ACK_RECV	0x4
+#define	VSW_VER_ACK_SENT	0x8
+#define	VSW_VER_NACK_RECV	0x10
+#define	VSW_VER_NACK_SENT	0x20
+
+#define	VSW_ATTR_INFO_SENT	0x40	/* Attributes sent to peer */
+#define	VSW_ATTR_INFO_RECV	0x80	/* Peer attributes received */
+#define	VSW_ATTR_ACK_SENT	0x100
+#define	VSW_ATTR_ACK_RECV	0x200
+#define	VSW_ATTR_NACK_SENT	0x400
+#define	VSW_ATTR_NACK_RECV	0x800
+
+#define	VSW_DRING_INFO_SENT	0x1000	/* Dring info sent to peer */
+#define	VSW_DRING_INFO_RECV	0x2000	/* Dring info received */
+#define	VSW_DRING_ACK_SENT	0x4000
+#define	VSW_DRING_ACK_RECV	0x8000
+#define	VSW_DRING_NACK_SENT	0x10000
+#define	VSW_DRING_NACK_RECV	0x20000
+
+#define	VSW_RDX_INFO_SENT	0x40000	/* RDX sent to peer */
+#define	VSW_RDX_INFO_RECV	0x80000	/* RDX received from peer */
+#define	VSW_RDX_ACK_SENT	0x100000
+#define	VSW_RDX_ACK_RECV	0x200000
+#define	VSW_RDX_NACK_SENT	0x400000
+#define	VSW_RDX_NACK_RECV	0x800000
+
+#define	VSW_MCST_INFO_SENT	0x1000000
+#define	VSW_MCST_INFO_RECV	0x2000000
+#define	VSW_MCST_ACK_SENT	0x4000000
+#define	VSW_MCST_ACK_RECV	0x8000000
+#define	VSW_MCST_NACK_SENT	0x10000000
+#define	VSW_MCST_NACK_RECV	0x20000000
+
+#define	VSW_LANE_ACTIVE		0x40000000	/* Lane open to xmit data */
+
+/* Handshake milestones */
+#define	VSW_MILESTONE0		0x1	/* ver info exchanged */
+#define	VSW_MILESTONE1		0x2	/* attribute exchanged */
+#define	VSW_MILESTONE2		0x4	/* dring info exchanged */
+#define	VSW_MILESTONE3		0x8	/* rdx exchanged */
+#define	VSW_MILESTONE4		0x10	/* handshake complete */
+
+/*
+ * Lane direction (relative to ourselves).
+ */
+#define	INBOUND			0x1
+#define	OUTBOUND		0x2
+
+/* Peer session id received */
+#define	VSW_PEER_SESSION	0x1
+
+/*
+ * Maximum number of consecutive reads of data from channel
+ */
+#define	VSW_MAX_CHAN_READ	50
+
+/*
+ * LDC queue length
+ */
+#define	VSW_LDC_QLEN		1024
+
+/*
+ * Currently only support one ldc per port.
+ */
+#define	VSW_PORT_MAX_LDCS	1	/* max # of ldcs per port */
+
+/*
+ * Used for port add/deletion.
+ */
+#define	VSW_PORT_UPDATED	0x1
+
+#define	LDC_TX_SUCCESS		0	/* ldc transmit success */
+#define	LDC_TX_FAILURE		1	/* ldc transmit failure */
+#define	LDC_TX_NORESOURCES	2	/* out of descriptors */
+
+/* ID of the source of a frame being switched */
+#define	VSW_PHYSDEV		1	/* physical device associated */
+#define	VSW_VNETPORT		2	/* port connected to vnet (over ldc) */
+#define	VSW_LOCALDEV		4	/* vsw configured as an eth interface */
+
+/*
+ * Descriptor ring info
+ *
+ * Each descriptor element has a pre-allocated data buffer
+ * associated with it, into which data being transmitted is
+ * copied. By pre-allocating we speed up the copying process.
+ * The buffer is re-used once the peer has indicated that it is
+ * finished with the descriptor.
+ */
+#define	VSW_RING_NUM_EL		512	/* Num of entries in ring */
+#define	VSW_RING_EL_DATA_SZ	2048	/* Size of data section (bytes) */
+#define	VSW_PRIV_SIZE	sizeof (vnet_private_desc_t)
+#define	VSW_PUB_SIZE	sizeof (vnet_public_desc_t)
+
+#define	VSW_MAX_COOKIES		((ETHERMTU >> MMU_PAGESHIFT) + 2)
+
+/*
+ * Private descriptor
+ */
+typedef struct vsw_private_desc {
+	uint64_t		dstate;
+	vnet_public_desc_t	*descp;
+	ldc_mem_handle_t	memhandle;
+	void			*datap;
+	uint64_t		datalen;
+	uint64_t		ncookies;
+	ldc_mem_cookie_t	memcookie[VSW_MAX_COOKIES];
+	int			bound;
+} vsw_private_desc_t;
+
+/*
+ * Descriptor ring structure
+ */
+typedef struct dring_info {
+	struct	dring_info	*next;	/* next ring in chain */
+	kmutex_t		dlock;
+	uint32_t		num_descriptors;
+	uint32_t		descriptor_size;
+	uint32_t		options;
+	uint32_t		ncookies;
+	ldc_mem_cookie_t	cookie[1];
+
+	ldc_dring_handle_t	handle;
+	uint64_t		ident;	/* identifier sent to peer */
+	uint64_t		end_idx;	/* last idx processed */
+
+	/*
+	 * base address of private and public portions of the
+	 * ring (where appropriate), and data block.
+	 */
+	void			*pub_addr;	/* base of public section */
+	void			*priv_addr;	/* base of private section */
+	void			*data_addr;	/* base of data section */
+	size_t			data_sz;	/* size of data section */
+} dring_info_t;
+
+/*
+ * Each ldc connection is comprised of two lanes, incoming
+ * from a peer, and outgoing to that peer. Each lane shares
+ * common ldc parameters and also has private lane-specific
+ * parameters.
+ */
+typedef struct lane {
+	uint64_t	lstate;		/* Lane state */
+	uint32_t	ver_major:16,	/* Version major number */
+			ver_minor:16;	/* Version minor number */
+	uint64_t	seq_num;	/* Sequence number */
+	uint64_t	mtu;		/* ETHERMTU */
+	uint64_t	addr;		/* Unique physical address */
+	uint8_t		addr_type;	/* Only MAC address at moment */
+	uint8_t		xfer_mode;	/* Dring or Pkt based */
+	uint8_t		ack_freq;	/* Only non zero for Pkt based xfer */
+	dring_info_t	*dringp;	/* List of drings for this lane */
+} lane_t;
+
+/* channel drain states */
+#define	VSW_LDC_INIT		0x1	/* Initial non-drain state */
+#define	VSW_LDC_DRAINING	0x2	/* Channel draining */
+
+/* ldc information associated with a vsw-port */
+typedef struct vsw_ldc {
+	struct vsw_ldc		*ldc_next;	/* next ldc in the list */
+	struct vsw_port		*ldc_port;	/* associated port */
+	struct vsw		*ldc_vswp;	/* associated vsw */
+	kmutex_t		ldc_cblock;	/* sync callback processing */
+	kmutex_t		ldc_txlock;	/* sync transmits */
+	uint64_t		ldc_id;		/* channel number */
+	ldc_handle_t		ldc_handle;	/* channel handle */
+	kmutex_t		drain_cv_lock;
+	kcondvar_t		drain_cv;	/* channel draining */
+	int			drain_state;
+	uint32_t		hphase;		/* handshake phase */
+	int			hcnt;		/* # handshake attempts */
+	ldc_status_t		ldc_status;	/* channel status */
+	uint64_t		local_session;	/* Our session id */
+	uint64_t		peer_session;	/* Our peers session id */
+	uint8_t			session_status;	/* Session recv'd, sent */
+	kmutex_t		hss_lock;
+	uint32_t		hss_id;		/* Handshake session id */
+	uint64_t		next_ident;	/* Next dring ident # to use */
+	lane_t			lane_in;	/* Inbound lane */
+	lane_t			lane_out;	/* Outbound lane */
+	uint8_t			dev_class;	/* Peer device class */
+} vsw_ldc_t;
+
+/* list of ldcs per port */
+typedef struct vsw_ldc_list {
+	vsw_ldc_t	*head;		/* head of the list */
+	krwlock_t	lockrw;		/* sync access(rw) to the list */
+	int		num_ldcs;	/* number of ldcs in the list */
+} vsw_ldc_list_t;
+
+/* multicast addresses port is interested in */
+typedef struct mcst_addr {
+	struct mcst_addr	*nextp;
+	uint64_t		addr;
+} mcst_addr_t;
+
+/* Port detach states */
+#define	VSW_PORT_INIT		0x1	/* Initial non-detach state */
+#define	VSW_PORT_DETACHING	0x2	/* In process of being detached */
+#define	VSW_PORT_DETACHABLE	0x4	/* Safe to detach */
+
+/* port information associated with a vsw */
+typedef struct vsw_port {
+	int			p_instance;	/* port instance */
+	struct vsw_port		*p_next;	/* next port in the list */
+	struct vsw		*p_vswp;	/* associated vsw */
+	vsw_ldc_list_t		p_ldclist;	/* list of ldcs for this port */
+
+	kmutex_t		tx_lock;	/* transmit lock */
+	int			(*transmit)(vsw_ldc_t *, mblk_t *);
+
+	int			state;		/* port state */
+	kmutex_t		state_lock;
+	kcondvar_t		state_cv;
+
+	int			ref_cnt;	/* # of active references */
+	kmutex_t		ref_lock;
+	kcondvar_t		ref_cv;
+
+	kmutex_t		mca_lock;	/* multicast lock */
+	mcst_addr_t		*mcap;		/* list of multicast addrs */
+
+	/*
+	 * mac address of the port & connected device
+	 */
+	struct ether_addr	p_macaddr;
+} vsw_port_t;
+
+/* list of ports per vsw */
+typedef struct vsw_port_list {
+	vsw_port_t	*head;		/* head of the list */
+	krwlock_t	lockrw;		/* sync access(rw) to the list */
+	int		num_ports;	/* number of ports in the list */
+} vsw_port_list_t;
+
+/*
+ * Taskq control message
+ */
+typedef struct vsw_ctrl_task {
+	vsw_ldc_t	*ldcp;
+	def_msg_t	pktp;
+	uint32_t	hss_id;
+} vsw_ctrl_task_t;
+
+/*
+ * Number of hash chains in the multicast forwarding database.
+ */
+#define		VSW_NCHAINS	8
+
+/*
+ * State of interface if switch plumbed as network device.
+ */
+#define		VSW_IF_UP	0x1	/* Interface UP */
+#define		VSW_IF_PROMISC	0x2	/* Interface in promiscious mode */
+
+#define		VSW_U_P(state)	\
+			(state == (VSW_IF_UP | VSW_IF_PROMISC))
+
+/*
+ * Switching modes.
+ */
+#define		VSW_LAYER2		0x1	/* Layer 2 - MAC switching */
+#define		VSW_LAYER2_PROMISC	0x2	/* Layer 2 + promisc mode */
+#define		VSW_LAYER3		0x4	/* Layer 3 - IP switching */
+
+#define		NUM_SMODES	3	/* number of switching modes */
+
+/*
+ * Bits indicating which properties we've read from MD.
+ */
+#define		VSW_MD_PHYSNAME	0x1
+#define		VSW_MD_MACADDR	0x2
+#define		VSW_MD_SMODE	0x4
+
+/*
+ * vsw instance state information.
+ */
+typedef struct	vsw {
+	int			instance;	/* instance # */
+	dev_info_t		*dip;		/* associated dev_info */
+	struct vsw		*next;		/* next in list */
+	char			physname[LIFNAMSIZ];	/* phys-dev */
+	uint8_t			smode[NUM_SMODES];	/* switching mode */
+	int			smode_idx;	/* curr pos in smode array */
+	uint8_t			mdprops;	/* bitmask of props found */
+	vsw_port_list_t		plist;		/* associated ports */
+	ddi_taskq_t		*taskq_p;	/* VIO ctrl msg taskq */
+	mod_hash_t		*fdb;		/* forwarding database */
+
+	mod_hash_t		*mfdb;		/* multicast FDB */
+	krwlock_t		mfdbrw;		/* rwlock for mFDB */
+
+	/* mac layer */
+	mac_handle_t		mh;
+	mac_rx_handle_t		mrh;
+	mac_notify_handle_t	mnh;
+	const mac_txinfo_t	*txinfo;	/* MAC tx routine */
+
+	/* Initial promisc setting of interface */
+	boolean_t		init_promisc;
+
+	/* Machine Description updates  */
+	mdeg_node_spec_t	*inst_spec;
+	mdeg_handle_t		mdeg_hdl;
+
+	/* if configured as an ethernet interface */
+	mac_t			*if_macp;	/* MAC structure */
+	mac_resource_handle_t	if_mrh;
+	struct ether_addr	if_addr;	/* interface address */
+	krwlock_t		if_lockrw;
+	uint8_t			if_state;	/* interface state */
+
+	/* multicast addresses when configured as eth interface */
+	kmutex_t		mca_lock;	/* multicast lock */
+	mcst_addr_t		*mcap;		/* list of multicast addrs */
+} vsw_t;
+
+
+/*
+ * Ethernet broadcast address definition.
+ */
+static	struct	ether_addr	etherbroadcastaddr = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+#define	IS_BROADCAST(ehp) \
+	(ether_cmp(&ehp->ether_dhost, &etherbroadcastaddr) == 0)
+#define	IS_MULTICAST(ehp) \
+	((ehp->ether_dhost.ether_addr_octet[0] & 01) == 1)
+
+#define	READ_ENTER(x)	rw_enter(x, RW_READER)
+#define	WRITE_ENTER(x)	rw_enter(x, RW_WRITER)
+#define	RW_EXIT(x)	rw_exit(x)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VSW_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/sys/vsw_fdb.h	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,64 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_VSW_FDB_H
+#define	_VSW_FDB_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Convert ethernet (mac) address to hash table key.
+ */
+#define	KEY_HASH(key, addr) \
+	(key = ((((uint64_t)addr.ether_addr_octet[0]) << 40) | \
+	(((uint64_t)addr.ether_addr_octet[1]) << 32) | \
+	(((uint64_t)addr.ether_addr_octet[2]) << 24) | \
+	(((uint64_t)addr.ether_addr_octet[3]) << 16) | \
+	(((uint64_t)addr.ether_addr_octet[4]) << 8) | \
+	((uint64_t)addr.ether_addr_octet[5])));
+
+/*
+ * Multicast forwarding database (mFDB) is a hashtable
+ * keyed off the mac address, with the value being a linked
+ * list of mfdb_ent_t structures, each of which is a destination
+ * (either a vsw_port or the vsw instance itself when plumbed as
+ * a network device) to which the multicast pkt should be forwarded.
+ */
+typedef struct mfdb_ent {
+	struct mfdb_ent		*nextp;		/* next entry in list */
+	void			*d_addr;	/* address of dest */
+	uint8_t			d_type;		/* destination type */
+} mfdb_ent_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _VSW_FDB_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/vcc/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,106 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# uts/sun4v/vcc/Makefile
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# ident	"%Z%%M%	%I%	%E% SMI"
+#
+#	This makefile drives the production of the vcc driver kernel module.
+#
+#	sun4v implementation architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= vcc 
+OBJECTS		= $(VCC_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(VCC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_DRV_DIR)/$(MODULE)
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+#	Override defaults to build a unique, local modstubs.o.
+#
+MODSTUBS_DIR	= $(OBJS_DIR)
+
+CLEANFILES	+= $(MODSTUBS_O)
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+# compiler failes with not reached statements
+#
+CERRWARN += -erroff=E_STATEMENT_NOT_REACHED
+
+#
+# module dependencies
+#
+LDFLAGS += -dy -Nmisc/ldc -Nmisc/platsvc
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/vdc/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,108 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# uts/sun4v/vdc/Makefile
+#
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+#
+#	This makefile drives the production of the vdc driver module.
+#
+#	sun4v implementation architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= vdc
+OBJECTS		= $(VDC_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(VDC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_DRV_DIR)/$(MODULE)
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+#	Override defaults to build a unique, local modstubs.o.
+#
+MODSTUBS_DIR	= $(OBJS_DIR)
+
+CLEANFILES	+= $(MODSTUBS_O)
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS	+= $(CCVERBOSE)
+CFLAGS	+= -errwarn=%all
+
+LDFLAGS += -dy -Nmisc/ldc -Nmisc/platsvc
+
+#
+# Re-enable C99 compilation to use stack allocation of variable-sized arrays.
+# According to usr/src/uts/Makefile.uts, C99 is disabled until a problem seen
+# on x86 machines can be fully diagnosed; presumably a sun4v (i.e., SPARC)
+# module should be "safe".  Furthermore, only the variable-sized array
+# extension is needed/used.
+#
+C99MODE = $(C99_ENABLE)
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/vds/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,106 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+# uts/sun4v/vds/Makefile
+#
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+#
+#	This makefile drives the production of the vds driver module.
+#
+#	sun4v implementation architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= vds
+OBJECTS		= $(VDS_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(VDS_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_DRV_DIR)/$(MODULE)
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+#	Override defaults to build a unique, local modstubs.o.
+#
+MODSTUBS_DIR	= $(OBJS_DIR)
+
+CLEANFILES	+= $(MODSTUBS_O)
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+# module dependencies
+#
+LDFLAGS += -dy -Nmisc/ldc -Nmisc/platsvc
+
+#
+# Manually turn on C99 compilation until the sync with snv_38 re-enables it
+#
+C99MODE = $(C99_ENABLE)
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/vldc/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,101 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#ident	"%Z%%M%	%I%	%E% SMI"
+#
+# uts/sun4v/vldc/Makefile
+#
+#	This makefile drives the production of the vldc driver module.
+#
+#	sun4v implementation architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= vldc
+OBJECTS		= $(VLDC_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(VLDC_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_DRV_DIR)/$(MODULE)
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+#	Override defaults to build a unique, local modstubs.o.
+#
+MODSTUBS_DIR	= $(OBJS_DIR)
+
+CLEANFILES	+= $(MODSTUBS_O)
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+# module dependencies
+#
+LDFLAGS += -dy -Nmisc/ldc -Nmisc/platsvc
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/vnet/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,105 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+#
+# uts/sun4v/vnet/Makefile
+#
+#	This makefile drives the production of the vnet driver module.
+#
+#	sun4v implementation architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= vnet
+OBJECTS		= $(VNET_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(VNET_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_DRV_DIR)/$(MODULE)
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+#	Override defaults to build a unique, local modstubs.o.
+#
+MODSTUBS_DIR	= $(OBJS_DIR)
+
+CLEANFILES	+= $(MODSTUBS_O)
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+CFLAGS += -DVGEN_HANDLE_LOST_PKTS
+#CFLAGS	+= -DVGEN_USE_MAC_TX_UPDATE
+#CFLAGS += -DVGEN_REXMIT
+
+
+#
+# Driver depends on MAC & IP
+#
+LDFLAGS         += -dy -N misc/mac -N drv/ip -N misc/ldc -N misc/platsvc
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/sun4v/vsw/Makefile	Tue May 16 16:05:21 2006 -0700
@@ -0,0 +1,112 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+#
+# uts/sun4v/vsw/Makefile
+#
+#	This makefile drives the production of the vsw driver module.
+#
+#	sun4v implementation architecture dependent
+#
+
+#
+#	Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE	= ../..
+
+#
+#	Define the module and object file sets.
+#
+MODULE		= vsw
+OBJECTS		= $(VSW_OBJS:%=$(OBJS_DIR)/%)
+LINTS		= $(VSW_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE	= $(ROOT_PSM_DRV_DIR)/$(MODULE)
+
+#
+#	Include common rules.
+#
+include $(UTSBASE)/sun4v/Makefile.sun4v
+
+#
+#	Override defaults to build a unique, local modstubs.o.
+#
+MODSTUBS_DIR	= $(OBJS_DIR)
+
+CLEANFILES	+= $(MODSTUBS_O)
+
+#
+#	Define targets
+#
+ALL_TARGET	= $(BINARY)
+LINT_TARGET	= $(MODULE).lint
+INSTALL_TARGET	= $(BINARY) $(ROOTMODULE)
+
+#
+# lint pass one enforcement
+#
+CFLAGS += $(CCVERBOSE)
+
+#
+# Module dependencies
+#
+LDFLAGS += -dy -Nmisc/ldc -Nmisc/mac -Nmisc/platsvc
+
+#
+# Re-enable C99 compilation to use stack allocation of variable-sized arrays.
+# According to usr/src/uts/Makefile.uts, C99 is disabled until a problem seen
+# on x86 machines can be fully diagnosed; presumably a sun4v (i.e., SPARC)
+# module should be "safe".  Furthermore, only the variable-sized array
+# extension is needed/used.
+#
+# C99 mode also gives us macros such as __func__
+#
+C99MODE = $(99_ENABLE)
+
+#
+#	Default build targets.
+#
+.KEEP_STATE:
+
+def:		$(DEF_DEPS)
+
+all:		$(ALL_DEPS)
+
+clean:		$(CLEAN_DEPS)
+
+clobber:	$(CLOBBER_DEPS)
+
+lint:		$(LINT_DEPS)
+
+modlintlib:	$(MODLINTLIB_DEPS)
+
+clean.lint:	$(CLEAN_LINT_DEPS)
+
+install:	$(INSTALL_DEPS)
+
+#
+#	Include common targets.
+#
+include $(UTSBASE)/$(PLATFORM)/Makefile.targ