changeset 7589:7de800909a06

PSARC 2008/560 Intel IOMMU 6714111 Solaris needs to support the Intel IOMMU
author Vikram Hegde <Vikram.Hegde@Sun.COM>
date Sun, 14 Sep 2008 19:52:20 -0700
parents fc605a2defdc
children c9805cafd4a9
files usr/src/uts/common/os/main.c usr/src/uts/common/sys/ddi_impldefs.h usr/src/uts/i86pc/Makefile.files usr/src/uts/i86pc/io/dmar_acpi.c usr/src/uts/i86pc/io/intel_iommu.c usr/src/uts/i86pc/io/iommu_rscs.c usr/src/uts/i86pc/io/rootnex.c usr/src/uts/i86pc/os/acpi_fw.h usr/src/uts/i86pc/os/cpuid.c usr/src/uts/i86pc/os/fakebop.c usr/src/uts/i86pc/sys/dmar_acpi.h usr/src/uts/i86pc/sys/intel_iommu.h usr/src/uts/i86pc/sys/iommu_rscs.h usr/src/uts/i86pc/sys/machsystm.h usr/src/uts/i86pc/sys/rootnex.h usr/src/uts/i86pc/vm/htable.c usr/src/uts/i86pc/vm/vm_machdep.c usr/src/uts/intel/ia32/ml/i86_subr.s usr/src/uts/intel/io/pci/pci_boot.c usr/src/uts/intel/sys/archsystm.h usr/src/uts/intel/sys/x86_archext.h
diffstat 21 files changed, 5053 insertions(+), 69 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/os/main.c	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/common/os/main.c	Sun Sep 14 19:52:20 2008 -0700
@@ -27,8 +27,6 @@
 /*	  All Rights Reserved  	*/
 
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from SVr4.0 1.31 */
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/sysmacros.h>
@@ -346,6 +344,9 @@
 	lwp_rtt();
 }
 
+extern void return_instr(void);
+void (*rootnex_iommu_add_intr)(void) = (void (*)(void))return_instr;
+
 void
 main(void)
 {
@@ -446,6 +447,11 @@
 	(void) spl0();
 	interrupts_unleashed = 1;
 
+	/*
+	 * add intel iommu fault event handler
+	 */
+	rootnex_iommu_add_intr();
+
 	vfs_mountroot();	/* Mount the root file system */
 	errorq_init();		/* after vfs_mountroot() so DDI root is ready */
 	cpu_kstat_init(CPU);	/* after vfs_mountroot() so TOD is valid */
--- a/usr/src/uts/common/sys/ddi_impldefs.h	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/common/sys/ddi_impldefs.h	Sun Sep 14 19:52:20 2008 -0700
@@ -26,8 +26,6 @@
 #ifndef _SYS_DDI_IMPLDEFS_H
 #define	_SYS_DDI_IMPLDEFS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/t_lock.h>
@@ -223,6 +221,9 @@
 	/* Declarations of the pure dynamic properties to snapshot */
 	struct i_ddi_prop_dyn	*devi_prop_dyn_driver;	/* prop_op */
 	struct i_ddi_prop_dyn	*devi_prop_dyn_parent;	/* bus_prop_op */
+
+	/* For intel iommu support */
+	void		*devi_iommu_private;
 };
 
 #define	DEVI(dev_info_type)	((struct dev_info *)(dev_info_type))
--- a/usr/src/uts/i86pc/Makefile.files	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/i86pc/Makefile.files	Sun Sep 14 19:52:20 2008 -0700
@@ -184,7 +184,7 @@
 
 ACPIPPM_OBJS	+= acpippm.o acpisleep.o
 
-ROOTNEX_OBJS += rootnex.o
+ROOTNEX_OBJS += rootnex.o iommu_rscs.o dmar_acpi.o intel_iommu.o
 TZMON_OBJS	+= tzmon.o
 UPPC_OBJS += uppc.o psm_common.o
 XSVC_OBJS += xsvc.o
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/io/dmar_acpi.c	Sun Sep 14 19:52:20 2008 -0700
@@ -0,0 +1,662 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Portions Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ * All rights reserved.
+ */
+
+
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#include <sys/list.h>
+#include <sys/pci.h>
+#include <sys/pci_cfgspace.h>
+#include <sys/pci_impl.h>
+#include <sys/sunndi.h>
+#include <sys/ksynch.h>
+#include <sys/cmn_err.h>
+#include <sys/bootconf.h>
+#include <sys/int_fmtio.h>
+#include <sys/dmar_acpi.h>
+
+/*
+ * the following pci manipulate function pinter
+ * are defined in pci_cfgspace.h
+ */
+#define	pci_getb	(*pci_getb_func)
+
+/*
+ * define for debug
+ */
+int intel_dmar_acpi_debug = 0;
+#define	dcmn_err	if (intel_dmar_acpi_debug) cmn_err
+
+/*
+ * global varables
+ */
+boolean_t intel_iommu_support;
+intel_dmar_info_t *dmar_info;
+
+/*
+ * internal varables
+ */
+static void *dmart;
+
+/*
+ * helper functions to release the allocated resources
+ * when failed
+ */
+static void
+release_dev_scope(list_t *lp)
+{
+	pci_dev_scope_t *devs;
+
+	if (list_is_empty(lp))
+		return;
+
+	while ((devs = list_head(lp)) != NULL) {
+		list_remove(lp, devs);
+		kmem_free(devs, sizeof (pci_dev_scope_t));
+	}
+}
+
+static void
+release_drhd_info(void)
+{
+	drhd_info_t *drhd;
+	list_t *lp;
+	int i;
+
+	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
+		lp = &dmar_info->dmari_drhd[i];
+		if (list_is_empty(lp))
+			break;
+
+		while ((drhd = list_head(lp)) != NULL) {
+			list_remove(lp, drhd);
+
+			/*
+			 * release the device scope
+			 */
+			release_dev_scope(&drhd->di_dev_list);
+			list_destroy(&drhd->di_dev_list);
+			kmem_free(drhd, sizeof (drhd_info_t));
+		}
+	}
+}
+
+static void
+release_rmrr_info(void)
+{
+	rmrr_info_t *rmrr;
+	list_t *lp;
+	int i;
+
+	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
+		lp = &dmar_info->dmari_rmrr[i];
+		if (list_is_empty(lp))
+			break;
+
+		while ((rmrr = list_head(lp)) != NULL) {
+			list_remove(lp, rmrr);
+			release_dev_scope(&rmrr->ri_dev_list);
+			list_destroy(&rmrr->ri_dev_list);
+			kmem_free(rmrr, sizeof (rmrr_info_t));
+		}
+	}
+}
+
+/*
+ * intel_iommu_release_dmar_info()
+ *   global function, which is called to release dmar_info
+ *   when the dmar_intel_iommu_supportinfo is not
+ *   needed any more.
+ */
+void
+intel_iommu_release_dmar_info(void)
+{
+	int i;
+
+	intel_iommu_support = B_FALSE;
+	release_drhd_info();
+	release_rmrr_info();
+
+	/*
+	 * destroy the drhd and rmrr list
+	 */
+	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
+		list_destroy(&dmar_info->dmari_drhd[i]);
+		list_destroy(&dmar_info->dmari_rmrr[i]);
+	}
+
+	kmem_free(dmar_info, sizeof (intel_dmar_info_t));
+}
+
+/*
+ * create_dmar_devi()
+ *
+ *   create the dev_info node in the device tree,
+ *   the info node is a nuxus child of the root
+ *   nexus
+ */
+static void
+create_dmar_devi(void)
+{
+	dev_info_t *dip;
+	drhd_info_t *drhd;
+	struct regspec reg;
+	struct ddi_parent_private_data *pdptr;
+	char nodename[64];
+	int i, j;
+
+	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
+
+		/*
+		 * ignore the empty list
+		 */
+		if (list_is_empty(&dmar_info->dmari_drhd[i]))
+			break;
+
+		/*
+		 * alloc dev_info per drhd unit
+		 */
+		j = 0;
+		for_each_in_list(&dmar_info->dmari_drhd[i], drhd) {
+			(void) snprintf(nodename, sizeof (nodename),
+			    "dmar%d,%d", drhd->di_segment, j++);
+			ndi_devi_alloc_sleep(ddi_root_node(), nodename,
+			    DEVI_SID_NODEID, &dip);
+			drhd->di_dip = dip;
+			reg.regspec_bustype = 0;
+			reg.regspec_addr = drhd->di_reg_base;
+			reg.regspec_size = IOMMU_REG_SIZE;
+
+			/*
+			 * update the reg properties
+			 *
+			 *   reg property will be used for register
+			 *   set access
+			 *
+			 * refer to the bus_map of root nexus driver
+			 * I/O or memory mapping:
+			 *
+			 * <bustype=0, addr=x, len=x>: memory
+			 * <bustype=1, addr=x, len=x>: i/o
+			 * <bustype>1, addr=0, len=x>: x86-compatibility i/o
+			 */
+			(void) ndi_prop_update_int_array(DDI_DEV_T_NONE,
+			    dip, "reg", (int *)&reg,
+			    sizeof (struct regspec) / sizeof (int));
+
+			pdptr = (struct ddi_parent_private_data *)
+			    kmem_zalloc(sizeof (struct ddi_parent_private_data)
+			    + sizeof (struct regspec), KM_SLEEP);
+			pdptr->par_nreg = 1;
+			pdptr->par_reg = (struct regspec *)(pdptr + 1);
+			pdptr->par_reg->regspec_bustype = 0;
+			pdptr->par_reg->regspec_addr = drhd->di_reg_base;
+			pdptr->par_reg->regspec_size = IOMMU_REG_SIZE;
+			ddi_set_parent_data(dip, pdptr);
+		}
+	}
+}
+
+/*
+ * parse_dmar_dev_scope()
+ *   parse the device scope attached to drhd or rmrr
+ */
+static int
+parse_dmar_dev_scope(dmar_acpi_dev_scope_t *scope, pci_dev_scope_t **devs)
+{
+	int depth;
+	int bus, dev, func;
+	pci_dev_scope_t *entry;
+
+	struct path_to_dev {
+		uint8_t device;
+		uint8_t function;
+	} *path;
+
+	path = (struct path_to_dev *)(scope + 1);
+	depth = (scope->ds_length - 6)/2;
+	bus = scope->ds_sbusnum;
+	dev = path->device;
+	func = path->function;
+
+	while (--depth) {
+		path++;
+		bus = pci_getb(bus, dev, func, PCI_BCNF_SECBUS);
+		dev = path->device;
+		func = path->function;
+	}
+
+	entry = (pci_dev_scope_t *)kmem_zalloc(
+	    sizeof (pci_dev_scope_t), KM_SLEEP);
+	entry->pds_bus = bus;
+	entry->pds_dev = dev;
+	entry->pds_func = func;
+	entry->pds_type = scope->ds_type;
+
+	*devs = entry;
+	return (PARSE_DMAR_SUCCESS);
+}
+
+/*
+ * parse_dmar_rmrr()
+ *   parse the rmrr units in dmar table
+ */
+static int
+parse_dmar_rmrr(dmar_acpi_unit_head_t *head)
+{
+	dmar_acpi_rmrr_t *rmrr;
+	rmrr_info_t *rinfo;
+	dmar_acpi_dev_scope_t *scope;
+	pci_dev_scope_t *devs;
+
+	rmrr = (dmar_acpi_rmrr_t *)head;
+	ASSERT(head->uh_type == DMAR_UNIT_TYPE_RMRR);
+	ASSERT(rmrr->rm_segment <= DMAR_MAX_SEGMENT);
+
+	/*
+	 * for each rmrr, limiaddr must > baseaddr
+	 */
+	if (rmrr->rm_baseaddr >= rmrr->rm_limiaddr) {
+		cmn_err(CE_WARN, "parse_dmar_rmrr: buggy rmrr,"
+		    " baseaddr = 0x%" PRIx64
+		    ", limiaddr = 0x%" PRIx64 "",
+		    rmrr->rm_baseaddr, rmrr->rm_limiaddr);
+		return (PARSE_DMAR_FAIL);
+	}
+
+	/*
+	 * allocate and setup the device info structure
+	 */
+	rinfo = (rmrr_info_t *)kmem_zalloc(sizeof (rmrr_info_t),
+	    KM_SLEEP);
+	rinfo->ri_segment = rmrr->rm_segment;
+	rinfo->ri_baseaddr = rmrr->rm_baseaddr;
+	rinfo->ri_limiaddr = rmrr->rm_limiaddr;
+	list_create(&rinfo->ri_dev_list, sizeof (pci_dev_scope_t),
+	    offsetof(pci_dev_scope_t, node));
+
+	/*
+	 * parse the device scope
+	 */
+	scope = (dmar_acpi_dev_scope_t *)(rmrr + 1);
+	while ((unsigned long)scope < ((unsigned long)rmrr + head->uh_length)) {
+		if (parse_dmar_dev_scope(scope, &devs)
+		    != PARSE_DMAR_SUCCESS) {
+			return (PARSE_DMAR_FAIL);
+		}
+
+		list_insert_tail(&rinfo->ri_dev_list, devs);
+		scope = (dmar_acpi_dev_scope_t *)((unsigned long)scope
+		    + scope->ds_length);
+	}
+
+	/*
+	 * save this info structure
+	 */
+	list_insert_tail(&dmar_info->dmari_rmrr[rinfo->ri_segment], rinfo);
+	return (PARSE_DMAR_SUCCESS);
+}
+
+/*
+ * parse_dmar_drhd()
+ *   parse the drhd uints in dmar table
+ */
+static int
+parse_dmar_drhd(dmar_acpi_unit_head_t *head)
+{
+	dmar_acpi_drhd_t *drhd;
+	drhd_info_t *dinfo;
+	dmar_acpi_dev_scope_t *scope;
+	list_t *lp;
+	pci_dev_scope_t *devs;
+
+	drhd = (dmar_acpi_drhd_t *)head;
+	ASSERT(head->uh_type == DMAR_UNIT_TYPE_DRHD);
+
+	/*
+	 * assert the segment boundary
+	 */
+	ASSERT(drhd->dr_segment <= DMAR_MAX_SEGMENT);
+
+	/*
+	 * allocate and setup the info structure
+	 */
+	dinfo = (drhd_info_t *)kmem_zalloc(sizeof (drhd_info_t), KM_SLEEP);
+	dinfo->di_segment = drhd->dr_segment;
+	dinfo->di_reg_base = drhd->dr_baseaddr;
+	dinfo->di_include_all = (drhd->dr_flags & INCLUDE_PCI_ALL) ?
+	    B_TRUE : B_FALSE;
+	list_create(&dinfo->di_dev_list, sizeof (pci_dev_scope_t),
+	    offsetof(pci_dev_scope_t, node));
+
+	/*
+	 * parse the device scope
+	 */
+	scope = (dmar_acpi_dev_scope_t *)(drhd + 1);
+	while ((unsigned long)scope < ((unsigned long)drhd +
+	    head->uh_length)) {
+
+		if (parse_dmar_dev_scope(scope, &devs)
+		    != PARSE_DMAR_SUCCESS) {
+			return (PARSE_DMAR_FAIL);
+		}
+
+		list_insert_tail(&dinfo->di_dev_list, devs);
+		scope = (dmar_acpi_dev_scope_t *)((unsigned long)scope +
+		    scope->ds_length);
+	}
+
+	lp = &dmar_info->dmari_drhd[dinfo->di_segment];
+	list_insert_tail(lp, dinfo);
+	return (PARSE_DMAR_SUCCESS);
+}
+
+/*
+ * parse_dmar()
+ *   parse the dmar table
+ */
+static int
+parse_dmar(void)
+{
+	dmar_acpi_head_t *dmar_head;
+	dmar_acpi_unit_head_t *unit_head;
+	drhd_info_t *drhd;
+	int i;
+
+	dmar_head = (dmar_acpi_head_t *)dmart;
+
+	/*
+	 * do a sanity check
+	 */
+	if (!dmar_head || strncmp(dmar_head->dh_sig, "DMAR", 4)) {
+		dcmn_err(CE_CONT, "wrong DMAR signature: %c%c%c%c",
+		    dmar_head->dh_sig[0], dmar_head->dh_sig[1],
+		    dmar_head->dh_sig[2], dmar_head->dh_sig[3]);
+		return (PARSE_DMAR_FAIL);
+	}
+
+	dmar_info->dmari_haw = dmar_head->dh_haw + 1;
+	dmar_info->dmari_intr_remap = dmar_head->dh_flags & 0x1 ?
+	    B_TRUE : B_FALSE;
+
+	/*
+	 * parse each unit
+	 *    only DRHD and RMRR are parsed, others are ignored
+	 */
+	unit_head = (dmar_acpi_unit_head_t *)(dmar_head + 1);
+	while ((unsigned long)unit_head < (unsigned long)dmar_head +
+	    dmar_head->dh_len) {
+		switch (unit_head->uh_type) {
+		case DMAR_UNIT_TYPE_DRHD:
+			if (parse_dmar_drhd(unit_head) !=
+			    PARSE_DMAR_SUCCESS) {
+
+				/*
+				 * iommu_detect_parse() will release
+				 * all drhd info structure, just
+				 * return false here
+				 */
+				return (PARSE_DMAR_FAIL);
+			}
+			break;
+		case DMAR_UNIT_TYPE_RMRR:
+			if (parse_dmar_rmrr(unit_head) !=
+			    PARSE_DMAR_SUCCESS)
+				return (PARSE_DMAR_FAIL);
+			break;
+		default:
+			cmn_err(CE_WARN,
+			    "unit type %d ignored\n", unit_head->uh_type);
+		}
+		unit_head = (dmar_acpi_unit_head_t *)
+		    ((unsigned long)unit_head +
+		    unit_head->uh_length);
+	}
+
+#ifdef	DEBUG
+	/*
+	 * make sure the include_all drhd is the
+	 * last drhd in the list, this is only for
+	 * debug
+	 */
+	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
+		if (list_is_empty(&dmar_info->dmari_drhd[i]))
+			break;
+
+		for_each_in_list(&dmar_info->dmari_drhd[i], drhd) {
+			if (drhd->di_include_all &&
+			    list_next(&dmar_info->dmari_drhd[i], drhd)
+			    != NULL) {
+				list_remove(&dmar_info->dmari_drhd[i], drhd);
+				list_insert_tail(&dmar_info->dmari_drhd[i],
+				    drhd);
+				dcmn_err(CE_CONT,
+				    "include_all drhd is adjusted\n");
+			}
+		}
+	}
+#endif
+
+	return (PARSE_DMAR_SUCCESS);
+}
+
+/*
+ * detect_dmar()
+ *   detect the dmar acpi table
+ */
+static boolean_t
+detect_dmar(void)
+{
+	int len;
+	char *intel_iommu;
+
+	/*
+	 * if "intel-iommu = no" boot property is set,
+	 * ignore intel iommu
+	 */
+	if ((len = do_bsys_getproplen(NULL, "intel-iommu")) > 0) {
+		intel_iommu = kmem_alloc(len, KM_SLEEP);
+		(void) do_bsys_getprop(NULL, "intel-iommu", intel_iommu);
+		if (strcmp(intel_iommu, "no") == 0) {
+			dcmn_err(CE_CONT, "\"intel-iommu=no\" was set\n");
+			kmem_free(intel_iommu, len);
+			return (B_FALSE);
+		}
+		kmem_free(intel_iommu, len);
+	}
+
+	/*
+	 * get dmar-table from system properties
+	 */
+	if ((len = do_bsys_getproplen(NULL, DMAR_TABLE_PROPNAME)) <= 0) {
+		dcmn_err(CE_CONT, "dmar-table getprop failed\n");
+		return (B_FALSE);
+	}
+	dcmn_err(CE_CONT, "dmar-table length = %d\n", len);
+	dmart = kmem_alloc(len, KM_SLEEP);
+	(void) do_bsys_getprop(NULL, DMAR_TABLE_PROPNAME, dmart);
+
+	return (B_TRUE);
+}
+
+/*
+ * print dmar_info for debug
+ */
+static void
+print_dmar_info(void)
+{
+	drhd_info_t *drhd;
+	rmrr_info_t *rmrr;
+	pci_dev_scope_t *dev;
+	int i;
+
+	/* print the title */
+	cmn_err(CE_CONT, "dmar_info->:\n");
+	cmn_err(CE_CONT, "\thaw = %d\n", dmar_info->dmari_haw);
+	cmn_err(CE_CONT, "\tintr_remap = %d\n",
+	    dmar_info->dmari_intr_remap ? 1 : 0);
+
+	/* print drhd info list */
+	cmn_err(CE_CONT, "\ndrhd list:\n");
+	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
+		if (list_is_empty(&dmar_info->dmari_drhd[i]))
+			break;
+		for (drhd = list_head(&dmar_info->dmari_drhd[i]);
+		    drhd != NULL; drhd = list_next(&dmar_info->dmari_drhd[i],
+		    drhd)) {
+			cmn_err(CE_CONT, "\n\tsegment = %d\n",
+			    drhd->di_segment);
+			cmn_err(CE_CONT, "\treg_base = 0x%" PRIx64 "\n",
+			    drhd->di_reg_base);
+			cmn_err(CE_CONT, "\tinclude_all = %s\n",
+			    drhd->di_include_all ? "yes" : "no");
+			cmn_err(CE_CONT, "\tdip = 0x%p\n",
+			    (void *)drhd->di_dip);
+			cmn_err(CE_CONT, "\tdevice list:\n");
+			for (dev = list_head(&drhd->di_dev_list);
+			    dev != NULL; dev = list_next(&drhd->di_dev_list,
+			    dev)) {
+				cmn_err(CE_CONT, "\n\t\tbus = %d\n",
+				    dev->pds_bus);
+				cmn_err(CE_CONT, "\t\tdev = %d\n",
+				    dev->pds_dev);
+				cmn_err(CE_CONT, "\t\tfunc = %d\n",
+				    dev->pds_func);
+				cmn_err(CE_CONT, "\t\ttype = %d\n",
+				    dev->pds_type);
+			}
+		}
+	}
+
+	/* print rmrr info list */
+	cmn_err(CE_CONT, "\nrmrr list:\n");
+	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
+		if (list_is_empty(&dmar_info->dmari_rmrr[i]))
+			break;
+		for (rmrr = list_head(&dmar_info->dmari_rmrr[i]);
+		    rmrr != NULL; rmrr = list_next(&dmar_info->dmari_rmrr[i],
+		    rmrr)) {
+			cmn_err(CE_CONT, "\n\tsegment = %d\n",
+			    rmrr->ri_segment);
+			cmn_err(CE_CONT, "\tbaseaddr = 0x%" PRIx64 "\n",
+			    rmrr->ri_baseaddr);
+			cmn_err(CE_CONT, "\tlimiaddr = 0x%" PRIx64 "\n",
+			    rmrr->ri_limiaddr);
+			cmn_err(CE_CONT, "\tdevice list:\n");
+			for (dev = list_head(&rmrr->ri_dev_list);
+			    dev != NULL;
+			    dev = list_next(&rmrr->ri_dev_list, dev)) {
+				cmn_err(CE_CONT, "\n\t\tbus = %d\n",
+				    dev->pds_bus);
+				cmn_err(CE_CONT, "\t\tdev = %d\n",
+				    dev->pds_dev);
+				cmn_err(CE_CONT, "\t\tfunc = %d\n",
+				    dev->pds_func);
+				cmn_err(CE_CONT, "\t\ttype = %d\n",
+				    dev->pds_type);
+			}
+		}
+	}
+}
+
+/*
+ * intel_iommu_probe_and_parse()
+ *   called from rootnex driver
+ */
+void
+intel_iommu_probe_and_parse(void)
+{
+	int i, len;
+	char *opt;
+
+	intel_iommu_support = B_FALSE;
+	dmar_info = NULL;
+
+	/*
+	 * retrieve the print-dmar-acpi boot option
+	 */
+	if ((len = do_bsys_getproplen(NULL, "print-dmar-acpi")) > 0) {
+		opt = kmem_alloc(len, KM_SLEEP);
+		(void) do_bsys_getprop(NULL, "print-dmar-acpi", opt);
+		if (strcmp(opt, "yes") == 0) {
+			intel_dmar_acpi_debug = 1;
+			cmn_err(CE_CONT, "\"print-dmar-acpi=yes\" was set\n");
+		} else if (strcmp(opt, "no") == 0) {
+			intel_dmar_acpi_debug = 0;
+			cmn_err(CE_CONT, "\"print-dmar-acpi=no\" was set\n");
+		}
+		kmem_free(opt, len);
+	}
+
+	dcmn_err(CE_CONT, "intel iommu detect start\n");
+
+	if (detect_dmar() == B_FALSE) {
+		dcmn_err(CE_CONT, "no intel iommu detected\n");
+		return;
+	}
+
+	/*
+	 * the platform has intel iommu, setup globals
+	 */
+	intel_iommu_support = B_TRUE;
+	dmar_info = kmem_zalloc(sizeof (intel_dmar_info_t),
+	    KM_SLEEP);
+	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
+		list_create(&(dmar_info->dmari_drhd[i]), sizeof (drhd_info_t),
+		    offsetof(drhd_info_t, node));
+		list_create(&(dmar_info->dmari_rmrr[i]), sizeof (rmrr_info_t),
+		    offsetof(rmrr_info_t, node));
+	}
+
+	/*
+	 * parse dmar acpi table
+	 */
+	if (parse_dmar() != PARSE_DMAR_SUCCESS) {
+		intel_iommu_release_dmar_info();
+		dcmn_err(CE_CONT, "DMAR parse failed\n");
+		return;
+	}
+
+	/*
+	 * create dev_info structure per hrhd
+	 * and prepare it for binding driver
+	 */
+	create_dmar_devi();
+
+	/*
+	 * print the dmar info if the debug
+	 * is set
+	 */
+	if (intel_dmar_acpi_debug)
+		print_dmar_info();
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/io/intel_iommu.c	Sun Sep 14 19:52:20 2008 -0700
@@ -0,0 +1,2959 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Portions Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ * All rights reserved.
+ */
+
+/*
+ * Intel IOMMU implementaion
+ */
+#include <sys/conf.h>
+#include <sys/modctl.h>
+#include <sys/pci.h>
+#include <sys/pci_impl.h>
+#include <sys/sysmacros.h>
+#include <sys/ddi.h>
+#include <sys/ddidmareq.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/ddifm.h>
+#include <sys/sunndi.h>
+#include <sys/debug.h>
+#include <sys/fm/protocol.h>
+#include <sys/note.h>
+#include <sys/apic.h>
+#include <vm/hat_i86.h>
+#include <sys/smp_impldefs.h>
+#include <sys/spl.h>
+#include <sys/archsystm.h>
+#include <sys/x86_archext.h>
+#include <sys/rootnex.h>
+#include <sys/avl.h>
+#include <sys/bootconf.h>
+#include <sys/bootinfo.h>
+#include <sys/intel_iommu.h>
+#include <sys/atomic.h>
+
+/*
+ * internal variables
+ *   iommu_state	- the list of iommu structures
+ *   reserve_memory	- the list of reserved regions
+ *   page_num		- the count of pages for iommu page tables
+ */
+static list_t iommu_states;
+static list_t reserve_memory;
+static uint_t page_num;
+
+/*
+ * record some frequently used dips
+ */
+static dev_info_t *pci_top_devinfo = NULL;
+static dev_info_t *isa_top_devinfo = NULL;
+static dev_info_t *lpc_devinfo = NULL;
+
+/*
+ * dvma cache related variables
+ */
+static uint_t dvma_cache_high = 64;
+static dvma_cookie_head_t cookie_cache[MAX_COOKIE_CACHE_SIZE];
+
+/*
+ * switch to turn on/off the gfx dma remapping unit,
+ * this is used when there is a dedicated drhd for the
+ * gfx
+ */
+int gfx_drhd_disable = 0;
+static dev_info_t *gfx_devinfo = NULL;
+
+/*
+ * switch to disable dmar remapping unit, even the initiation work has
+ * been finished
+ */
+int dmar_drhd_disable = 0;
+
+static char *dmar_fault_reason[] = {
+	"Reserved",
+	"The present field in root-entry is Clear",
+	"The present field in context-entry is Clear",
+	"Hardware detected invalid programming of a context-entry",
+	"The DMA request attempted to access an address beyond max support",
+	"The Write field in a page-table entry is Clear when DMA write",
+	"The Read field in a page-table entry is Clear when DMA read",
+	"Access the next level page table resulted in error",
+	"Access the root-entry table resulted in error",
+	"Access the context-entry table resulted in error",
+	"Reserved field not initialized to zero in a present root-entry",
+	"Reserved field not initialized to zero in a present context-entry",
+	"Reserved field not initialized to zero in a present page-table entry",
+	"DMA blocked due to the Translation Type field in context-entry",
+	"Incorrect fault event reason number"
+};
+
+#define	DMAR_MAX_REASON_NUMBER	(14)
+
+/*
+ * cpu_clflush()
+ *   flush the cpu cache line
+ */
+static void
+cpu_clflush(caddr_t addr, uint_t size)
+{
+	uint_t i;
+
+	for (i = 0; i < size; i += x86_clflush_size) {
+		clflush_insn(addr+i);
+	}
+
+	mfence_insn();
+}
+
+/*
+ * iommu_page_init()
+ *   do some init work for the iommu page allocator
+ */
+static void
+iommu_page_init(void)
+{
+	page_num = 0;
+}
+
+/*
+ * iommu_get_page()
+ *   get a 4k iommu page, and zero out it
+ */
+static paddr_t
+iommu_get_page(intel_iommu_state_t *iommu, int kmflag)
+{
+	paddr_t paddr;
+	caddr_t vaddr;
+
+	paddr = iommu_page_alloc(kmflag);
+	vaddr = iommu_page_map(paddr);
+	bzero(vaddr, IOMMU_PAGE_SIZE);
+	iommu->iu_dmar_ops->do_clflush(vaddr, IOMMU_PAGE_SIZE);
+	iommu_page_unmap(vaddr);
+
+	page_num++;
+
+	return (paddr);
+}
+
+/*
+ * iommu_free_page()
+ *   free the iommu page allocated with iommu_get_page
+ */
+static void
+iommu_free_page(paddr_t paddr)
+{
+	iommu_page_free(paddr);
+	page_num--;
+}
+
+#define	iommu_get_reg32(iommu, offset)	ddi_get32((iommu)->iu_reg_handle, \
+		(uint32_t *)(iommu->iu_reg_address + (offset)))
+#define	iommu_get_reg64(iommu, offset)	ddi_get64((iommu)->iu_reg_handle, \
+		(uint64_t *)(iommu->iu_reg_address + (offset)))
+#define	iommu_put_reg32(iommu, offset, val)	ddi_put32\
+		((iommu)->iu_reg_handle, \
+		(uint32_t *)(iommu->iu_reg_address + (offset)), val)
+#define	iommu_put_reg64(iommu, offset, val)	ddi_put64\
+		((iommu)->iu_reg_handle, \
+		(uint64_t *)(iommu->iu_reg_address + (offset)), val)
+
+/*
+ * calculate_agaw()
+ *   calculate agaw from gaw
+ */
+static int
+calculate_agaw(int gaw)
+{
+	int r, agaw;
+
+	r = (gaw - 12) % 9;
+
+	if (r == 0)
+		agaw = gaw;
+	else
+		agaw = gaw + 9 - r;
+
+	if (agaw > 64)
+		agaw = 64;
+
+	return (agaw);
+}
+
+/*
+ * destroy_iommu_state()
+ *   destory an iommu state
+ */
+static void
+destroy_iommu_state(intel_iommu_state_t *iommu)
+{
+	iommu_free_page(iommu->iu_root_entry_paddr);
+	iommu_rscs_fini(&(iommu->iu_domain_id_hdl));
+	mutex_destroy(&(iommu->iu_reg_lock));
+	mutex_destroy(&(iommu->iu_root_context_lock));
+	ddi_regs_map_free(&(iommu->iu_reg_handle));
+	kmem_free(iommu->iu_dmar_ops, sizeof (struct dmar_ops));
+	kmem_free(iommu, sizeof (intel_iommu_state_t));
+}
+
+/*
+ * iommu_update_stats - update iommu private kstat counters
+ *
+ * This routine will dump and reset the iommu's internal
+ * statistics counters. The current stats dump values will
+ * be sent to the kernel status area.
+ */
+static int
+iommu_update_stats(kstat_t *ksp, int rw)
+{
+	intel_iommu_state_t *iommu;
+	iommu_kstat_t *iommu_ksp;
+	const char *state;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	iommu = (intel_iommu_state_t *)ksp->ks_private;
+	ASSERT(iommu != NULL);
+	iommu_ksp = (iommu_kstat_t *)ksp->ks_data;
+	ASSERT(iommu_ksp != NULL);
+
+	state = iommu->iu_enabled ? "enabled" : "disabled";
+	(void) strcpy(iommu_ksp->is_enabled.value.c, state);
+	iommu_ksp->is_iotlb_psi.value.ui64 =
+	    iommu->iu_statistics.st_iotlb_psi;
+	iommu_ksp->is_iotlb_domain.value.ui64 =
+	    iommu->iu_statistics.st_iotlb_domain;
+	iommu_ksp->is_iotlb_global.value.ui64 =
+	    iommu->iu_statistics.st_iotlb_global;
+	iommu_ksp->is_write_buffer.value.ui64 =
+	    iommu->iu_statistics.st_write_buffer;
+	iommu_ksp->is_context_cache.value.ui64 =
+	    iommu->iu_statistics.st_context_cache;
+	iommu_ksp->is_wait_complete_us.value.ui64 =
+	    drv_hztousec(iommu->iu_statistics.st_wait_complete_us);
+	iommu_ksp->is_domain_alloc.value.ui64 =
+	    iommu->iu_statistics.st_domain_alloc;
+	iommu_ksp->is_page_used.value.ui64 = page_num;
+
+	return (0);
+}
+
+/*
+ * iommu_init_stats - initialize kstat data structures
+ *
+ * This routine will create and initialize the iommu private
+ * statistics counters.
+ */
+int
+iommu_init_stats(intel_iommu_state_t *iommu)
+{
+	kstat_t *ksp;
+	iommu_kstat_t *iommu_ksp;
+
+	/*
+	 * Create and init kstat
+	 */
+	ksp = kstat_create("rootnex", 0,
+	    ddi_node_name(iommu->iu_drhd->di_dip),
+	    "misc", KSTAT_TYPE_NAMED,
+	    sizeof (iommu_kstat_t) / sizeof (kstat_named_t), 0);
+
+	if (ksp == NULL) {
+		cmn_err(CE_WARN,
+		    "Could not create kernel statistics for %s",
+		    ddi_node_name(iommu->iu_drhd->di_dip));
+		return (DDI_FAILURE);
+	}
+
+	iommu->iu_kstat = ksp;
+	iommu_ksp = (iommu_kstat_t *)ksp->ks_data;
+
+	/*
+	 * Initialize all the statistics
+	 */
+	kstat_named_init(&(iommu_ksp->is_enabled), "iommu_enable",
+	    KSTAT_DATA_CHAR);
+	kstat_named_init(&(iommu_ksp->is_iotlb_psi), "iotlb_psi",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&(iommu_ksp->is_iotlb_domain), "iotlb_domain",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&(iommu_ksp->is_iotlb_global), "iotlb_global",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&(iommu_ksp->is_write_buffer), "write_buffer",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&(iommu_ksp->is_context_cache), "context_cache",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&(iommu_ksp->is_wait_complete_us), "wait_complete_us",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&(iommu_ksp->is_page_used), "physical_page_used",
+	    KSTAT_DATA_UINT64);
+	kstat_named_init(&(iommu_ksp->is_domain_alloc), "domain_allocated",
+	    KSTAT_DATA_UINT64);
+
+	/*
+	 * Function to provide kernel stat update on demand
+	 */
+	ksp->ks_update = iommu_update_stats;
+
+	/*
+	 * Pointer into provider's raw statistics
+	 */
+	ksp->ks_private = (void *)iommu;
+
+	/*
+	 * Add kstat to systems kstat chain
+	 */
+	kstat_install(ksp);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * iommu_intr_handler()
+ *   the fault event handler for a single drhd
+ */
+static int
+iommu_intr_handler(intel_iommu_state_t *iommu)
+{
+	uint32_t status;
+	int index, fault_reg_offset;
+	int sindex, max_fault_index;
+
+	mutex_enter(&(iommu->iu_reg_lock));
+
+	/* read the fault status */
+	status = iommu_get_reg32(iommu, IOMMU_REG_FAULT_STS);
+
+	/* check if we have a pending fault for this IOMMU */
+	if (!(status & IOMMU_FAULT_STS_PPF)) {
+		mutex_exit(&(iommu->iu_reg_lock));
+		return (0);
+	}
+
+	/*
+	 * handle all primary pending faults
+	 */
+	sindex = index = IOMMU_FAULT_GET_INDEX(status);
+	max_fault_index =  IOMMU_CAP_GET_NFR(iommu->iu_capability) - 1;
+	fault_reg_offset = IOMMU_CAP_GET_FRO(iommu->iu_capability);
+
+	/*
+	 * don't loop forever for a misbehaving IOMMU. Return after 1 loop
+	 * so that we some progress.
+	 */
+	do {
+		uint64_t val;
+		uint8_t fault_reason;
+		uint8_t fault_type;
+		uint16_t sid;
+		uint64_t pg_addr;
+
+		if (index > max_fault_index)
+			index = 0;
+
+		/* read the higher 64bits */
+		val = iommu_get_reg64(iommu,
+		    fault_reg_offset + index * 16 + 8);
+
+		/* check if pending fault */
+		if (!IOMMU_FRR_GET_F(val))
+			break;
+
+		/* get the fault reason, fault type and sid */
+		fault_reason = IOMMU_FRR_GET_FR(val);
+		fault_type = IOMMU_FRR_GET_FT(val);
+		sid = IOMMU_FRR_GET_SID(val);
+
+		/* read the first 64bits */
+		val = iommu_get_reg64(iommu,
+		    fault_reg_offset + index * 16);
+		pg_addr = val & IOMMU_PAGE_MASK;
+
+		/* clear the fault */
+		iommu_put_reg32(iommu, fault_reg_offset + index * 16 + 12,
+		    (((uint32_t)1) << 31));
+
+		/* report the fault info */
+		cmn_err(CE_WARN,
+		    "%s generated a fault event when translating DMA %s\n"
+		    "\t on address 0x%" PRIx64 " for PCI(%d, %d, %d), "
+		    "the reason is:\n\t %s",
+		    ddi_node_name(iommu->iu_drhd->di_dip),
+		    fault_type ? "read" : "write", pg_addr,
+		    (sid >> 8) & 0xff, (sid >> 3) & 0x1f, sid & 0x7,
+		    dmar_fault_reason[MIN(fault_reason,
+		    DMAR_MAX_REASON_NUMBER)]);
+
+	} while (++index < sindex);
+
+	/*
+	 * At this point we have cleared the overflow if any
+	 */
+	status = iommu_get_reg32(iommu, IOMMU_REG_FAULT_STS);
+
+	/* clear over flow */
+	if (status & IOMMU_FAULT_STS_PFO) {
+#ifdef	DEBUG
+		cmn_err(CE_WARN, "Primary Fault logging overflow detected. "
+		    "Clearing fault overflow");
+#endif
+		iommu_put_reg32(iommu, IOMMU_REG_FAULT_STS, 1);
+	}
+
+	mutex_exit(&(iommu->iu_reg_lock));
+
+	return (1);
+}
+
+/*
+ * intel_iommu_intr_handler()
+ *   call iommu_intr_handler for each iommu
+ */
+static uint_t
+intel_iommu_intr_handler(caddr_t arg)
+{
+	int claimed = 0;
+	intel_iommu_state_t *iommu;
+	list_t *lp = (list_t *)arg;
+
+	for_each_in_list(lp, iommu) {
+		claimed |= iommu_intr_handler(iommu);
+	}
+
+	return (claimed ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
+}
+
+/*
+ * intel_iommu_add_intr()
+ *   the interface to hook dmar interrupt handler
+ */
+static void
+intel_iommu_add_intr(void)
+{
+	int ipl, irq, vect;
+	intel_iommu_state_t *iommu;
+	uint32_t msi_addr, msi_data;
+	ipl = IOMMU_INTR_IPL;
+
+	irq = psm_get_ipivect(ipl, -1);
+	vect = apic_irq_table[irq]->airq_vector;
+	(void) add_avintr((void *)NULL, ipl, (avfunc)(intel_iommu_intr_handler),
+	    "iommu intr", irq, (caddr_t)&iommu_states, NULL, NULL, NULL);
+
+	msi_addr = (MSI_ADDR_HDR |
+	    (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
+	    (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) |
+	    apic_cpus[0].aci_local_id);
+	msi_data = ((MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) | vect);
+
+	for_each_in_list(&iommu_states, iommu) {
+		(void) iommu_intr_handler(iommu);
+		mutex_enter(&(iommu->iu_reg_lock));
+		iommu_put_reg32(iommu, IOMMU_REG_FEVNT_ADDR, msi_addr);
+		iommu_put_reg32(iommu, IOMMU_REG_FEVNT_UADDR, 0);
+		iommu_put_reg32(iommu, IOMMU_REG_FEVNT_DATA, msi_data);
+		iommu_put_reg32(iommu, IOMMU_REG_FEVNT_CON, 0);
+		mutex_exit(&(iommu->iu_reg_lock));
+	}
+}
+
+/*
+ * wait max 60s for the hardware completion
+ */
+#define	IOMMU_WAIT_TIME		60000000
+#define	iommu_wait_completion(iommu, offset, getf, completion, status) \
+{ \
+	clock_t stick = ddi_get_lbolt(); \
+	clock_t ntick; \
+	_NOTE(CONSTCOND) \
+	while (1) { \
+		status = getf(iommu, offset); \
+		ntick = ddi_get_lbolt(); \
+		if (completion) {\
+			atomic_add_64\
+			    (&(iommu->iu_statistics.st_wait_complete_us),\
+			    ntick - stick);\
+			break; \
+		} \
+		if (ntick - stick >= drv_usectohz(IOMMU_WAIT_TIME)) { \
+			cmn_err(CE_PANIC, \
+			    "iommu wait completion time out\n"); \
+		} else { \
+			iommu_cpu_nop();\
+		}\
+	}\
+}
+
+/*
+ * dmar_flush_write_buffer()
+ *   flush the write buffer
+ */
+static void
+dmar_flush_write_buffer(intel_iommu_state_t *iommu)
+{
+	uint32_t status;
+
+	mutex_enter(&(iommu->iu_reg_lock));
+	iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD,
+	    iommu->iu_global_cmd_reg | IOMMU_GCMD_WBF);
+	iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS,
+	    iommu_get_reg32, !(status & IOMMU_GSTS_WBFS), status);
+	mutex_exit(&(iommu->iu_reg_lock));
+
+	/* record the statistics */
+	atomic_inc_64(&(iommu->iu_statistics.st_write_buffer));
+}
+
+/*
+ * dmar_flush_iotlb_common()
+ *   flush the iotlb cache
+ */
+static void
+dmar_flush_iotlb_common(intel_iommu_state_t *iommu, uint_t domain_id,
+    uint64_t addr, uint_t am, uint_t hint, tlb_inv_g_t type)
+{
+	uint64_t command = 0, iva = 0, status;
+	uint_t iva_offset, iotlb_offset;
+
+	iva_offset = IOMMU_ECAP_GET_IRO(iommu->iu_excapability);
+	iotlb_offset = iva_offset + 8;
+
+	/*
+	 * prepare drain read/write command
+	 */
+	if (IOMMU_CAP_GET_DWD(iommu->iu_capability)) {
+		command |= TLB_INV_DRAIN_WRITE;
+	}
+
+	if (IOMMU_CAP_GET_DRD(iommu->iu_capability)) {
+		command |= TLB_INV_DRAIN_READ;
+	}
+
+	/*
+	 * if the hardward doesn't support page selective invalidation, we
+	 * will use domain type. Otherwise, use global type
+	 */
+	switch (type) {
+	case TLB_INV_G_PAGE:
+		if (!IOMMU_CAP_GET_PSI(iommu->iu_capability) ||
+		    am > IOMMU_CAP_GET_MAMV(iommu->iu_capability) ||
+		    addr & IOMMU_PAGE_OFFSET) {
+			goto ignore_psi;
+		}
+		command |= TLB_INV_PAGE | TLB_INV_IVT |
+		    TLB_INV_DID(domain_id);
+		iva = addr | am | TLB_IVA_HINT(hint);
+		break;
+ignore_psi:
+	case TLB_INV_G_DOMAIN:
+		command |= TLB_INV_DOMAIN | TLB_INV_IVT |
+		    TLB_INV_DID(domain_id);
+		break;
+	case TLB_INV_G_GLOBAL:
+		command |= TLB_INV_GLOBAL | TLB_INV_IVT;
+		break;
+	default:
+		cmn_err(CE_WARN, "incorrect iotlb flush type");
+		return;
+	}
+
+	/*
+	 * do the actual flush
+	 */
+	mutex_enter(&(iommu->iu_reg_lock));
+	/* verify there is no pending command */
+	iommu_wait_completion(iommu, iotlb_offset, iommu_get_reg64,
+	    !(status & TLB_INV_IVT), status);
+	if (iva)
+		iommu_put_reg64(iommu, iva_offset, iva);
+	iommu_put_reg64(iommu, iotlb_offset, command);
+	iommu_wait_completion(iommu, iotlb_offset, iommu_get_reg64,
+	    !(status & TLB_INV_IVT), status);
+	mutex_exit(&(iommu->iu_reg_lock));
+
+	/*
+	 * check the result and record the statistics
+	 */
+	switch (TLB_INV_GET_IAIG(status)) {
+	/* global */
+	case 1:
+		atomic_inc_64(&(iommu->iu_statistics.st_iotlb_global));
+		break;
+	/* domain */
+	case 2:
+		atomic_inc_64(&(iommu->iu_statistics.st_iotlb_domain));
+		break;
+	/* psi */
+	case 3:
+		atomic_inc_64(&(iommu->iu_statistics.st_iotlb_psi));
+		break;
+	default:
+		break;
+	}
+}
+
+/*
+ * dmar_flush_iotlb_psi()
+ *   register based iotlb psi invalidation
+ */
+static void
+dmar_flush_iotlb_psi(intel_iommu_state_t *iommu, uint_t domain_id,
+    uint64_t dvma, uint_t count, uint_t hint)
+{
+	uint_t am = 0;
+	uint_t max_am = 0;
+	uint64_t align = 0;
+	uint64_t dvma_pg = 0;
+	uint_t used_count = 0;
+
+	/* choose page specified invalidation */
+	if (IOMMU_CAP_GET_PSI(iommu->iu_capability)) {
+		/* MAMV is valid only if PSI is set */
+		max_am = IOMMU_CAP_GET_MAMV(iommu->iu_capability);
+		while (count != 0) {
+			/* First calculate alignment of DVMA */
+			dvma_pg = IOMMU_BTOP(dvma);
+			ASSERT(dvma_pg != NULL);
+			ASSERT(count >= 1);
+			for (align = 1; (dvma_pg & align) == 0; align <<= 1)
+				;
+			/* truncate count to the nearest power of 2 */
+			for (used_count = 1, am = 0; count >> used_count != 0;
+			    used_count <<= 1, am++)
+				;
+			if (am > max_am) {
+				am = max_am;
+				used_count = 1 << am;
+			}
+			if (align >= used_count) {
+				dmar_flush_iotlb_common(iommu, domain_id,
+				    dvma, am, hint, TLB_INV_G_PAGE);
+			} else {
+				/* align < used_count */
+				used_count = align;
+				for (am = 0; (1 << am) != used_count; am++)
+					;
+				dmar_flush_iotlb_common(iommu, domain_id,
+				    dvma, am, hint, TLB_INV_G_PAGE);
+			}
+			count -= used_count;
+			dvma = (dvma_pg + used_count) << IOMMU_PAGE_SHIFT;
+		}
+	/* choose domain invalidation */
+	} else {
+		dmar_flush_iotlb_common(iommu, domain_id, dvma,
+		    0, 0, TLB_INV_G_DOMAIN);
+	}
+}
+
+/*
+ * dmar_flush_iotlb_dsi()
+ *   flush dsi iotlb
+ */
+static void
+dmar_flush_iotlb_dsi(intel_iommu_state_t *iommu, uint_t domain_id)
+{
+	dmar_flush_iotlb_common(iommu, domain_id, 0, 0, 0, TLB_INV_G_DOMAIN);
+}
+
+/*
+ * dmar_flush_iotlb_glb()
+ *   flush global iotbl
+ */
+static void
+dmar_flush_iotlb_glb(intel_iommu_state_t *iommu)
+{
+	dmar_flush_iotlb_common(iommu, 0, 0, 0, 0, TLB_INV_G_GLOBAL);
+}
+
+
+/*
+ * dmar_flush_context_cache()
+ *   flush the context cache
+ */
+static void
+dmar_flush_context_cache(intel_iommu_state_t *iommu, uint8_t function_mask,
+    uint16_t source_id, uint_t domain_id, ctt_inv_g_t type)
+{
+	uint64_t command = 0, status;
+
+	/*
+	 * define the command
+	 */
+	switch (type) {
+	case CTT_INV_G_DEVICE:
+		command |= CCMD_INV_ICC | CCMD_INV_DEVICE
+		    | CCMD_INV_DID(domain_id)
+		    | CCMD_INV_SID(source_id) | CCMD_INV_FM(function_mask);
+		break;
+	case CTT_INV_G_DOMAIN:
+		command |= CCMD_INV_ICC | CCMD_INV_DOMAIN
+		    | CCMD_INV_DID(domain_id);
+		break;
+	case CTT_INV_G_GLOBAL:
+		command |= CCMD_INV_ICC | CCMD_INV_GLOBAL;
+		break;
+	default:
+		cmn_err(CE_WARN, "incorrect context cache flush type");
+		return;
+	}
+
+	mutex_enter(&(iommu->iu_reg_lock));
+	/* verify there is no pending command */
+	iommu_wait_completion(iommu, IOMMU_REG_CONTEXT_CMD, iommu_get_reg64,
+	    !(status & CCMD_INV_ICC), status);
+	iommu_put_reg64(iommu, IOMMU_REG_CONTEXT_CMD, command);
+	iommu_wait_completion(iommu, IOMMU_REG_CONTEXT_CMD, iommu_get_reg64,
+	    !(status & CCMD_INV_ICC), status);
+	mutex_exit(&(iommu->iu_reg_lock));
+
+	/* record the context cache statistics */
+	atomic_inc_64(&(iommu->iu_statistics.st_context_cache));
+}
+
+/*
+ * dmar_flush_context_fsi()
+ *   function based context cache flush
+ */
+static void
+dmar_flush_context_fsi(intel_iommu_state_t *iommu, uint8_t function_mask,
+    uint16_t source_id, uint_t domain_id)
+{
+	dmar_flush_context_cache(iommu, function_mask, source_id,
+	    domain_id, CTT_INV_G_DEVICE);
+}
+
+/*
+ * dmar_flush_context_dsi()
+ *   domain based context cache flush
+ */
+static void
+dmar_flush_context_dsi(intel_iommu_state_t *iommu, uint_t domain_id)
+{
+	dmar_flush_context_cache(iommu, 0, 0, domain_id, CTT_INV_G_DOMAIN);
+}
+
+/*
+ * dmar_flush_context_gbl()
+ *   flush global context cache
+ */
+static void
+dmar_flush_context_gbl(intel_iommu_state_t *iommu)
+{
+	dmar_flush_context_cache(iommu, 0, 0, 0, CTT_INV_G_GLOBAL);
+}
+
+/*
+ * dmar_set_root_entry_table()
+ *   set root entry table
+ */
+static void
+dmar_set_root_table(intel_iommu_state_t *iommu)
+{
+	uint32_t status;
+
+	mutex_enter(&(iommu->iu_reg_lock));
+	iommu_put_reg64(iommu, IOMMU_REG_ROOTENTRY,
+	    iommu->iu_root_entry_paddr);
+	iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD,
+	    iommu->iu_global_cmd_reg | IOMMU_GCMD_SRTP);
+	iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS,
+	    iommu_get_reg32, (status & IOMMU_GSTS_RTPS), status);
+	mutex_exit(&(iommu->iu_reg_lock));
+}
+
+/*
+ * dmar_enable_unit()
+ *   enable the dmar unit
+ */
+static void
+dmar_enable_unit(intel_iommu_state_t *iommu)
+{
+	uint32_t status;
+
+	mutex_enter(&(iommu->iu_reg_lock));
+	iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD,
+	    IOMMU_GCMD_TE);
+	iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS,
+	    iommu_get_reg32, (status & IOMMU_GSTS_TES), status);
+	mutex_exit(&(iommu->iu_reg_lock));
+	iommu->iu_global_cmd_reg |= IOMMU_GCMD_TE;
+	cmn_err(CE_CONT, "?\t%s enabled\n",
+	    ddi_node_name(iommu->iu_drhd->di_dip));
+}
+
+/*
+ * iommu_bringup_unit()
+ *   the processes to bring up a dmar unit
+ */
+static void
+iommu_bringup_unit(intel_iommu_state_t *iommu)
+{
+	/*
+	 * flush the iommu write buffer
+	 */
+	iommu->iu_dmar_ops->do_flwb(iommu);
+
+	/*
+	 * set root entry table
+	 */
+	iommu->iu_dmar_ops->do_set_root_table(iommu);
+
+	/*
+	 * flush the context cache
+	 */
+	iommu->iu_dmar_ops->do_context_gbl(iommu);
+
+	/*
+	 * flush the iotlb cache
+	 */
+	iommu->iu_dmar_ops->do_iotlb_gbl(iommu);
+
+	/*
+	 * at last enable the unit
+	 */
+	iommu->iu_dmar_ops->do_enable(iommu);
+}
+
+/*
+ * iommu_dvma_cache_get()
+ *   get a dvma from the cache
+ */
+static uint64_t
+iommu_dvma_cache_get(dmar_domain_state_t *domain,
+    size_t size, size_t align, size_t nocross)
+{
+	dvma_cache_node_t *cache_node = NULL;
+	dvma_cache_head_t *cache_head;
+	uint_t index = IOMMU_BTOP(size) - 1;
+	uint64_t ioaddr;
+
+	if (index >= DVMA_CACHE_HEAD_CNT)
+		return (0);
+
+	cache_head = &(domain->dm_dvma_cache[index]);
+	mutex_enter(&(cache_head->dch_free_lock));
+	for_each_in_list(&(cache_head->dch_free_list), cache_node) {
+		if ((cache_node->dcn_align >= align) &&
+		    ((nocross == 0) ||
+		    ((cache_node->dcn_dvma ^ (cache_node->dcn_dvma + size - 1))
+		    < (nocross - 1)))) {
+			list_remove(&(cache_head->dch_free_list),
+			    cache_node);
+			cache_head->dch_free_count--;
+			break;
+		}
+	}
+	mutex_exit(&(cache_head->dch_free_lock));
+
+	if (cache_node) {
+		ioaddr = cache_node->dcn_dvma;
+		mutex_enter(&(cache_head->dch_mem_lock));
+		list_insert_head(&(cache_head->dch_mem_list), cache_node);
+		mutex_exit(&(cache_head->dch_mem_lock));
+		return (ioaddr);
+	}
+
+	return (0);
+}
+
+/*
+ * iommu_dvma_cache_put()
+ *   put a dvma to the cache after use
+ */
+static void
+iommu_dvma_cache_put(dmar_domain_state_t *domain, uint64_t dvma,
+    size_t size, size_t align)
+{
+	dvma_cache_node_t *cache_node = NULL;
+	dvma_cache_head_t *cache_head;
+	uint_t index = IOMMU_BTOP(size) - 1;
+	boolean_t shrink = B_FALSE;
+
+	/* out of cache range */
+	if (index >= DVMA_CACHE_HEAD_CNT) {
+		vmem_xfree(domain->dm_dvma_map,
+		    (void *)(intptr_t)dvma, size);
+		return;
+	}
+
+	cache_head = &(domain->dm_dvma_cache[index]);
+
+	/* get a node block */
+	mutex_enter(&(cache_head->dch_mem_lock));
+	cache_node = list_head(&(cache_head->dch_mem_list));
+	if (cache_node) {
+		list_remove(&(cache_head->dch_mem_list), cache_node);
+	}
+	mutex_exit(&(cache_head->dch_mem_lock));
+
+	/* no cache, alloc one */
+	if (cache_node == NULL) {
+		cache_node = kmem_alloc(sizeof (dvma_cache_node_t), KM_SLEEP);
+	}
+
+	/* initialize this node */
+	cache_node->dcn_align = align;
+	cache_node->dcn_dvma = dvma;
+
+	/* insert into the free list */
+	mutex_enter(&(cache_head->dch_free_lock));
+	list_insert_head(&(cache_head->dch_free_list), cache_node);
+
+	/* shrink the cache list */
+	if (cache_head->dch_free_count++ > dvma_cache_high) {
+		cache_node = list_tail(&(cache_head->dch_free_list));
+		list_remove(&(cache_head->dch_free_list), cache_node);
+		shrink = B_TRUE;
+		cache_head->dch_free_count--;
+	}
+	mutex_exit(&(cache_head->dch_free_lock));
+
+	if (shrink) {
+		ASSERT(cache_node);
+		vmem_xfree(domain->dm_dvma_map,
+		    (void *)(intptr_t)(cache_node->dcn_dvma), size);
+		kmem_free(cache_node, sizeof (dvma_cache_node_t));
+	}
+}
+
+/*
+ * iommu_dvma_cache_flush()
+ *   flush the dvma caches when vmem_xalloc() failed
+ */
+static void
+iommu_dvma_cache_flush(dmar_domain_state_t *domain, dev_info_t *dip)
+{
+	dvma_cache_node_t *cache_node;
+	dvma_cache_head_t *cache_head;
+	uint_t index;
+
+	cmn_err(CE_NOTE, "domain dvma cache for %s flushed",
+	    ddi_node_name(dip));
+
+	for (index = 0; index < DVMA_CACHE_HEAD_CNT; index++) {
+		cache_head = &(domain->dm_dvma_cache[index]);
+		mutex_enter(&(cache_head->dch_free_lock));
+		cache_node = list_head(&(cache_head->dch_free_list));
+		while (cache_node) {
+			list_remove(&(cache_head->dch_free_list), cache_node);
+			vmem_xfree(domain->dm_dvma_map,
+			    (void *)(intptr_t)(cache_node->dcn_dvma),
+			    IOMMU_PTOB(index + 1));
+			kmem_free(cache_node, sizeof (dvma_cache_node_t));
+			cache_head->dch_free_count--;
+			cache_node = list_head(&(cache_head->dch_free_list));
+		}
+		ASSERT(cache_head->dch_free_count == 0);
+		mutex_exit(&(cache_head->dch_free_lock));
+	}
+}
+
+/*
+ * get_dvma_cookie_array()
+ *   get a dvma cookie array from the cache or allocate
+ */
+static iommu_dvma_cookie_t *
+get_dvma_cookie_array(uint_t array_size)
+{
+	dvma_cookie_head_t *cache_head;
+	iommu_dvma_cookie_t *cookie = NULL;
+
+	if (array_size > MAX_COOKIE_CACHE_SIZE) {
+		return (kmem_alloc(sizeof (iommu_dvma_cookie_t) * array_size,
+		    KM_SLEEP));
+	}
+
+	cache_head = &(cookie_cache[array_size - 1]);
+	mutex_enter(&(cache_head->dch_lock));
+	/* LINTED E_EQUALITY_NOT_ASSIGNMENT */
+	if (cookie = cache_head->dch_next) {
+		cache_head->dch_next = cookie->dc_next;
+		cache_head->dch_count--;
+	}
+	mutex_exit(&(cache_head->dch_lock));
+
+	if (cookie) {
+		return (cookie);
+	} else {
+		return (kmem_alloc(sizeof (iommu_dvma_cookie_t) * array_size,
+		    KM_SLEEP));
+	}
+}
+
+/*
+ * put_dvma_cookie_array()
+ *   put a dvma cookie array to the cache or free
+ */
+static void
+put_dvma_cookie_array(iommu_dvma_cookie_t *dcookies, uint_t array_size)
+{
+	dvma_cookie_head_t *cache_head;
+
+	if (array_size > MAX_COOKIE_CACHE_SIZE) {
+		kmem_free(dcookies, sizeof (iommu_dvma_cookie_t) * array_size);
+		return;
+	}
+
+	cache_head = &(cookie_cache[array_size - 1]);
+	mutex_enter(&(cache_head->dch_lock));
+	dcookies->dc_next = cache_head->dch_next;
+	cache_head->dch_next = dcookies;
+	cache_head->dch_count++;
+	mutex_exit(&(cache_head->dch_lock));
+}
+
+/*
+ * dmar_reg_plant_wait()
+ *   the plant wait operation for register based cache invalidation
+ */
+static void
+dmar_reg_plant_wait(intel_iommu_state_t *iommu, iommu_dvma_cookie_t *dcookies,
+    uint_t count, uint_t array_size)
+{
+	iotlb_pend_node_t *node = NULL;
+	iotlb_pend_head_t *head;
+
+	head = &(iommu->iu_pend_head);
+
+	/* get a node */
+	mutex_enter(&(head->ich_mem_lock));
+	node = list_head(&(head->ich_mem_list));
+	if (node) {
+		list_remove(&(head->ich_mem_list), node);
+	}
+	mutex_exit(&(head->ich_mem_lock));
+
+	/* no cache, alloc one */
+	if (node == NULL) {
+		node = kmem_alloc(sizeof (iotlb_pend_node_t), KM_SLEEP);
+	}
+
+	/* initialize this node */
+	node->icn_dcookies = dcookies;
+	node->icn_count = count;
+	node->icn_array_size = array_size;
+
+	/* insert into the pend list */
+	mutex_enter(&(head->ich_pend_lock));
+	list_insert_tail(&(head->ich_pend_list), node);
+	head->ich_pend_count++;
+	mutex_exit(&(head->ich_pend_lock));
+}
+
+/*
+ * dmar_release_dvma_cookie()
+ *   release the dvma cookie
+ */
+static void
+dmar_release_dvma_cookie(iommu_dvma_cookie_t *dcookies,
+    uint_t count, uint_t array_size)
+{
+	uint_t i;
+
+	/* free dvma */
+	for (i = 0; i < count; i++) {
+		iommu_dvma_cache_put(dcookies[i].dc_domain,
+		    dcookies[i].dc_addr, dcookies[i].dc_size,
+		    dcookies[i].dc_align);
+	}
+
+	/* free the cookie array */
+	put_dvma_cookie_array(dcookies, array_size);
+}
+
+/*
+ * dmar_reg_reap_wait()
+ *   the reap wait operation for register based cache invalidation
+ */
+static void
+dmar_reg_reap_wait(intel_iommu_state_t *iommu)
+{
+	iotlb_pend_node_t *node;
+	iotlb_pend_head_t *head;
+
+	head = &(iommu->iu_pend_head);
+	mutex_enter(&(head->ich_pend_lock));
+	node = list_head(&(head->ich_pend_list));
+	if (node) {
+		list_remove(&(head->ich_pend_list), node);
+		head->ich_pend_count--;
+	}
+	mutex_exit(&(head->ich_pend_lock));
+
+	if (node) {
+		dmar_release_dvma_cookie(node->icn_dcookies,
+		    node->icn_count, node->icn_array_size);
+		/* put the node into the node cache */
+		mutex_enter(&(head->ich_mem_lock));
+		list_insert_head(&(head->ich_mem_list), node);
+		mutex_exit(&(head->ich_mem_lock));
+	}
+}
+
+/*
+ * dmar_init_ops()
+ *   init dmar ops
+ */
+static void
+dmar_init_ops(intel_iommu_state_t *iommu)
+{
+	struct dmar_ops *ops;
+
+	ASSERT(iommu);
+	ops = kmem_alloc(sizeof (struct dmar_ops), KM_SLEEP);
+
+	/* initialize the dmar operations */
+	ops->do_enable = dmar_enable_unit;
+	ops->do_fault = iommu_intr_handler;
+
+	/* cpu clflush */
+	if (iommu->iu_coherency) {
+		ops->do_clflush = (void (*)(caddr_t, uint_t))return_instr;
+	} else {
+		ASSERT(x86_feature & X86_CLFSH);
+		ops->do_clflush = cpu_clflush;
+	}
+
+	/* write buffer */
+	if (IOMMU_CAP_GET_RWBF(iommu->iu_capability)) {
+		ops->do_flwb = dmar_flush_write_buffer;
+	} else {
+		ops->do_flwb = (void (*)(intel_iommu_state_t *))return_instr;
+	}
+
+	/* cache related functions */
+	ops->do_iotlb_psi = dmar_flush_iotlb_psi;
+	ops->do_iotlb_dsi = dmar_flush_iotlb_dsi;
+	ops->do_iotlb_gbl = dmar_flush_iotlb_glb;
+	ops->do_context_fsi = dmar_flush_context_fsi;
+	ops->do_context_dsi = dmar_flush_context_dsi;
+	ops->do_context_gbl = dmar_flush_context_gbl;
+	ops->do_plant_wait = dmar_reg_plant_wait;
+	ops->do_reap_wait = dmar_reg_reap_wait;
+
+	ops->do_set_root_table = dmar_set_root_table;
+
+	iommu->iu_dmar_ops = ops;
+}
+
+/*
+ * create_iommu_state()
+ *   alloc and setup the iommu state
+ */
+static int
+create_iommu_state(drhd_info_t *drhd)
+{
+	intel_iommu_state_t *iommu;
+	int mgaw, sagaw, agaw;
+	int bitnum;
+	int ret;
+
+	static ddi_device_acc_attr_t ioattr = {
+		DDI_DEVICE_ATTR_V0,
+		DDI_NEVERSWAP_ACC,
+		DDI_STRICTORDER_ACC,
+	};
+
+	iommu = kmem_alloc(sizeof (intel_iommu_state_t), KM_SLEEP);
+	drhd->di_iommu = (void *)iommu;
+	iommu->iu_drhd = drhd;
+
+	/*
+	 * map the register address space
+	 */
+	ret = ddi_regs_map_setup(iommu->iu_drhd->di_dip, 0,
+	    (caddr_t *)&(iommu->iu_reg_address), (offset_t)0,
+	    (offset_t)IOMMU_REG_SIZE, &ioattr,
+	    &(iommu->iu_reg_handle));
+
+	if (ret != DDI_SUCCESS) {
+		cmn_err(CE_WARN, "iommu register map failed: %d", ret);
+		kmem_free(iommu, sizeof (intel_iommu_state_t));
+		return (DDI_FAILURE);
+	}
+
+	mutex_init(&(iommu->iu_reg_lock), NULL, MUTEX_DRIVER,
+	    (void *)ipltospl(IOMMU_INTR_IPL));
+	mutex_init(&(iommu->iu_root_context_lock), NULL, MUTEX_DRIVER, NULL);
+
+	/*
+	 * get the register value
+	 */
+	iommu->iu_capability = iommu_get_reg64(iommu, IOMMU_REG_CAP);
+	iommu->iu_excapability = iommu_get_reg64(iommu, IOMMU_REG_EXCAP);
+
+	/*
+	 * if the hardware access is non-coherent, we need clflush
+	 */
+	if (IOMMU_ECAP_GET_C(iommu->iu_excapability)) {
+		iommu->iu_coherency = B_TRUE;
+	} else {
+		iommu->iu_coherency = B_FALSE;
+		if (!(x86_feature & X86_CLFSH)) {
+			cmn_err(CE_WARN, "drhd can't be enabled due to "
+			    "missing clflush functionality");
+			ddi_regs_map_free(&(iommu->iu_reg_handle));
+			kmem_free(iommu, sizeof (intel_iommu_state_t));
+			return (DDI_FAILURE);
+		}
+	}
+
+	/*
+	 * retrieve the maximum number of domains
+	 */
+	iommu->iu_max_domain = IOMMU_CAP_ND(iommu->iu_capability);
+
+	/*
+	 * setup the domain id allocator
+	 *  domain id 0 is reserved by the architecture
+	 */
+	iommu_rscs_init(1, iommu->iu_max_domain, &(iommu->iu_domain_id_hdl));
+
+	/*
+	 * calculate the agaw
+	 */
+	mgaw = IOMMU_CAP_MGAW(iommu->iu_capability);
+	sagaw = IOMMU_CAP_SAGAW(iommu->iu_capability);
+	iommu->iu_gaw = mgaw;
+	agaw = calculate_agaw(iommu->iu_gaw);
+	bitnum = (agaw - 30) / 9;
+
+	while (bitnum < 5) {
+		if (sagaw & (1 << bitnum))
+			break;
+		else
+			bitnum++;
+	}
+
+	if (bitnum >= 5) {
+		cmn_err(CE_PANIC, "can't determine agaw");
+		/*NOTREACHED*/
+		return (DDI_FAILURE);
+	} else {
+		iommu->iu_agaw = 30 + bitnum * 9;
+		if (iommu->iu_agaw > 64)
+			iommu->iu_agaw = 64;
+		iommu->iu_level = bitnum + 2;
+	}
+
+	/*
+	 * the iommu is orginally disabled
+	 */
+	iommu->iu_enabled = B_FALSE;
+	iommu->iu_global_cmd_reg = 0;
+
+	/*
+	 * init kstat
+	 */
+	(void) iommu_init_stats(iommu);
+	bzero(&(iommu->iu_statistics), sizeof (iommu_stat_t));
+
+	/*
+	 * init dmar ops
+	 */
+	dmar_init_ops(iommu);
+
+	/*
+	 * alloc root entry table, this should put after init ops
+	 */
+	iommu->iu_root_entry_paddr = iommu_get_page(iommu, KM_SLEEP);
+
+	/*
+	 * initialize the iotlb pending list and cache
+	 */
+	mutex_init(&(iommu->iu_pend_head.ich_pend_lock), NULL,
+	    MUTEX_DRIVER, NULL);
+	list_create(&(iommu->iu_pend_head.ich_pend_list),
+	    sizeof (iotlb_pend_node_t),
+	    offsetof(iotlb_pend_node_t, node));
+	iommu->iu_pend_head.ich_pend_count = 0;
+
+	mutex_init(&(iommu->iu_pend_head.ich_mem_lock), NULL,
+	    MUTEX_DRIVER, NULL);
+	list_create(&(iommu->iu_pend_head.ich_mem_list),
+	    sizeof (iotlb_pend_node_t),
+	    offsetof(iotlb_pend_node_t, node));
+
+	/*
+	 * insert this iommu into the list
+	 */
+	list_insert_tail(&iommu_states, iommu);
+
+	/*
+	 * report this unit
+	 */
+	cmn_err(CE_CONT, "?\t%s state structure created\n",
+	    ddi_node_name(iommu->iu_drhd->di_dip));
+
+	return (DDI_SUCCESS);
+}
+
+#define	IS_OVERLAP(new, old)	(((new)->rm_pfn_start <= (old)->rm_pfn_end) && \
+				((new)->rm_pfn_end >= (old)->rm_pfn_start))
+
+/*
+ * memory_region_overlap()
+ *   handle the pci mmio pages overlap condition
+ */
+static boolean_t
+memory_region_overlap(dmar_reserve_pages_t *rmem)
+{
+	dmar_reserve_pages_t *temp;
+
+	for_each_in_list(&reserve_memory, temp) {
+		if (IS_OVERLAP(rmem, temp)) {
+			temp->rm_pfn_start = MIN(temp->rm_pfn_start,
+			    rmem->rm_pfn_start);
+			temp->rm_pfn_end = MAX(temp->rm_pfn_end,
+			    rmem->rm_pfn_end);
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * collect_pci_mmio_walk
+ *   reserve a single dev mmio resources
+ */
+static int
+collect_pci_mmio_walk(dev_info_t *dip, void *arg)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	int i, length, account;
+	pci_regspec_t *assigned;
+	uint64_t mmio_hi, mmio_lo, mmio_size;
+	dmar_reserve_pages_t *rmem;
+
+	/*
+	 * ingore the devices which have no assigned-address
+	 * properties
+	 */
+	if (ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+	    "assigned-addresses", (caddr_t)&assigned,
+	    &length) != DDI_PROP_SUCCESS)
+		return (DDI_WALK_CONTINUE);
+
+	account = length / sizeof (pci_regspec_t);
+
+	for (i = 0; i < account; i++) {
+
+		/*
+		 * check the memory io assigned-addresses
+		 * refer to pci.h for bits defination of
+		 * pci_phys_hi
+		 */
+		if (((assigned[i].pci_phys_hi & PCI_ADDR_MASK)
+		    == PCI_ADDR_MEM32) ||
+		    ((assigned[i].pci_phys_hi & PCI_ADDR_MASK)
+		    == PCI_ADDR_MEM64)) {
+			mmio_lo = (((uint64_t)assigned[i].pci_phys_mid) << 32) |
+			    (uint64_t)assigned[i].pci_phys_low;
+			mmio_size =
+			    (((uint64_t)assigned[i].pci_size_hi) << 32) |
+			    (uint64_t)assigned[i].pci_size_low;
+			mmio_hi = mmio_lo + mmio_size - 1;
+
+			rmem = kmem_alloc(sizeof (dmar_reserve_pages_t),
+			    KM_SLEEP);
+			rmem->rm_pfn_start = IOMMU_BTOP(mmio_lo);
+			rmem->rm_pfn_end = IOMMU_BTOP(mmio_hi);
+			if (!memory_region_overlap(rmem)) {
+				list_insert_tail(&reserve_memory, rmem);
+			}
+		}
+	}
+
+	kmem_free(assigned, length);
+
+	return (DDI_WALK_CONTINUE);
+}
+
+/*
+ * collect_pci_mmio()
+ *   walk through the pci device tree, and collect the mmio resources
+ */
+static int
+collect_pci_mmio(dev_info_t *pdip)
+{
+	int count;
+	ASSERT(pdip);
+
+	/*
+	 * walk through the device tree under pdip
+	 * normally, pdip should be the pci root nexus
+	 */
+	ndi_devi_enter(pdip, &count);
+	ddi_walk_devs(ddi_get_child(pdip),
+	    collect_pci_mmio_walk, NULL);
+	ndi_devi_exit(pdip, count);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * iommu_collect_reserve_memory()
+ *   collect the reserved memory region
+ */
+static void
+iommu_collect_reserve_memory(void)
+{
+	dmar_reserve_pages_t *rmem;
+
+	/*
+	 * reserve pages for pci memory mapped io
+	 */
+	(void) collect_pci_mmio(pci_top_devinfo);
+
+	/*
+	 * reserve pages for ioapic
+	 */
+	rmem = kmem_alloc(sizeof (dmar_reserve_pages_t), KM_SLEEP);
+	rmem->rm_pfn_start = IOMMU_BTOP(IOAPIC_REGION_START);
+	rmem->rm_pfn_end = IOMMU_BTOP(IOAPIC_REGION_END);
+	list_insert_tail(&reserve_memory, rmem);
+}
+
+/*
+ * match_dip_sbdf()
+ *   walk function for get_dip_from_info()
+ */
+static int
+match_dip_sbdf(dev_info_t *dip, void *arg)
+{
+	iommu_private_t *private = DEVI(dip)->devi_iommu_private;
+	pci_dev_info_t *info = arg;
+
+	if (private &&
+	    (info->pdi_seg == private->idp_seg) &&
+	    (info->pdi_bus == private->idp_bus) &&
+	    (info->pdi_devfn == private->idp_devfn)) {
+		info->pdi_dip = dip;
+		return (DDI_WALK_TERMINATE);
+	}
+	return (DDI_WALK_CONTINUE);
+}
+
+/*
+ * get_dip_from_info()
+ *   get the dev_info structure by pass a bus/dev/func
+ */
+static int
+get_dip_from_info(pci_dev_info_t *info)
+{
+	int count;
+	info->pdi_dip = NULL;
+
+	ndi_devi_enter(pci_top_devinfo, &count);
+	ddi_walk_devs(ddi_get_child(pci_top_devinfo),
+	    match_dip_sbdf, info);
+	ndi_devi_exit(pci_top_devinfo, count);
+
+	if (info->pdi_dip)
+		return (DDI_SUCCESS);
+	else
+		return (DDI_FAILURE);
+}
+
+/*
+ * get_pci_top_bridge()
+ *   get the top level bridge for a pci device
+ */
+static dev_info_t *
+get_pci_top_bridge(dev_info_t *dip)
+{
+	iommu_private_t *private;
+	dev_info_t *tmp, *pdip;
+
+	tmp = NULL;
+	pdip = ddi_get_parent(dip);
+	while (pdip != pci_top_devinfo) {
+		private = DEVI(pdip)->devi_iommu_private;
+		if ((private->idp_bbp_type == IOMMU_PPB_PCIE_PCI) ||
+		    (private->idp_bbp_type == IOMMU_PPB_PCI_PCI))
+			tmp = pdip;
+		pdip = ddi_get_parent(pdip);
+	}
+
+	return (tmp);
+}
+
+/*
+ * domain_vmem_init_reserve()
+ *   dish out the reserved pages
+ */
+static void
+domain_vmem_init_reserve(dmar_domain_state_t *domain)
+{
+	dmar_reserve_pages_t *rmem;
+	uint64_t lo, hi;
+	size_t size;
+
+	for_each_in_list(&reserve_memory, rmem) {
+		lo = IOMMU_PTOB(rmem->rm_pfn_start);
+		hi = IOMMU_PTOB(rmem->rm_pfn_end + 1);
+		size = hi - lo;
+
+		if (vmem_xalloc(domain->dm_dvma_map,
+		    size,		/* size */
+		    IOMMU_PAGE_SIZE,	/* align/quantum */
+		    0,			/* phase */
+		    0,			/* nocross */
+		    (void *)(uintptr_t)lo,	/* minaddr */
+		    (void *)(uintptr_t)hi,	/* maxaddr */
+		    VM_NOSLEEP) == NULL) {
+			cmn_err(CE_WARN,
+			    "region [%" PRIx64 ",%" PRIx64 ") not reserved",
+			    lo, hi);
+		}
+	}
+}
+
+/*
+ * domain_vmem_init()
+ *   initiate the domain vmem
+ */
+static void
+domain_vmem_init(dmar_domain_state_t *domain)
+{
+	char vmem_name[64];
+	uint64_t base, size;
+	static uint_t vmem_instance = 0;
+
+	/*
+	 * create the whole available virtual address and
+	 * dish out the reserved memory regions with xalloc
+	 */
+	(void) snprintf(vmem_name, sizeof (vmem_name),
+	    "domain_vmem_%d", vmem_instance++);
+	base = IOMMU_PAGE_SIZE;
+	size = IOMMU_SIZE_4G - base;
+
+	domain->dm_dvma_map = vmem_create(vmem_name,
+	    (void *)(uintptr_t)base,	/* base */
+	    size,			/* size */
+	    IOMMU_PAGE_SIZE,		/* quantum */
+	    NULL,			/* afunc */
+	    NULL,			/* ffunc */
+	    NULL,			/* source */
+	    0,				/* qcache_max */
+	    VM_SLEEP);
+
+	/*
+	 * dish out the reserved pages
+	 */
+	domain_vmem_init_reserve(domain);
+}
+
+/*
+ * iommu_domain_init()
+ *   initiate a domain
+ */
+static int
+iommu_domain_init(dmar_domain_state_t *domain)
+{
+	uint_t i;
+
+	/*
+	 * allocate the domain id
+	 */
+	if (iommu_rscs_alloc(domain->dm_iommu->iu_domain_id_hdl,
+	    &(domain->dm_domain_id)) != DDI_SUCCESS) {
+		cmn_err(CE_WARN, "domain id exhausted %p, assign 1",
+		    (void *)domain->dm_iommu);
+		domain->dm_domain_id = 1;
+	}
+
+	/*
+	 * record the domain statistics
+	 */
+	atomic_inc_64(&(domain->dm_iommu->iu_statistics.st_domain_alloc));
+
+	/*
+	 * create vmem map
+	 */
+	domain_vmem_init(domain);
+
+	/*
+	 * create the first level page table
+	 */
+	domain->dm_page_table_paddr =
+	    iommu_get_page(domain->dm_iommu, KM_SLEEP);
+
+	/*
+	 * init the CPU available page tables
+	 */
+	domain->dm_pt_tree.vp = kmem_zalloc(IOMMU_PAGE_SIZE << 1, KM_SLEEP);
+	domain->dm_pt_tree.pp = iommu_page_map(domain->dm_page_table_paddr);
+	domain->dm_identity = B_FALSE;
+
+	/*
+	 * init the dvma cache
+	 */
+	for (i = 0; i < DVMA_CACHE_HEAD_CNT; i++) {
+		/* init the free list */
+		mutex_init(&(domain->dm_dvma_cache[i].dch_free_lock),
+		    NULL, MUTEX_DRIVER, NULL);
+		list_create(&(domain->dm_dvma_cache[i].dch_free_list),
+		    sizeof (dvma_cache_node_t),
+		    offsetof(dvma_cache_node_t, node));
+		domain->dm_dvma_cache[i].dch_free_count = 0;
+
+		/* init the memory cache list */
+		mutex_init(&(domain->dm_dvma_cache[i].dch_mem_lock),
+		    NULL, MUTEX_DRIVER, NULL);
+		list_create(&(domain->dm_dvma_cache[i].dch_mem_list),
+		    sizeof (dvma_cache_node_t),
+		    offsetof(dvma_cache_node_t, node));
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * dmar_check_sub()
+ *   check to see if the device is under scope of a p2p bridge
+ */
+static boolean_t
+dmar_check_sub(dev_info_t *dip, pci_dev_scope_t *devs)
+{
+	dev_info_t *pdip, *pci_root;
+	iommu_private_t *private;
+	int bus = devs->pds_bus;
+	int devfn = ((devs->pds_dev << 3) | devs->pds_func);
+
+	pdip = ddi_get_parent(dip);
+	pci_root = pci_top_devinfo;
+	while (pdip != pci_root) {
+		private = DEVI(pdip)->devi_iommu_private;
+		if (private && (private->idp_bus == bus) &&
+		    (private->idp_devfn == devfn))
+			return (B_TRUE);
+		pdip = ddi_get_parent(pdip);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * iommu_get_dmar()
+ *   get the iommu structure for a device
+ */
+static intel_iommu_state_t *
+iommu_get_dmar(dev_info_t *dip)
+{
+	iommu_private_t *private =
+	    DEVI(dip)->devi_iommu_private;
+	int seg = private->idp_seg;
+	int bus = private->idp_bus;
+	int dev = private->idp_devfn >> 3;
+	int func = private->idp_devfn & 7;
+	pci_dev_scope_t *devs;
+	drhd_info_t *drhd;
+
+	/*
+	 * walk the drhd list for a match
+	 */
+	for_each_in_list(&(dmar_info->dmari_drhd[seg]), drhd) {
+
+		/*
+		 * match the include all
+		 */
+		if (drhd->di_include_all)
+			return ((intel_iommu_state_t *)
+			    drhd->di_iommu);
+
+		/*
+		 * try to match the device scope
+		 */
+		for_each_in_list(&(drhd->di_dev_list), devs) {
+
+			/*
+			 * get a perfect match
+			 */
+			if (devs->pds_bus == bus &&
+			    devs->pds_dev == dev &&
+			    devs->pds_func == func) {
+				return ((intel_iommu_state_t *)
+				    (drhd->di_iommu));
+			}
+
+			/*
+			 * maybe under a scope of a p2p
+			 */
+			if (devs->pds_type == 0x2 &&
+			    dmar_check_sub(dip, devs))
+				return ((intel_iommu_state_t *)
+				    (drhd->di_iommu));
+		}
+	}
+
+	/*
+	 * shouldn't get here
+	 */
+	cmn_err(CE_PANIC, "can't match iommu for %s\n",
+	    ddi_node_name(dip));
+
+	return (NULL);
+}
+
+/*
+ * domain_set_root_context
+ *   set root context for a single device
+ */
+static void
+domain_set_root_context(dmar_domain_state_t *domain,
+    pci_dev_info_t *info, uint_t agaw)
+{
+	caddr_t root, context;
+	paddr_t paddr;
+	iorce_t rce;
+	uint_t bus, devfn;
+	intel_iommu_state_t *iommu;
+	uint_t aw_code;
+
+	ASSERT(domain);
+	iommu = domain->dm_iommu;
+	ASSERT(iommu);
+	bus = info->pdi_bus;
+	devfn = info->pdi_devfn;
+	aw_code = (agaw - 30) / 9;
+
+	/*
+	 * set root entry
+	 */
+	root = iommu_page_map(iommu->iu_root_entry_paddr);
+	rce = (iorce_t)root + bus;
+	mutex_enter(&(iommu->iu_root_context_lock));
+	if (!ROOT_ENTRY_GET_P(rce)) {
+		paddr = iommu_get_page(iommu, KM_SLEEP);
+		ROOT_ENTRY_SET_P(rce);
+		ROOT_ENTRY_SET_CTP(rce, paddr);
+		iommu->iu_dmar_ops->do_clflush((caddr_t)rce, sizeof (*rce));
+		context = iommu_page_map(paddr);
+	} else {
+		paddr = ROOT_ENTRY_GET_CTP(rce);
+		context = iommu_page_map(paddr);
+	}
+
+	/* set context entry */
+	rce = (iorce_t)context + devfn;
+	if (!CONT_ENTRY_GET_P(rce)) {
+		paddr = domain->dm_page_table_paddr;
+		CONT_ENTRY_SET_P(rce);
+		CONT_ENTRY_SET_ASR(rce, paddr);
+		CONT_ENTRY_SET_AW(rce, aw_code);
+		CONT_ENTRY_SET_DID(rce, domain->dm_domain_id);
+		iommu->iu_dmar_ops->do_clflush((caddr_t)rce, sizeof (*rce));
+	} else if (CONT_ENTRY_GET_ASR(rce) !=
+	    domain->dm_page_table_paddr) {
+		cmn_err(CE_WARN, "root context entries for"
+		    " %d, %d, %d has been set", bus,
+		    devfn >>3, devfn & 0x7);
+	}
+
+	mutex_exit(&(iommu->iu_root_context_lock));
+	iommu_page_unmap(root);
+	iommu_page_unmap(context);
+
+	/* cache mode set, flush context cache */
+	if (IOMMU_CAP_GET_CM(iommu->iu_capability)) {
+		iommu->iu_dmar_ops->do_context_fsi(iommu, 0,
+		    (bus << 8) | devfn, domain->dm_domain_id);
+		iommu->iu_dmar_ops->do_iotlb_dsi(iommu, domain->dm_domain_id);
+	/* cache mode not set, flush write buffer */
+	} else {
+		iommu->iu_dmar_ops->do_flwb(iommu);
+	}
+}
+
+/*
+ * setup_single_context()
+ *   setup the root context entry
+ */
+static void
+setup_single_context(dmar_domain_state_t *domain,
+    int seg, int bus, int devfn)
+{
+	pci_dev_info_t info;
+
+	info.pdi_seg = seg;
+	info.pdi_bus = bus;
+	info.pdi_devfn = devfn;
+
+	domain_set_root_context(domain, &info,
+	    domain->dm_iommu->iu_agaw);
+}
+
+/*
+ * setup_context_walk()
+ *   the walk function to set up the possible context entries
+ */
+static int
+setup_context_walk(dev_info_t *dip, void *arg)
+{
+	dmar_domain_state_t *domain = arg;
+	iommu_private_t *private;
+
+	private = DEVI(dip)->devi_iommu_private;
+	ASSERT(private);
+
+	setup_single_context(domain, private->idp_seg,
+	    private->idp_bus, private->idp_devfn);
+	return (DDI_WALK_PRUNECHILD);
+}
+
+/*
+ * setup_possible_contexts()
+ *   set up all the possible context entries for a device under ppb
+ */
+static void
+setup_possible_contexts(dmar_domain_state_t *domain, dev_info_t *dip)
+{
+	int count;
+	iommu_private_t *private;
+	private = DEVI(dip)->devi_iommu_private;
+
+	/* for pci-pci bridge */
+	if (private->idp_bbp_type == IOMMU_PPB_PCI_PCI) {
+		setup_single_context(domain, private->idp_seg,
+		    private->idp_bus, private->idp_devfn);
+		return;
+	}
+
+	/* for pcie-pci bridge */
+	setup_single_context(domain, private->idp_seg,
+	    private->idp_bus, private->idp_devfn);
+	setup_single_context(domain, private->idp_seg,
+	    private->idp_sec, 0);
+
+	/* for functions under pcie-pci bridge */
+	ndi_devi_enter(dip, &count);
+	ddi_walk_devs(ddi_get_child(dip), setup_context_walk, domain);
+	ndi_devi_exit(dip, count);
+}
+
+/*
+ * iommu_alloc_domain()
+ *   allocate a domain for device, the result is returned in domain parameter
+ */
+static int
+iommu_alloc_domain(dev_info_t *dip, dmar_domain_state_t **domain)
+{
+	iommu_private_t *private, *b_private;
+	dmar_domain_state_t *new;
+	pci_dev_info_t info;
+	dev_info_t *bdip = NULL;
+	uint_t need_to_set_parent;
+	int count, pcount;
+
+	need_to_set_parent = 0;
+	private = DEVI(dip)->devi_iommu_private;
+	if (private == NULL) {
+		cmn_err(CE_PANIC, "iommu private is NULL (%s)\n",
+		    ddi_node_name(dip));
+	}
+
+	/*
+	 * check if the domain has already allocated
+	 */
+	if (private->idp_domain) {
+		*domain = private->idp_domain;
+		return (DDI_SUCCESS);
+	}
+
+	/*
+	 * we have to assign a domain for this device,
+	 */
+
+	ndi_hold_devi(dip);
+	bdip = get_pci_top_bridge(dip);
+	if (bdip != NULL) {
+		ndi_devi_enter(ddi_get_parent(bdip), &pcount);
+	}
+
+	/*
+	 * hold the parent for modifying its children
+	 */
+	ndi_devi_enter(ddi_get_parent(dip), &count);
+
+	/*
+	 * check to see if it is under a pci bridge
+	 */
+	if (bdip != NULL) {
+		b_private = DEVI(bdip)->devi_iommu_private;
+		if (b_private->idp_domain) {
+			new = b_private->idp_domain;
+			goto get_domain_finish;
+		} else {
+			need_to_set_parent = 1;
+		}
+	}
+
+get_domain_alloc:
+	/*
+	 * OK, we have to allocate a new domain
+	 */
+	new = kmem_alloc(sizeof (dmar_domain_state_t), KM_SLEEP);
+	new->dm_iommu = iommu_get_dmar(dip);
+
+	/*
+	 * setup the domain
+	 */
+	if (iommu_domain_init(new) != DDI_SUCCESS) {
+		ndi_devi_exit(ddi_get_parent(dip), count);
+		if (need_to_set_parent)
+			ndi_devi_exit(ddi_get_parent(bdip), pcount);
+		return (DDI_FAILURE);
+	}
+
+get_domain_finish:
+	/*
+	 * add the device to the domain's device list
+	 */
+	private->idp_domain = new;
+	ndi_devi_exit(ddi_get_parent(dip), count);
+
+	if (need_to_set_parent) {
+		b_private->idp_domain = new;
+		ndi_devi_exit(ddi_get_parent(bdip), pcount);
+		setup_possible_contexts(new, bdip);
+	} else if (bdip == NULL) {
+		info.pdi_seg = private->idp_seg;
+		info.pdi_bus = private->idp_bus;
+		info.pdi_devfn = private->idp_devfn;
+		domain_set_root_context(new, &info,
+		    new->dm_iommu->iu_agaw);
+	} else {
+		ndi_devi_exit(ddi_get_parent(bdip), pcount);
+	}
+
+	/*
+	 * return new domain
+	 */
+	*domain = new;
+	return (DDI_SUCCESS);
+}
+
+/*
+ * iommu_get_domain()
+ *   get a iommu domain for dip, and the result is returned in domain
+ */
+static int
+iommu_get_domain(dev_info_t *dip, dmar_domain_state_t **domain)
+{
+	iommu_private_t *private;
+	dev_info_t *pdip;
+	private = DEVI(dip)->devi_iommu_private;
+
+	ASSERT(domain);
+
+	/*
+	 * for isa devices attached under lpc
+	 */
+	if (ddi_get_parent(dip) == isa_top_devinfo) {
+		if (lpc_devinfo) {
+			return (iommu_alloc_domain(lpc_devinfo, domain));
+		} else {
+			*domain = NULL;
+			return (DDI_FAILURE);
+		}
+	}
+
+	/*
+	 * for gart, use the real graphic devinfo
+	 */
+	if (strcmp(ddi_node_name(dip), "agpgart") == 0) {
+		if (gfx_devinfo) {
+			return (iommu_alloc_domain(gfx_devinfo, domain));
+		} else {
+			*domain = NULL;
+			return (DDI_FAILURE);
+		}
+	}
+
+	/*
+	 * if iommu private is NULL, we share
+	 * the domain with the parent
+	 */
+	if (private == NULL) {
+		pdip = ddi_get_parent(dip);
+		return (iommu_alloc_domain(pdip, domain));
+	}
+
+	/*
+	 * check if the domain has already allocated
+	 */
+	if (private->idp_domain) {
+		*domain = private->idp_domain;
+		return (DDI_SUCCESS);
+	}
+
+	/*
+	 * allocate a domain for this device
+	 */
+	return (iommu_alloc_domain(dip, domain));
+}
+
+/*
+ * helper functions to manipulate iommu pte
+ */
+static inline void
+set_pte(iopte_t pte, uint_t rw, paddr_t addr)
+{
+	*pte |= (rw & 0x3);
+	*pte |= (addr & IOMMU_PAGE_MASK);
+}
+
+static inline paddr_t
+pte_get_paddr(iopte_t pte)
+{
+	return (*pte & IOMMU_PAGE_MASK);
+}
+
+/*
+ * dvma_level_offset()
+ *   get the page table offset by specifying a dvma and level
+ */
+static inline uint_t
+dvma_level_offset(uint64_t dvma_pn, uint_t level)
+{
+	uint_t start_bit, offset;
+
+	start_bit = (level - 1) * IOMMU_LEVEL_STRIDE;
+	offset = (dvma_pn >> start_bit) & IOMMU_LEVEL_OFFSET;
+
+	return (offset);
+}
+
+/*
+ * iommu_setup_level_table()
+ *   setup the page table for a level
+ */
+static iovpte_t
+iommu_setup_level_table(dmar_domain_state_t *domain,
+    iovpte_t pvpte, uint_t offset)
+{
+	iopte_t pte;
+	iovpte_t vpte;
+	paddr_t child;
+
+	vpte = (iovpte_t)(pvpte->vp) + offset;
+	pte = (iopte_t)(pvpte->pp) + offset;
+
+	/*
+	 * the pte is nonpresent, alloc new page
+	 */
+	if (*pte == NULL) {
+		child = iommu_get_page(domain->dm_iommu, KM_SLEEP);
+		set_pte(pte, IOMMU_PAGE_PROP_RW, child);
+		domain->dm_iommu->iu_dmar_ops->do_clflush((caddr_t)pte,
+		    sizeof (*pte));
+		vpte->vp = kmem_zalloc(IOMMU_PAGE_SIZE << 1, KM_SLEEP);
+		vpte->pp = iommu_page_map(child);
+	}
+
+	return (vpte);
+}
+
+/*
+ * iommu_setup_page_table()
+ *   setup the page table for a dvma
+ */
+static caddr_t
+iommu_setup_page_table(dmar_domain_state_t *domain, uint64_t dvma)
+{
+	iovpte_t vpte;
+	uint_t level;
+	uint_t offset;
+	int i;
+
+	level = domain->dm_iommu->iu_level;
+	vpte = &(domain->dm_pt_tree);
+
+	for (i = level; i > 1; i--) {
+		offset = dvma_level_offset(IOMMU_BTOP(dvma), i);
+		vpte = iommu_setup_level_table(domain, vpte, offset);
+	}
+
+	return (vpte->pp);
+}
+
+/*
+ * iommu_map_page_range()
+ *   map a range of pages for iommu translation
+ *
+ * domain: the device domain
+ * dvma: the start dvma for mapping
+ * start: the start physcial address
+ * end: the end physical address
+ * flags: misc flag
+ */
+static int
+iommu_map_page_range(dmar_domain_state_t *domain, uint64_t dvma,
+    uint64_t start, uint64_t end, int flags)
+{
+	uint_t offset;
+	iopte_t pte;
+	caddr_t vaddr, dirt;
+	uint64_t paddr = start & IOMMU_PAGE_MASK;
+	uint64_t epaddr = end & IOMMU_PAGE_MASK;
+	uint64_t ioaddr = dvma & IOMMU_PAGE_MASK;
+	uint_t count;
+
+	while (paddr <= epaddr) {
+		vaddr = iommu_setup_page_table(domain, ioaddr);
+		offset = dvma_level_offset(IOMMU_BTOP(ioaddr), 1);
+
+		count = 0;
+		dirt = (caddr_t)((iopte_t)vaddr + offset);
+		while ((paddr <= epaddr) && (offset < IOMMU_PTE_MAX)) {
+			pte = (iopte_t)vaddr + offset;
+			if (*pte != NULL) {
+				if (pte_get_paddr(pte) != paddr) {
+					cmn_err(CE_WARN, "try to set "
+					    "non-NULL pte");
+				}
+			} else {
+				set_pte(pte, IOMMU_PAGE_PROP_RW, paddr);
+			}
+			paddr += IOMMU_PAGE_SIZE;
+			offset++;
+			count++;
+		}
+
+		/* flush cpu and iotlb cache */
+		domain->dm_iommu->iu_dmar_ops->do_clflush(dirt,
+		    count * sizeof (uint64_t));
+
+		if (!(flags & IOMMU_PAGE_PROP_NOSYNC)) {
+			/* cache mode set, flush iotlb */
+			if (IOMMU_CAP_GET_CM(domain->dm_iommu->iu_capability)) {
+				domain->dm_iommu->iu_dmar_ops->
+				    do_iotlb_psi(domain->dm_iommu,
+				    0, ioaddr, count, TLB_IVA_WHOLE);
+			/* cache mode not set, flush write buffer */
+			} else {
+				domain->dm_iommu->iu_dmar_ops->
+				    do_flwb(domain->dm_iommu);
+			}
+		}
+
+		ioaddr += IOMMU_PTOB(count);
+	}
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * build_single_rmrr_identity_map()
+ *   build identity map for a single rmrr unit
+ */
+static void
+build_single_rmrr_identity_map(rmrr_info_t *rmrr)
+{
+	pci_dev_scope_t *devs;
+	pci_dev_info_t info;
+	uint64_t start, end, size;
+	dmar_domain_state_t *domain;
+
+	info.pdi_seg = rmrr->ri_segment;
+	for_each_in_list(&(rmrr->ri_dev_list), devs) {
+		info.pdi_bus = devs->pds_bus;
+		info.pdi_devfn = (devs->pds_dev << 3) |
+		    devs->pds_func;
+
+		if (get_dip_from_info(&info) != DDI_SUCCESS) {
+			cmn_err(CE_WARN, "rmrr: get dip for %d,%d failed",
+			    info.pdi_bus, info.pdi_devfn);
+			continue;
+		}
+
+		if (iommu_get_domain(info.pdi_dip, &domain) != DDI_SUCCESS) {
+			cmn_err(CE_WARN, "rmrr: get domain for %s failed",
+			    ddi_node_name(info.pdi_dip));
+			continue;
+		}
+
+		start = rmrr->ri_baseaddr;
+		end = rmrr->ri_limiaddr;
+		size = end - start + 1;
+
+		/*
+		 * setup the page tables
+		 */
+		if ((vmem_xalloc(domain->dm_dvma_map,
+		    size,		/* size */
+		    IOMMU_PAGE_SIZE,	/* align/quantum */
+		    0,			/* phase */
+		    0,			/* nocross */
+		    (void *)(uintptr_t)start,		/* minaddr */
+		    (void *)(uintptr_t)(end + 1),	/* maxaddr */
+		    VM_NOSLEEP) != NULL)) {
+			(void) iommu_map_page_range(domain,
+			    start, start, end,
+			    DDI_DMA_READ | DDI_DMA_WRITE |
+			    IOMMU_PAGE_PROP_NOSYNC);
+		} else {
+			cmn_err(CE_WARN, "Can't reserve 0x%" PRIx64
+			    " ~ 0x%" PRIx64 " for %s", start, end,
+			    ddi_node_name(info.pdi_dip));
+		}
+	}
+}
+
+/*
+ * build_rmrr_identity_map()
+ *   build identity mapping for devices under rmrr scopes
+ */
+static void
+build_rmrr_identity_map(void)
+{
+	rmrr_info_t *rmrr;
+	int i;
+
+	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
+		if (list_is_empty(&(dmar_info->dmari_rmrr[i])))
+			break;
+		for_each_in_list(&(dmar_info->dmari_rmrr[i]), rmrr) {
+			build_single_rmrr_identity_map(rmrr);
+		}
+	}
+}
+
+/*
+ * drhd_only_for_gfx()
+ *   return TRUE, if the drhd is only for gfx
+ */
+static boolean_t
+drhd_only_for_gfx(intel_iommu_state_t *iommu)
+{
+	drhd_info_t *drhd = iommu->iu_drhd;
+	pci_dev_scope_t *devs;
+	pci_dev_info_t info;
+	int dev_num;
+
+	if (drhd->di_include_all)
+		return (B_FALSE);
+
+	/* get the device number attached to this drhd */
+	dev_num = 0;
+	for_each_in_list(&(drhd->di_dev_list), devs) {
+		dev_num++;
+	}
+
+	if (dev_num == 1) {
+		iommu_private_t *private;
+		devs = list_head(&(drhd->di_dev_list));
+		info.pdi_seg = drhd->di_segment;
+		info.pdi_bus = devs->pds_bus;
+		info.pdi_devfn = (devs->pds_dev << 3) +
+		    (devs->pds_func & 0x7);
+
+		if (get_dip_from_info(&info) != DDI_SUCCESS) {
+			return (B_FALSE);
+		}
+
+		private = DEVI(info.pdi_dip)->devi_iommu_private;
+		if (private->idp_is_display)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * build_gfx_identity_map()
+ *   build identity map for the gfx device
+ */
+static void
+build_gfx_identity_map(dev_info_t *dip)
+{
+	struct memlist *mp;
+	dmar_domain_state_t *domain;
+
+	if (iommu_get_domain(dip, &domain) != DDI_SUCCESS) {
+		cmn_err(CE_WARN, "build identity map for %s failed,"
+		    "this device may not be functional",
+		    ddi_node_name(dip));
+		return;
+	}
+
+	gfx_devinfo = dip;
+
+	ASSERT(bootops != NULL);
+	ASSERT(!modrootloaded);
+	mp = bootops->boot_mem->physinstalled;
+	while (mp != 0) {
+		(void) iommu_map_page_range(domain,
+		    mp->address & IOMMU_PAGE_MASK,
+		    mp->address & IOMMU_PAGE_MASK,
+		    (mp->address + mp->size - 1) & IOMMU_PAGE_MASK,
+		    DDI_DMA_READ | DDI_DMA_WRITE |
+		    IOMMU_PAGE_PROP_NOSYNC);
+		mp = mp->next;
+	}
+
+	/*
+	 * record the identity map for domain, any device
+	 * which uses this domain will needn't any further
+	 * map
+	 */
+	domain->dm_identity = B_TRUE;
+}
+
+/*
+ * build_isa_gfx_identity_walk()
+ *   the walk function for build_isa_gfx_identity_map()
+ */
+static int
+build_isa_gfx_identity_walk(dev_info_t *dip, void *arg)
+{
+	_NOTE(ARGUNUSED(arg))
+
+	iommu_private_t *private;
+	private = DEVI(dip)->devi_iommu_private;
+
+	/* ignore the NULL private device */
+	if (!private)
+		return (DDI_WALK_CONTINUE);
+
+	/* fix the gfx and fd */
+	if (private->idp_is_display)
+		build_gfx_identity_map(dip);
+	else if (private->idp_is_lpc)
+		lpc_devinfo = dip;
+
+	return (DDI_WALK_CONTINUE);
+}
+
+/*
+ * build_isa_gfx_identity_map()
+ *   build identity map for isa and gfx devices
+ */
+static void
+build_isa_gfx_identity_map(void)
+{
+	int count;
+
+	/*
+	 * walk through the device tree from pdip
+	 * normally, pdip should be the pci root
+	 */
+	ndi_devi_enter(pci_top_devinfo, &count);
+	ddi_walk_devs(ddi_get_child(pci_top_devinfo),
+	    build_isa_gfx_identity_walk, NULL);
+	ndi_devi_exit(pci_top_devinfo, count);
+}
+
+/*
+ * dmar_check_boot_option()
+ *   check the intel iommu boot option
+ */
+static void
+dmar_check_boot_option(char *opt, int *var)
+{
+	int len;
+	char *boot_option;
+
+	if ((len = do_bsys_getproplen(NULL, opt)) > 0) {
+		boot_option = kmem_alloc(len, KM_SLEEP);
+		(void) do_bsys_getprop(NULL, opt, boot_option);
+		if (strcmp(boot_option, "yes") == 0) {
+			cmn_err(CE_CONT, "\"%s=yes\" was set\n",
+			    opt);
+			*var = 1;
+		} else if (strcmp(boot_option, "no") == 0) {
+			cmn_err(CE_CONT, "\"%s=no\" was set\n",
+			    opt);
+			*var = 0;
+		}
+		kmem_free(boot_option, len);
+	}
+}
+
+extern void (*rootnex_iommu_add_intr)(void);
+
+/*
+ * intel_iommu_attach_dmar_nodes()
+ *   attach intel iommu nodes
+ */
+int
+intel_iommu_attach_dmar_nodes(void)
+{
+	drhd_info_t *drhd;
+	intel_iommu_state_t *iommu;
+	dmar_reserve_pages_t *rmem;
+	int i;
+
+	/*
+	 * retrieve the dmar boot options
+	 */
+	cmn_err(CE_CONT, "?Start to check dmar related boot options\n");
+	dmar_check_boot_option("dmar-gfx-disable", &gfx_drhd_disable);
+	dmar_check_boot_option("dmar-drhd-disable", &dmar_drhd_disable);
+
+	/*
+	 * init the lists
+	 */
+	list_create(&iommu_states, sizeof (intel_iommu_state_t),
+	    offsetof(intel_iommu_state_t, node));
+	list_create(&reserve_memory, sizeof (dmar_reserve_pages_t),
+	    offsetof(dmar_reserve_pages_t, node));
+
+	pci_top_devinfo = ddi_find_devinfo("pci", -1, 0);
+	isa_top_devinfo = ddi_find_devinfo("isa", -1, 0);
+	if (pci_top_devinfo == NULL) {
+		cmn_err(CE_WARN, "can't get pci top devinfo");
+		return (DDI_FAILURE);
+	}
+
+	iommu_page_init();
+
+	/*
+	 * initiate each iommu unit
+	 */
+	cmn_err(CE_CONT, "?Start to create iommu state structures\n");
+	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
+		for_each_in_list(&(dmar_info->dmari_drhd[i]), drhd) {
+			if (create_iommu_state(drhd) != DDI_SUCCESS)
+				goto iommu_init_fail;
+		}
+	}
+
+	/*
+	 * collect the reserved memory pages
+	 */
+	cmn_err(CE_CONT, "?Start to collect the reserved memory\n");
+	iommu_collect_reserve_memory();
+
+	/*
+	 * build identity map for devices in the rmrr scope
+	 */
+	cmn_err(CE_CONT, "?Start to prepare identity map for rmrr\n");
+	build_rmrr_identity_map();
+
+	/*
+	 * build identity map for isa and gfx devices
+	 */
+	cmn_err(CE_CONT, "?Start to prepare identity map for gfx\n");
+	build_isa_gfx_identity_map();
+
+	/*
+	 * initialize the dvma cookie cache
+	 */
+	for (i = 0; i < MAX_COOKIE_CACHE_SIZE; i++) {
+		mutex_init(&(cookie_cache[i].dch_lock), NULL,
+		    MUTEX_DRIVER, NULL);
+		cookie_cache[i].dch_count = 0;
+		cookie_cache[i].dch_next = NULL;
+	}
+
+	/*
+	 * regist the intr add function
+	 */
+	rootnex_iommu_add_intr = intel_iommu_add_intr;
+
+	/*
+	 * enable dma remapping
+	 */
+	cmn_err(CE_CONT, "?Start to enable the dmar units\n");
+	if (!dmar_drhd_disable) {
+		for_each_in_list(&iommu_states, iommu) {
+			if (gfx_drhd_disable &&
+			    drhd_only_for_gfx(iommu))
+				continue;
+			iommu_bringup_unit(iommu);
+			iommu->iu_enabled = B_TRUE;
+		}
+	}
+
+	return (DDI_SUCCESS);
+
+iommu_init_fail:
+	/*
+	 * free the reserve memory list
+	 */
+	while (rmem = list_head(&reserve_memory)) {
+		list_remove(&reserve_memory, rmem);
+		kmem_free(rmem, sizeof (dmar_reserve_pages_t));
+	}
+	list_destroy(&reserve_memory);
+
+	/*
+	 * free iommu state structure
+	 */
+	while (iommu = list_head(&iommu_states)) {
+		list_remove(&iommu_states, iommu);
+		destroy_iommu_state(iommu);
+	}
+	list_destroy(&iommu_states);
+
+	return (DDI_FAILURE);
+}
+
+/*
+ * get_level_table()
+ *   get level n page table, NULL is returned if
+ *   failure encountered
+ */
+static caddr_t
+get_level_table(dmar_domain_state_t *domain,
+    uint64_t dvma_pn, uint_t n)
+{
+	iovpte_t vpte;
+	uint_t level;
+	uint_t i, offset;
+
+	level = domain->dm_iommu->iu_level;
+	ASSERT(level >= n);
+	vpte = &(domain->dm_pt_tree);
+
+	/* walk to the level n page table */
+	for (i = level; i > n; i--) {
+		offset = dvma_level_offset(dvma_pn, i);
+		vpte = (iovpte_t)(vpte->vp) + offset;
+	}
+
+	return (vpte->pp);
+}
+
+/*
+ * iommu_alloc_cookie_array()
+ *   allocate the cookie array which is needed by map sgl
+ */
+static int
+iommu_alloc_cookie_array(rootnex_dma_t *dma,
+    struct ddi_dma_req *dmareq, uint_t prealloc)
+{
+	int kmflag;
+	rootnex_sglinfo_t *sinfo = &(dma->dp_sglinfo);
+
+	/* figure out the rough estimate of array size */
+	sinfo->si_max_pages =
+	    (dmareq->dmar_object.dmao_size + IOMMU_PAGE_OFFSET) /
+	    sinfo->si_max_cookie_size + 1;
+
+	/* the preallocated buffer fit this size */
+	if (sinfo->si_max_pages <= prealloc) {
+		dma->dp_cookies = (ddi_dma_cookie_t *)dma->dp_prealloc_buffer;
+		dma->dp_need_to_free_cookie = B_FALSE;
+	/* we need to allocate new array */
+	} else {
+		/* convert the sleep flags */
+		if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
+			kmflag =  KM_SLEEP;
+		} else {
+			kmflag =  KM_NOSLEEP;
+		}
+
+		dma->dp_cookie_size = sinfo->si_max_pages *
+		    sizeof (ddi_dma_cookie_t);
+		dma->dp_cookies = kmem_alloc(dma->dp_cookie_size, kmflag);
+		if (dma->dp_cookies == NULL) {
+			return (IOMMU_SGL_NORESOURCES);
+		}
+		dma->dp_need_to_free_cookie = B_TRUE;
+	}
+
+	/* allocate the dvma cookie array */
+	dma->dp_dvma_cookies = get_dvma_cookie_array(sinfo->si_max_pages);
+
+	return (IOMMU_SGL_SUCCESS);
+}
+
+/*
+ * iommu_alloc_dvma()
+ *   alloc a dvma range for the caller
+ */
+static int
+iommu_alloc_dvma(dmar_domain_state_t *domain, uint_t size,
+    ddi_dma_impl_t *hp, uint64_t *dvma, uint_t cnt)
+{
+	rootnex_dma_t *dma;
+	ddi_dma_attr_t *dma_attr;
+	iommu_dvma_cookie_t *dcookie;
+	uint64_t ioaddr;
+	size_t xsize, align, nocross;
+	uint64_t minaddr, maxaddr;
+
+	/* shotcuts */
+	dma = (rootnex_dma_t *)hp->dmai_private;
+	dma_attr = &(hp->dmai_attr);
+	dcookie = dma->dp_dvma_cookies;
+
+	/* parameters */
+	xsize = (size + IOMMU_PAGE_OFFSET) & IOMMU_PAGE_MASK;
+	align = MAX((size_t)(dma_attr->dma_attr_align), IOMMU_PAGE_SIZE);
+	nocross = (size_t)(dma_attr->dma_attr_seg + 1);
+	minaddr = dma_attr->dma_attr_addr_lo;
+	maxaddr = dma_attr->dma_attr_addr_hi + 1;
+
+	/* handle the rollover cases */
+	if (maxaddr < dma_attr->dma_attr_addr_hi) {
+		maxaddr = dma_attr->dma_attr_addr_hi;
+	}
+
+	/* get from cache first */
+	ioaddr = iommu_dvma_cache_get(domain, xsize, align, nocross);
+
+	if (ioaddr == NULL) {
+		/* allocate from vmem arena */
+		ioaddr = (uint64_t)(uintptr_t)vmem_xalloc(domain->dm_dvma_map,
+		    xsize, align, 0, nocross,
+		    (void *)(uintptr_t)minaddr,
+		    (void *)(uintptr_t)maxaddr,
+		    VM_NOSLEEP);
+
+		/* if xalloc failed, we have to flush the cache and retry */
+		if (ioaddr == NULL) {
+			iommu_dvma_cache_flush(domain, dma->dp_dip);
+			ioaddr = (uint64_t)(uintptr_t)vmem_xalloc(
+			    domain->dm_dvma_map,
+			    xsize, align, 0, nocross,
+			    (void *)(uintptr_t)minaddr,
+			    (void *)(uintptr_t)maxaddr,
+			    VM_NOSLEEP);
+			ASSERT(ioaddr);
+		}
+	}
+
+	ASSERT(ioaddr >= minaddr);
+	ASSERT(ioaddr + size - 1 < maxaddr);
+
+	*dvma = ioaddr;
+
+	/*
+	 * save the dvma range in the device dvma cookie
+	 */
+	dcookie[cnt].dc_addr = ioaddr;
+	dcookie[cnt].dc_size = xsize;
+	dcookie[cnt].dc_domain = domain;
+	dcookie[cnt].dc_align = align;
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * iommu_map_dvma()
+ *   map dvma to the physical addresses, the actual
+ *   mapped dvma page number is returned
+ */
+static int
+iommu_map_dvma(dmar_domain_state_t *domain, uint64_t dvma,
+    uint64_t paddr, uint_t psize, struct ddi_dma_req *dmareq)
+{
+	uint64_t start, end;
+	int flags;
+
+	start = paddr & IOMMU_PAGE_MASK;
+	end = (paddr + psize - 1) & IOMMU_PAGE_MASK;
+	flags = dmareq->dmar_flags & DDI_DMA_RDWR;
+
+	/* map each physical address */
+	(void) iommu_map_page_range(domain, dvma, start, end, flags);
+	return (IOMMU_BTOP(end - start) + 1);
+}
+
+/*
+ * intel_iommu_map_sgl()
+ *   called from rootnex_dma_bindhdl(), to build dma
+ *   cookies when iommu is enabled
+ */
+int
+intel_iommu_map_sgl(ddi_dma_handle_t handle,
+    struct ddi_dma_req *dmareq, uint_t prealloc)
+{
+	ddi_dma_atyp_t buftype;
+	uint64_t offset;
+	page_t **pparray;
+	uint64_t paddr;
+	uint64_t dvma;
+	uint_t psize;
+	uint_t size;
+	uint64_t maxseg;
+	caddr_t vaddr;
+	uint_t pcnt, cnt;
+	page_t *page;
+	ddi_dma_cookie_t *sgl;
+	rootnex_sglinfo_t *sglinfo;
+	ddi_dma_obj_t *dmar_object;
+	ddi_dma_impl_t *hp;
+	rootnex_dma_t *dma;
+	dmar_domain_state_t *domain;
+	int e;
+
+	hp = (ddi_dma_impl_t *)handle;
+	dma = (rootnex_dma_t *)hp->dmai_private;
+	sglinfo = &(dma->dp_sglinfo);
+	dmar_object = &(dmareq->dmar_object);
+	maxseg = sglinfo->si_max_cookie_size;
+	pparray = dmar_object->dmao_obj.virt_obj.v_priv;
+	vaddr = dmar_object->dmao_obj.virt_obj.v_addr;
+	buftype = dmar_object->dmao_type;
+	size = dmar_object->dmao_size;
+
+	/* get domain for the dma request */
+	if (iommu_get_domain(dma->dp_dip, &domain) != DDI_SUCCESS) {
+		cmn_err(CE_WARN, "get domain for %s failed",
+		    ddi_node_name(dma->dp_dip));
+		return (IOMMU_SGL_NORESOURCES);
+	}
+
+	/* direct return if drhd is disabled */
+	if (!(domain->dm_iommu->iu_enabled) ||
+	    domain->dm_identity)
+		return (IOMMU_SGL_DISABLE);
+
+	/*
+	 * allocate the cookies arrays, if the pre-allocated
+	 * space is not enough, we should reallocate it
+	 */
+	if (iommu_alloc_cookie_array(dma, dmareq, prealloc)
+	    != IOMMU_SGL_SUCCESS)
+		return (IOMMU_SGL_NORESOURCES);
+	hp->dmai_cookie = dma->dp_cookies;
+	sgl = dma->dp_cookies;
+
+	pcnt = 0;
+	cnt = 0;
+
+	/* retrieve paddr, psize, offset from dmareq */
+	if (buftype == DMA_OTYP_PAGES) {
+		page = dmar_object->dmao_obj.pp_obj.pp_pp;
+		ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
+		offset =  dmar_object->dmao_obj.pp_obj.pp_offset &
+		    MMU_PAGEOFFSET;
+		paddr = pfn_to_pa(page->p_pagenum) + offset;
+		psize = MIN((MMU_PAGESIZE - offset), size);
+		sglinfo->si_asp = NULL;
+		page = page->p_next;
+	} else {
+		ASSERT((buftype == DMA_OTYP_VADDR) ||
+		    (buftype == DMA_OTYP_BUFVADDR));
+		sglinfo->si_asp = dmar_object->dmao_obj.virt_obj.v_as;
+		if (sglinfo->si_asp == NULL) {
+			sglinfo->si_asp = &kas;
+		}
+		offset = (uintptr_t)vaddr & MMU_PAGEOFFSET;
+
+		if (pparray != NULL) {
+			ASSERT(!PP_ISFREE(pparray[pcnt]));
+			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset;
+			psize = MIN((MMU_PAGESIZE - offset), size);
+			pcnt++;
+		} else {
+			paddr = pfn_to_pa(hat_getpfnum(sglinfo->si_asp->a_hat,
+			    vaddr)) + offset;
+			psize = MIN(size, (MMU_PAGESIZE - offset));
+			vaddr += psize;
+		}
+	}
+
+	/* save the iommu page offset */
+	sglinfo->si_buf_offset = offset & IOMMU_PAGE_OFFSET;
+
+	/*
+	 * allocate the dvma and map [paddr, paddr+psize)
+	 */
+	e = iommu_alloc_dvma(domain, MIN(size + sglinfo->si_buf_offset,
+	    maxseg), hp, &dvma, cnt);
+	if (e != DDI_SUCCESS)
+		return (IOMMU_SGL_NORESOURCES);
+	e  = iommu_map_dvma(domain, dvma, paddr, psize, dmareq);
+
+	/*
+	 * setup the first cookie with the dvma of the page
+	 * and the its size, we don't take account in the
+	 * offset into the first page now
+	 */
+	sgl[cnt].dmac_laddress = dvma;
+	sgl[cnt].dmac_size = psize + sglinfo->si_buf_offset;
+	sgl[cnt].dmac_type = 0;
+	dvma += IOMMU_PTOB(e);
+
+	size -= psize;
+	while (size > 0) {
+		/* get the size for this page (i.e. partial or full page) */
+		psize = MIN(size, MMU_PAGESIZE);
+		if (buftype == DMA_OTYP_PAGES) {
+			/* get the paddr from the page_t */
+			ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
+			paddr = pfn_to_pa(page->p_pagenum);
+			page = page->p_next;
+		} else if (pparray != NULL) {
+			/* index into the array of page_t's to get the paddr */
+			ASSERT(!PP_ISFREE(pparray[pcnt]));
+			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum);
+			pcnt++;
+		} else {
+			/* call into the VM to get the paddr */
+			paddr = pfn_to_pa(hat_getpfnum
+			    (sglinfo->si_asp->a_hat, vaddr));
+			vaddr += psize;
+		}
+
+		/*
+		 * check to see if this page would put us
+		 * over the max cookie size
+		 */
+		if ((sgl[cnt].dmac_size + psize) > maxseg) {
+			/* use the next cookie */
+			cnt++;
+
+			/* allocate the dvma and map [paddr, paddr+psize) */
+			e = iommu_alloc_dvma(domain, MIN(size, maxseg),
+			    hp, &dvma, cnt);
+			if (e != DDI_SUCCESS)
+				return (IOMMU_SGL_NORESOURCES);
+			e  = iommu_map_dvma(domain, dvma, paddr, psize, dmareq);
+
+			/* save the cookie information */
+			sgl[cnt].dmac_laddress = dvma;
+			sgl[cnt].dmac_size = psize;
+			sgl[cnt].dmac_type = 0;
+			dvma += IOMMU_PTOB(e);
+
+		/*
+		 * we can add this page in the current cookie
+		 */
+		} else {
+			e  = iommu_map_dvma(domain, dvma, paddr, psize, dmareq);
+			sgl[cnt].dmac_size += psize;
+			dvma += IOMMU_PTOB(e);
+		}
+
+		size -= psize;
+	}
+
+	/* take account in the offset into the first page */
+	sgl[0].dmac_laddress += sglinfo->si_buf_offset;
+	sgl[0].dmac_size -= sglinfo->si_buf_offset;
+
+	/* save away how many cookies we have */
+	sglinfo->si_sgl_size = cnt + 1;
+
+	return (IOMMU_SGL_SUCCESS);
+}
+
+/*
+ * iommu_clear_leaf_pte()
+ *   clear a single leaf pte
+ */
+static void
+iommu_clear_leaf_pte(dmar_domain_state_t *domain, uint64_t dvma, uint64_t size)
+{
+	iopte_t pte;
+	uint_t offset;
+	caddr_t leaf_table, dirt;
+	uint64_t csize = 0;
+	uint64_t cdvma = dvma & IOMMU_PAGE_MASK;
+	int count;
+
+	while (csize < size) {
+
+		/* retrieve the leaf page table */
+		leaf_table = get_level_table(domain, IOMMU_BTOP(cdvma), 1);
+		if (!leaf_table) {
+			cmn_err(CE_WARN, "get level 1 table for 0x%"
+			    PRIx64 "failed", cdvma);
+			return;
+		}
+
+		/* map the leaf page and walk to the pte */
+		offset = dvma_level_offset(IOMMU_BTOP(cdvma), 1);
+
+		/* clear the ptes */
+		count = 0;
+		dirt = (caddr_t)((iopte_t)leaf_table + offset);
+		while ((csize < size) &&
+		    (offset < IOMMU_PTE_MAX)) {
+			pte = (iopte_t)leaf_table + offset;
+			if (!*pte) {
+				cmn_err(CE_WARN, "try to clear NULL pte");
+			} else {
+				*pte = 0;
+			}
+			csize += IOMMU_PAGE_SIZE;
+			offset++;
+			count++;
+		}
+
+		/* flush cpu and iotlb cache */
+		domain->dm_iommu->iu_dmar_ops->do_clflush(dirt,
+		    count * sizeof (uint64_t));
+		domain->dm_iommu->iu_dmar_ops->do_iotlb_psi(domain->dm_iommu,
+		    domain->dm_domain_id, cdvma, count, TLB_IVA_LEAF);
+
+		/* unmap the leaf page */
+		cdvma += IOMMU_PTOB(count);
+	}
+}
+
+/*
+ * intel_iommu_unmap_sgl()
+ *   called from rootnex_dma_unbindhdl(), to unbind dma
+ *   cookies when iommu is enabled
+ */
+void
+intel_iommu_unmap_sgl(ddi_dma_handle_t handle)
+{
+	ddi_dma_impl_t *hp;
+	rootnex_dma_t *dma;
+	dmar_domain_state_t *domain;
+	iommu_dvma_cookie_t *dcookies;
+	rootnex_sglinfo_t *sinfo;
+	uint64_t i;
+
+	hp = (ddi_dma_impl_t *)handle;
+	dma = (rootnex_dma_t *)hp->dmai_private;
+	dcookies = dma->dp_dvma_cookies;
+	sinfo = &(dma->dp_sglinfo);
+
+	/* get the device domain, no return check needed here */
+	(void) iommu_get_domain(dma->dp_dip, &domain);
+
+	/* if the drhd is disabled, nothing will be done */
+	if (!(domain->dm_iommu->iu_enabled) ||
+	    domain->dm_identity)
+		return;
+
+	/* the drhd is enabled */
+	for (i = 0; i < sinfo->si_sgl_size; i++) {
+		/* clear leaf ptes */
+		iommu_clear_leaf_pte(domain, dcookies[i].dc_addr,
+		    dcookies[i].dc_size);
+	}
+
+	domain->dm_iommu->iu_dmar_ops->do_reap_wait(domain->dm_iommu);
+	domain->dm_iommu->iu_dmar_ops->do_plant_wait(domain->dm_iommu,
+	    dcookies, sinfo->si_sgl_size, sinfo->si_max_pages);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/io/iommu_rscs.c	Sun Sep 14 19:52:20 2008 -0700
@@ -0,0 +1,340 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+#include <sys/conf.h>
+#include <sys/autoconf.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/psw.h>
+#include <sys/ddidmareq.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <vm/seg.h>
+#include <vm/seg_kmem.h>
+#include <vm/seg_kpm.h>
+#include <vm/seg_dev.h>
+#include <sys/vmem.h>
+#include <vm/hat.h>
+#include <vm/as.h>
+#include <vm/page.h>
+#include <sys/avintr.h>
+#include <sys/errno.h>
+#include <sys/modctl.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/mach_intr.h>
+#include <vm/hat_i86.h>
+#include <sys/machsystm.h>
+#include <sys/iommu_rscs.h>
+
+
+
+typedef struct iommu_rscs_s {
+	/*
+	 * Bounds of resource allocation. We will start allocating at rs_min
+	 * and rollover at rs_max+1 (rs_max is included). e.g. for rs_min=0
+	 * and rs_max=7, we will have 8 total resources which can be alloced.
+	 */
+	uint_t rs_min;
+	uint_t rs_max;
+
+	/*
+	 * rs_free points to an array of 64-bit values used to track resource
+	 * allocation. rs_free_size is the free buffer size in bytes.
+	 */
+	uint64_t *rs_free;
+	uint_t rs_free_size;
+
+	/*
+	 * last tracks the last alloc'd resource. This allows us to do a round
+	 * robin allocation.
+	 */
+	uint_t rs_last;
+
+	kmutex_t rs_mutex;
+} iommu_rscs_state_t;
+
+
+/*
+ * iommu_page_alloc()
+ *
+ */
+paddr_t
+iommu_page_alloc(int kmflag)
+{
+	paddr_t paddr;
+	page_t *pp;
+
+	ASSERT(kmflag == KM_SLEEP || kmflag == KM_NOSLEEP);
+
+	pp = page_get_physical(kmflag);
+	if (pp == NULL) {
+		return (NULL);
+	}
+
+	paddr =  pa_to_ma((uint64_t)pp->p_pagenum << PAGESHIFT);
+
+	return (paddr);
+}
+
+
+/*
+ * iommu_page_free()
+ */
+void
+iommu_page_free(paddr_t paddr)
+{
+	page_t *pp;
+
+	pp = page_numtopp_nolock(ma_to_pa(paddr) >> PAGESHIFT);
+	page_free_physical(pp);
+}
+
+
+/*
+ * iommu_page_map()
+ *
+ */
+caddr_t
+iommu_page_map(paddr_t addr)
+{
+	paddr_t paddr;
+	caddr_t kva;
+	page_t *pp;
+
+	paddr = ma_to_pa(addr);
+
+	if (kpm_enable) {
+		kva = hat_kpm_pfn2va((pfn_t)btop(paddr));
+	} else {
+		kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP);
+		if (kva == NULL) {
+			return (NULL);
+		}
+		pp = page_numtopp_nolock(paddr >> PAGESHIFT);
+		hat_memload(kas.a_hat, kva, pp,
+		    PROT_READ | PROT_WRITE, HAT_LOAD_LOCK);
+	}
+
+	return (kva);
+}
+
+
+/*
+ * iommu_page_unmap()
+ *
+ */
+void
+iommu_page_unmap(caddr_t kva)
+{
+	if (!kpm_enable) {
+		hat_unload(kas.a_hat, kva, PAGESIZE, HAT_UNLOAD_UNLOCK);
+		vmem_free(heap_arena, kva, PAGESIZE);
+	}
+}
+
+
+
+/*
+ * iommu_rscs_init()
+ *    Initialize the resource structure. init() returns a handle to be
+ *    used for the rest of the resource functions. This code is written assuming
+ *    that min_val will be close to 0. Therefore, we will allocate the free
+ *    buffer only taking max_val into account.
+ */
+void
+iommu_rscs_init(uint_t min_val, uint_t max_val, iommu_rscs_t *handle)
+{
+	iommu_rscs_state_t *rstruct;
+	uint_t array_size;
+	uint_t index;
+
+
+	ASSERT(handle != NULL);
+	ASSERT(min_val < max_val);
+
+	/* alloc space for resource structure */
+	rstruct = kmem_alloc(sizeof (iommu_rscs_state_t), KM_SLEEP);
+
+	/*
+	 * Test to see if the max value is 64-bit aligned. If so, we don't need
+	 * to allocate an extra 64-bit word. alloc space for free buffer
+	 * (8 bytes per uint64_t).
+	 */
+	if ((max_val & 0x3F) == 0) {
+		rstruct->rs_free_size = (max_val >> 6) * 8;
+	} else {
+		rstruct->rs_free_size = ((max_val >> 6) + 1) * 8;
+	}
+	rstruct->rs_free = kmem_alloc(rstruct->rs_free_size, KM_SLEEP);
+
+	/* Initialize resource structure */
+	rstruct->rs_min = min_val;
+	rstruct->rs_last = min_val;
+	rstruct->rs_max = max_val;
+	mutex_init(&rstruct->rs_mutex, NULL, MUTEX_DRIVER, NULL);
+
+	/* Mark all resources as free */
+	array_size = rstruct->rs_free_size >> 3;
+	for (index = 0; index < array_size; index++) {
+		rstruct->rs_free[index] = (uint64_t)0xFFFFFFFFFFFFFFFF;
+	}
+
+	/* setup handle which is returned from this function */
+	*handle = rstruct;
+}
+
+
+/*
+ * iommu_rscs_fini()
+ *    Frees up the space allocated in init().  Notice that a pointer to the
+ *    handle is used for the parameter.  fini() will set the handle to NULL
+ *    before returning.
+ */
+void
+iommu_rscs_fini(iommu_rscs_t *handle)
+{
+	iommu_rscs_state_t *rstruct;
+
+
+	ASSERT(handle != NULL);
+
+	rstruct = (iommu_rscs_state_t *)*handle;
+
+	mutex_destroy(&rstruct->rs_mutex);
+	kmem_free(rstruct->rs_free, rstruct->rs_free_size);
+	kmem_free(rstruct, sizeof (iommu_rscs_state_t));
+
+	/* set handle to null.  This helps catch bugs. */
+	*handle = NULL;
+}
+
+
+/*
+ * iommu_rscs_alloc()
+ *    alloc a resource. If alloc fails, we are out of resources.
+ */
+int
+iommu_rscs_alloc(iommu_rscs_t handle, uint_t *resource)
+{
+	iommu_rscs_state_t *rstruct;
+	uint_t array_idx;
+	uint64_t free;
+	uint_t index;
+	uint_t last;
+	uint_t min;
+	uint_t max;
+
+
+	ASSERT(handle != NULL);
+	ASSERT(resource != NULL);
+
+	rstruct = (iommu_rscs_state_t *)handle;
+
+	mutex_enter(&rstruct->rs_mutex);
+	min = rstruct->rs_min;
+	max = rstruct->rs_max;
+
+	/*
+	 * Find a free resource. This will return out of the loop once it finds
+	 * a free resource. There are a total of 'max'-'min'+1 resources.
+	 * Performs a round robin allocation.
+	 */
+	for (index = min; index <= max; index++) {
+
+		array_idx = rstruct->rs_last >> 6;
+		free = rstruct->rs_free[array_idx];
+		last = rstruct->rs_last & 0x3F;
+
+		/* if the next resource to check is free */
+		if ((free & ((uint64_t)1 << last)) != 0) {
+			/* we are using this resource */
+			*resource = rstruct->rs_last;
+
+			/* take it out of the free list */
+			rstruct->rs_free[array_idx] &= ~((uint64_t)1 << last);
+
+			/*
+			 * increment the last count so we start checking the
+			 * next resource on the next alloc().  Note the rollover
+			 * at 'max'+1.
+			 */
+			rstruct->rs_last++;
+			if (rstruct->rs_last > max) {
+				rstruct->rs_last = rstruct->rs_min;
+			}
+
+			/* unlock the resource structure */
+			mutex_exit(&rstruct->rs_mutex);
+
+			return (DDI_SUCCESS);
+		}
+
+		/*
+		 * This resource is not free, lets go to the next one. Note the
+		 * rollover at 'max'.
+		 */
+		rstruct->rs_last++;
+		if (rstruct->rs_last > max) {
+			rstruct->rs_last = rstruct->rs_min;
+		}
+	}
+
+	mutex_exit(&rstruct->rs_mutex);
+
+	return (DDI_FAILURE);
+}
+
+
+/*
+ * iommu_rscs_free()
+ *    Free the previously alloc'd resource.  Once a resource has been free'd,
+ *    it can be used again when alloc is called.
+ */
+void
+iommu_rscs_free(iommu_rscs_t handle, uint_t resource)
+{
+	iommu_rscs_state_t *rstruct;
+	uint_t array_idx;
+	uint_t offset;
+
+
+	ASSERT(handle != NULL);
+
+	rstruct = (iommu_rscs_state_t *)handle;
+	ASSERT(resource >= rstruct->rs_min);
+	ASSERT(resource <= rstruct->rs_max);
+
+	mutex_enter(&rstruct->rs_mutex);
+
+	/* Put the resource back in the free list */
+	array_idx = resource >> 6;
+	offset = resource & 0x3F;
+	rstruct->rs_free[array_idx] |= ((uint64_t)1 << offset);
+
+	mutex_exit(&rstruct->rs_mutex);
+}
--- a/usr/src/uts/i86pc/io/rootnex.c	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/i86pc/io/rootnex.c	Sun Sep 14 19:52:20 2008 -0700
@@ -69,6 +69,13 @@
 #include <vm/kboot_mmu.h>
 #endif
 
+#include <sys/intel_iommu.h>
+
+/*
+ * add to support dmar fault interrupt, will change soon
+ */
+char _depends_on[] = "mach/pcplusmp";
+
 /*
  * enable/disable extra checking of function parameters. Useful for debugging
  * drivers.
@@ -399,6 +406,7 @@
 	rootnex_state->r_err_ibc = (ddi_iblock_cookie_t)ipltospl(15);
 	rootnex_state->r_reserved_msg_printed = B_FALSE;
 	rootnex_cnt = &rootnex_state->r_counters[0];
+	rootnex_state->r_intel_iommu_enabled = B_FALSE;
 
 	/*
 	 * Set minimum fm capability level for i86pc platforms and then
@@ -426,6 +434,20 @@
 	/* Initialize rootnex event handle */
 	i_ddi_rootnex_init_events(dip);
 
+#if defined(__amd64)
+	/* probe intel iommu */
+	intel_iommu_probe_and_parse();
+
+	/* attach the iommu nodes */
+	if (intel_iommu_support) {
+		if (intel_iommu_attach_dmar_nodes() == DDI_SUCCESS) {
+			rootnex_state->r_intel_iommu_enabled = B_TRUE;
+		} else {
+			intel_iommu_release_dmar_info();
+		}
+	}
+#endif
+
 	return (DDI_SUCCESS);
 }
 
@@ -1757,6 +1779,34 @@
 	/* save away the original bind info */
 	dma->dp_dma = dmareq->dmar_object;
 
+	if (rootnex_state->r_intel_iommu_enabled) {
+		e = intel_iommu_map_sgl(handle, dmareq,
+		    rootnex_state->r_prealloc_cookies);
+
+		switch (e) {
+		case IOMMU_SGL_SUCCESS:
+			goto rootnex_sgl_end;
+
+		case IOMMU_SGL_DISABLE:
+			goto rootnex_sgl_start;
+
+		case IOMMU_SGL_NORESOURCES:
+			cmn_err(CE_WARN, "iommu map sgl failed for %s",
+			    ddi_node_name(dma->dp_dip));
+			rootnex_clean_dmahdl(hp);
+			return (DDI_DMA_NORESOURCES);
+
+		default:
+			cmn_err(CE_WARN,
+			    "undefined value returned from"
+			    " intel_iommu_map_sgl: %d",
+			    e);
+			rootnex_clean_dmahdl(hp);
+			return (DDI_DMA_NORESOURCES);
+		}
+	}
+
+rootnex_sgl_start:
 	/*
 	 * Figure out a rough estimate of what maximum number of pages this
 	 * buffer could use (a high estimate of course).
@@ -1818,8 +1868,9 @@
 	 */
 	rootnex_get_sgl(&dmareq->dmar_object, dma->dp_cookies,
 	    &dma->dp_sglinfo);
+
+rootnex_sgl_end:
 	ASSERT(sinfo->si_sgl_size <= sinfo->si_max_pages);
-
 	/* if we don't need a copy buffer, we don't need to sync */
 	if (sinfo->si_copybuf_req == 0) {
 		hp->dmai_rflags |= DMP_NOSYNC;
@@ -1970,6 +2021,13 @@
 	rootnex_teardown_windows(dma);
 
 	/*
+	 * If intel iommu enabled, clean up the page tables and free the dvma
+	 */
+	if (rootnex_state->r_intel_iommu_enabled) {
+		intel_iommu_unmap_sgl(handle);
+	}
+
+	/*
 	 * If we had to allocate space to for the worse case sgl (it didn't
 	 * fit into our pre-allocate buffer), free that up now
 	 */
--- a/usr/src/uts/i86pc/os/acpi_fw.h	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/i86pc/os/acpi_fw.h	Sun Sep 14 19:52:20 2008 -0700
@@ -26,8 +26,6 @@
 #ifndef _ACPI_FW_H
 #define	_ACPI_FW_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -163,6 +161,14 @@
  */
 extern struct slit	*slit_ptr;
 
+struct dmar {
+	struct table_header hdr;
+	uint8_t width;
+	uint8_t flags;
+	uint8_t rsvd[10];
+};
+
+
 /*
  * Arbitrary limit on number of localities we handle; if
  * this limit is raised to more than UINT16_MAX, make sure
--- a/usr/src/uts/i86pc/os/cpuid.c	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/i86pc/os/cpuid.c	Sun Sep 14 19:52:20 2008 -0700
@@ -105,6 +105,7 @@
 uint_t x86_feature = 0;
 uint_t x86_vendor = X86_VENDOR_IntelClone;
 uint_t x86_type = X86_TYPE_OTHER;
+uint_t x86_clflush_size = 0;
 
 uint_t pentiumpro_bug4046376;
 uint_t pentiumpro_bug4064495;
@@ -780,6 +781,15 @@
 		feature |= X86_MWAIT;
 	}
 
+	/*
+	 * Only need it first time, rest of the cpus would follow suite.
+	 * we only capture this for the bootcpu.
+	 */
+	if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) {
+		feature |= X86_CLFSH;
+		x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8);
+	}
+
 	if (feature & X86_PAE)
 		cpi->cpi_pabits = 36;
 
--- a/usr/src/uts/i86pc/os/fakebop.c	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/i86pc/os/fakebop.c	Sun Sep 14 19:52:20 2008 -0700
@@ -57,6 +57,7 @@
 #endif
 #include <vm/kboot_mmu.h>
 #include <vm/hat_pte.h>
+#include <sys/dmar_acpi.h>
 #include "acpi_fw.h"
 
 static int have_console = 0;	/* set once primitive console is initialized */
@@ -2015,6 +2016,14 @@
 	bsetprop(SLIT_PROPNAME, strlen(SLIT_PROPNAME), &tp->entry,
 	    tp->number * tp->number);
 }
+
+static void
+process_dmar(struct dmar *tp)
+{
+	bsetprop(DMAR_TABLE_PROPNAME, strlen(DMAR_TABLE_PROPNAME),
+	    tp, tp->hdr.len);
+}
+
 #else /* __xpv */
 static void
 enumerate_xen_cpus()
@@ -2056,6 +2065,9 @@
 
 	if (slit_ptr = (struct slit *)find_fw_table("SLIT"))
 		process_slit(slit_ptr);
+
+	if (tp = find_fw_table("DMAR"))
+		process_dmar((struct dmar *)tp);
 #else /* __xpv */
 	enumerate_xen_cpus();
 #endif /* __xpv */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/sys/dmar_acpi.h	Sun Sep 14 19:52:20 2008 -0700
@@ -0,0 +1,223 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Portions Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ * All rights reserved.
+ */
+
+#ifndef _SYS_DMAR_ACPI_H
+#define	_SYS_DMAR_ACPI_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	DMAR_TABLE_PROPNAME	"dmar-table"
+
+#define	DMAR_UNIT_TYPE_DRHD	0
+#define	DMAR_UNIT_TYPE_RMRR	1
+#define	DMAR_UNIT_TYPE_ATSR	2
+
+#define	DEV_SCOPE_ENDPOINT	1
+#define	DEV_SCOPE_P2P		2
+#define	DEV_SCOPE_IOAPIC	3
+#define	DEV_SCOPE_HPET		4
+
+#define	INCLUDE_PCI_ALL		0x01
+#define	DMAR_MAX_SEGMENT	1
+
+#define	IOMMU_PAGE_SIZE_4K	(1UL << 12)
+#define	IOMMU_REG_SIZE		(1UL << 12)
+#define	PARSE_DMAR_SUCCESS	1
+#define	PARSE_DMAR_FAIL		0
+
+#define	for_each_in_list(list, node) \
+	for (node = list_head(list); node != NULL; \
+	    node = list_next(list, node))
+
+/*
+ * The following structure describes the formate of
+ * DMAR ACPI table format. They are used to parse
+ * DMAR ACPI table.
+ *
+ * Read the spec for the meaning of each member.
+ */
+
+/* DMAR ACPI table header */
+typedef struct dmar_acpi_head {
+	char		dh_sig[4];
+	uint32_t	dh_len;
+	uint8_t		dh_rev;
+	uint8_t		dh_checksum;
+	char		dh_oemid[6];
+	char		dh_oemtblid[8];
+	uint32_t	dh_oemrev;
+	char		dh_asl[4];
+	uint32_t	dh_aslrev;
+	uint8_t		dh_haw;
+	uint8_t		dh_flags;
+	uint8_t		dh_reserved[10];
+} dmar_acpi_head_t;
+
+/* Remapping structure header */
+typedef struct dmar_acpi_unit_head {
+	uint16_t	uh_type;
+	uint16_t	uh_length;
+} dmar_acpi_unit_head_t;
+
+/* DRHD unit structure */
+typedef struct dmar_acpi_drhd {
+	dmar_acpi_unit_head_t	dr_header;
+	uint8_t			dr_flags;
+	uint8_t			dr_reserved;
+	uint16_t		dr_segment;
+	uint64_t		dr_baseaddr;
+} dmar_acpi_drhd_t;
+
+/* Device scope structure */
+typedef struct dmar_acpi_dev_scope {
+	uint8_t		ds_type;
+	uint8_t		ds_length;
+	uint8_t		ds_reserved[2];
+	uint8_t		ds_enumid;
+	uint8_t		ds_sbusnum;
+} dmar_acpi_dev_scope_t;
+
+/* RMRR unit structure */
+typedef struct dmar_acpi_rmrr {
+	dmar_acpi_unit_head_t	rm_header;
+	uint8_t			rm_reserved[2];
+	uint16_t		rm_segment;
+	uint64_t		rm_baseaddr;
+	uint64_t		rm_limiaddr;
+} dmar_acpi_rmrr_t;
+
+/*
+ * The following structures describes kernel recorded
+ * information about the DRHD and RMRR.
+ */
+
+/*
+ * DRHD information structure
+ *
+ * node           - the drhd info structure is inserted in the
+ *                  list embedded in the intel_dmar_info
+ * di_segment     - the pci segment associated with this drhd
+ * di_reg_base    - base address of the register set, the size
+ *                  of this set is 4K
+ * di_include_all - is it an include_all unit
+ * di_dev_list    - the dev_info list get from the device scope,
+ *                  the node of this list is pci_dev_info_t,
+ *                  which present a single pci device
+ * di_dip         - pointer to the dev_info for this drhd in the
+ *                  device tree
+ * di_iommu	  - link to the iommu state structure
+ */
+typedef struct drhd_info {
+	list_node_t 	node;
+	uint16_t 	di_segment;
+	uint64_t 	di_reg_base;
+	boolean_t	di_include_all;
+	list_t 		di_dev_list;
+	dev_info_t	*di_dip;
+	void		*di_iommu;
+} drhd_info_t;
+
+/*
+ * RMRR information structure
+ *
+ * node        - the rmrr info structure is inserted in the
+ *               list embedded in the intel_dmar_info
+ * ri_segment  - the pci segment associated with this rmrr
+ * ri_baseaddr - the low address of the reserved range
+ * ri_limiaddr - the high address of the reserved range
+ * ri_dev_list - the dev_info list get from the device scope,
+ *               the node of this list is pci_dev_info_t, w-
+ *               hich present a single pci device
+ */
+typedef struct rmrr_info {
+	list_node_t	node;
+	uint16_t	ri_segment;
+	uint64_t	ri_baseaddr;
+	uint64_t	ri_limiaddr;
+	list_t		ri_dev_list;
+} rmrr_info_t;
+
+/*
+ * Intel IOMMU information structure
+ *
+ * dmari_haw        - haw (host address width) indicates the max-
+ *                    imum DMA physical addressability by this
+ *                    platform.
+ * dmari_intr_remap - does this platform support intr remapping
+ * dmari_drhd       - the list array of drhd units with the
+ *                    segment number as the index into this array
+ * dmari_rmrr       - list array for the rmrr
+ */
+typedef struct intel_dmar_info {
+	uint8_t		dmari_haw;
+	boolean_t	dmari_intr_remap;
+	list_t		dmari_drhd[DMAR_MAX_SEGMENT];
+	list_t		dmari_rmrr[DMAR_MAX_SEGMENT];
+} intel_dmar_info_t;
+
+/*
+ * The pci device node in the dev_list of drhd_info and
+ * rmrr_info
+ *
+ * node		  - list node
+ * bus, dev, func - bus, device and function number of
+ *		  - this pci device
+ * pdi_type	  - type of this device, includes
+ *		    0x01 : pci endpoint
+ *		    0x02 : pci p2p bridge
+ *		    0x03 : ioapci
+ *		    0x04 : msi capable hpet
+ * pdi_sec_bus	  - record the bus number of the PCI bus
+ *		    segment to which the secondary interface
+ *		    of the bridge is connected
+ * pdi_sub_bus	  - record the bus number of the highest
+ *		    numbered PCI bus segment which is behind
+ *		    (or subordinate to) the bridge
+ */
+typedef struct pci_dev_scope {
+	list_node_t node;
+	uint8_t pds_bus;
+	uint8_t pds_dev;
+	uint8_t pds_func;
+	uint8_t pds_type;
+} pci_dev_scope_t;
+
+extern boolean_t intel_iommu_support;
+extern intel_dmar_info_t *dmar_info;
+extern void intel_iommu_release_dmar_info(void);
+extern void intel_iommu_probe_and_parse(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DMAR_ACPI_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/sys/intel_iommu.h	Sun Sep 14 19:52:20 2008 -0700
@@ -0,0 +1,544 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Portions Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ * All rights reserved.
+ */
+
+#ifndef	_SYS_INTEL_IOMMU_H
+#define	_SYS_INTEL_IOMMU_H
+
+/*
+ * Intel IOMMU implementation specific state
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/dmar_acpi.h>
+#include <sys/iommu_rscs.h>
+#include <sys/cpu.h>
+#include <sys/kstat.h>
+
+/* extern functions */
+extern int intel_iommu_attach_dmar_nodes(void);
+extern int intel_iommu_map_sgl(ddi_dma_handle_t handle,
+    struct ddi_dma_req *dmareq, uint_t prealloc);
+extern void intel_iommu_unmap_sgl(ddi_dma_handle_t handle);
+extern void return_instr(void);
+
+/* define the return value for iommu_map_sgl */
+#define	IOMMU_SGL_SUCCESS	0
+#define	IOMMU_SGL_DISABLE	1
+#define	IOMMU_SGL_NORESOURCES	2
+
+/* register offset */
+#define	IOMMU_REG_VERSION	(0x00)  /* Version Rigister, 32 bit */
+#define	IOMMU_REG_CAP		(0x08)  /* Capability Register, 64 bit */
+#define	IOMMU_REG_EXCAP		(0x10)  /* Extended Capability Reg, 64 bit */
+#define	IOMMU_REG_GLOBAL_CMD	(0x18)  /* Global Command Register, 32 bit */
+#define	IOMMU_REG_GLOBAL_STS	(0x1C)  /* Global Status Register, 32 bit */
+#define	IOMMU_REG_ROOTENTRY	(0x20)  /* Root-Entry Table Addr Reg, 64 bit */
+#define	IOMMU_REG_CONTEXT_CMD	(0x28)  /* Context Comand Register, 64 bit */
+#define	IOMMU_REG_FAULT_STS	(0x34)  /* Fault Status Register, 32 bit */
+#define	IOMMU_REG_FEVNT_CON	(0x38)  /* Fault Event Control Reg, 32 bit */
+#define	IOMMU_REG_FEVNT_DATA	(0x3C)  /* Fault Event Data Register, 32 bit */
+#define	IOMMU_REG_FEVNT_ADDR	(0x40)  /* Fault Event Address Reg, 32 bit */
+#define	IOMMU_REG_FEVNT_UADDR	(0x44)  /* Fault Event Upper Addr Reg, 32 bit */
+#define	IOMMU_REG_AFAULT_LOG	(0x58)  /* Advanced Fault Log Reg, 64 bit */
+#define	IOMMU_REG_PMER		(0x64)  /* Protected Memory Enble Reg, 32 bit */
+#define	IOMMU_REG_PLMBR		(0x68)  /* Protected Low Mem Base Reg, 32 bit */
+#define	IOMMU_REG_PLMLR		(0x6C)  /* Protected Low Mem Lim Reg, 32 bit */
+#define	IOMMU_REG_PHMBR		(0X70)  /* Protectd High Mem Base Reg, 64 bit */
+#define	IOMMU_REG_PHMLR		(0x78)  /* Protected High Mem Lim Reg, 64 bit */
+#define	IOMMU_REG_INVAL_QH	(0x80)  /* Invalidation Queue Head, 64 bit */
+#define	IOMMU_REG_INVAL_QT	(0x88)  /* Invalidation Queue Tail, 64 bit */
+#define	IOMMU_REG_INVAL_QAR	(0x90)  /* Invalidtion Queue Addr Reg, 64 bit */
+#define	IOMMU_REG_INVAL_CSR	(0x9C)  /* Inval Compl Status Reg, 32 bit */
+#define	IOMMU_REG_INVAL_CECR	(0xA0)  /* Inval Compl Evnt Ctrl Reg, 32 bit */
+#define	IOMMU_REG_INVAL_CEDR	(0xA4)  /* Inval Compl Evnt Data Reg, 32 bit */
+#define	IOMMU_REG_INVAL_CEAR	(0xA8)  /* Inval Compl Event Addr Reg, 32 bit */
+#define	IOMMU_REG_INVAL_CEUAR	(0xAC)  /* Inval Comp Evnt Up Addr reg, 32bit */
+#define	IOMMU_REG_IRTAR		(0xB8)  /* INTR Remap Tbl Addr Reg, 64 bit */
+
+/* ioapic memory region */
+#define	IOAPIC_REGION_START	(0xfee00000)
+#define	IOAPIC_REGION_END	(0xfeefffff)
+
+/* iommu page */
+#define	IOMMU_LEVEL_STRIDE	(9)
+#define	IOMMU_LEVEL_SIZE	((uint64_t)1 << IOMMU_LEVEL_STRIDE)
+#define	IOMMU_LEVEL_OFFSET	(IOMMU_LEVEL_SIZE - 1)
+#define	IOMMU_PAGE_SHIFT	(12)
+#define	IOMMU_PAGE_SIZE		(uint64_t)((uint64_t)1 << IOMMU_PAGE_SHIFT)
+#define	IOMMU_PAGE_MASK		~(IOMMU_PAGE_SIZE - 1)
+#define	IOMMU_PAGE_OFFSET	(IOMMU_PAGE_SIZE - 1)
+#define	IOMMU_PAGE_ROUND(x)	(((x) + IOMMU_PAGE_OFFSET) & IOMMU_PAGE_MASK)
+#define	IOMMU_PTOB(x)		(((uint64_t)(x)) << IOMMU_PAGE_SHIFT)
+#define	IOMMU_BTOP(x)		((x) >> IOMMU_PAGE_SHIFT)
+#define	IOMMU_BTOPR(x)		IOMMU_BTOP((x) + IOMMU_PAGE_OFFSET)
+#define	IOMMU_LEVEL_TO_AGAW(x)	((x) * 9 + 12)
+#define	IOMMU_IOVA_MAX_4G	(((uint64_t)1 << 32) - 1)
+#define	IOMMU_SIZE_4G		((uint64_t)1 << 32)
+#define	IOMMU_SIZE_2M		((uint64_t)1 << 21)
+#define	IOMMU_2M_MASK		~(IOMMU_SIZE_2M - 1)
+#define	IOMMU_PTE_MAX		(IOMMU_PAGE_SIZE >> 3)
+
+/* iommu page entry property */
+#define	IOMMU_PAGE_PROP_READ	(1)
+#define	IOMMU_PAGE_PROP_WRITE	(2)
+#define	IOMMU_PAGE_PROP_RW	(IOMMU_PAGE_PROP_READ | IOMMU_PAGE_PROP_WRITE)
+#define	IOMMU_PAGE_PROP_NOSYNC	(4)
+
+/* root context entry */
+#define	ROOT_ENTRY_GET_P(x)		(((x)->lo) & 0x1)
+#define	ROOT_ENTRY_SET_P(x)		((x)->lo) |= 0x1
+#define	ROOT_ENTRY_GET_CTP(x)		(((x)->lo) & IOMMU_PAGE_MASK)
+#define	ROOT_ENTRY_SET_CTP(x, p)	((x)->lo) |= ((p) & IOMMU_PAGE_MASK)
+#define	CONT_ENTRY_GET_P(x)		(((x)->lo) & 0x1)
+#define	CONT_ENTRY_SET_P(x)		((x)->lo) |= 0x1
+#define	CONT_ENTRY_SET_ASR(x, p)	((x)->lo) |= ((p) & IOMMU_PAGE_MASK)
+#define	CONT_ENTRY_GET_ASR(x)		(((x)->lo) & IOMMU_PAGE_MASK)
+#define	CONT_ENTRY_SET_AW(x, v)		((x)->hi) |= ((v) & 7)
+#define	CONT_ENTRY_SET_DID(x, v) ((x)->hi) |= (((v) & ((1 << 16) - 1)) << 8)
+
+/* fault register */
+#define	IOMMU_FAULT_STS_PPF		(2)
+#define	IOMMU_FAULT_STS_PFO		(1)
+#define	IOMMU_FAULT_STS_IQE		(1 << 4)
+#define	IOMMU_FAULT_GET_INDEX(x)	(((x) >> 8) & 0xff)
+#define	IOMMU_FRR_GET_F(x)		((x) >> 63)
+#define	IOMMU_FRR_GET_FR(x)		(((x) >> 32) & 0xff)
+#define	IOMMU_FRR_GET_FT(x)		(((x) >> 62) & 0x1)
+#define	IOMMU_FRR_GET_SID(x)		((x) & 0xffff)
+
+/* (ex)capability register */
+#define	IOMMU_CAP_GET_NFR(x)		((((x) >> 40) & 0xff) + 1)
+#define	IOMMU_CAP_GET_DWD(x)		(((x) >> 54) & 1)
+#define	IOMMU_CAP_GET_DRD(x)		(((x) >> 55) & 1)
+#define	IOMMU_CAP_GET_PSI(x)		(((x) >> 39) & 1)
+#define	IOMMU_CAP_GET_MAMV(x)		(((x) >> 48) & 0x3f)
+#define	IOMMU_CAP_GET_CM(x)		(((x) >> 7) & 1)
+#define	IOMMU_CAP_GET_RWBF(x)		(((x) >> 4) & 1)
+#define	IOMMU_CAP_GET_FRO(x)		((((x) >> 24) & 0x3ff) * 16)
+#define	IOMMU_CAP_MGAW(x)		(((((uint64_t)x) >> 16) & 0x3f) + 1)
+#define	IOMMU_CAP_SAGAW(x)		(((x) >> 8) & 0x1f)
+#define	IOMMU_CAP_ND(x)			(1 << (((x) & 0x7) *2 + 4)) -1
+#define	IOMMU_ECAP_GET_IRO(x)		((((x) >> 8) & 0x3ff) << 4)
+#define	IOMMU_ECAP_GET_C(x)		((x) & 0x1)
+#define	IOMMU_ECAP_GET_IR(x)		((x) & 0x8)
+#define	IOMMU_ECAP_GET_DI(x)		((x) & 0x4)
+#define	IOMMU_ECAP_GET_QI(x)		((x) & 0x2)
+
+
+/* iotlb invalidation */
+#define	TLB_INV_GLOBAL		(((uint64_t)1) << 60)
+#define	TLB_INV_DOMAIN		(((uint64_t)2) << 60)
+#define	TLB_INV_PAGE		(((uint64_t)3) << 60)
+#define	TLB_INV_GET_IAIG(x)	(((x) >> 57) & 7)
+#define	TLB_INV_DRAIN_READ	(((uint64_t)1) << 49)
+#define	TLB_INV_DRAIN_WRITE	(((uint64_t)1) << 48)
+#define	TLB_INV_DID(x)		(((uint64_t)((x) & 0xffff)) << 32)
+#define	TLB_INV_IVT		(((uint64_t)1) << 63)
+#define	TLB_IVA_HINT(x)		(((x) & 0x1) << 6)
+#define	TLB_IVA_LEAF		1
+#define	TLB_IVA_WHOLE		0
+
+/* context invalidation */
+#define	CCMD_INV_ICC		(((uint64_t)1) << 63)
+#define	CCMD_INV_GLOBAL		(((uint64_t)1) << 61)
+#define	CCMD_INV_DOMAIN		(((uint64_t)2) << 61)
+#define	CCMD_INV_DEVICE		(((uint64_t)3) << 61)
+#define	CCMD_INV_DID(x)		((uint64_t)((x) & 0xffff))
+#define	CCMD_INV_SID(x)		(((uint64_t)((x) & 0xffff)) << 16)
+#define	CCMD_INV_FM(x)		(((uint64_t)((x) & 0x3)) << 32)
+
+/* global command register */
+#define	IOMMU_GCMD_TE		(((uint32_t)1) << 31)
+#define	IOMMU_GCMD_SRTP		(((uint32_t)1) << 30)
+#define	IOMMU_GCMD_SFL		(((uint32_t)1) << 29)
+#define	IOMMU_GCMD_EAFL		(((uint32_t)1) << 28)
+#define	IOMMU_GCMD_WBF		(((uint32_t)1) << 27)
+#define	IOMMU_GCMD_QIE		(((uint32_t)1) << 26)
+#define	IOMMU_GCMD_IRE		(((uint32_t)1) << 25)
+#define	IOMMU_GCMD_SIRTP	(((uint32_t)1) << 24)
+#define	IOMMU_GCMD_CFI		(((uint32_t)1) << 23)
+
+/* global status register */
+#define	IOMMU_GSTS_TES		(((uint32_t)1) << 31)
+#define	IOMMU_GSTS_RTPS		(((uint32_t)1) << 30)
+#define	IOMMU_GSTS_FLS		(((uint32_t)1) << 29)
+#define	IOMMU_GSTS_AFLS		(((uint32_t)1) << 28)
+#define	IOMMU_GSTS_WBFS		(((uint32_t)1) << 27)
+#define	IOMMU_GSTS_QIES		(((uint32_t)1) << 26)
+#define	IOMMU_GSTS_IRES		(((uint32_t)1) << 25)
+#define	IOMMU_GSTS_IRTPS	(((uint32_t)1) << 24)
+#define	IOMMU_GSTS_CFIS		(((uint32_t)1) << 23)
+
+/* psi address mask */
+#define	ADDR_AM_MAX(m)		(((uint_t)1) << (m))
+#define	ADDR_AM_OFFSET(n, m)	((n) & (ADDR_AM_MAX(m) - 1))
+
+/* dmar fault event */
+#define	IOMMU_INTR_IPL			(8)
+#define	IOMMU_REG_FEVNT_CON_IM_SHIFT	(31)
+
+/* page entry structure */
+typedef uint64_t *iopte_t;
+
+/* root/context entry structure */
+typedef struct iorce {
+	uint64_t lo;
+	uint64_t hi;
+} *iorce_t;
+
+/* kernel maintained page table entry */
+typedef struct iovpte {
+	/*
+	 * pointer to the cpu accessable
+	 * iommu page table
+	 */
+	caddr_t vp;
+	/*
+	 * pointer to the real iommu
+	 * page table
+	 */
+	caddr_t pp;
+} *iovpte_t;
+
+/*
+ * struct iommu_kstat
+ *   kstat tructure for iommu
+ */
+typedef struct iommu_kstat {
+
+	/* hardware dependent */
+	kstat_named_t is_enabled;
+	kstat_named_t is_iotlb_psi;
+	kstat_named_t is_iotlb_domain;
+	kstat_named_t is_iotlb_global;
+	kstat_named_t is_write_buffer;
+	kstat_named_t is_context_cache;
+	kstat_named_t is_wait_complete_us;
+	kstat_named_t is_domain_alloc;
+
+	/* hardware independent */
+	kstat_named_t is_page_used;
+} iommu_kstat_t;
+
+/*
+ * struct iommu_stat
+ *   statistics for iommu
+ */
+typedef struct iommu_stat {
+	uint64_t st_iotlb_psi;
+	uint64_t st_iotlb_domain;
+	uint64_t st_iotlb_global;
+	uint64_t st_write_buffer;
+	uint64_t st_context_cache;
+	uint64_t st_wait_complete_us;
+	uint64_t st_domain_alloc;
+} iommu_stat_t;
+
+struct intel_iommu_state;
+struct iommu_dvma_cookie;
+struct dmar_domain_state;
+
+/*
+ * invalidation granularity
+ */
+typedef enum {
+	TLB_INV_G_GLOBAL = 1,
+	TLB_INV_G_DOMAIN,
+	TLB_INV_G_PAGE
+} tlb_inv_g_t;
+
+typedef enum {
+	CTT_INV_G_GLOBAL = 1,
+	CTT_INV_G_DOMAIN,
+	CTT_INV_G_DEVICE
+} ctt_inv_g_t;
+
+/*
+ * struct dmar_ops
+ *   dmar hardware operation functions
+ */
+struct dmar_ops {
+	/* enable */
+	void (*do_enable)(struct intel_iommu_state *iommu);
+
+	/* page fault */
+	int (*do_fault)(struct intel_iommu_state *iommu);
+
+	/* cache related */
+	void (*do_flwb)(struct intel_iommu_state *iommu);
+	void (*do_iotlb_psi)(struct intel_iommu_state *iommu, uint_t domain_id,
+	    uint64_t dvma, uint_t count, uint_t hint);
+	void (*do_iotlb_dsi)(struct intel_iommu_state *iommu, uint_t domain_id);
+	void (*do_iotlb_gbl)(struct intel_iommu_state *iommu);
+	void (*do_context_fsi)(struct intel_iommu_state *iommu,
+	    uint8_t function_mask,
+	    uint16_t source_id, uint_t domain_id);
+	void (*do_context_dsi)(struct intel_iommu_state *iommu,
+	    uint_t domain_id);
+	void (*do_context_gbl)(struct intel_iommu_state *iommu);
+	void (*do_plant_wait)(struct intel_iommu_state *iommu,
+	    struct iommu_dvma_cookie *dcookies, uint_t count,
+	    uint_t array_size);
+	void (*do_reap_wait)(struct intel_iommu_state *iommu);
+
+	/* root entry */
+	void (*do_set_root_table)(struct intel_iommu_state *iommu);
+
+	/* cpu cache line flush */
+	void (*do_clflush)(caddr_t addr, uint_t size);
+};
+
+/*
+ * struct iotlb_cache_node
+ *   the pending data for iotlb flush
+ */
+typedef struct iotlb_pend_node {
+	/* node to hook into the list */
+	list_node_t			node;
+	/* ptr to dvma cookie array */
+	struct iommu_dvma_cookie	*icn_dcookies;
+	/* valid cookie count */
+	uint_t				icn_count;
+	/* array size */
+	uint_t				icn_array_size;
+} iotlb_pend_node_t;
+
+/*
+ * struct iotlb_cache_head
+ *   the pending head for the iotlb flush
+ */
+typedef struct iotlb_pend_head {
+	/* the pending iotlb list */
+	kmutex_t	ich_pend_lock;
+	list_t		ich_pend_list;
+	uint_t		ich_pend_count;
+
+	/* the pending node cache list */
+	kmutex_t	ich_mem_lock;
+	list_t		ich_mem_list;
+} iotlb_pend_head_t;
+
+/*
+ * struct intel_iommu_state
+ *   This structure describes the state information
+ *   of each iommu unit in the platform. It is cre-
+ *   ated in the dmarnex driver's attach(), and will
+ *   be used in every DMA DDI and the iommu transla-
+ *   tion functions
+ *
+ * node			- the list node to hook it in iommu_states
+ * iu_drhd		- the related drhd
+ * iu_reg_handle	- register access handler
+ * iu_reg_lock		- lock to protect register operation
+ * iu_reg_address	- virtual address of the register base address
+ * iu_capability	- copy of the capability register
+ * iu_excapability	- copy of the extention register
+ * iu_root_entry_paddr	- root entry page table
+ * iu_root_context_lock	- root context entry lock
+ * iu_gaw		- guest address width
+ * iu_agaw		- adjusted guest address width
+ * iu_level		- the page table level
+ * iu_global_cmd_reg	- global command register save place
+ * iu_max_domain	- the maximum domain numbers
+ * iu_domain_id_hdl	- domain id allocator handler
+ * iu_enabled		- the soft state of the iommu
+ * iu_coherency		- hardware access is coherent
+ * iu_kstat		- kstat pointer
+ * iu_statistics	- iommu statistics
+ * iu_dmar_ops		- iommu operation functions
+ * iu_pend_head		- pending iotlb list
+ */
+typedef struct intel_iommu_state {
+	list_node_t		node;
+	drhd_info_t		*iu_drhd;
+	ddi_acc_handle_t	iu_reg_handle;
+	kmutex_t		iu_reg_lock;
+	caddr_t			iu_reg_address;
+	uint64_t		iu_capability;
+	uint64_t		iu_excapability;
+	paddr_t			iu_root_entry_paddr;
+	kmutex_t		iu_root_context_lock;
+	int			iu_gaw;
+	int			iu_agaw;
+	int			iu_level;
+	uint32_t		iu_global_cmd_reg;
+	int			iu_max_domain;
+	iommu_rscs_t		iu_domain_id_hdl;
+	boolean_t		iu_enabled;
+	boolean_t		iu_coherency;
+	kstat_t			*iu_kstat;
+	iommu_stat_t		iu_statistics;
+	struct dmar_ops		*iu_dmar_ops;
+	iotlb_pend_head_t	iu_pend_head;
+} intel_iommu_state_t;
+
+/*
+ * struct dvma_cache_node
+ *   dvma cache node
+ */
+typedef struct dvma_cache_node {
+	list_node_t		node;
+
+	/* parameters */
+	size_t			dcn_align;
+	uint64_t		dcn_dvma;
+} dvma_cache_node_t;
+
+/*
+ * struct dvma_cache_head
+ *   dvma cache head
+ */
+typedef struct dvma_cache_head {
+	/* the list of the free dvma */
+	kmutex_t	dch_free_lock;
+	list_t		dch_free_list;
+	uint_t		dch_free_count;
+
+	/* the cache for the node memory */
+	kmutex_t	dch_mem_lock;
+	list_t		dch_mem_list;
+} dvma_cache_head_t;
+
+#define	DVMA_CACHE_HEAD_CNT	64
+
+/*
+ * struct dmar_domain_state
+ *   This structure describes the state information
+ *   of an iommu domain. It is created and initiated
+ *   when the driver call ddi_dma_bind_handle(). And
+ *   will be used in each iommu translation fucntions
+ *
+ * dm_domain_id		- the domain id
+ * dm_iommu		- iommu pointer this domain belongs to
+ * dm_dvma_map		- dvma map
+ * dm_dvma_cache	- dvma cahce lists
+ * dm_page_table_paddr	- page table address for this domain
+ * dm_pt_tree		- the kernel maintained page tables
+ * dm_identity		- does this domain identity mapped
+ */
+typedef struct dmar_domain_state {
+	uint_t			dm_domain_id;
+	intel_iommu_state_t	*dm_iommu;
+	vmem_t			*dm_dvma_map;
+	dvma_cache_head_t	dm_dvma_cache[DVMA_CACHE_HEAD_CNT];
+	paddr_t			dm_page_table_paddr;
+	struct iovpte		dm_pt_tree;
+	boolean_t		dm_identity;
+} dmar_domain_state_t;
+
+/*
+ * struct dmar_reserve_mem
+ *   This structure describes the reserved memory regions which can
+ *   not be allocated by vmem.
+ *
+ * node		- list node
+ * rm_pfn_start	- the start page frame number
+ * rm_pfn_end	- the end page frame number
+ */
+typedef struct dmar_reserve_pages {
+	list_node_t	node;
+	uint64_t	rm_pfn_start;
+	uint64_t	rm_pfn_end;
+} dmar_reserve_pages_t;
+
+/*
+ * struct pci_dev_info
+ *   pci device info structure
+ */
+typedef struct pci_dev_info {
+	list_node_t	node;
+	int		pdi_seg;
+	int		pdi_bus;
+	int		pdi_devfn;
+	dev_info_t	*pdi_dip;
+} pci_dev_info_t;
+
+/*
+ * struct iommu_dip_private
+ *   the intel iommu private structure hook on dev_info
+ */
+typedef struct iommu_private {
+	/* pci seg, bus, dev, func */
+	int		idp_seg;
+	int		idp_bus;
+	int		idp_devfn;
+
+	/* ppb information */
+	boolean_t	idp_is_bridge;
+	int		idp_bbp_type;
+	int		idp_sec;
+	int		idp_sub;
+
+	/* identifier for special devices */
+	boolean_t	idp_is_display;
+	boolean_t	idp_is_lpc;
+
+	/* domain ptr */
+	dmar_domain_state_t	*idp_domain;
+} iommu_private_t;
+
+#define		IOMMU_PPB_NONE		0
+#define		IOMMU_PPB_PCIE_PCIE	1
+#define		IOMMU_PPB_PCIE_PCI	2
+#define		IOMMU_PPB_PCI_PCI	3
+
+#define		MAX_COOKIE_CACHE_SIZE	20
+/*
+ * struct iommu_dvma_cookie
+ *   this cookie record the dvma allocated for
+ *   an individual device
+ */
+typedef struct iommu_dvma_cookie {
+	uint64_t	dc_addr;
+	uint64_t	dc_size;
+	struct dmar_domain_state	*dc_domain;
+	size_t		dc_align;
+	struct iommu_dvma_cookie	*dc_next;
+} iommu_dvma_cookie_t;
+
+/*
+ * struct dvma_cookie_head
+ *   the cookie cache head
+ */
+typedef struct dvma_cookie_head {
+	kmutex_t		dch_lock;
+	iommu_dvma_cookie_t	*dch_next;
+	uint_t			dch_count;
+} dvma_cookie_head_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_INTEL_IOMMU_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/sys/iommu_rscs.h	Sun Sep 14 19:52:20 2008 -0700
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_IOMMU_H
+#define	_SYS_IOMMU_H
+
+/*
+ * XXX
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/modctl.h>
+#include <sys/sunddi.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+/*
+ * iommu_page_alloc()
+ *   allocate a 4K page and map it into KVA
+ * iommu_page_free()
+ *   unmap and free page from iommu_page_alloc()
+ * iommu_page_map()
+ *   map page into kva
+ * iommu_page_unmap()
+ *   unmap page out of kva
+ */
+paddr_t iommu_page_alloc(int kmflag);
+void iommu_page_free(paddr_t paddr);
+caddr_t iommu_page_map(paddr_t paddr);
+void iommu_page_unmap(caddr_t kva);
+
+
+typedef struct iommu_rscs_s *iommu_rscs_t;
+
+void iommu_rscs_init(uint_t min_val, uint_t max_val, iommu_rscs_t *handle);
+void iommu_rscs_fini(iommu_rscs_t *handle);
+int iommu_rscs_alloc(iommu_rscs_t handle, uint_t *rs);
+void iommu_rscs_free(iommu_rscs_t handle, uint_t rs);
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_IOMMU_H */
--- a/usr/src/uts/i86pc/sys/machsystm.h	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/i86pc/sys/machsystm.h	Sun Sep 14 19:52:20 2008 -0700
@@ -27,8 +27,6 @@
 #ifndef _SYS_MACHSYSTM_H
 #define	_SYS_MACHSYSTM_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Numerous platform-dependent interfaces that don't seem to belong
  * in any other header file.
@@ -132,7 +130,8 @@
 struct memlist;
 extern void memlist_add(uint64_t, uint64_t, struct memlist *,
     struct memlist **);
-extern page_t *page_get_physical(uintptr_t);
+extern page_t *page_get_physical(int flags);
+extern void page_free_physical(page_t *);
 extern int linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp);
 extern int dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp);
 
--- a/usr/src/uts/i86pc/sys/rootnex.h	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/i86pc/sys/rootnex.h	Sun Sep 14 19:52:20 2008 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ROOTNEX_H
 #define	_SYS_ROOTNEX_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * x86 root nexus implementation specific state
  */
@@ -294,6 +291,13 @@
 	 * expensive on x86.
 	 */
 	uchar_t			*dp_prealloc_buffer;
+
+	/*
+	 * intel iommu related state
+	 * dvma_cookies saves the dvma allocated for this handler, it has the
+	 * size of si_max_pages, set when bind handler and freed when unbind
+	 */
+	void			*dp_dvma_cookies;
 } rootnex_dma_t;
 
 /*
@@ -323,6 +327,7 @@
  *   r_dip - rootnex dip
  *   r_reserved_msg_printed - ctlops reserve message threshold
  *   r_counters - profile/performance counters
+ *   r_intel_iommu_enabled - intel iommu enabled
  */
 typedef struct rootnex_state_s {
 	uint_t			r_prealloc_cookies;
@@ -334,6 +339,7 @@
 	ddi_iblock_cookie_t	r_err_ibc;
 	boolean_t		r_reserved_msg_printed;
 	uint64_t		r_counters[ROOTNEX_CNT_LAST];
+	boolean_t		r_intel_iommu_enabled;
 } rootnex_state_t;
 
 
--- a/usr/src/uts/i86pc/vm/htable.c	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/i86pc/vm/htable.c	Sun Sep 14 19:52:20 2008 -0700
@@ -24,8 +24,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
@@ -266,7 +264,7 @@
  * A wrapper around page_get_physical(), with some extra checks.
  */
 static pfn_t
-ptable_alloc(uintptr_t seed)
+ptable_alloc(void)
 {
 	pfn_t pfn;
 	page_t *pp;
@@ -300,13 +298,11 @@
 	}
 #endif /* DEBUG */
 
-	pp = page_get_physical(seed);
+	pp = page_get_physical(KM_NOSLEEP);
 	if (pp == NULL)
 		return (PFN_INVALID);
+	ASSERT(PAGE_SHARED(pp));
 	pfn = pp->p_pagenum;
-	page_downgrade(pp);
-	ASSERT(PAGE_SHARED(pp));
-
 	if (pfn == PFN_INVALID)
 		panic("ptable_alloc(): Invalid PFN!!");
 	HATSTAT_INC(hs_ptable_allocs);
@@ -330,29 +326,13 @@
 	atomic_add_32(&active_ptables, -1);
 	if (pp == NULL)
 		panic("ptable_free(): no page for pfn!");
-	ASSERT(PAGE_SHARED(pp));
 	ASSERT(pfn == pp->p_pagenum);
 	ASSERT(!IN_XPV_PANIC());
-
-	/*
-	 * Get an exclusive lock, might have to wait for a kmem reader.
-	 */
-	if (!page_tryupgrade(pp)) {
-		page_unlock(pp);
-		/*
-		 * RFE: we could change this to not loop forever
-		 * George Cameron had some idea on how to do that.
-		 * For now looping works - it's just like sfmmu.
-		 */
-		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
-			continue;
-	}
 #ifdef __xpv
 	if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0)
 		panic("failure making kpm r/w pfn=0x%lx", pfn);
 #endif
-	page_free(pp, 1);
-	page_unresv(1);
+	page_free_physical(pp);
 }
 
 /*
@@ -680,7 +660,6 @@
 	return (list);
 }
 
-
 /*
  * This is invoked from kmem when the system is low on memory.  We try
  * to free hments, htables, and ptables to improve the memory situation.
@@ -788,7 +767,7 @@
 		 */
 		if (ht != NULL && !is_bare) {
 			ht->ht_hat = hat;
-			ht->ht_pfn = ptable_alloc((uintptr_t)ht);
+			ht->ht_pfn = ptable_alloc();
 			if (ht->ht_pfn == PFN_INVALID) {
 				if (USE_HAT_RESERVES())
 					htable_put_reserve(ht);
@@ -851,7 +830,7 @@
 		for (;;) {
 			htable_t *stolen;
 
-			hat->hat_user_ptable = ptable_alloc((uintptr_t)ht + 1);
+			hat->hat_user_ptable = ptable_alloc();
 			if (hat->hat_user_ptable != PFN_INVALID)
 				break;
 			stolen = htable_steal(1);
--- a/usr/src/uts/i86pc/vm/vm_machdep.c	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/i86pc/vm/vm_machdep.c	Sun Sep 14 19:52:20 2008 -0700
@@ -31,8 +31,6 @@
  * under license from the Regents of the University of California.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * UNIX machine dependent virtual memory support.
  */
@@ -3708,34 +3706,25 @@
  * available - this would have a minimal impact on page coloring.
  */
 page_t *
-page_get_physical(uintptr_t seed)
+page_get_physical(int flags)
 {
 	page_t *pp;
-	u_offset_t offset;
+	u_offset_t offset = (u_offset_t)1 << 41;	/* in VA hole */
 	static struct seg tmpseg;
 	static uintptr_t ctr = 0;
+	static kmutex_t pgp_mutex;
 
 	/*
 	 * This code is gross, we really need a simpler page allocator.
 	 *
-	 * We need assign an offset for the page to call page_create_va().
 	 * To avoid conflicts with other pages, we get creative with the offset.
-	 * For 32 bits, we pick an offset > 4Gig
-	 * For 64 bits, pick an offset somewhere in the VA hole.
+	 * For 32 bits, we need an offset > 4Gig
+	 * For 64 bits, need an offset somewhere in the VA hole.
 	 */
-	offset = seed;
-	if (offset > kernelbase)
-		offset -= kernelbase;
-	offset <<= MMU_PAGESHIFT;
-#if defined(__amd64)
-	offset += mmu.hole_start;	/* something in VA hole */
-#else
-	offset += 1ULL << 40;		/* something > 4 Gig */
-#endif
-
-	if (page_resv(1, KM_NOSLEEP) == 0)
+	if (page_resv(1, flags & KM_NOSLEEP) == 0)
 		return (NULL);
 
+	mutex_enter(&pgp_mutex);
 #ifdef	DEBUG
 	pp = page_exists(&kvp, offset);
 	if (pp != NULL)
@@ -3744,9 +3733,32 @@
 
 	pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL,
 	    &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE));	/* changing VA usage */
-	if (pp == NULL)
-		return (NULL);
-	page_io_unlock(pp);
-	page_hashout(pp, NULL);
+	if (pp != NULL) {
+		page_io_unlock(pp);
+		page_hashout(pp, NULL);
+	}
+	mutex_exit(&pgp_mutex);
+	page_downgrade(pp);
 	return (pp);
 }
+
+void
+page_free_physical(page_t *pp)
+{
+	/*
+	 * Get an exclusive lock, might have to wait for a kmem reader.
+	 */
+	ASSERT(PAGE_SHARED(pp));
+	if (!page_tryupgrade(pp)) {
+		page_unlock(pp);
+		/*
+		 * RFE: we could change this to not loop forever
+		 * George Cameron had some idea on how to do that.
+		 * For now looping works - it's just like sfmmu.
+		 */
+		while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM))
+			continue;
+	}
+	page_free(pp, 1);
+	page_unresv(1);
+}
--- a/usr/src/uts/intel/ia32/ml/i86_subr.s	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/intel/ia32/ml/i86_subr.s	Sun Sep 14 19:52:20 2008 -0700
@@ -4212,5 +4212,67 @@
 	ret
 	SET_SIZE(ftrace_interrupt_enable)
 
-#endif	/* __i386 */	
+#endif	/* __i386 */
 #endif	/* __lint */
+
+#if defined (__lint)
+
+/*ARGSUSED*/
+void
+iommu_cpu_nop(void)
+{}
+
+#else /* __lint */
+
+	ENTRY(iommu_cpu_nop)
+	rep;	nop
+	ret
+	SET_SIZE(iommu_cpu_nop)
+
+#endif /* __lint */
+
+#if defined (__lint)
+
+/*ARGSUSED*/
+void
+clflush_insn(caddr_t addr)
+{}
+
+#else /* __lint */
+
+#if defined (__amd64)
+	ENTRY(clflush_insn)
+	clflush (%rdi)
+	ret
+	SET_SIZE(clflush_insn)
+#elif defined (__i386)
+	ENTRY(clflush_insn)
+	movl	4(%esp), %eax
+	clflush (%eax)
+	ret
+	SET_SIZE(clflush_insn)
+
+#endif /* __i386 */
+#endif /* __lint */
+
+#if defined (__lint)
+/*ARGSUSED*/
+void
+mfence_insn(void)
+{}
+
+#else /* __lint */
+
+#if defined (__amd64)
+	ENTRY(mfence_insn)
+	mfence
+	ret
+	SET_SIZE(mfence_insn)
+#elif defined (__i386)
+	ENTRY(mfence_insn)
+	mfence
+	ret
+	SET_SIZE(mfence_insn)
+
+#endif /* __i386 */
+#endif /* __lint */
--- a/usr/src/uts/intel/io/pci/pci_boot.c	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/intel/io/pci/pci_boot.c	Sun Sep 14 19:52:20 2008 -0700
@@ -23,8 +23,6 @@
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/sunndi.h>
@@ -43,6 +41,7 @@
 #include <io/hotplug/pciehpc/pciehpc_acpi.h>
 #include <sys/acpi/acpi.h>
 #include <sys/acpica.h>
+#include <sys/intel_iommu.h>
 
 #define	pci_getb	(*pci_getb_func)
 #define	pci_getw	(*pci_getw_func)
@@ -1374,6 +1373,7 @@
 	int pciex = 0;
 	ushort_t is_pci_bridge = 0;
 	struct pci_devfunc *devlist = NULL, *entry = NULL;
+	iommu_private_t *private;
 
 	ushort_t deviceid = pci_getw(bus, dev, func, PCI_CONF_DEVID);
 
@@ -1598,6 +1598,35 @@
 		reprogram = 0;	/* don't reprogram pci-ide bridge */
 	}
 
+	/* allocate and set up iommu private */
+	private = kmem_alloc(sizeof (iommu_private_t), KM_SLEEP);
+	private->idp_seg = 0;
+	private->idp_bus = bus;
+	private->idp_devfn = (dev << 3) | func;
+	private->idp_sec = 0;
+	private->idp_sub = 0;
+	private->idp_bbp_type = IOMMU_PPB_NONE;
+	/* record the bridge */
+	private->idp_is_bridge = ((basecl == PCI_CLASS_BRIDGE) &&
+	    (subcl == PCI_BRIDGE_PCI));
+	if (private->idp_is_bridge) {
+		private->idp_sec = pci_getb(bus, dev, func, PCI_BCNF_SECBUS);
+		private->idp_sub = pci_getb(bus, dev, func, PCI_BCNF_SUBBUS);
+		if (pciex && is_pci_bridge)
+			private->idp_bbp_type = IOMMU_PPB_PCIE_PCI;
+		else if (pciex)
+			private->idp_bbp_type = IOMMU_PPB_PCIE_PCIE;
+		else
+			private->idp_bbp_type = IOMMU_PPB_PCI_PCI;
+	}
+	/* record the special devices */
+	private->idp_is_display = (is_display(classcode) ? B_TRUE : B_FALSE);
+	private->idp_is_lpc = ((basecl == PCI_CLASS_BRIDGE) &&
+	    (subcl == PCI_BRIDGE_ISA));
+	private->idp_domain = NULL;
+	/* hook the private to dip */
+	DEVI(dip)->devi_iommu_private = private;
+
 	if (reprogram && (entry != NULL))
 		entry->reprogram = B_TRUE;
 }
--- a/usr/src/uts/intel/sys/archsystm.h	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/intel/sys/archsystm.h	Sun Sep 14 19:52:20 2008 -0700
@@ -45,6 +45,9 @@
 extern ulong_t getcr0(void);
 extern void setcr0(ulong_t);
 extern ulong_t getcr2(void);
+extern void iommu_cpu_nop(void);
+extern void clflush_insn(caddr_t addr);
+extern void mfence_insn(void);
 
 #if defined(__i386)
 extern uint16_t getgs(void);
--- a/usr/src/uts/intel/sys/x86_archext.h	Sun Sep 14 17:28:06 2008 -0700
+++ b/usr/src/uts/intel/sys/x86_archext.h	Sun Sep 14 19:52:20 2008 -0700
@@ -337,6 +337,7 @@
 #define	X86_SSE4_1	0x04000000
 #define	X86_SSE4_2	0x08000000
 #define	X86_1GPG	0x10000000
+#define	X86_CLFSH	0x20000000
 
 /*
  * flags to patch tsc_read routine.
@@ -512,6 +513,7 @@
 extern uint_t x86_feature;
 extern uint_t x86_type;
 extern uint_t x86_vendor;
+extern uint_t x86_clflush_size;
 
 extern uint_t pentiumpro_bug4046376;
 extern uint_t pentiumpro_bug4064495;