changeset 11600:651a9a4f7b5f

6875273 Intel IOMMU needs a rewrite 6855502 iommu: Toshiba Portege R600 fails to suspend with VT enabled starting with daily.0624 6874904 Lenovo X301 - Messages spews up some stuff now. Suspend/Resume no longer work. 6885148 Huge network performance drop with multiple NICs on x86 platforms with IOMMU 6890819 slow reboot got much slower in snv_118 on my tecra M10 6808450 Fast Reboot does not work on Virgo blade 6877258 Virgo will kernel panic with VT-d enabled under heavy network traffic 6910946 Westmere Class System panics on snv_129-: Freeing a free IOMMU page: paddr=0x8379c000 under I/O load
author Vikram Hegde <Vikram.Hegde@Sun.COM>
date Sat, 30 Jan 2010 18:23:16 -0800
parents dd472370bf36
children 5a8b35d191ab
files usr/src/cmd/mdb/common/modules/rootnex/intel_iommu.c usr/src/cmd/mdb/intel/amd64/Makefile usr/src/cmd/mdb/intel/ia32/Makefile usr/src/cmd/mdb/intel/ia32/rootnex/Makefile usr/src/pkgdefs/SUNWmdb/prototype_i386 usr/src/pkgdefs/SUNWmdbr/prototype_i386 usr/src/uts/common/os/devcfg.c usr/src/uts/common/os/mem_config.c usr/src/uts/common/sys/ddi_impldefs.h usr/src/uts/common/sys/ddidmareq.h usr/src/uts/common/sys/sunddi.h usr/src/uts/i86pc/Makefile.files usr/src/uts/i86pc/io/dmar_acpi.c usr/src/uts/i86pc/io/immu.c usr/src/uts/i86pc/io/immu_dmar.c usr/src/uts/i86pc/io/immu_dvma.c usr/src/uts/i86pc/io/immu_intrmap.c usr/src/uts/i86pc/io/immu_qinv.c usr/src/uts/i86pc/io/immu_regs.c usr/src/uts/i86pc/io/intel_iommu.c usr/src/uts/i86pc/io/iommu_rscs.c usr/src/uts/i86pc/io/mp_platform_common.c usr/src/uts/i86pc/io/pcplusmp/apic.c usr/src/uts/i86pc/io/pcplusmp/apic_introp.c usr/src/uts/i86pc/io/rootnex.c usr/src/uts/i86pc/os/ddi_impl.c usr/src/uts/i86pc/os/fakebop.c usr/src/uts/i86pc/os/startup.c usr/src/uts/i86pc/rootnex/Makefile usr/src/uts/i86pc/sys/apic.h usr/src/uts/i86pc/sys/dmar_acpi.h usr/src/uts/i86pc/sys/immu.h usr/src/uts/i86pc/sys/intel_iommu.h usr/src/uts/i86pc/sys/iommu_rscs.h usr/src/uts/i86pc/sys/rootnex.h usr/src/uts/intel/ia32/ml/modstubs.s usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c usr/src/uts/intel/io/pci/pci_boot.c
diffstat 38 files changed, 9477 insertions(+), 8309 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/mdb/common/modules/rootnex/intel_iommu.c	Sat Jan 30 15:04:39 2010 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,883 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2009, Intel Corporation.
- * All rights reserved.
- */
-#include <sys/mdb_modapi.h>
-#include <sys/list.h>
-#include <sys/note.h>
-#include <sys/dditypes.h>
-#include <sys/ddi_impldefs.h>
-#include <sys/intel_iommu.h>
-#include <sys/iommulib.h>
-#include <stddef.h>
-
-/*
- * Does Intel IOMMU works on this system?
- */
-static boolean_t iommu_support = B_FALSE;
-
-static void
-iomuvtop_help(void)
-{
-	mdb_printf("print physical mapping of IO virtual address\n\n"
-	    "Usage:\n\n"
-	    "  address::iomuvtop <iova>\n\n"
-	    "Where, \"address\" is the address of the devinfo node, "
-	    "while \"iova\" is the DMA virtual address.\n");
-}
-
-static boolean_t
-iommu_supported(void)
-{
-	if (iommu_support == B_FALSE)
-		mdb_printf("No Intel IOMMU active on this system\n");
-	return (iommu_support);
-}
-
-/*
- * print_device_scope_cb()
- *   call back for print_device_scope()
- */
-static int
-print_device_scope_cb(uintptr_t addr, pci_dev_scope_t *devs, void *cbdata)
-{
-	_NOTE(ARGUNUSED(addr))
-
-	mdb_printf((char *)cbdata);
-	mdb_printf("BDF[%x:%x:%x],type[%x]\n",
-	    devs->pds_bus,
-	    devs->pds_dev,
-	    devs->pds_func,
-	    devs->pds_type);
-
-	return (WALK_NEXT);
-}
-
-/*
- * print_device_scope()
- *   a common function to print device scope of a drhd or rmrr
- */
-static void
-print_device_scope(const char *pre, uintptr_t addr)
-{
-	mdb_pwalk("list",
-	    (mdb_walk_cb_t)print_device_scope_cb, (void *)pre, addr);
-}
-
-/*
- * parse_hw_capa()
- * parse_hw_excapa()
- *
- *  Given the capability and extension capability register contents,
- *  parse and print supported features in <output>
- *
- *  Please refer to chapter 10.4.2/3 in "Intel virutalization technology
- *  for direct IO specification" for register details
- */
-static void
-parse_hw_capa(uint64_t capa)
-{
-	char string[128];
-	size_t len;
-
-	strcpy(string, "  Hardware Capability:\t\t");
-	if (IOMMU_CAP_GET_DRD(capa))
-		strcat(string, "DRD ");
-	if (IOMMU_CAP_GET_DWD(capa))
-		strcat(string, "DWD ");
-	if (IOMMU_CAP_GET_PSI(capa))
-		strcat(string, "PSI ");
-	if (IOMMU_CAP_GET_ISOCH(capa))
-		strcat(string, "ISOCH ");
-	if (IOMMU_CAP_GET_ZLR(capa))
-		strcat(string, "ZLR ");
-	if (IOMMU_CAP_GET_CM(capa))
-		strcat(string, "CM ");
-	if (IOMMU_CAP_GET_PHMR(capa))
-		strcat(string, "PHMR ");
-	if (IOMMU_CAP_GET_PLMR(capa))
-		strcat(string, "PLMR ");
-	if (IOMMU_CAP_GET_RWBF(capa))
-		strcat(string, "RWBF ");
-	if (IOMMU_CAP_GET_AFL(capa))
-		strcat(string, "AFL ");
-
-	len = strlen(string);
-	if ((len > 1) &&
-	    (string[len - 1] == ' '))
-		string[len - 1] = 0;
-
-	strcat(string, "\n");
-	mdb_printf(string);
-}
-
-static void
-parse_hw_excapa(uint64_t excapa)
-{
-	char string[128];
-	size_t len;
-
-	strcpy(string, "  Hardware Ex-Capability:\t");
-	if (IOMMU_ECAP_GET_SC(excapa))
-		strcat(string, "SC ");
-	if (IOMMU_ECAP_GET_PT(excapa))
-		strcat(string, "PT ");
-	if (IOMMU_ECAP_GET_CH(excapa))
-		strcat(string, "CH ");
-	if (IOMMU_ECAP_GET_EIM(excapa))
-		strcat(string, "EIM ");
-	if (IOMMU_ECAP_GET_IR(excapa))
-		strcat(string, "IR ");
-	if (IOMMU_ECAP_GET_DI(excapa))
-		strcat(string, "DI ");
-	if (IOMMU_ECAP_GET_QI(excapa))
-		strcat(string, "QI ");
-	if (IOMMU_ECAP_GET_C(excapa))
-		strcat(string, "C ");
-
-	len = strlen(string);
-	if ((len > 1) &&
-	    (string[len - 1] == ' '))
-		string[len - 1] = 0;
-
-	strcat(string, "\n");
-	mdb_printf(string);
-}
-
-typedef enum {
-	ERROR_SCOPE,
-	INCLUDE_ALL_SCOPE,
-	DEV_SCOPE
-} iomu_scope_t;
-
-/*
- * print_iommu_state()
- *  Given an iommu_state structure, parse and print iommu information
- *
- *  Returns:
- *   INCLUDE_ALL_SCOPE if include all is set
- *   DEV_SCOPE if not set
- *   ERROR_SCOPE on error.
- */
-static iomu_scope_t
-print_iommu_state(intel_iommu_state_t *iommu, drhd_info_t *drhd)
-{
-	if ((iommu == NULL) || (drhd == NULL)) {
-		mdb_warn("Internal error - NULL iommu state pointer passed\n");
-		return (ERROR_SCOPE);
-	}
-
-	mdb_printf("Intel DMA remapping unit\n");
-	mdb_printf("  IOMMU Status:\t\t\t%s\n",
-	    (iommu->iu_enabled & DMAR_ENABLE) ? "Enabled" : "Disabled");
-	mdb_printf("  Queued Invalid:\t\t%s\n",
-	    (iommu->iu_enabled & QINV_ENABLE) ? "Enabled" : "Disabled");
-	mdb_printf("  Interrupt remapping:\t\t%s\n",
-	    (iommu->iu_enabled & INTRR_ENABLE) ? "Enabled" : "Disabled");
-	mdb_printf("  Register Physical Address:\t%p\n",
-	    (uintptr_t)drhd->di_reg_base);
-	mdb_printf("  Register Virtual Address:\t%p\n",
-	    (uintptr_t)iommu->iu_reg_address);
-	parse_hw_capa(iommu->iu_capability);
-	parse_hw_excapa(iommu->iu_excapability);
-	mdb_printf("  Root Entry Table:\t\t%p\n",
-	    (uintptr_t)iommu->iu_root_entry_paddr);
-	mdb_printf("  Guest Address Width:\t\t%d\n", iommu->iu_gaw);
-	mdb_printf("  Adjust Guest Address Width:\t%d\n", iommu->iu_agaw);
-	mdb_printf("  Page Table Level:\t\t%d\n", iommu->iu_level);
-	mdb_printf("  Max Domain Supported:\t\t%d\n", iommu->iu_max_domain);
-	mdb_printf("  System Coherence:\t\t%s\n",
-	    iommu->iu_coherency ? "Yes" : "No");
-	mdb_printf("  Include All unit:\t\t%s\n",
-	    drhd->di_include_all ? "Yes" : "No");
-	mdb_printf("  Devinfo Node:\t\t\t%p\n",
-	    (intptr_t)drhd->di_dip);
-
-	if (iommu->iu_enabled & QINV_ENABLE) {
-		struct inv_queue_state qi_state;
-		if (iommu->iu_inv_queue &&
-		    mdb_vread(&qi_state, sizeof (qi_state),
-		    (intptr_t)iommu->iu_inv_queue) == sizeof (qi_state)) {
-			mdb_printf("  Qinv Table:\t\t\tpaddr:%p, "
-			    "vaddr:%p, size:%x\n",
-			    (uintptr_t)qi_state.iq_table.paddr,
-			    (uintptr_t)qi_state.iq_table.vaddr,
-			    qi_state.iq_table.size);
-			mdb_printf("  Sync Table:\t\t\tpaddr:%p, "
-			    "vaddr:%p, size:%x\n",
-			    (uintptr_t)qi_state.iq_sync.paddr,
-			    (uintptr_t)qi_state.iq_sync.vaddr,
-			    qi_state.iq_sync.size);
-		} else {
-			mdb_warn("failed to read iommu invalidation "
-			    "queue state at %p\n",
-			    (uintptr_t)iommu->iu_inv_queue);
-			return (ERROR_SCOPE);
-		}
-	}
-
-	return (drhd->di_include_all ? INCLUDE_ALL_SCOPE : DEV_SCOPE);
-}
-
-/*
- * dcmd: iomuprt
- */
-static int
-iomuprt(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	_NOTE(ARGUNUSED(argv))
-	intel_iommu_state_t iommu;
-	drhd_info_t drhd;
-
-	if (iommu_supported() == B_FALSE)
-		return (DCMD_OK);
-
-	if ((argc != 0) || !(flags & DCMD_ADDRSPEC))
-		return (DCMD_USAGE);
-
-	if (!DCMD_HDRSPEC(flags))
-		mdb_printf("\n");
-
-	if ((mdb_vread(&iommu, sizeof (iommu), addr) == sizeof (iommu)) &&
-	    (iommu.iu_drhd != NULL) &&
-	    (mdb_vread(&drhd, sizeof (drhd),
-	    (intptr_t)iommu.iu_drhd) == sizeof (drhd))) {
-		switch (print_iommu_state(&iommu, &drhd)) {
-		case DEV_SCOPE:
-			/*
-			 * Use actual address of list_t in kernel for walker
-			 */
-			print_device_scope("  Device Scope:\t\t\t",
-			    (uintptr_t)((char *)iommu.iu_drhd +
-			    offsetof(drhd_info_t, di_dev_list)));
-			break;
-		case ERROR_SCOPE:
-			return (DCMD_ERR);
-		default:
-			break;
-		}
-	} else {
-		mdb_warn("failed to read iommu state at %p\n", addr);
-		return (DCMD_ERR);
-	}
-
-	return (DCMD_OK);
-}
-
-/*
- * print_iommu_addr()
- * callback to print addresses of IOMMU unit software structures
- */
-static int
-print_iommu_addr(uintptr_t addr, intel_iommu_state_t *ip, void *cbdata)
-{
-	_NOTE(ARGUNUSED(cbdata))
-	_NOTE(ARGUNUSED(ip))
-	intel_iommu_state_t iommu;
-
-	if (mdb_vread(&iommu, sizeof (iommu), addr) != sizeof (iommu)) {
-		mdb_warn("failed to read IOMMU structure at %p\n", addr);
-		return (WALK_ERR);
-	}
-
-	mdb_printf("%p\n", addr);
-
-	return (WALK_NEXT);
-}
-
-/*
- * dcmd: iomunits
- */
-static int
-iomunits(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	_NOTE(ARGUNUSED(addr))
-	_NOTE(ARGUNUSED(argv))
-	GElf_Sym sym;
-
-	if (iommu_supported() == B_FALSE)
-		return (DCMD_OK);
-
-	if ((flags & DCMD_ADDRSPEC) || (argc != 0)) {
-		return (DCMD_USAGE);
-	}
-
-	if (mdb_lookup_by_name("iommu_states", &sym) == -1) {
-		mdb_warn("failed to find symbol iommu_states\n");
-		return (DCMD_ERR);
-	}
-
-	addr = (uintptr_t)sym.st_value;
-	if (mdb_pwalk("list", (mdb_walk_cb_t)print_iommu_addr, NULL, addr)) {
-		mdb_warn("couldn't walk IOMMU state structures\n");
-		return (DCMD_ERR);
-	}
-	return (DCMD_OK);
-}
-
-
-
-/*
- * print_domain_state()
- *   Given an device domain structure, parse and print information
- */
-static void
-print_domain_state(dmar_domain_state_t *domain)
-{
-	if (domain == NULL) {
-		mdb_warn("Internal error: NULL domain pointer passed\n");
-		return;
-	}
-
-	mdb_printf("IOMMU device domain:\n");
-	mdb_printf("Domain ID:\t\t%d\n", domain->dm_domain_id);
-	mdb_printf("Bind IOMMU:\t\t%p\n", (uintptr_t)domain->dm_iommu);
-	mdb_printf("DVMA vmem:\t\t%p\n",
-	    (uintptr_t)domain->dm_dvma_map);
-	mdb_printf("Top Level Page Table:\t%p\n",
-	    (uintptr_t)domain->dm_page_table_paddr);
-	mdb_printf("Identity Mapping:\t\t%s\n",
-	    domain->dm_identity ? "YES" : "NO");
-}
-
-/*
- * dcmd: iomudomprt
- */
-static int
-iomudomprt(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	_NOTE(ARGUNUSED(argv))
-	dmar_domain_state_t domain;
-
-	if (iommu_supported() == B_FALSE)
-		return (DCMD_OK);
-
-	if ((argc != 0) || !(flags & DCMD_ADDRSPEC))
-		return (DCMD_USAGE);
-
-	if (!DCMD_HDRSPEC(flags))
-		mdb_printf("\n");
-
-	if (mdb_vread(&domain, sizeof (domain), addr) == sizeof (domain)) {
-		print_domain_state(&domain);
-	} else {
-		mdb_warn("failed to read domain at %p\n", addr);
-		return (DCMD_ERR);
-	}
-
-	return (DCMD_OK);
-}
-
-/*
- * print_domain_addr()
- */
-static int
-print_domain_addr(uintptr_t addr, dmar_domain_state_t *domp, void *cbdata)
-{
-	_NOTE(ARGUNUSED(domp))
-	_NOTE(ARGUNUSED(cbdata))
-	dmar_domain_state_t domain;
-
-	if (iommu_supported() == B_FALSE)
-		return (WALK_NEXT);
-
-	if (mdb_vread(&domain, sizeof (domain), addr) != sizeof (domain)) {
-		mdb_warn("failed to read domain at %p\n", addr);
-		return (WALK_ERR);
-	}
-
-	mdb_printf("%p\n", addr);
-
-	return (WALK_NEXT);
-}
-
-/*
- * dcmd: iomudoms
- */
-static int
-iomudoms(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	_NOTE(ARGUNUSED(addr))
-	_NOTE(ARGUNUSED(argv))
-	GElf_Sym sym;
-
-	if (iommu_supported() == B_FALSE)
-		return (DCMD_OK);
-
-	if ((flags & DCMD_ADDRSPEC) || (argc != 0)) {
-		return (DCMD_USAGE);
-	}
-
-	if (mdb_lookup_by_name("domain_states", &sym) == -1) {
-		mdb_warn("failed to find symbol domain_states\n");
-		return (DCMD_ERR);
-	}
-
-	addr = (uintptr_t)sym.st_value;
-	if (mdb_pwalk("list", (mdb_walk_cb_t)print_domain_addr, NULL, addr))
-		return (DCMD_ERR);
-	return (DCMD_OK);
-}
-
-/*
- * print_rmrr_info()
- */
-static void
-print_rmrr_info(rmrr_info_t *rmrr)
-{
-	mdb_printf("Reserved Memory Region Reporting:\n");
-	mdb_printf("  Segment:\t%d\n", rmrr->ri_segment);
-	mdb_printf("  BaseAddr:\t%p\n", (uintptr_t)rmrr->ri_baseaddr);
-	mdb_printf("  LimiAddr:\t%p\n", (uintptr_t)rmrr->ri_limiaddr);
-}
-
-/*
- * print_rmrr_addr()
- *   list walk callback for list_rmrr
- */
-static int
-print_rmrr_addr(uintptr_t addr, rmrr_info_t *rp, void *cbdata)
-{
-	_NOTE(ARGUNUSED(rp))
-	_NOTE(ARGUNUSED(cbdata))
-	rmrr_info_t rmrr;
-
-	if (mdb_vread(&rmrr, sizeof (rmrr), addr) != sizeof (rmrr)) {
-		mdb_warn("failed to read RMRR structure at %p\n", addr);
-		return (WALK_ERR);
-	}
-
-	mdb_printf("%p\n", addr);
-
-	return (WALK_NEXT);
-}
-
-/*
- * dcmd: iomurmrrs
- */
-static int
-iomurmrrs(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	_NOTE(ARGUNUSED(addr))
-	_NOTE(ARGUNUSED(argv))
-	GElf_Sym sym;
-
-	if (iommu_supported() == B_FALSE)
-		return (DCMD_OK);
-
-	if ((flags & DCMD_ADDRSPEC) || (argc != 0)) {
-		return (DCMD_USAGE);
-	}
-
-	if (mdb_lookup_by_name("rmrr_states", &sym) == -1) {
-		mdb_warn("failed to find symbol rmrr_states\n");
-		return (DCMD_ERR);
-	}
-
-	addr = (uintptr_t)sym.st_value;
-	if (mdb_pwalk("list", (mdb_walk_cb_t)print_rmrr_addr, NULL, addr))
-		return (DCMD_ERR);
-	return (DCMD_OK);
-}
-
-/*
- * dcmd: iomurmrrprt: Given an RMRR address print the RMRR.
- */
-static int
-iomurmrrprt(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	_NOTE(ARGUNUSED(argv))
-	uintptr_t dev_list_addr;
-	rmrr_info_t rmrr;
-
-	if (iommu_supported() == B_FALSE)
-		return (DCMD_OK);
-
-	if (!(flags & DCMD_ADDRSPEC) || (argc != 0)) {
-		return (DCMD_USAGE);
-	}
-
-	if (mdb_vread(&rmrr, sizeof (rmrr), addr) != sizeof (rmrr)) {
-		mdb_warn("failed to read RMRR structure at %p\n", addr);
-		return (DCMD_ERR);
-	}
-
-	dev_list_addr = addr + offsetof(rmrr_info_t, ri_dev_list);
-	print_rmrr_info(&rmrr);
-	print_device_scope("  DevScope:\t", dev_list_addr);
-
-	return (DCMD_OK);
-}
-
-/*
- * iova_level_to_offset()
- *   Given an iova and page table level, return the corresponding offset
- */
-static int
-iova_level_to_offset(uintptr_t iova, int level)
-{
-	int start, offset;
-
-	start = (level - 1) * IOMMU_LEVEL_STRIDE + IOMMU_PAGE_SHIFT;
-	offset = (iova >> start) & IOMMU_LEVEL_OFFSET;
-
-	return (offset);
-}
-
-/*
- * iovtp_read_table_entry()
- */
-static int
-iovtp_read_table_entry(uint64_t ptaddr, size_t offset,
-    void *ent_buf, size_t ent_size)
-{
-	if (mdb_pread(ent_buf, ent_size, ptaddr + offset * ent_size)
-	    != ent_size) {
-		return (B_FALSE);
-	} else {
-		return (B_TRUE);
-	}
-}
-
-/*
- * dcmd: iomuvtop
- */
-static int
-iomuvtop(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	iommu_private_t private;
-	dmar_domain_state_t domain;
-	struct dev_info dinfo;
-	intel_iommu_state_t iommu;
-	int i, level, offset;
-	uintptr_t iova;
-	uint64_t ptaddr, ptentr;
-	int bus, devfn;
-
-	struct root_context_entry {
-		uint64_t asr;
-		uint64_t pro;
-	} rc_entry;
-
-	if (iommu_supported() == B_FALSE)
-		return (DCMD_OK);
-
-	if (!(flags & DCMD_ADDRSPEC) || (argc != 1)) {
-		return (DCMD_USAGE);
-	}
-
-	iova = (argv[0].a_type == MDB_TYPE_IMMEDIATE) ?
-	    (uintptr_t)argv[0].a_un.a_val :
-	    (uintptr_t)mdb_strtoull(argv->a_un.a_str);
-
-	/* read iommu private */
-	if ((mdb_vread(&dinfo, sizeof (dinfo), addr) != sizeof (dinfo)) ||
-	    (dinfo.devi_iommu_private == NULL) ||
-	    (mdb_vread(&private, sizeof (private),
-	    (uintptr_t)dinfo.devi_iommu_private) != sizeof (private))) {
-		mdb_warn("failed to read iommu private structure for "
-		    "devinfo node at address %p\n", addr);
-		return (DCMD_ERR);
-	}
-
-	bus = private.idp_bus;
-	devfn = private.idp_devfn;
-
-	/* read domain */
-	if (private.idp_intel_domain == NULL) {
-		mdb_printf("IOMMU domain for this device has not yet been "
-		    "allocated.\nNo mapped physical address for this vaddr\n");
-		return (DCMD_OK);
-	}
-
-	if (mdb_vread(&domain, sizeof (domain),
-	    (uintptr_t)private.idp_intel_domain)
-	    != sizeof (domain)) {
-		mdb_warn("failed to read domain structure at %p\n",
-		    (uintptr_t)private.idp_intel_domain);
-		return (DCMD_ERR);
-	}
-
-	/* read iommu */
-	if (mdb_vread(&iommu, sizeof (iommu), (uintptr_t)domain.dm_iommu)
-	    != sizeof (iommu)) {
-		mdb_warn("failed to read iommu structure at %p\n",
-		    (uintptr_t)domain.dm_iommu);
-		return (DCMD_ERR);
-	}
-
-	mdb_printf("Level\tPageTableAddress\tOffset\tPageTableEntry\n");
-
-	/* walk and print root context tabls */
-	ptaddr = iommu.iu_root_entry_paddr;
-	if (iovtp_read_table_entry(ptaddr, bus, &rc_entry, sizeof (rc_entry))
-	    == B_FALSE) {
-		mdb_warn("failed to read root table entry for bus %x "
-		    "at %p\n", bus, (uintptr_t)ptaddr);
-		return (DCMD_ERR);
-	}
-	mdb_printf("Root\t%p\t\t%x\tlow :%p\n", (uintptr_t)ptaddr,
-	    bus, (uintptr_t)rc_entry.asr);
-	mdb_printf("Root\t%p\t\t%x\thigh:%p\n", (uintptr_t)ptaddr,
-	    bus, (uintptr_t)rc_entry.pro);
-
-	ptaddr = rc_entry.asr & IOMMU_PAGE_MASK;
-	if (iovtp_read_table_entry(ptaddr, devfn, &rc_entry, sizeof (rc_entry))
-	    == B_FALSE) {
-		mdb_warn("failed to read context table entry for "
-		    "device-function %x at %p\n", devfn, (uintptr_t)ptaddr);
-		return (DCMD_ERR);
-	}
-	mdb_printf("Context\t%p\t\t%x\tlow :%p\n", (uintptr_t)ptaddr,
-	    devfn, (uintptr_t)rc_entry.asr);
-	mdb_printf("Context\t%p\t\t%x\thigh:%p\n", (uintptr_t)ptaddr,
-	    devfn, (uintptr_t)rc_entry.pro);
-
-	/* walk and print page tables */
-	ptaddr = rc_entry.asr & IOMMU_PAGE_MASK;
-
-	/*
-	 * Toppest level page table address should be the same
-	 * as that stored in domain structure
-	 */
-	if (ptaddr != domain.dm_page_table_paddr) {
-		mdb_warn("The top level page table retrieved from context"
-		    " table doesn't match that from the domain structure."
-		    " Aborting PA lookup.\n");
-		return (DCMD_ERR);
-	}
-
-	level = iommu.iu_level;
-	for (i = level; i > 0; i--) {
-		if (!ptaddr) {
-			mdb_printf("\nNULL page table entry encountered at "
-			" page table level %d. Aborting PA lookup.\n", i);
-			return (DCMD_OK);
-		}
-		offset = iova_level_to_offset(iova, i);
-		if (iovtp_read_table_entry(ptaddr, offset, &ptentr,
-		    sizeof (ptentr)) == B_FALSE) {
-			mdb_warn("failed to read page table entry "
-			    "(level %d) at %p\n", i, (uintptr_t)ptaddr);
-			return (DCMD_ERR);
-		}
-		mdb_printf("%x\t%p\t\t%x\t%p\n", i, (uintptr_t)ptaddr,
-		    offset, (uintptr_t)ptentr);
-		ptaddr = ptentr & IOMMU_PAGE_MASK;
-	}
-
-	return (DCMD_OK);
-}
-
-typedef struct bdf_cb_data {
-	int	dc_seg;
-	int	dc_bus;
-	int	dc_devfunc;
-	int	dc_match;
-} bdf_cb_data_t;
-
-/*
- * match_bdf()
- *   call back function that matches BDF
- */
-static int
-match_bdf(uintptr_t addr, struct dev_info *dev, bdf_cb_data_t *cbdata)
-{
-	_NOTE(ARGUNUSED(addr))
-	/* if there is iommu private, get it */
-	if (dev->devi_iommu_private != NULL) {
-		iommu_private_t private;
-		if (mdb_vread((void*)&private, sizeof (private),
-		    (uintptr_t)dev->devi_iommu_private) != sizeof (private)) {
-			mdb_warn("failed to read iommu private at %p\n",
-			    (uintptr_t)dev->devi_iommu_private);
-			return (WALK_ERR);
-		}
-
-		if (private.idp_seg == cbdata->dc_seg &&
-		    private.idp_bus == cbdata->dc_bus &&
-		    private.idp_devfn == cbdata->dc_devfunc) {
-			if (cbdata->dc_match == 0) {
-				mdb_printf("%p\n", addr);
-				cbdata->dc_match = 1;
-			} else {
-				mdb_warn("More than one devinfo node matches "
-				    "a single pci device. Aborting devinfo "
-				    "lookup\n");
-				return (WALK_ERR);
-			}
-		}
-	}
-
-	return (WALK_NEXT);
-}
-
-/*
- * dcmd: bdf2devinfo
- */
-static int
-bdf2devinfo(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	_NOTE(ARGUNUSED(addr))
-	bdf_cb_data_t cbdata;
-	uint_t i, bdf[4];
-
-	if (iommu_supported() == B_FALSE)
-		return (DCMD_OK);
-
-	if ((flags & DCMD_ADDRSPEC) || (argc != 4)) {
-		return (DCMD_USAGE);
-	}
-
-	for (i = 0; i < 4; i++) {
-		bdf[i] = (argv[i].a_type == MDB_TYPE_IMMEDIATE) ?
-		    (int)argv[i].a_un.a_val :
-		    (int)mdb_strtoull(argv[i].a_un.a_str);
-	}
-
-	if ((bdf[0] != 0) || (bdf[1] > 255) || (bdf[2] > 31) || (bdf[3] > 7)) {
-		mdb_warn("invalid pci segment, bus, device, function"
-		    "tuple (%x, %x, %x, %x)\n", bdf[0], bdf[1], bdf[2], bdf[3]);
-		return (DCMD_USAGE);
-	}
-
-
-	cbdata.dc_seg = bdf[0];
-	cbdata.dc_bus = bdf[1];
-	cbdata.dc_devfunc = bdf[2] << 3 | bdf[3];
-	cbdata.dc_match = 0;
-
-	if (mdb_readvar(&addr, "top_devinfo") == -1) {
-		mdb_warn("failed to read 'top_devinfo'\n");
-		return (DCMD_ERR);
-	}
-
-	if (mdb_pwalk("devinfo",
-	    (mdb_walk_cb_t)match_bdf, &cbdata, addr)) {
-		mdb_warn("couldn't walk devinfo tree\n");
-		return (DCMD_ERR);
-	}
-
-	if (cbdata.dc_match == 0)
-		mdb_printf("No devinfo node found for %x:%x:%x:%x\n",
-		    bdf[0], bdf[1], bdf[2], bdf[3]);
-
-	return (DCMD_OK);
-}
-
-/*
- * dcmd: iomudip2dom
- */
-static int
-iomudip2dom(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
-{
-	_NOTE(ARGUNUSED(argv))
-	struct dev_info dinfo;
-	iommu_private_t private;
-
-	if (iommu_supported() == B_FALSE)
-		return (DCMD_OK);
-
-	if (!(flags & DCMD_ADDRSPEC) || (argc != 0)) {
-		return (DCMD_USAGE);
-	}
-
-	/* read iommu private */
-	if ((mdb_vread(&dinfo, sizeof (dinfo), addr) != sizeof (dinfo)) ||
-	    (dinfo.devi_iommu_private == NULL) ||
-	    (mdb_vread(&private, sizeof (private),
-	    (uintptr_t)dinfo.devi_iommu_private) != sizeof (private))) {
-		mdb_warn("failed to read iommu private structure for "
-		    "devinfo node at %p\n", addr);
-		return (DCMD_ERR);
-	}
-
-	/* read domain */
-	if (private.idp_intel_domain != NULL) {
-		mdb_printf("%p\n", (uintptr_t)private.idp_intel_domain);
-	} else {
-		mdb_printf("No domain dedicated for this device\n");
-	}
-
-	return (DCMD_OK);
-}
-
-static const mdb_dcmd_t dcmds[] = {
-	{ "iomunits", NULL,
-		"list addresses of software state structure for all IOMMUs",
-		iomunits },
-	{ "iomuprt", "?",
-		"given an IOMMU's state structure address, print its contents",
-		iomuprt},
-	{ "iomudoms", NULL,
-		"list addresses of all IOMMU domain software structures",
-		iomudoms },
-	{ "iomudomprt", "?",
-		"given an IOMMU's domain struct address, print its contents",
-		iomudomprt },
-	{ "iomurmrrs", NULL,
-		"list addresses of all Intel IOMMU RMRR software structures",
-		iomurmrrs },
-	{ "iomurmrrprt", NULL,
-		"given an IOMMU RMRR structure address, print its contents",
-		iomurmrrprt },
-	{ "iomuvtop", "?<iova>",
-		"print physical address of an IO virtual address",
-		iomuvtop, iomuvtop_help },
-	{ "bdf2devinfo", "[segment] [bus] [dev] [func]",
-		"given its pci segment/bus/dev/func, print the devinfo node",
-		bdf2devinfo },
-	{ "iomudip2dom", "?",
-		"given a devinfo node, print the address of its IOMMU domain",
-		iomudip2dom },
-	{ NULL }
-};
-
-static const mdb_walker_t walkers[] = {
-	{ NULL }
-};
-
-static const mdb_modinfo_t modinfo = {
-	MDB_API_VERSION, dcmds, walkers
-};
-
-const mdb_modinfo_t *
-_mdb_init(void)
-{
-	GElf_Sym sym;
-
-	/* check to see if kernel supports iommu */
-	if (mdb_lookup_by_name("intel_iommu_support", &sym) != -1) {
-		if (mdb_vread(&iommu_support, sizeof (boolean_t),
-		    (uintptr_t)sym.st_value) != sizeof (boolean_t)) {
-			iommu_support = B_FALSE;
-		}
-	}
-
-	return (&modinfo);
-}
--- a/usr/src/cmd/mdb/intel/amd64/Makefile	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/cmd/mdb/intel/amd64/Makefile	Sat Jan 30 18:23:16 2010 -0800
@@ -19,13 +19,13 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
 include ../../Makefile.common
 
-MODULES = $(COMMON_MODULES_PROC) $(COMMON_MODULES_KVM) uhci rootnex
+MODULES = $(COMMON_MODULES_PROC) $(COMMON_MODULES_KVM) uhci
 
 $(CLOSED_BUILD)MODULES += \
 	$(CLOSED_COMMON_MODULES_KVM:%=$(CLOSED)/cmd/mdb/intel/amd64/%)
--- a/usr/src/cmd/mdb/intel/ia32/Makefile	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/cmd/mdb/intel/ia32/Makefile	Sat Jan 30 18:23:16 2010 -0800
@@ -19,14 +19,14 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
 include ../../Makefile.common
 
 MODULES = $(COMMON_MODULES_PROC) $(COMMON_MODULES_PROC_32BIT) \
-    $(COMMON_MODULES_KVM) uhci rootnex
+    $(COMMON_MODULES_KVM) uhci
 
 $(CLOSED_BUILD)MODULES += \
 	$(CLOSED_COMMON_MODULES_KVM:%=$(CLOSED)/cmd/mdb/intel/ia32/%)
--- a/usr/src/cmd/mdb/intel/ia32/rootnex/Makefile	Sat Jan 30 15:04:39 2010 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-#
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-
-# Copyright (c) 2009, Intel Corporation.
-# All rights reserved.
-
-MODULE = rootnex.so
-MDBTGT = kvm
-
-MODSRCS = intel_iommu.c
-
-include ../../../../Makefile.cmd
-include ../../Makefile.ia32
-include ../../../Makefile.module
-
-CPPFLAGS += -I$(SRC)/uts/i86pc
--- a/usr/src/pkgdefs/SUNWmdb/prototype_i386	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/pkgdefs/SUNWmdb/prototype_i386	Sat Jan 30 18:23:16 2010 -0800
@@ -19,7 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -81,7 +81,6 @@
 f none usr/lib/mdb/kvm/amd64/nfs.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/ptm.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/random.so 555 root sys
-f none usr/lib/mdb/kvm/amd64/rootnex.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/s1394.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/sata.so 555 root sys
 f none usr/lib/mdb/kvm/amd64/scsi_vhci.so 555 root sys
@@ -118,7 +117,6 @@
 f none usr/lib/mdb/kvm/nfs.so 555 root sys
 f none usr/lib/mdb/kvm/ptm.so 555 root sys
 f none usr/lib/mdb/kvm/random.so 555 root sys
-f none usr/lib/mdb/kvm/rootnex.so 555 root sys
 f none usr/lib/mdb/kvm/s1394.so 555 root sys
 f none usr/lib/mdb/kvm/sata.so 555 root sys
 f none usr/lib/mdb/kvm/scsi_vhci.so 555 root sys
--- a/usr/src/pkgdefs/SUNWmdbr/prototype_i386	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/pkgdefs/SUNWmdbr/prototype_i386	Sat Jan 30 18:23:16 2010 -0800
@@ -19,7 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
@@ -50,7 +50,6 @@
 f none kernel/kmdb/amd64/nfs 555 root sys
 f none kernel/kmdb/amd64/ptm 555 root sys
 f none kernel/kmdb/amd64/random 555 root sys
-f none kernel/kmdb/amd64/rootnex 555 root sys
 f none kernel/kmdb/amd64/s1394 555 root sys
 f none kernel/kmdb/amd64/sata 555 root sys
 f none kernel/kmdb/amd64/scsi_vhci 555 root sys
@@ -86,7 +85,6 @@
 f none kernel/kmdb/nfs 555 root sys
 f none kernel/kmdb/ptm 555 root sys
 f none kernel/kmdb/random 555 root sys
-f none kernel/kmdb/rootnex 555 root sys
 f none kernel/kmdb/s1394 555 root sys
 f none kernel/kmdb/sata 555 root sys
 f none kernel/kmdb/scsi_vhci 555 root sys
--- a/usr/src/uts/common/os/devcfg.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/common/os/devcfg.c	Sat Jan 30 18:23:16 2010 -0800
@@ -57,12 +57,12 @@
 #include <sys/sunldi_impl.h>
 #include <sys/bootprops.h>
 
-
-#if defined(__i386) || defined(__amd64)
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 #include <sys/iommulib.h>
 #endif
-#endif
+
+/* XXX remove before putback */
+boolean_t ddi_err_panic = B_TRUE;
 
 #ifdef DEBUG
 int ddidebug = DDI_AUDIT;
@@ -399,10 +399,6 @@
 {
 	struct dev_info *devi = DEVI(dip);
 	struct devi_nodeid *elem;
-#if defined(__x86) && !defined(__xpv)
-	gfx_entry_t *gfxp;
-	extern void *gfx_devinfo_list;
-#endif
 
 	ASSERT(devi->devi_ref == 0);
 	ASSERT(devi->devi_addr == NULL);
@@ -410,16 +406,6 @@
 	ASSERT(devi->devi_child == NULL);
 	ASSERT(devi->devi_hp_hdlp == NULL);
 
-#if defined(__x86) && !defined(__xpv)
-	for (gfxp = gfx_devinfo_list; gfxp; gfxp = gfxp->g_next) {
-		if (gfxp->g_dip == dip) {
-			gfxp->g_dip = NULL;
-			while (gfxp->g_ref)
-				;
-		}
-	}
-	membar_producer();
-#endif
 	/* free devi_addr_buf allocated by ddi_set_name_addr() */
 	if (devi->devi_addr_buf)
 		kmem_free(devi->devi_addr_buf, 2 * MAXNAMELEN);
@@ -1348,14 +1334,12 @@
 	DEVI_CLR_NEED_RESET(dip);
 	mutex_exit(&(DEVI(dip)->devi_lock));
 
-#if defined(__i386) || defined(__amd64)
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 	/*
 	 * Close any iommulib mediated linkage to an IOMMU
 	 */
 	iommulib_nex_close(dip);
 #endif
-#endif
 
 	/* destroy the taskq */
 	if (DEVI(dip)->devi_taskq) {
@@ -8565,3 +8549,110 @@
 	if (MDI_PHCI(dip))
 		mdi_phci_retire_finalize(dip, phci_only);
 }
+
+void
+ddi_err(ddi_err_t ade, dev_info_t *rdip, const char *fmt, ...)
+{
+	va_list ap;
+	char strbuf[256];
+	char *buf;
+	size_t buflen, tlen;
+	int ce;
+	int de;
+	const char *fmtbad = "Invalid arguments to ddi_err()";
+
+	de = DER_CONT;
+	strbuf[1] = '\0';
+
+	switch (ade) {
+	case DER_CONS:
+		strbuf[0] = '^';
+		break;
+	case DER_LOG:
+		strbuf[0] = '!';
+		break;
+	case DER_VERB:
+		strbuf[0] = '?';
+		break;
+	default:
+		strbuf[0] = '\0';
+		de = ade;
+		break;
+	}
+
+	tlen = strlen(strbuf);
+	buf = strbuf + tlen;
+	buflen = sizeof (strbuf) - tlen;
+
+	if (rdip && ddi_get_instance(rdip) == -1) {
+		(void) snprintf(buf, buflen, "%s: ",
+		    ddi_driver_name(rdip));
+	} else if (rdip) {
+		(void) snprintf(buf, buflen, "%s%d: ",
+		    ddi_driver_name(rdip), ddi_get_instance(rdip));
+	}
+
+	tlen = strlen(strbuf);
+	buf = strbuf + tlen;
+	buflen = sizeof (strbuf) - tlen;
+
+	va_start(ap, fmt);
+	switch (de) {
+	case DER_CONT:
+		(void) vsnprintf(buf, buflen, fmt, ap);
+		if (ade != DER_CONT) {
+			(void) strlcat(strbuf, "\n", sizeof (strbuf));
+		}
+		ce = CE_CONT;
+		break;
+	case DER_NOTE:
+		(void) vsnprintf(buf, buflen, fmt, ap);
+		ce = CE_NOTE;
+		break;
+	case DER_WARN:
+		(void) vsnprintf(buf, buflen, fmt, ap);
+		ce = CE_WARN;
+		break;
+	case DER_MODE:
+		(void) vsnprintf(buf, buflen, fmt, ap);
+		if (ddi_err_panic == B_TRUE) {
+			ce = CE_PANIC;
+		} else {
+			ce = CE_WARN;
+		}
+		break;
+	case DER_DEBUG:
+		(void) snprintf(buf, buflen, "DEBUG: ");
+		tlen = strlen("DEBUG: ");
+		(void) vsnprintf(buf + tlen, buflen - tlen, fmt, ap);
+		ce = CE_CONT;
+		break;
+	case DER_PANIC:
+		(void) vsnprintf(buf, buflen, fmt, ap);
+		ce = CE_PANIC;
+		break;
+	case DER_INVALID:
+	default:
+		(void) snprintf(buf, buflen, fmtbad);
+		tlen = strlen(fmtbad);
+		(void) vsnprintf(buf + tlen, buflen - tlen, fmt, ap);
+		ce = CE_PANIC;
+		break;
+	}
+	va_end(ap);
+
+	cmn_err(ce, strbuf);
+}
+
+/*ARGSUSED*/
+void
+ddi_mem_update(uint64_t addr, uint64_t size)
+{
+#if defined(__x86) && !defined(__xpv)
+	extern void immu_physmem_update(uint64_t addr, uint64_t size);
+	immu_physmem_update(addr, size);
+#else
+	/*LINTED*/
+	;
+#endif
+}
--- a/usr/src/uts/common/os/mem_config.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/common/os/mem_config.c	Sat Jan 30 18:23:16 2010 -0800
@@ -562,6 +562,12 @@
 	if (nlgrps == 1)
 		lgrp_config(LGRP_CONFIG_GEN_UPDATE, 0, 0);
 
+	/*
+	 * Inform DDI of update
+	 */
+	ddi_mem_update((uint64_t)(pt_base) << PAGESHIFT,
+	    (uint64_t)(tpgs) << PAGESHIFT);
+
 	delspan_unreserve(pt_base, tpgs);
 	return (KPHYSM_OK);		/* Successfully added system memory */
 
--- a/usr/src/uts/common/sys/ddi_impldefs.h	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/common/sys/ddi_impldefs.h	Sat Jan 30 18:23:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -257,8 +257,8 @@
 	struct i_ddi_prop_dyn	*devi_prop_dyn_driver;	/* prop_op */
 	struct i_ddi_prop_dyn	*devi_prop_dyn_parent;	/* bus_prop_op */
 
-	/* For intel iommu support */
-	void		*devi_iommu_private;
+	/* For x86 (Intel and AMD) IOMMU support */
+	void		*devi_iommu;
 
 	/* IOMMU handle */
 	iommulib_handle_t	devi_iommulib_handle;
@@ -596,12 +596,16 @@
 #define	DEVI_RETIRING		0x00000200 /* being evaluated for retire */
 #define	DEVI_R_CONSTRAINT	0x00000400 /* constraints have been applied  */
 #define	DEVI_R_BLOCKED		0x00000800 /* constraints block retire  */
-#define	DEVI_CT_NOP		0x00001000 /*  NOP contract event occurred */
+#define	DEVI_CT_NOP		0x00001000 /* NOP contract event occurred */
+#define	DEVI_PCI_DEVICE		0x00002000 /* dip is PCI */
 
 #define	DEVI_BUSY_CHANGING(dip)	(DEVI(dip)->devi_flags & DEVI_BUSY)
 #define	DEVI_BUSY_OWNED(dip)	(DEVI_BUSY_CHANGING(dip) &&	\
 	((DEVI(dip))->devi_busy_thread == curthread))
 
+#define	DEVI_IS_PCI(dip)	(DEVI(dip)->devi_flags & DEVI_PCI_DEVICE)
+#define	DEVI_SET_PCI(dip)	(DEVI(dip)->devi_flags |= (DEVI_PCI_DEVICE))
+
 char	*i_ddi_devi_class(dev_info_t *);
 int	i_ddi_set_devi_class(dev_info_t *, char *, int);
 
--- a/usr/src/uts/common/sys/ddidmareq.h	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/common/sys/ddidmareq.h	Sat Jan 30 18:23:16 2010 -0800
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DDIDMAREQ_H
 #define	_SYS_DDIDMAREQ_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -639,6 +637,13 @@
  */
 #define	DDI_DMA_INUSE		-9
 
+
+/*
+ * DVMA disabled or not supported. use physical DMA
+ */
+#define	DDI_DMA_USE_PHYSICAL		-10
+
+
 /*
  * In order for the access to a memory object to be consistent
  * between a device and a CPU, the function ddi_dma_sync(9F)
--- a/usr/src/uts/common/sys/sunddi.h	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/common/sys/sunddi.h	Sat Jan 30 18:23:16 2010 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -398,6 +398,26 @@
 #define	DDI_MODEL_NATIVE	DATAMODEL_NATIVE
 #define	DDI_MODEL_NONE		DATAMODEL_NONE
 
+/*
+ * Defines for ddi_err().
+ */
+typedef enum ddi_err {
+	DER_INVALID = 0,	/* must be 0 */
+	DER_CONT = 1,
+	DER_CONS,
+	DER_LOG,
+	DER_VERB,
+	DER_NOTE,
+	DER_WARN,
+	DER_PANIC,
+	DER_MODE,
+	DER_DEBUG
+} ddi_err_t;
+
+/* if set to B_TRUE is DER_MODE is equivalent to DERE_PANIC */
+extern boolean_t ddi_err_panic;
+extern void ddi_err(ddi_err_t de, dev_info_t *rdip, const char *fmt, ...);
+
 extern char *ddi_strdup(const char *str, int flag);
 extern char *strdup(const char *str);
 extern void strfree(char *str);
@@ -2225,6 +2245,9 @@
 	    ddi_cb_handle_t *ret_hdlp);
 int	ddi_cb_unregister(ddi_cb_handle_t hdl);
 
+/* Notify DDI of memory added */
+void ddi_mem_update(uint64_t addr, uint64_t size);
+
 #endif	/* _KERNEL */
 
 #ifdef	__cplusplus
--- a/usr/src/uts/i86pc/Makefile.files	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/Makefile.files	Sat Jan 30 18:23:16 2010 -0800
@@ -20,7 +20,7 @@
 #
 
 #
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #	This Makefile defines file modules in the directory uts/i86pc
@@ -204,7 +204,9 @@
 	acpidev_resource.o \
 	acpidev_util.o
 
-ROOTNEX_OBJS += rootnex.o iommu_rscs.o dmar_acpi.o intel_iommu.o
+ROOTNEX_OBJS += rootnex.o immu.o immu_dmar.o immu_dvma.o \
+		immu_intrmap.o immu_qinv.o immu_regs.o
+
 TZMON_OBJS	+= tzmon.o
 UPPC_OBJS += uppc.o psm_common.o
 XSVC_OBJS += xsvc.o
--- a/usr/src/uts/i86pc/io/dmar_acpi.c	Sat Jan 30 15:04:39 2010 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,829 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Portions Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2009, Intel Corporation.
- * All rights reserved.
- */
-
-
-#include <sys/debug.h>
-#include <sys/sysmacros.h>
-#include <sys/types.h>
-#include <sys/kmem.h>
-#include <sys/sunddi.h>
-#include <sys/list.h>
-#include <sys/pci.h>
-#include <sys/pci_cfgspace.h>
-#include <sys/pci_impl.h>
-#include <sys/sunndi.h>
-#include <sys/ksynch.h>
-#include <sys/cmn_err.h>
-#include <sys/bootconf.h>
-#include <sys/int_fmtio.h>
-#include <sys/dmar_acpi.h>
-#include <sys/smbios.h>
-#include <sys/iommulib.h>
-
-/*
- * the following pci manipulate function pinter
- * are defined in pci_cfgspace.h
- */
-#define	pci_getb	(*pci_getb_func)
-
-/*
- * define for debug
- */
-int intel_dmar_acpi_debug = 0;
-#define	dcmn_err	if (intel_dmar_acpi_debug) cmn_err
-
-/*
- * define for printing blacklist ID
- */
-int intel_iommu_blacklist_id;
-
-/*
- * global varables
- */
-boolean_t intel_iommu_support;
-intel_dmar_info_t *dmar_info;
-
-/*
- * global variables to save source id and drhd info for ioapic
- * to support interrupt remapping
- */
-list_t	ioapic_drhd_infos;
-
-/*
- * internal variables
- */
-static void *dmart;
-
-/*
- * helper functions to release the allocated resources
- * when failed
- */
-static void
-release_dev_scope(list_t *lp)
-{
-	pci_dev_scope_t *devs;
-
-	if (list_is_empty(lp))
-		return;
-
-	while ((devs = list_head(lp)) != NULL) {
-		list_remove(lp, devs);
-		kmem_free(devs, sizeof (pci_dev_scope_t));
-	}
-}
-
-static void
-release_drhd_info(void)
-{
-	drhd_info_t *drhd;
-	list_t *lp;
-	int i;
-
-	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
-		lp = &dmar_info->dmari_drhd[i];
-		if (list_is_empty(lp))
-			break;
-
-		while ((drhd = list_head(lp)) != NULL) {
-			list_remove(lp, drhd);
-
-			/*
-			 * release the device scope
-			 */
-			release_dev_scope(&drhd->di_dev_list);
-			list_destroy(&drhd->di_dev_list);
-			kmem_free(drhd, sizeof (drhd_info_t));
-		}
-	}
-}
-
-static void
-release_rmrr_info(void)
-{
-	rmrr_info_t *rmrr;
-	list_t *lp;
-	int i;
-
-	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
-		lp = &dmar_info->dmari_rmrr[i];
-		if (list_is_empty(lp))
-			break;
-
-		while ((rmrr = list_head(lp)) != NULL) {
-			list_remove(lp, rmrr);
-			release_dev_scope(&rmrr->ri_dev_list);
-			list_destroy(&rmrr->ri_dev_list);
-			kmem_free(rmrr, sizeof (rmrr_info_t));
-		}
-	}
-}
-
-/*
- * intel_iommu_release_dmar_info()
- *   global function, which is called to release dmar_info
- *   when the dmar_intel_iommu_supportinfo is not
- *   needed any more.
- */
-void
-intel_iommu_release_dmar_info(void)
-{
-	int i;
-
-	intel_iommu_support = B_FALSE;
-	release_drhd_info();
-	release_rmrr_info();
-
-	/*
-	 * destroy the drhd and rmrr list
-	 */
-	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
-		list_destroy(&dmar_info->dmari_drhd[i]);
-		list_destroy(&dmar_info->dmari_rmrr[i]);
-	}
-
-	kmem_free(dmar_info, sizeof (intel_dmar_info_t));
-}
-
-/*
- * create_dmar_devi()
- *
- *   create the dev_info node in the device tree,
- *   the info node is a nuxus child of the root
- *   nexus
- */
-static void
-create_dmar_devi(void)
-{
-	dev_info_t *dip;
-	drhd_info_t *drhd;
-	struct regspec reg;
-	struct ddi_parent_private_data *pdptr;
-	char nodename[64];
-	int i, j;
-
-	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
-
-		/*
-		 * ignore the empty list
-		 */
-		if (list_is_empty(&dmar_info->dmari_drhd[i]))
-			break;
-
-		/*
-		 * alloc dev_info per drhd unit
-		 */
-		j = 0;
-		for_each_in_list(&dmar_info->dmari_drhd[i], drhd) {
-			(void) snprintf(nodename, sizeof (nodename),
-			    "dmar%d,%d", drhd->di_segment, j++);
-			ndi_devi_alloc_sleep(ddi_root_node(), nodename,
-			    DEVI_SID_NODEID, &dip);
-			drhd->di_dip = dip;
-			reg.regspec_bustype = 0;
-			reg.regspec_addr = drhd->di_reg_base;
-			reg.regspec_size = IOMMU_REG_SIZE;
-
-			/*
-			 * update the reg properties
-			 *
-			 *   reg property will be used for register
-			 *   set access
-			 *
-			 * refer to the bus_map of root nexus driver
-			 * I/O or memory mapping:
-			 *
-			 * <bustype=0, addr=x, len=x>: memory
-			 * <bustype=1, addr=x, len=x>: i/o
-			 * <bustype>1, addr=0, len=x>: x86-compatibility i/o
-			 */
-			(void) ndi_prop_update_int_array(DDI_DEV_T_NONE,
-			    dip, "reg", (int *)&reg,
-			    sizeof (struct regspec) / sizeof (int));
-
-			pdptr = (struct ddi_parent_private_data *)
-			    kmem_zalloc(sizeof (struct ddi_parent_private_data)
-			    + sizeof (struct regspec), KM_SLEEP);
-			pdptr->par_nreg = 1;
-			pdptr->par_reg = (struct regspec *)(pdptr + 1);
-			pdptr->par_reg->regspec_bustype = 0;
-			pdptr->par_reg->regspec_addr = drhd->di_reg_base;
-			pdptr->par_reg->regspec_size = IOMMU_REG_SIZE;
-			ddi_set_parent_data(dip, pdptr);
-		}
-	}
-}
-
-/*
- * parse_dmar_dev_scope()
- *   parse the device scope attached to drhd or rmrr
- */
-static int
-parse_dmar_dev_scope(dmar_acpi_dev_scope_t *scope, pci_dev_scope_t **devs)
-{
-	int depth;
-	int bus, dev, func;
-	pci_dev_scope_t *entry;
-
-	struct path_to_dev {
-		uint8_t device;
-		uint8_t function;
-	} *path;
-
-	path = (struct path_to_dev *)(scope + 1);
-	depth = (scope->ds_length - 6)/2;
-	bus = scope->ds_sbusnum;
-	dev = path->device;
-	func = path->function;
-
-	while (--depth) {
-		path++;
-		bus = pci_getb(bus, dev, func, PCI_BCNF_SECBUS);
-		dev = path->device;
-		func = path->function;
-	}
-
-	entry = (pci_dev_scope_t *)kmem_zalloc(
-	    sizeof (pci_dev_scope_t), KM_SLEEP);
-	entry->pds_bus = bus;
-	entry->pds_dev = dev;
-	entry->pds_func = func;
-	entry->pds_type = scope->ds_type;
-
-	*devs = entry;
-	return (PARSE_DMAR_SUCCESS);
-}
-
-/*
- * parse_dmar_rmrr()
- *   parse the rmrr units in dmar table
- */
-static int
-parse_dmar_rmrr(dmar_acpi_unit_head_t *head)
-{
-	dmar_acpi_rmrr_t *rmrr;
-	rmrr_info_t *rinfo;
-	dmar_acpi_dev_scope_t *scope;
-	pci_dev_scope_t *devs;
-
-	rmrr = (dmar_acpi_rmrr_t *)head;
-	ASSERT(head->uh_type == DMAR_UNIT_TYPE_RMRR);
-	ASSERT(rmrr->rm_segment <= DMAR_MAX_SEGMENT);
-
-	/*
-	 * for each rmrr, limiaddr must > baseaddr
-	 */
-	if (rmrr->rm_baseaddr >= rmrr->rm_limiaddr) {
-		cmn_err(CE_NOTE, "Invalid BIOS RMRR: Disabling Intel IOMMU");
-		cmn_err(CE_WARN, "!invalid rmrr,"
-		    " baseaddr = 0x%" PRIx64
-		    ", limiaddr = 0x%" PRIx64 "",
-		    rmrr->rm_baseaddr, rmrr->rm_limiaddr);
-		return (PARSE_DMAR_FAIL);
-	}
-
-	/*
-	 * allocate and setup the device info structure
-	 */
-	rinfo = (rmrr_info_t *)kmem_zalloc(sizeof (rmrr_info_t),
-	    KM_SLEEP);
-	rinfo->ri_segment = rmrr->rm_segment;
-	rinfo->ri_baseaddr = rmrr->rm_baseaddr;
-	rinfo->ri_limiaddr = rmrr->rm_limiaddr;
-	list_create(&rinfo->ri_dev_list, sizeof (pci_dev_scope_t),
-	    offsetof(pci_dev_scope_t, node));
-
-	/*
-	 * parse the device scope
-	 */
-	scope = (dmar_acpi_dev_scope_t *)(rmrr + 1);
-	while ((unsigned long)scope < ((unsigned long)rmrr + head->uh_length)) {
-		if (parse_dmar_dev_scope(scope, &devs)
-		    != PARSE_DMAR_SUCCESS) {
-			return (PARSE_DMAR_FAIL);
-		}
-
-		list_insert_tail(&rinfo->ri_dev_list, devs);
-		scope = (dmar_acpi_dev_scope_t *)((unsigned long)scope
-		    + scope->ds_length);
-	}
-
-	/*
-	 * save this info structure
-	 */
-	list_insert_tail(&dmar_info->dmari_rmrr[rinfo->ri_segment], rinfo);
-	return (PARSE_DMAR_SUCCESS);
-}
-
-/*
- * parse_dmar_drhd()
- *   parse the drhd uints in dmar table
- */
-static int
-parse_dmar_drhd(dmar_acpi_unit_head_t *head)
-{
-	dmar_acpi_drhd_t *drhd;
-	drhd_info_t *dinfo;
-	dmar_acpi_dev_scope_t *scope;
-	list_t *lp;
-	pci_dev_scope_t *devs;
-	ioapic_drhd_info_t	*ioapic_dinfo;
-
-	drhd = (dmar_acpi_drhd_t *)head;
-	ASSERT(head->uh_type == DMAR_UNIT_TYPE_DRHD);
-
-	/*
-	 * assert the segment boundary
-	 */
-	ASSERT(drhd->dr_segment <= DMAR_MAX_SEGMENT);
-
-	/*
-	 * allocate and setup the info structure
-	 */
-	dinfo = (drhd_info_t *)kmem_zalloc(sizeof (drhd_info_t), KM_SLEEP);
-	dinfo->di_segment = drhd->dr_segment;
-	dinfo->di_reg_base = drhd->dr_baseaddr;
-	dinfo->di_include_all = (drhd->dr_flags & INCLUDE_PCI_ALL) ?
-	    B_TRUE : B_FALSE;
-	list_create(&dinfo->di_dev_list, sizeof (pci_dev_scope_t),
-	    offsetof(pci_dev_scope_t, node));
-
-	/*
-	 * parse the device scope
-	 */
-	scope = (dmar_acpi_dev_scope_t *)(drhd + 1);
-	while ((unsigned long)scope < ((unsigned long)drhd +
-	    head->uh_length)) {
-
-		if (parse_dmar_dev_scope(scope, &devs)
-		    != PARSE_DMAR_SUCCESS) {
-			return (PARSE_DMAR_FAIL);
-		}
-		/* get ioapic source id for interrupt remapping */
-		if (devs->pds_type == DEV_SCOPE_IOAPIC) {
-			ioapic_dinfo = kmem_zalloc
-			    (sizeof (ioapic_drhd_info_t), KM_SLEEP);
-
-			ioapic_dinfo->ioapic_id = scope->ds_enumid;
-			ioapic_dinfo->sid =
-			    (devs->pds_bus << 8) |
-			    (devs->pds_dev << 3) |
-			    (devs->pds_func);
-			ioapic_dinfo->drhd = dinfo;
-			list_insert_tail(&ioapic_drhd_infos, ioapic_dinfo);
-		}
-
-		list_insert_tail(&dinfo->di_dev_list, devs);
-		scope = (dmar_acpi_dev_scope_t *)((unsigned long)scope +
-		    scope->ds_length);
-	}
-
-	lp = &dmar_info->dmari_drhd[dinfo->di_segment];
-	list_insert_tail(lp, dinfo);
-	return (PARSE_DMAR_SUCCESS);
-}
-
-#define	OEMID_OFF	10
-#define	OEMID_LEN	6
-#define	OEM_TBLID_OFF	16
-#define	OEM_TBLID_LEN	8
-#define	OEMREV_OFF	24
-#define	OEMREV_LEN	4
-
-static int
-dmar_blacklisted(caddr_t dmart)
-{
-	char oemid[OEMID_LEN + 1] = {0};
-	char oem_tblid[OEM_TBLID_LEN + 1] = {0};
-	char oemrev[OEMREV_LEN + 1] = {0};
-	const char *mfgr = "?";
-	const char *product = "?";
-	const char *version = "?";
-	smbios_info_t smbios_info;
-	smbios_system_t smbios_sys;
-	id_t id;
-	char **blacklist;
-	int i;
-	uint_t n;
-
-	(void) strncpy(oemid, dmart + OEMID_OFF, OEMID_LEN);
-	(void) strncpy(oem_tblid, dmart + OEM_TBLID_OFF, OEM_TBLID_LEN);
-	(void) strncpy(oemrev, dmart + OEMREV_OFF, OEMREV_LEN);
-
-	iommulib_smbios = smbios_open(NULL, SMB_VERSION, ksmbios_flags,
-	    NULL);
-	if (iommulib_smbios &&
-	    (id = smbios_info_system(iommulib_smbios, &smbios_sys))
-	    != SMB_ERR &&
-	    smbios_info_common(iommulib_smbios, id, &smbios_info)
-	    != SMB_ERR) {
-		mfgr = smbios_info.smbi_manufacturer;
-		product = smbios_info.smbi_product;
-		version = smbios_info.smbi_version;
-	}
-
-	if (intel_iommu_blacklist_id) {
-		cmn_err(CE_NOTE, "SMBIOS ID:");
-		cmn_err(CE_NOTE, "Manufacturer = <%s>", mfgr);
-		cmn_err(CE_NOTE, "Product = <%s>", product);
-		cmn_err(CE_NOTE, "Version = <%s>", version);
-		cmn_err(CE_NOTE, "DMAR ID:");
-		cmn_err(CE_NOTE, "oemid = <%s>", oemid);
-		cmn_err(CE_NOTE, "oemtblid = <%s>", oem_tblid);
-		cmn_err(CE_NOTE, "oemrev = <%s>", oemrev);
-	}
-
-	/*
-	 * Fake up a dev_t since searching global prop list needs it
-	 */
-	if (ddi_prop_lookup_string_array(
-	    makedevice(ddi_name_to_major("rootnex"), 0), ddi_root_node(),
-	    DDI_PROP_DONTPASS | DDI_PROP_ROOTNEX_GLOBAL,
-	    "intel-iommu-blacklist", &blacklist, &n) != DDI_PROP_SUCCESS) {
-		/* No blacklist */
-		return (0);
-	}
-
-	if (n < 4 || n % 4 != 0) {
-		cmn_err(CE_WARN,
-		    "invalid Intel IOMMU blacklist: not a multiple of four");
-		ddi_prop_free(blacklist);
-		return (0);
-	}
-
-	for (i = 0; i < n; i += 4) {
-		if (strcmp(blacklist[i], "SMBIOS") == 0 &&
-		    strcmp(blacklist[i+1], mfgr) == 0 &&
-		    (blacklist[i+2][0] == '\0' ||
-		    strcmp(blacklist[i+2], product) == 0) &&
-		    (blacklist[i+3][0] == '\0' ||
-		    strcmp(blacklist[i+3], version) == 0)) {
-			ddi_prop_free(blacklist);
-			return (1);
-		}
-		if (strcmp(blacklist[i], "DMAR") == 0 &&
-		    strcmp(blacklist[i+1], oemid) == 0 &&
-		    (blacklist[i+2][0] == '\0' ||
-		    strcmp(blacklist[i+2], oem_tblid) == 0) &&
-		    (blacklist[i+3][0] == '\0' ||
-		    strcmp(blacklist[i+3], oemrev) == 0)) {
-			ddi_prop_free(blacklist);
-			return (1);
-		}
-	}
-
-	ddi_prop_free(blacklist);
-
-	return (0);
-}
-
-/*
- * parse_dmar()
- *   parse the dmar table
- */
-static int
-parse_dmar(void)
-{
-	dmar_acpi_head_t *dmar_head;
-	dmar_acpi_unit_head_t *unit_head;
-	drhd_info_t *drhd;
-	int i;
-
-	dmar_head = (dmar_acpi_head_t *)dmart;
-
-	/*
-	 * do a sanity check
-	 */
-	if (!dmar_head || strncmp(dmar_head->dh_sig, "DMAR", 4)) {
-		dcmn_err(CE_CONT, "wrong DMAR signature: %c%c%c%c",
-		    dmar_head->dh_sig[0], dmar_head->dh_sig[1],
-		    dmar_head->dh_sig[2], dmar_head->dh_sig[3]);
-		return (PARSE_DMAR_FAIL);
-	}
-
-	if (dmar_blacklisted(dmart)) {
-		cmn_err(CE_NOTE, "Intel IOMMU is blacklisted on this platform");
-		return (PARSE_DMAR_FAIL);
-	}
-
-	dmar_info->dmari_haw = dmar_head->dh_haw + 1;
-	dmar_info->dmari_intr_remap = dmar_head->dh_flags & 0x1 ?
-	    B_TRUE : B_FALSE;
-
-	/*
-	 * parse each unit
-	 *    only DRHD and RMRR are parsed, others are ignored
-	 */
-	unit_head = (dmar_acpi_unit_head_t *)(dmar_head + 1);
-	while ((unsigned long)unit_head < (unsigned long)dmar_head +
-	    dmar_head->dh_len) {
-		switch (unit_head->uh_type) {
-		case DMAR_UNIT_TYPE_DRHD:
-			if (parse_dmar_drhd(unit_head) !=
-			    PARSE_DMAR_SUCCESS) {
-
-				/*
-				 * iommu_detect_parse() will release
-				 * all drhd info structure, just
-				 * return false here
-				 */
-				return (PARSE_DMAR_FAIL);
-			}
-			break;
-		case DMAR_UNIT_TYPE_RMRR:
-			if (parse_dmar_rmrr(unit_head) !=
-			    PARSE_DMAR_SUCCESS)
-				return (PARSE_DMAR_FAIL);
-			break;
-		default:
-			cmn_err(CE_CONT, "!DMAR ACPI table: "
-			    "unit type %d ignored\n", unit_head->uh_type);
-		}
-		unit_head = (dmar_acpi_unit_head_t *)
-		    ((unsigned long)unit_head +
-		    unit_head->uh_length);
-	}
-
-#ifdef	DEBUG
-	/*
-	 * make sure the include_all drhd is the
-	 * last drhd in the list, this is only for
-	 * debug
-	 */
-	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
-		if (list_is_empty(&dmar_info->dmari_drhd[i]))
-			break;
-
-		for_each_in_list(&dmar_info->dmari_drhd[i], drhd) {
-			if (drhd->di_include_all &&
-			    list_next(&dmar_info->dmari_drhd[i], drhd)
-			    != NULL) {
-				list_remove(&dmar_info->dmari_drhd[i], drhd);
-				list_insert_tail(&dmar_info->dmari_drhd[i],
-				    drhd);
-				dcmn_err(CE_CONT,
-				    "include_all drhd is adjusted\n");
-			}
-		}
-	}
-#endif
-
-	return (PARSE_DMAR_SUCCESS);
-}
-
-/*
- * detect_dmar()
- *   detect the dmar acpi table
- */
-static boolean_t
-detect_dmar(void)
-{
-	int len;
-	char *intel_iommu;
-	char *enable;
-
-	/*
-	 * if "intel-iommu = no" boot property is set,
-	 * ignore intel iommu
-	 */
-	if ((len = do_bsys_getproplen(NULL, "intel-iommu")) > 0) {
-		intel_iommu = kmem_alloc(len, KM_SLEEP);
-		(void) do_bsys_getprop(NULL, "intel-iommu", intel_iommu);
-		if (strcmp(intel_iommu, "no") == 0) {
-			dcmn_err(CE_CONT, "\"intel-iommu=no\" was set\n");
-			kmem_free(intel_iommu, len);
-			return (B_FALSE);
-		}
-		kmem_free(intel_iommu, len);
-	}
-
-	/*
-	 * Check rootnex.conf for enable/disable IOMMU
-	 * Fake up a dev_t since searching global prop list needs it
-	 */
-	if (ddi_prop_lookup_string(
-	    makedevice(ddi_name_to_major("rootnex"), 0), ddi_root_node(),
-	    DDI_PROP_DONTPASS | DDI_PROP_ROOTNEX_GLOBAL,
-	    "intel-iommu", &enable) == DDI_PROP_SUCCESS) {
-		if (strcmp(enable, "false") == 0 || strcmp(enable, "no") == 0) {
-			dcmn_err(CE_CONT,
-			    "\"intel-iommu=no\" set in rootnex.conf\n");
-			ddi_prop_free(enable);
-			return (B_FALSE);
-		}
-		ddi_prop_free(enable);
-	}
-
-	/*
-	 * get dmar-table from system properties
-	 */
-	if ((len = do_bsys_getproplen(NULL, DMAR_TABLE_PROPNAME)) <= 0) {
-		dcmn_err(CE_CONT, "dmar-table getprop failed\n");
-		return (B_FALSE);
-	}
-	dcmn_err(CE_CONT, "dmar-table length = %d\n", len);
-	dmart = kmem_alloc(len, KM_SLEEP);
-	(void) do_bsys_getprop(NULL, DMAR_TABLE_PROPNAME, dmart);
-
-	return (B_TRUE);
-}
-
-/*
- * print dmar_info for debug
- */
-static void
-print_dmar_info(void)
-{
-	drhd_info_t *drhd;
-	rmrr_info_t *rmrr;
-	pci_dev_scope_t *dev;
-	int i;
-
-	/* print the title */
-	cmn_err(CE_CONT, "dmar_info->:\n");
-	cmn_err(CE_CONT, "\thaw = %d\n", dmar_info->dmari_haw);
-	cmn_err(CE_CONT, "\tintr_remap = %d\n",
-	    dmar_info->dmari_intr_remap ? 1 : 0);
-
-	/* print drhd info list */
-	cmn_err(CE_CONT, "\ndrhd list:\n");
-	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
-		if (list_is_empty(&dmar_info->dmari_drhd[i]))
-			break;
-		for (drhd = list_head(&dmar_info->dmari_drhd[i]);
-		    drhd != NULL; drhd = list_next(&dmar_info->dmari_drhd[i],
-		    drhd)) {
-			cmn_err(CE_CONT, "\n\tsegment = %d\n",
-			    drhd->di_segment);
-			cmn_err(CE_CONT, "\treg_base = 0x%" PRIx64 "\n",
-			    drhd->di_reg_base);
-			cmn_err(CE_CONT, "\tinclude_all = %s\n",
-			    drhd->di_include_all ? "yes" : "no");
-			cmn_err(CE_CONT, "\tdip = 0x%p\n",
-			    (void *)drhd->di_dip);
-			cmn_err(CE_CONT, "\tdevice list:\n");
-			for (dev = list_head(&drhd->di_dev_list);
-			    dev != NULL; dev = list_next(&drhd->di_dev_list,
-			    dev)) {
-				cmn_err(CE_CONT, "\n\t\tbus = %d\n",
-				    dev->pds_bus);
-				cmn_err(CE_CONT, "\t\tdev = %d\n",
-				    dev->pds_dev);
-				cmn_err(CE_CONT, "\t\tfunc = %d\n",
-				    dev->pds_func);
-				cmn_err(CE_CONT, "\t\ttype = %d\n",
-				    dev->pds_type);
-			}
-		}
-	}
-
-	/* print rmrr info list */
-	cmn_err(CE_CONT, "\nrmrr list:\n");
-	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
-		if (list_is_empty(&dmar_info->dmari_rmrr[i]))
-			break;
-		for (rmrr = list_head(&dmar_info->dmari_rmrr[i]);
-		    rmrr != NULL; rmrr = list_next(&dmar_info->dmari_rmrr[i],
-		    rmrr)) {
-			cmn_err(CE_CONT, "\n\tsegment = %d\n",
-			    rmrr->ri_segment);
-			cmn_err(CE_CONT, "\tbaseaddr = 0x%" PRIx64 "\n",
-			    rmrr->ri_baseaddr);
-			cmn_err(CE_CONT, "\tlimiaddr = 0x%" PRIx64 "\n",
-			    rmrr->ri_limiaddr);
-			cmn_err(CE_CONT, "\tdevice list:\n");
-			for (dev = list_head(&rmrr->ri_dev_list);
-			    dev != NULL;
-			    dev = list_next(&rmrr->ri_dev_list, dev)) {
-				cmn_err(CE_CONT, "\n\t\tbus = %d\n",
-				    dev->pds_bus);
-				cmn_err(CE_CONT, "\t\tdev = %d\n",
-				    dev->pds_dev);
-				cmn_err(CE_CONT, "\t\tfunc = %d\n",
-				    dev->pds_func);
-				cmn_err(CE_CONT, "\t\ttype = %d\n",
-				    dev->pds_type);
-			}
-		}
-	}
-}
-
-/*
- * intel_iommu_probe_and_parse()
- *   called from rootnex driver
- */
-void
-intel_iommu_probe_and_parse(void)
-{
-	int i, len;
-	char *opt;
-
-	dmar_info = NULL;
-
-	/*
-	 * retrieve the print-dmar-acpi boot option
-	 */
-	if ((len = do_bsys_getproplen(NULL, "print-dmar-acpi")) > 0) {
-		opt = kmem_alloc(len, KM_SLEEP);
-		(void) do_bsys_getprop(NULL, "print-dmar-acpi", opt);
-		if (strcmp(opt, "yes") == 0 ||
-		    strcmp(opt, "true") == 0) {
-			intel_dmar_acpi_debug = 1;
-			cmn_err(CE_CONT, "\"print-dmar-acpi=true\" was set\n");
-		} else if (strcmp(opt, "no") == 0 ||
-		    strcmp(opt, "false") == 0) {
-			intel_dmar_acpi_debug = 0;
-			cmn_err(CE_CONT, "\"print-dmar-acpi=false\" was set\n");
-		}
-		kmem_free(opt, len);
-	}
-
-	/*
-	 * retrieve the print-iommu-blacklist-id boot option
-	 */
-	if ((len = do_bsys_getproplen(NULL, "print-iommu-blacklist-id")) > 0) {
-		opt = kmem_alloc(len, KM_SLEEP);
-		(void) do_bsys_getprop(NULL, "print-iommu-blacklist-id", opt);
-		if (strcmp(opt, "yes") == 0 ||
-		    strcmp(opt, "true") == 0) {
-			intel_iommu_blacklist_id = 1;
-		} else if (strcmp(opt, "no") == 0 ||
-		    strcmp(opt, "false") == 0) {
-			intel_iommu_blacklist_id = 0;
-		}
-		kmem_free(opt, len);
-	}
-
-
-	dcmn_err(CE_CONT, "intel iommu detect start\n");
-
-	if (detect_dmar() == B_FALSE) {
-		dcmn_err(CE_CONT, "no intel iommu detected\n");
-		return;
-	}
-
-	/*
-	 * the platform has intel iommu, setup globals
-	 */
-	intel_iommu_support = B_TRUE;
-	dmar_info = kmem_zalloc(sizeof (intel_dmar_info_t),
-	    KM_SLEEP);
-	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
-		list_create(&(dmar_info->dmari_drhd[i]), sizeof (drhd_info_t),
-		    offsetof(drhd_info_t, node));
-		list_create(&(dmar_info->dmari_rmrr[i]), sizeof (rmrr_info_t),
-		    offsetof(rmrr_info_t, node));
-	}
-
-	/* create ioapic - drhd map info for interrupt remapping */
-	list_create(&ioapic_drhd_infos, sizeof (ioapic_drhd_info_t),
-	    offsetof(ioapic_drhd_info_t, node));
-
-	/*
-	 * parse dmar acpi table
-	 */
-	if (parse_dmar() != PARSE_DMAR_SUCCESS) {
-		intel_iommu_release_dmar_info();
-		dcmn_err(CE_CONT, "DMAR parse failed\n");
-		return;
-	}
-
-	/*
-	 * create dev_info structure per hrhd
-	 * and prepare it for binding driver
-	 */
-	create_dmar_devi();
-
-	/*
-	 * print the dmar info if the debug
-	 * is set
-	 */
-	if (intel_dmar_acpi_debug)
-		print_dmar_info();
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/io/immu.c	Sat Jan 30 18:23:16 2010 -0800
@@ -0,0 +1,1033 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Portions Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ * All rights reserved.
+ */
+
+/*
+ * Intel IOMMU implementation
+ * This file contains Intel IOMMU code exported
+ * to the rest of the system and code that deals
+ * with the Intel IOMMU as a whole.
+ */
+
+#include <sys/conf.h>
+#include <sys/modctl.h>
+#include <sys/pci.h>
+#include <sys/pci_impl.h>
+#include <sys/sysmacros.h>
+#include <sys/ddi.h>
+#include <sys/ddidmareq.h>
+#include <sys/ddi_impldefs.h>
+#include <sys/ddifm.h>
+#include <sys/sunndi.h>
+#include <sys/debug.h>
+#include <sys/fm/protocol.h>
+#include <sys/note.h>
+#include <sys/apic.h>
+#include <vm/hat_i86.h>
+#include <sys/smp_impldefs.h>
+#include <sys/spl.h>
+#include <sys/archsystm.h>
+#include <sys/x86_archext.h>
+#include <sys/rootnex.h>
+#include <sys/avl.h>
+#include <sys/bootconf.h>
+#include <sys/bootinfo.h>
+#include <sys/atomic.h>
+#include <sys/immu.h>
+
+/* ########################### Globals and tunables ######################## */
+/*
+ * Global switches (boolean) that can be toggled either via boot options
+ * or via /etc/system or kmdb
+ */
+
+/* Various features */
+boolean_t immu_enable = B_TRUE;
+boolean_t immu_dvma_enable = B_TRUE;
+
+/* accessed in other files so not static */
+boolean_t immu_gfxdvma_enable = B_TRUE;
+boolean_t immu_intrmap_enable = B_FALSE;
+boolean_t immu_qinv_enable = B_FALSE;
+
+/* various quirks that need working around */
+
+/* XXX We always map page 0 read/write for now */
+boolean_t immu_quirk_usbpage0 = B_TRUE;
+boolean_t immu_quirk_usbrmrr = B_TRUE;
+boolean_t immu_quirk_usbfullpa;
+boolean_t immu_quirk_mobile4;
+
+boolean_t immu_mmio_safe = B_TRUE;
+
+/* debug messages */
+boolean_t immu_dmar_print;
+
+/* ############  END OPTIONS section ################ */
+
+/*
+ * Global used internally by Intel IOMMU code
+ */
+dev_info_t *root_devinfo;
+kmutex_t immu_lock;
+list_t immu_list;
+boolean_t immu_setup;
+boolean_t immu_running;
+boolean_t immu_quiesced;
+
+/* ######################## END Globals and tunables ###################### */
+/* Globals used only in this file */
+static char **black_array;
+static uint_t nblacks;
+/* ###################### Utility routines ############################# */
+
+/*
+ * Check if the device has mobile 4 chipset
+ */
+static int
+check_mobile4(dev_info_t *dip, void *arg)
+{
+	_NOTE(ARGUNUSED(arg));
+	int vendor, device;
+	int *ip = (int *)arg;
+
+	ASSERT(arg);
+
+	vendor = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+	    "vendor-id", -1);
+	device = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
+	    "device-id", -1);
+
+	if (vendor == 0x8086 && device == 0x2a40) {
+		*ip = B_TRUE;
+		ddi_err(DER_NOTE, dip, "IMMU: Mobile 4 chipset detected. "
+		    "Force setting IOMMU write buffer");
+		return (DDI_WALK_TERMINATE);
+	} else {
+		return (DDI_WALK_CONTINUE);
+	}
+}
+
+static void
+map_bios_rsvd_mem(dev_info_t *dip)
+{
+	struct memlist *mp;
+	int e;
+
+	memlist_read_lock();
+
+	mp = bios_rsvd;
+	while (mp != NULL) {
+		memrng_t *mrng = {0};
+
+		ddi_err(DER_LOG, dip, "IMMU: Mapping BIOS rsvd range "
+		    "[0x%" PRIx64 " - 0x%"PRIx64 "]\n", mp->ml_address,
+		    mp->ml_address + mp->ml_size);
+
+		mrng->mrng_start = IMMU_ROUNDOWN(mp->ml_address);
+		mrng->mrng_npages = IMMU_ROUNDUP(mp->ml_size) / IMMU_PAGESIZE;
+
+		e = immu_dvma_map(NULL, NULL, mrng, 0, dip, IMMU_FLAGS_MEMRNG);
+		ASSERT(e == DDI_DMA_MAPPED || e == DDI_DMA_USE_PHYSICAL);
+
+		mp = mp->ml_next;
+	}
+
+	memlist_read_unlock();
+}
+
+/*
+ * Check if the device is USB controller
+ */
+/*ARGSUSED*/
+static void
+check_usb(dev_info_t *dip, void *arg)
+{
+	const char *drv = ddi_driver_name(dip);
+
+	if (drv == NULL ||
+	    (strcmp(drv, "uhci") != 0 && strcmp(drv, "ohci") != 0 &&
+	    strcmp(drv, "ehci") != 0)) {
+		return;
+	}
+
+	/* This must come first since it does unity mapping */
+	if (immu_quirk_usbfullpa == B_TRUE) {
+		int e;
+		ddi_err(DER_NOTE, dip, "Applying USB FULL PA quirk");
+		e = immu_dvma_map(NULL, NULL, NULL, 0, dip, IMMU_FLAGS_UNITY);
+		/* for unity mode, map will return USE_PHYSICAL */
+		ASSERT(e == DDI_DMA_USE_PHYSICAL);
+	}
+
+	if (immu_quirk_usbrmrr == B_TRUE) {
+		ddi_err(DER_LOG, dip, "Applying USB RMRR quirk");
+		map_bios_rsvd_mem(dip);
+	}
+}
+
+/*
+ * Check if the device is a LPC device
+ */
+/*ARGSUSED*/
+static void
+check_lpc(dev_info_t *dip, void *arg)
+{
+	immu_devi_t *immu_devi;
+
+	immu_devi = immu_devi_get(dip);
+	ASSERT(immu_devi);
+	if (immu_devi->imd_lpc == B_TRUE) {
+		ddi_err(DER_LOG, dip, "IMMU: Found LPC device");
+		/* This will put the immu_devi on the LPC "specials" list */
+		(void) immu_dvma_get_immu(dip, IMMU_FLAGS_SLEEP);
+	}
+}
+
+/*
+ * Check if the device is a GFX device
+ */
+/*ARGSUSED*/
+static void
+check_gfx(dev_info_t *dip, void *arg)
+{
+	immu_devi_t *immu_devi;
+	int e;
+
+	immu_devi = immu_devi_get(dip);
+	ASSERT(immu_devi);
+	if (immu_devi->imd_display == B_TRUE) {
+		ddi_err(DER_LOG, dip, "IMMU: Found GFX device");
+		/* This will put the immu_devi on the GFX "specials" list */
+		(void) immu_dvma_get_immu(dip, IMMU_FLAGS_SLEEP);
+		e = immu_dvma_map(NULL, NULL, NULL, 0, dip, IMMU_FLAGS_UNITY);
+		/* for unity mode, map will return USE_PHYSICAL */
+		ASSERT(e == DDI_DMA_USE_PHYSICAL);
+	}
+}
+
+static void
+walk_tree(int (*f)(dev_info_t *, void *), void *arg)
+{
+	int count;
+
+	ndi_devi_enter(root_devinfo, &count);
+	ddi_walk_devs(ddi_get_child(root_devinfo), f, arg);
+	ndi_devi_exit(root_devinfo, count);
+}
+
+static int
+check_pre_setup_quirks(dev_info_t *dip, void *arg)
+{
+	/* just 1 check right now */
+	return (check_mobile4(dip, arg));
+}
+
+static int
+check_pre_startup_quirks(dev_info_t *dip, void *arg)
+{
+	if (immu_devi_set(dip, IMMU_FLAGS_SLEEP) != DDI_SUCCESS) {
+		ddi_err(DER_PANIC, dip, "Failed to get immu_devi");
+	}
+
+	check_gfx(dip, arg);
+
+	check_lpc(dip, arg);
+
+	check_usb(dip, arg);
+
+	return (DDI_WALK_CONTINUE);
+}
+
+static void
+pre_setup_quirks(void)
+{
+	walk_tree(check_pre_setup_quirks, &immu_quirk_mobile4);
+}
+
+static void
+pre_startup_quirks(void)
+{
+	walk_tree(check_pre_startup_quirks, NULL);
+
+	immu_dmar_rmrr_map();
+}
+
+/*
+ * get_bootopt()
+ * 	check a boot option  (always a boolean)
+ */
+static void
+get_bootopt(char *bopt, boolean_t *kvar)
+{
+	char *val = NULL;
+
+	ASSERT(bopt);
+	ASSERT(kvar);
+
+	/*
+	 * All boot options set at the GRUB menu become
+	 * properties on the rootnex.
+	 */
+	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, root_devinfo,
+	    DDI_PROP_DONTPASS, bopt, &val) == DDI_SUCCESS) {
+		ASSERT(val);
+		if (strcmp(val, "true") == 0) {
+			*kvar = B_TRUE;
+		} else if (strcmp(val, "false") == 0) {
+			*kvar = B_FALSE;
+		} else {
+			ddi_err(DER_WARN, NULL, "boot option %s=\"%s\" ",
+			    "is not set to true or false. Ignoring option.",
+			    bopt, val);
+		}
+		ddi_prop_free(val);
+	}
+}
+
+static void
+read_boot_options(void)
+{
+	/* enable/disable options */
+	get_bootopt("immu-enable", &immu_enable);
+	get_bootopt("immu-dvma-enable", &immu_dvma_enable);
+	get_bootopt("immu-gfxdvma-enable", &immu_gfxdvma_enable);
+	get_bootopt("immu-intrmap-enable", &immu_intrmap_enable);
+	get_bootopt("immu-qinv-enable", &immu_qinv_enable);
+	get_bootopt("immu-mmio-safe", &immu_mmio_safe);
+
+	/* workaround switches */
+	get_bootopt("immu-quirk-usbpage0", &immu_quirk_usbpage0);
+	get_bootopt("immu-quirk-usbfullpa", &immu_quirk_usbfullpa);
+	get_bootopt("immu-quirk-usbrmrr", &immu_quirk_usbrmrr);
+
+	/* debug printing */
+	get_bootopt("immu-dmar-print", &immu_dmar_print);
+}
+
+/*
+ * Note, this will not catch hardware not enumerated
+ * in early boot
+ */
+static boolean_t
+blacklisted_driver(void)
+{
+	char **strptr;
+	int i;
+	major_t maj;
+
+	ASSERT((black_array == NULL) ^ (nblacks != 0));
+
+	/* need at least 2 strings */
+	if (nblacks < 2) {
+		return (B_FALSE);
+	}
+
+	strptr = black_array;
+	for (i = 0; nblacks - i > 1; i++) {
+		if (strcmp(*strptr++, "DRIVER") == 0) {
+			if ((maj = ddi_name_to_major(*strptr++))
+			    != DDI_MAJOR_T_NONE) {
+				/* is there hardware bound to this drvr */
+				if (devnamesp[maj].dn_head != NULL) {
+					return (B_TRUE);
+				}
+			}
+			i += 1;   /* for loop adds 1, so add only 1 here */
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static boolean_t
+blacklisted_smbios(void)
+{
+	id_t smid;
+	smbios_hdl_t *smhdl;
+	smbios_info_t sminf;
+	smbios_system_t smsys;
+	char *mfg, *product, *version;
+	char **strptr;
+	int i;
+
+	ASSERT((black_array == NULL) ^ (nblacks != 0));
+
+	/* need at least 4 strings for this setting */
+	if (nblacks < 4) {
+		return (B_FALSE);
+	}
+
+	smhdl = smbios_open(NULL, SMB_VERSION, ksmbios_flags, NULL);
+	if (smhdl == NULL ||
+	    (smid = smbios_info_system(smhdl, &smsys)) == SMB_ERR ||
+	    smbios_info_common(smhdl, smid, &sminf) == SMB_ERR) {
+		return (B_FALSE);
+	}
+
+	mfg = (char *)sminf.smbi_manufacturer;
+	product = (char *)sminf.smbi_product;
+	version = (char *)sminf.smbi_version;
+
+	ddi_err(DER_CONT, NULL, "?System SMBIOS information:\n");
+	ddi_err(DER_CONT, NULL, "?Manufacturer = <%s>\n", mfg);
+	ddi_err(DER_CONT, NULL, "?Product = <%s>\n", product);
+	ddi_err(DER_CONT, NULL, "?Version = <%s>\n", version);
+
+	strptr = black_array;
+	for (i = 0; nblacks - i > 3; i++) {
+		if (strcmp(*strptr++, "SMBIOS") == 0) {
+			if (strcmp(*strptr++, mfg) == 0 &&
+			    ((char *)strptr == '\0' ||
+			    strcmp(*strptr++, product) == 0) &&
+			    ((char *)strptr == '\0' ||
+			    strcmp(*strptr++, version) == 0)) {
+				return (B_TRUE);
+			}
+			i += 3;
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static boolean_t
+blacklisted_acpi(void)
+{
+	ASSERT((black_array == NULL) ^ (nblacks != 0));
+	if (nblacks == 0) {
+		return (B_FALSE);
+	}
+
+	return (immu_dmar_blacklisted(black_array, nblacks));
+}
+
+/*
+ * Check if system is blacklisted by Intel IOMMU driver
+ * i.e. should Intel IOMMU be disabled on this system
+ * Currently a system can be blacklistd based on the
+ * following bases:
+ *
+ * 1. DMAR ACPI table information.
+ *    This information includes things like
+ *    manufacturer and revision number. If rootnex.conf
+ *    has matching info set in its blacklist property
+ *    then Intel IOMMu will be disabled
+ *
+ * 2. SMBIOS information
+ *
+ * 3. Driver installed - useful if a particular
+ *    driver or hardware is toxic if Intel IOMMU
+ *    is turned on.
+ */
+
+static void
+blacklist_setup(void)
+{
+	char **string_array;
+	uint_t nstrings;
+
+	/*
+	 * Check the rootnex.conf blacklist property.
+	 * Fake up a dev_t since searching the global
+	 * property list needs it
+	 */
+	if (ddi_prop_lookup_string_array(
+	    makedevice(ddi_name_to_major("rootnex"), 0), root_devinfo,
+	    DDI_PROP_DONTPASS | DDI_PROP_ROOTNEX_GLOBAL, "immu-blacklist",
+	    &string_array, &nstrings) != DDI_PROP_SUCCESS) {
+		return;
+	}
+
+	/* smallest blacklist criteria works with multiples of 2 */
+	if (nstrings % 2 != 0) {
+		ddi_err(DER_WARN, NULL, "Invalid IOMMU blacklist "
+		    "rootnex.conf: number of strings must be a "
+		    "multiple of 2");
+		ddi_prop_free(string_array);
+		return;
+	}
+
+	black_array = string_array;
+	nblacks = nstrings;
+}
+
+static void
+blacklist_destroy(void)
+{
+	if (black_array) {
+		ddi_prop_free(black_array);
+		black_array = NULL;
+		nblacks = 0;
+	}
+
+	ASSERT(black_array == NULL);
+	ASSERT(nblacks == 0);
+}
+
+
+/*
+ * Now set all the fields in the order they are defined
+ * We do this only as a defensive-coding practice, it is
+ * not a correctness issue.
+ */
+static void *
+immu_state_alloc(int seg, void *dmar_unit)
+{
+	immu_t *immu;
+
+	dmar_unit = immu_dmar_walk_units(seg, dmar_unit);
+	if (dmar_unit == NULL) {
+		/* No more IOMMUs in this segment */
+		return (NULL);
+	}
+
+	immu = kmem_zalloc(sizeof (immu_t), KM_SLEEP);
+
+	mutex_init(&(immu->immu_lock), NULL, MUTEX_DRIVER, NULL);
+
+	mutex_enter(&(immu->immu_lock));
+
+	immu->immu_dmar_unit = dmar_unit;
+	immu->immu_name = ddi_strdup(immu_dmar_unit_name(dmar_unit),
+	    KM_SLEEP);
+	immu->immu_dip = immu_dmar_unit_dip(dmar_unit);
+
+	/*
+	 * the immu_intr_lock mutex is grabbed by the IOMMU
+	 * unit's interrupt handler so we need to use an
+	 * interrupt cookie for the mutex
+	 */
+	mutex_init(&(immu->immu_intr_lock), NULL, MUTEX_DRIVER,
+	    (void *)ipltospl(IMMU_INTR_IPL));
+
+	/* IOMMU regs related */
+	mutex_init(&(immu->immu_regs_lock), NULL, MUTEX_DEFAULT, NULL);
+
+	/* DVMA related */
+	immu->immu_dvma_coherent = B_FALSE;
+
+	/* DVMA context related */
+	rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL);
+
+	/* DVMA domain related */
+	list_create(&(immu->immu_domain_list), sizeof (domain_t),
+	    offsetof(domain_t, dom_immu_node));
+
+	/* DVMA special device lists */
+	immu->immu_dvma_gfx_only = B_FALSE;
+	list_create(&(immu->immu_dvma_lpc_list), sizeof (immu_devi_t),
+	    offsetof(immu_devi_t, imd_spc_node));
+	list_create(&(immu->immu_dvma_gfx_list), sizeof (immu_devi_t),
+	    offsetof(immu_devi_t, imd_spc_node));
+
+	/* interrupt remapping related */
+	mutex_init(&(immu->immu_intrmap_lock), NULL, MUTEX_DEFAULT, NULL);
+
+	/* qinv related */
+	mutex_init(&(immu->immu_qinv_lock), NULL, MUTEX_DEFAULT, NULL);
+
+	/*
+	 * insert this immu unit into the system-wide list
+	 */
+	list_insert_tail(&immu_list, immu);
+
+	mutex_exit(&(immu->immu_lock));
+
+	ddi_err(DER_LOG, immu->immu_dip, "IMMU: unit setup");
+
+	immu_dmar_set_immu(dmar_unit, immu);
+
+	return (dmar_unit);
+}
+
+static void
+immu_subsystems_setup(void)
+{
+	int seg;
+	void *unit_hdl;
+
+	ddi_err(DER_VERB, NULL,
+	    "Creating state structures for Intel IOMMU units\n");
+
+	ASSERT(immu_setup == B_FALSE);
+	ASSERT(immu_running == B_FALSE);
+
+	mutex_init(&immu_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&immu_list, sizeof (immu_t), offsetof(immu_t, immu_node));
+
+	mutex_enter(&immu_lock);
+
+	unit_hdl = NULL;
+	for (seg = 0; seg < IMMU_MAXSEG; seg++) {
+		while (unit_hdl = immu_state_alloc(seg, unit_hdl)) {
+			;
+		}
+	}
+
+	immu_regs_setup(&immu_list);	/* subsequent code needs this first */
+	immu_dvma_setup(&immu_list);
+	immu_intrmap_setup(&immu_list);
+	immu_qinv_setup(&immu_list);
+
+	mutex_exit(&immu_lock);
+}
+
+/*
+ * immu_subsystems_startup()
+ * 	startup all units that were setup
+ */
+static void
+immu_subsystems_startup(void)
+{
+	immu_t *immu;
+
+	mutex_enter(&immu_lock);
+
+	ASSERT(immu_setup == B_TRUE);
+	ASSERT(immu_running == B_FALSE);
+
+	immu_dmar_startup();
+
+	immu = list_head(&immu_list);
+	for (; immu; immu = list_next(&immu_list, immu)) {
+
+		mutex_enter(&(immu->immu_lock));
+
+		immu_intr_register(immu);
+		immu_dvma_startup(immu);
+		immu_intrmap_startup(immu);
+		immu_qinv_startup(immu);
+
+		/*
+		 * Set IOMMU unit's regs to do
+		 * the actual startup. This will
+		 * set immu->immu_running  field
+		 * if the unit is successfully
+		 * started
+		 */
+		immu_regs_startup(immu);
+
+		mutex_exit(&(immu->immu_lock));
+	}
+
+	mutex_exit(&immu_lock);
+}
+
+/* ##################  Intel IOMMU internal interfaces ###################### */
+
+/*
+ * Internal interfaces for IOMMU code (i.e. not exported to rootnex
+ * or rest of system)
+ */
+
+/*
+ * ddip can be NULL, in which case we walk up until we find the root dip
+ * NOTE: We never visit the root dip since its not a hardware node
+ */
+int
+immu_walk_ancestor(
+	dev_info_t *rdip,
+	dev_info_t *ddip,
+	int (*func)(dev_info_t *, void *arg),
+	void *arg,
+	int *lvlp,
+	immu_flags_t immu_flags)
+{
+	dev_info_t *pdip;
+	int level;
+	int error = DDI_SUCCESS;
+
+	ASSERT(root_devinfo);
+	ASSERT(rdip);
+	ASSERT(rdip != root_devinfo);
+	ASSERT(func);
+
+	/* ddip and immu can be NULL */
+
+	/* Hold rdip so that branch is not detached */
+	ndi_hold_devi(rdip);
+	for (pdip = rdip, level = 1; pdip && pdip != root_devinfo;
+	    pdip = ddi_get_parent(pdip), level++) {
+
+		if (immu_devi_set(pdip, immu_flags) != DDI_SUCCESS) {
+			error = DDI_FAILURE;
+			break;
+		}
+		if (func(pdip, arg) == DDI_WALK_TERMINATE) {
+			break;
+		}
+		if (immu_flags & IMMU_FLAGS_DONTPASS) {
+			break;
+		}
+		if (pdip == ddip) {
+			break;
+		}
+	}
+
+	ndi_rele_devi(rdip);
+
+	if (lvlp)
+		*lvlp = level;
+
+	return (error);
+}
+
+/* ########################  Intel IOMMU entry points ####################### */
+/*
+ * immu_init()
+ *	called from rootnex_attach(). setup but don't startup the Intel IOMMU
+ *      This is the first function called in Intel IOMMU code
+ */
+void
+immu_init(void)
+{
+	char *phony_reg = "A thing of beauty is a joy forever";
+
+	/* Set some global shorthands that are needed by all of IOMMU code */
+	ASSERT(root_devinfo == NULL);
+	root_devinfo = ddi_root_node();
+
+	/*
+	 * Intel IOMMU only supported only if MMU(CPU) page size is ==
+	 * IOMMU pages size.
+	 */
+	/*LINTED*/
+	if (MMU_PAGESIZE != IMMU_PAGESIZE) {
+		ddi_err(DER_WARN, NULL,
+		    "MMU page size (%d) is not equal to\n"
+		    "IOMMU page size (%d). "
+		    "Disabling Intel IOMMU. ",
+		    MMU_PAGESIZE, IMMU_PAGESIZE);
+		immu_enable = B_FALSE;
+		return;
+	}
+
+	/*
+	 * retrieve the Intel IOMMU boot options.
+	 * Do this before parsing immu ACPI table
+	 * as a boot option could potentially affect
+	 * ACPI parsing.
+	 */
+	ddi_err(DER_CONT, NULL, "?Reading Intel IOMMU boot options\n");
+	read_boot_options();
+
+	/*
+	 * Check the IOMMU enable boot-option first.
+	 * This is so that we can skip parsing the ACPI table
+	 * if necessary because that may cause problems in
+	 * systems with buggy BIOS or ACPI tables
+	 */
+	if (immu_enable == B_FALSE) {
+		return;
+	}
+
+	/*
+	 * Next, check if the system even has an Intel IOMMU
+	 * We use the presence or absence of the IOMMU ACPI
+	 * table to detect Intel IOMMU.
+	 */
+	if (immu_dmar_setup() != DDI_SUCCESS) {
+		immu_enable = B_FALSE;
+		return;
+	}
+
+	/*
+	 * Check blacklists
+	 */
+	blacklist_setup();
+
+	if (blacklisted_smbios() == B_TRUE) {
+		blacklist_destroy();
+		immu_enable = B_FALSE;
+		return;
+	}
+
+	if (blacklisted_driver() == B_TRUE) {
+		blacklist_destroy();
+		immu_enable = B_FALSE;
+		return;
+	}
+
+	/*
+	 * Read the "raw" DMAR ACPI table to get information
+	 * and convert into a form we can use.
+	 */
+	if (immu_dmar_parse() != DDI_SUCCESS) {
+		blacklist_destroy();
+		immu_enable = B_FALSE;
+		return;
+	}
+
+	/*
+	 * now that we have processed the ACPI table
+	 * check if we need to blacklist this system
+	 * based on ACPI info
+	 */
+	if (blacklisted_acpi() == B_TRUE) {
+		immu_dmar_destroy();
+		blacklist_destroy();
+		immu_enable = B_FALSE;
+		return;
+	}
+
+	blacklist_destroy();
+
+	/*
+	 * Check if system has HW quirks.
+	 */
+	pre_setup_quirks();
+
+	/* Now do the rest of the setup */
+	immu_subsystems_setup();
+
+	/*
+	 * Now that the IMMU is setup, create a phony
+	 * reg prop so that suspend/resume works
+	 */
+	if (ddi_prop_update_byte_array(DDI_DEV_T_NONE, root_devinfo, "reg",
+	    (uchar_t *)phony_reg, strlen(phony_reg) + 1) != DDI_PROP_SUCCESS) {
+		ddi_err(DER_PANIC, NULL, "Failed to create reg prop for "
+		    "rootnex node");
+		/*NOTREACHED*/
+	}
+
+	immu_setup = B_TRUE;
+}
+
+/*
+ * immu_startup()
+ * 	called directly by boot code to startup
+ *      all units of the IOMMU
+ */
+void
+immu_startup(void)
+{
+	/*
+	 * If IOMMU is disabled, do nothing
+	 */
+	if (immu_enable == B_FALSE) {
+		return;
+	}
+
+	if (immu_setup == B_FALSE) {
+		ddi_err(DER_WARN, NULL, "Intel IOMMU not setup, "
+		    "skipping IOMU startup");
+		return;
+	}
+
+	pre_startup_quirks();
+
+	ddi_err(DER_CONT, NULL,
+	    "?Starting Intel IOMMU (dmar) units...\n");
+
+	immu_subsystems_startup();
+
+	immu_running = B_TRUE;
+}
+
+/*
+ * immu_map_sgl()
+ * 	called from rootnex_coredma_bindhdl() when Intel
+ *	IOMMU is enabled to build DVMA cookies and map them.
+ */
+int
+immu_map_sgl(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq,
+    int prealloc_count, dev_info_t *rdip)
+{
+	if (immu_running == B_FALSE) {
+		return (DDI_DMA_USE_PHYSICAL);
+	}
+
+	return (immu_dvma_map(hp, dmareq, NULL, prealloc_count, rdip,
+	    IMMU_FLAGS_DMAHDL));
+}
+
+/*
+ * immu_unmap_sgl()
+ * 	called from rootnex_coredma_unbindhdl(), to unmap DVMA
+ * 	cookies and free them
+ */
+int
+immu_unmap_sgl(ddi_dma_impl_t *hp, dev_info_t *rdip)
+{
+	if (immu_running == B_FALSE) {
+		return (DDI_DMA_USE_PHYSICAL);
+	}
+
+	return (immu_dvma_unmap(hp, rdip));
+}
+
+/*
+ * Hook to notify IOMMU code of device tree changes
+ */
+void
+immu_device_tree_changed(void)
+{
+	if (immu_setup == B_FALSE) {
+		return;
+	}
+
+	ddi_err(DER_WARN, NULL, "Intel IOMMU currently "
+	    "does not use device tree updates");
+}
+
+/*
+ * Hook to notify IOMMU code of memory changes
+ */
+void
+immu_physmem_update(uint64_t addr, uint64_t size)
+{
+	if (immu_setup == B_FALSE) {
+		return;
+	}
+	immu_dvma_physmem_update(addr, size);
+}
+
+/*
+ * immu_quiesce()
+ * 	quiesce all units that are running
+ */
+int
+immu_quiesce(void)
+{
+	immu_t *immu;
+	int ret = DDI_SUCCESS;
+
+	mutex_enter(&immu_lock);
+
+	if (immu_running == B_FALSE)
+		return (DDI_SUCCESS);
+
+	ASSERT(immu_setup == B_TRUE);
+
+	immu = list_head(&immu_list);
+	for (; immu; immu = list_next(&immu_list, immu)) {
+
+		/* if immu is not running, we dont quiesce */
+		if (immu->immu_regs_running == B_FALSE)
+			continue;
+
+		/* flush caches */
+		rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
+		immu_regs_context_flush(immu, 0, 0, 0, CONTEXT_GLOBAL);
+		rw_exit(&(immu->immu_ctx_rwlock));
+		immu_regs_iotlb_flush(immu, 0, 0, 0, 0, IOTLB_GLOBAL);
+		immu_regs_wbf_flush(immu);
+
+		mutex_enter(&(immu->immu_lock));
+
+		/*
+		 * Set IOMMU unit's regs to do
+		 * the actual shutdown.
+		 */
+		immu_regs_shutdown(immu);
+		immu_regs_suspend(immu);
+
+		/* if immu is still running, we failed */
+		if (immu->immu_regs_running == B_TRUE)
+			ret = DDI_FAILURE;
+		else
+			immu->immu_regs_quiesced = B_TRUE;
+
+		mutex_exit(&(immu->immu_lock));
+	}
+	mutex_exit(&immu_lock);
+
+	if (ret == DDI_SUCCESS) {
+		immu_running = B_FALSE;
+		immu_quiesced = B_TRUE;
+	}
+
+	return (ret);
+}
+
+/*
+ * immu_unquiesce()
+ * 	unquiesce all units
+ */
+int
+immu_unquiesce(void)
+{
+	immu_t *immu;
+	int ret = DDI_SUCCESS;
+
+	mutex_enter(&immu_lock);
+
+	if (immu_quiesced == B_FALSE)
+		return (DDI_SUCCESS);
+
+	ASSERT(immu_setup == B_TRUE);
+	ASSERT(immu_running == B_FALSE);
+
+	immu = list_head(&immu_list);
+	for (; immu; immu = list_next(&immu_list, immu)) {
+
+		mutex_enter(&(immu->immu_lock));
+
+		/* if immu was not quiesced, i.e was not running before */
+		if (immu->immu_regs_quiesced == B_FALSE)
+			continue;
+
+		if (immu_regs_resume(immu) != DDI_SUCCESS) {
+			ret = DDI_FAILURE;
+			continue;
+		}
+
+		/* flush caches before unquiesce */
+		rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
+		immu_regs_context_flush(immu, 0, 0, 0, CONTEXT_GLOBAL);
+		rw_exit(&(immu->immu_ctx_rwlock));
+		immu_regs_iotlb_flush(immu, 0, 0, 0, 0, IOTLB_GLOBAL);
+
+		/*
+		 * Set IOMMU unit's regs to do
+		 * the actual startup. This will
+		 * set immu->immu_regs_running  field
+		 * if the unit is successfully
+		 * started
+		 */
+		immu_regs_startup(immu);
+
+		if (immu->immu_regs_running == B_FALSE) {
+			ret = DDI_FAILURE;
+		} else {
+			immu_quiesced = B_TRUE;
+			immu_running = B_TRUE;
+			immu->immu_regs_quiesced = B_FALSE;
+		}
+
+		mutex_exit(&(immu->immu_lock));
+	}
+
+	mutex_exit(&immu_lock);
+
+	return (ret);
+}
+
+/* ##############  END Intel IOMMU entry points ################## */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/io/immu_dmar.c	Sat Jan 30 18:23:16 2010 -0800
@@ -0,0 +1,1289 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Portions Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ * All rights reserved.
+ */
+
+
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#include <sys/list.h>
+#include <sys/pci.h>
+#include <sys/pci_cfgspace.h>
+#include <sys/pci_impl.h>
+#include <sys/sunndi.h>
+#include <sys/ksynch.h>
+#include <sys/cmn_err.h>
+#include <sys/bootconf.h>
+#include <sys/int_fmtio.h>
+#include <sys/smbios.h>
+#include <sys/acpi/acpi.h>
+#include <sys/acpica.h>
+#include <sys/iommulib.h>
+#include <sys/immu.h>
+
+static void dmar_table_destroy(dmar_table_t *tbl);
+
+/*
+ * internal global variables
+ */
+static char	*dmar_raw;		/* raw DMAR ACPI table */
+static dmar_table_t *dmar_table;	/* converted form of DMAR table */
+
+/*
+ * global variables exported outside this file
+ */
+boolean_t dmar_print = B_FALSE;
+kmutex_t ioapic_drhd_lock;
+list_t ioapic_drhd_list;
+
+/* ######################################################################### */
+
+/*
+ * helper functions to read the "raw" DMAR table
+ */
+
+static uint8_t
+get_uint8(char *cp)
+{
+	uint8_t val = *((uint8_t *)cp);
+	return (val);
+}
+
+static uint16_t
+get_uint16(char *cp)
+{
+	uint16_t val = *((uint16_t *)cp);
+	return (val);
+}
+
+static uint32_t
+get_uint32(char *cp)
+{
+	uint32_t val = *((uint32_t *)cp);
+	return (val);
+}
+
+static uint64_t
+get_uint64(char *cp)
+{
+	uint64_t val = *((uint64_t *)cp);
+	return (val);
+}
+
+static char *
+get_str(char *cp, uint_t len)
+{
+	char *str = kmem_alloc(len + 1, KM_SLEEP);
+
+	(void) strlcpy(str, cp, len + 1);
+
+	return (str);
+}
+
+static void
+scope_list_free(list_t *scope_list)
+{
+	scope_t *scope;
+
+	if (list_is_empty(scope_list)) {
+		list_destroy(scope_list);
+		return;
+	}
+
+	while ((scope = list_remove_head(scope_list)) != NULL) {
+		kmem_free(scope, sizeof (scope_t));
+	}
+
+	ASSERT(list_is_empty(scope_list));
+	list_destroy(scope_list);
+}
+
+static void
+drhd_list_destroy(list_t *drhd_list)
+{
+	drhd_t *drhd;
+
+	ASSERT(drhd_list);
+
+	if (list_is_empty(drhd_list)) {
+		list_destroy(drhd_list);
+		return;
+	}
+
+	while ((drhd = list_remove_head(drhd_list)) != NULL) {
+		scope_list_free(&(drhd->dr_scope_list));
+		kmem_free(drhd, sizeof (drhd_t));
+	}
+
+	ASSERT(list_is_empty(drhd_list));
+	list_destroy(drhd_list);
+}
+
+static void
+rmrr_list_destroy(list_t *rmrr_list)
+{
+	rmrr_t *rmrr;
+
+	ASSERT(rmrr_list);
+
+	if (list_is_empty(rmrr_list)) {
+		list_destroy(rmrr_list);
+		return;
+	}
+
+	while ((rmrr = list_remove_head(rmrr_list)) != NULL) {
+		scope_list_free(&(rmrr->rm_scope_list));
+		kmem_free(rmrr, sizeof (rmrr_t));
+	}
+
+	ASSERT(list_is_empty(rmrr_list));
+	list_destroy(rmrr_list);
+}
+
+/*
+ * parse_scope()
+ *      parse a scope structure in the "raw" table
+ */
+static scope_t *
+parse_scope(char *shead)
+{
+	scope_t *scope;
+	char *phead;
+	int bus, dev, func;
+	uint8_t startbus;
+	uint8_t len;
+	int depth;
+
+	ASSERT(shead);
+
+	scope = kmem_zalloc(sizeof (scope_t), KM_SLEEP);
+	scope->scp_type = get_uint8(&shead[0]);
+	scope->scp_enumid = get_uint8(&shead[4]);
+
+	len = get_uint8(&shead[1]);
+	startbus = get_uint8(&shead[5]);
+	depth = (len - 6)/2;
+	ASSERT(depth >= 1);
+
+	phead = &shead[6];
+
+	bus = startbus;
+	dev = get_uint8(phead++);
+	func = get_uint8(phead++);
+
+	for (depth--; depth > 0; depth--) {
+		bus = pci_getb_func(bus, dev, func, PCI_BCNF_SECBUS);
+		dev = get_uint8(phead++);
+		func = get_uint8(phead++);
+	}
+
+	ASSERT(bus >= 0 && bus < 256);
+	ASSERT(dev >= 0 && dev < 32);
+	ASSERT(func >= 0 && func < 8);
+
+	/* ok we got the device BDF */
+	scope->scp_bus = bus;
+	scope->scp_dev = dev;
+	scope->scp_func = func;
+
+	return (scope);
+}
+
+
+/* setup the ioapic_drhd structure */
+static void
+ioapic_drhd_setup(void)
+{
+	mutex_init(&(ioapic_drhd_lock), NULL, MUTEX_DEFAULT, NULL);
+
+	mutex_enter(&(ioapic_drhd_lock));
+	list_create(&(ioapic_drhd_list), sizeof (ioapic_drhd_t),
+	    offsetof(ioapic_drhd_t, ioapic_node));
+	mutex_exit(&(ioapic_drhd_lock));
+}
+
+/* get ioapic source id for interrupt remapping */
+static void
+ioapic_drhd_insert(scope_t *scope, drhd_t *drhd)
+{
+	ioapic_drhd_t *idt;
+
+	idt = kmem_zalloc(sizeof (ioapic_drhd_t), KM_SLEEP);
+	idt->ioapic_ioapicid = scope->scp_enumid;
+	idt->ioapic_sid = ((scope->scp_bus << 8) | (scope->scp_dev << 3) |
+	    (scope->scp_func));
+	idt->ioapic_drhd = drhd;
+
+	mutex_enter(&ioapic_drhd_lock);
+	list_insert_tail(&ioapic_drhd_list, idt);
+	mutex_exit(&ioapic_drhd_lock);
+}
+
+static ioapic_drhd_t *
+ioapic_drhd_lookup(int ioapicid)
+{
+	ioapic_drhd_t *idt;
+
+	mutex_enter(&ioapic_drhd_lock);
+	idt = list_head(&ioapic_drhd_list);
+	for (; idt; idt = list_next(&ioapic_drhd_list, idt)) {
+		if (idt->ioapic_ioapicid == ioapicid) {
+			break;
+		}
+	}
+	mutex_exit(&ioapic_drhd_lock);
+
+	return (idt);
+}
+
+static void
+ioapic_drhd_destroy(void)
+{
+	ioapic_drhd_t *idt;
+
+	mutex_enter(&ioapic_drhd_lock);
+	while (idt = list_remove_head(&ioapic_drhd_list)) {
+		kmem_free(idt, sizeof (ioapic_drhd_t));
+	}
+	list_destroy(&ioapic_drhd_list);
+	mutex_exit(&(ioapic_drhd_lock));
+
+	mutex_destroy(&(ioapic_drhd_lock));
+}
+
+/*
+ * parse_drhd()
+ *   parse the drhd uints in dmar table
+ */
+static int
+parse_drhd(char *uhead, dmar_table_t *tbl)
+{
+	drhd_t *drhd;
+	int seg;
+	int len;
+	char *shead;
+	scope_t *scope;
+
+	ASSERT(uhead);
+	ASSERT(tbl);
+	ASSERT(get_uint16(&uhead[0]) == DMAR_DRHD);
+
+	seg = get_uint16(&uhead[6]);
+	if (seg < 0 || seg >= IMMU_MAXSEG) {
+		ddi_err(DER_WARN, NULL, "invalid segment# <%d>"
+		    "in DRHD unit in ACPI DMAR table", seg);
+		return (DDI_FAILURE);
+	}
+
+	drhd = kmem_zalloc(sizeof (drhd_t), KM_SLEEP);
+	mutex_init(&(drhd->dr_lock), NULL, MUTEX_DEFAULT, NULL);
+	list_create(&(drhd->dr_scope_list), sizeof (scope_t),
+	    offsetof(scope_t, scp_node));
+
+	len = get_uint16(&uhead[2]);
+	drhd->dr_include_all =
+	    (get_uint8(&uhead[4]) & DMAR_INCLUDE_ALL) ? B_TRUE : B_FALSE;
+	drhd->dr_seg = seg;
+	drhd->dr_regs = get_uint64(&uhead[8]);
+
+	/*
+	 * parse each scope.
+	 */
+	shead = &uhead[16];
+	while (shead < &uhead[len - 1]) {
+		scope = parse_scope(shead);
+		if (scope == NULL) {
+			return (DDI_FAILURE);
+		}
+
+		if (scope->scp_type == DMAR_IOAPIC)  {
+			ioapic_drhd_insert(scope, drhd);
+		}
+
+		list_insert_tail(&(drhd->dr_scope_list), scope);
+		shead += get_uint8(&shead[1]);
+	}
+
+	list_insert_tail(&(tbl->tbl_drhd_list[drhd->dr_seg]), drhd);
+
+	return (DDI_SUCCESS);
+}
+
+/*
+ * parse_rmrr()
+ *   parse the rmrr units in dmar table
+ */
+static int
+parse_rmrr(char *uhead, dmar_table_t *tbl)
+{
+	rmrr_t *rmrr;
+	int seg;
+	int len;
+	char *shead;
+	scope_t *scope;
+
+	ASSERT(uhead);
+	ASSERT(tbl);
+	ASSERT(get_uint16(&uhead[0]) == DMAR_RMRR);
+
+	seg = get_uint16(&uhead[6]);
+	if (seg < 0 || seg >= IMMU_MAXSEG) {
+		ddi_err(DER_WARN, NULL, "invalid segment# <%d>"
+		    "in RMRR unit in ACPI DMAR table", seg);
+		return (DDI_FAILURE);
+	}
+
+	rmrr = kmem_zalloc(sizeof (rmrr_t), KM_SLEEP);
+	mutex_init(&(rmrr->rm_lock), NULL, MUTEX_DEFAULT, NULL);
+	list_create(&(rmrr->rm_scope_list), sizeof (scope_t),
+	    offsetof(scope_t, scp_node));
+
+	/* RMRR region is [base,limit] */
+	len = get_uint16(&uhead[2]);
+	rmrr->rm_seg = get_uint16(&uhead[6]);
+	rmrr->rm_base = get_uint64(&uhead[8]);
+	rmrr->rm_limit = get_uint64(&uhead[16]);
+
+	if (rmrr->rm_base > rmrr->rm_limit) {
+		ddi_err(DER_WARN, NULL, "IMMU: BIOS bug detected: "
+		    "RMRR: base (%lx) > limit (%lx)",
+		    rmrr->rm_base, rmrr->rm_limit);
+		list_destroy(&(rmrr->rm_scope_list));
+		mutex_destroy(&(rmrr->rm_lock));
+		kmem_free(rmrr, sizeof (rmrr_t));
+		return (DDI_SUCCESS);
+	}
+
+	/*
+	 * parse each scope in RMRR
+	 */
+	shead = &uhead[24];
+	while (shead < &uhead[len - 1]) {
+		scope = parse_scope(shead);
+		if (scope == NULL) {
+			return (DDI_FAILURE);
+		}
+		list_insert_tail(&(rmrr->rm_scope_list), scope);
+		shead += get_uint8(&shead[1]);
+	}
+
+	list_insert_tail(&(tbl->tbl_rmrr_list[rmrr->rm_seg]), rmrr);
+
+	return (DDI_SUCCESS);
+}
+
+#define	TBL_OEM_ID_SZ		(6)
+#define	TBL_OEM_TBLID_SZ	(8)
+
+/*
+ * parse the "raw" DMAR table and convert it
+ * into a useful form.
+ */
+static int
+dmar_parse(dmar_table_t **tblpp, char *raw)
+{
+	char *uhead;
+	dmar_table_t *tbl;
+	int i;
+	char *unmstr;
+
+	ASSERT(raw);
+	ASSERT(tblpp);
+
+	*tblpp = NULL;
+
+	/*
+	 * do a sanity check. make sure the raw table
+	 * has the right signature
+	 */
+	if (raw[0] != 'D' || raw[1] != 'M' ||
+	    raw[2] != 'A' || raw[3] != 'R') {
+		ddi_err(DER_WARN, NULL, "IOMMU ACPI "
+		    "signature != \"DMAR\"");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * the platform has intel iommu, create processed ACPI struct
+	 */
+	tbl = kmem_zalloc(sizeof (dmar_table_t), KM_SLEEP);
+	mutex_init(&(tbl->tbl_lock), NULL, MUTEX_DEFAULT, NULL);
+
+	tbl->tbl_raw = raw;
+
+	/*
+	 * Note we explicitly show offsets for clarity
+	 */
+	tbl->tbl_rawlen = get_uint32(&raw[4]);
+
+	/* XXX TO DO verify checksum of table */
+	tbl->tbl_oem_id = get_str(&raw[10], TBL_OEM_ID_SZ);
+	tbl->tbl_oem_tblid = get_str(&raw[16], TBL_OEM_TBLID_SZ);
+	tbl->tbl_oem_rev = get_uint32(&raw[24]);
+	tbl->tbl_haw = get_uint8(&raw[36]) + 1;
+	tbl->tbl_intrmap = (get_uint8(&raw[37]) & DMAR_INTRMAP_SUPPORT)
+	    ? B_TRUE : B_FALSE;
+
+	/* create lists for DRHD and RMRR */
+	for (i = 0; i < IMMU_MAXSEG; i++) {
+		list_create(&(tbl->tbl_drhd_list[i]), sizeof (drhd_t),
+		    offsetof(drhd_t, dr_node));
+		list_create(&(tbl->tbl_rmrr_list[i]), sizeof (rmrr_t),
+		    offsetof(rmrr_t, rm_node));
+	}
+
+	ioapic_drhd_setup();
+
+	/*
+	 * parse each unit. Currently only DRHD and RMRR types
+	 * are parsed. We ignore all other types of units.
+	 */
+	uhead = &raw[48];
+	while (uhead < &raw[tbl->tbl_rawlen - 1]) {
+		unmstr = NULL;
+		switch (get_uint16(uhead)) {
+		case DMAR_DRHD:
+			if (parse_drhd(uhead, tbl) != DDI_SUCCESS) {
+				goto failed;
+			}
+			break;
+		case DMAR_RMRR:
+			if (parse_rmrr(uhead, tbl) != DDI_SUCCESS) {
+				goto failed;
+			}
+			break;
+		case DMAR_ATSR:
+			unmstr = "ATSR";
+			break;
+		case DMAR_RHSA:
+			unmstr = "RHSA";
+			break;
+		default:
+			unmstr = "unknown unity type";
+			break;
+		}
+		if (unmstr) {
+			ddi_err(DER_NOTE, NULL, "DMAR ACPI table: "
+			    "skipping unsupported unit type %s", unmstr);
+		}
+		uhead += get_uint16(&uhead[2]);
+	}
+
+	*tblpp = tbl;
+	return (DDI_SUCCESS);
+
+failed:
+	dmar_table_destroy(tbl);
+	return (DDI_FAILURE);
+}
+
+static char *
+scope_type(int devtype)
+{
+	char *typestr;
+
+	switch (devtype) {
+	case DMAR_ENDPOINT:
+		typestr = "endpoint-device";
+		break;
+	case DMAR_SUBTREE:
+		typestr = "subtree-device";
+		break;
+	case DMAR_IOAPIC:
+		typestr = "IOAPIC";
+		break;
+	case DMAR_HPET:
+		typestr = "HPET";
+		break;
+	default:
+		typestr = "Unknown device";
+		break;
+	}
+
+	return (typestr);
+}
+
+static void
+print_scope_list(list_t *scope_list)
+{
+	scope_t *scope;
+
+	if (list_is_empty(scope_list))
+		return;
+
+	ddi_err(DER_CONT, NULL, "\tdevice list:\n");
+
+	for (scope = list_head(scope_list); scope;
+	    scope = list_next(scope_list, scope)) {
+		ddi_err(DER_CONT, NULL, "\t\ttype = %s\n",
+		    scope_type(scope->scp_type));
+		ddi_err(DER_CONT, NULL, "\n\t\tbus = %d\n",
+		    scope->scp_bus);
+		ddi_err(DER_CONT, NULL, "\t\tdev = %d\n",
+		    scope->scp_dev);
+		ddi_err(DER_CONT, NULL, "\t\tfunc = %d\n",
+		    scope->scp_func);
+	}
+}
+
+static void
+print_drhd_list(list_t *drhd_list)
+{
+	drhd_t *drhd;
+
+	if (list_is_empty(drhd_list))
+		return;
+
+	ddi_err(DER_CONT, NULL, "\ndrhd list:\n");
+
+	for (drhd = list_head(drhd_list); drhd;
+	    drhd = list_next(drhd_list, drhd)) {
+
+		ddi_err(DER_CONT, NULL, "\n\tsegment = %d\n",
+		    drhd->dr_seg);
+		ddi_err(DER_CONT, NULL, "\treg_base = 0x%" PRIx64 "\n",
+		    drhd->dr_regs);
+		ddi_err(DER_CONT, NULL, "\tinclude_all = %s\n",
+		    drhd->dr_include_all == B_TRUE ? "TRUE" : "FALSE");
+		ddi_err(DER_CONT, NULL, "\tdip = 0x%p\n",
+		    (void *)drhd->dr_dip);
+
+		print_scope_list(&(drhd->dr_scope_list));
+	}
+}
+
+
+static void
+print_rmrr_list(list_t *rmrr_list)
+{
+	rmrr_t *rmrr;
+
+	if (list_is_empty(rmrr_list))
+		return;
+
+	ddi_err(DER_CONT, NULL, "\nrmrr list:\n");
+
+	for (rmrr = list_head(rmrr_list); rmrr;
+	    rmrr = list_next(rmrr_list, rmrr)) {
+
+		ddi_err(DER_CONT, NULL, "\n\tsegment = %d\n",
+		    rmrr->rm_seg);
+		ddi_err(DER_CONT, NULL, "\tbase = 0x%lx\n",
+		    rmrr->rm_base);
+		ddi_err(DER_CONT, NULL, "\tlimit = 0x%lx\n",
+		    rmrr->rm_limit);
+
+		print_scope_list(&(rmrr->rm_scope_list));
+	}
+}
+
+/*
+ * print DMAR table
+ */
+static void
+dmar_table_print(dmar_table_t *tbl)
+{
+	int i;
+
+	if (dmar_print == B_FALSE) {
+		return;
+	}
+
+	/* print the title */
+	ddi_err(DER_CONT, NULL, "#### Start of dmar_table ####\n");
+	ddi_err(DER_CONT, NULL, "\thaw = %d\n", tbl->tbl_haw);
+	ddi_err(DER_CONT, NULL, "\tintr_remap = %s\n",
+	    tbl->tbl_intrmap == B_TRUE ? "<true>" : "<false>");
+
+	/* print drhd list */
+	for (i = 0; i < IMMU_MAXSEG; i++) {
+		print_drhd_list(&(tbl->tbl_drhd_list[i]));
+	}
+
+
+	/* print rmrr list */
+	for (i = 0; i < IMMU_MAXSEG; i++) {
+		print_rmrr_list(&(tbl->tbl_rmrr_list[i]));
+	}
+
+	ddi_err(DER_CONT, NULL, "#### END of dmar_table ####\n");
+}
+
+static void
+drhd_devi_create(drhd_t *drhd, char *name)
+{
+	struct ddi_parent_private_data *pdptr;
+	struct regspec reg;
+	dev_info_t *dip;
+
+	ndi_devi_alloc_sleep(root_devinfo, name,
+	    DEVI_SID_NODEID, &dip);
+
+	drhd->dr_dip = dip;
+
+	reg.regspec_bustype = 0;
+	reg.regspec_addr = drhd->dr_regs;
+	reg.regspec_size = IMMU_REGSZ;
+
+	/*
+	 * update the reg properties
+	 *
+	 *   reg property will be used for register
+	 *   set access
+	 *
+	 * refer to the bus_map of root nexus driver
+	 * I/O or memory mapping:
+	 *
+	 * <bustype=0, addr=x, len=x>: memory
+	 * <bustype=1, addr=x, len=x>: i/o
+	 * <bustype>1, addr=0, len=x>: x86-compatibility i/o
+	 */
+	(void) ndi_prop_update_int_array(DDI_DEV_T_NONE,
+	    dip, "reg", (int *)&reg,
+	    sizeof (struct regspec) / sizeof (int));
+
+
+	pdptr = kmem_zalloc(sizeof (struct ddi_parent_private_data)
+	    + sizeof (struct regspec), KM_SLEEP);
+	pdptr->par_nreg = 1;
+	pdptr->par_reg = (struct regspec *)(pdptr + 1);
+	pdptr->par_reg->regspec_bustype = 0;
+	pdptr->par_reg->regspec_addr = drhd->dr_regs;
+	pdptr->par_reg->regspec_size = IMMU_REGSZ;
+	ddi_set_parent_data(dip, pdptr);
+}
+
+/*
+ * dmar_devinfos_create()
+ *
+ *   create the dev_info node in the device tree,
+ *   the info node is a nuxus child of the root
+ *   nexus
+ */
+static void
+dmar_devinfos_create(dmar_table_t *tbl)
+{
+	list_t *drhd_list;
+	drhd_t *drhd;
+	char name[IMMU_MAXNAMELEN];
+	int i, unit;
+
+	for (i = 0; i < IMMU_MAXSEG; i++) {
+
+		drhd_list = &(tbl->tbl_drhd_list[i]);
+
+		if (list_is_empty(drhd_list))
+			continue;
+
+		drhd = list_head(drhd_list);
+		for (unit = 0; drhd;
+		    drhd = list_next(drhd_list, drhd), unit++) {
+			(void) snprintf(name, sizeof (name),
+			    "drhd%d,%d", i, unit);
+			drhd_devi_create(drhd, name);
+		}
+	}
+}
+
+static void
+drhd_devi_destroy(drhd_t *drhd)
+{
+	dev_info_t *dip;
+	int count;
+
+	dip = drhd->dr_dip;
+	ASSERT(dip);
+
+	ndi_devi_enter(root_devinfo, &count);
+	if (ndi_devi_offline(dip, NDI_DEVI_REMOVE) != DDI_SUCCESS) {
+		ddi_err(DER_WARN, dip, "Failed to destroy");
+	}
+	ndi_devi_exit(root_devinfo, count);
+	drhd->dr_dip = NULL;
+}
+
+/*
+ * dmar_devi_destroy()
+ *
+ * destroy dev_info nodes for all drhd units
+ */
+static void
+dmar_devi_destroy(dmar_table_t *tbl)
+{
+	drhd_t *drhd;
+	list_t *drhd_list;
+	int i;
+
+	for (i = 0; i < IMMU_MAXSEG; i++) {
+		drhd_list = &(tbl->tbl_drhd_list[i]);
+		if (list_is_empty(drhd_list))
+			continue;
+
+		drhd = list_head(drhd_list);
+		for (; drhd; drhd = list_next(drhd_list, drhd)) {
+			drhd_devi_destroy(drhd);
+		}
+	}
+}
+
+static int
+match_bdf(dev_info_t *ddip, void *arg)
+{
+	immu_arg_t *imarg = (immu_arg_t *)arg;
+	immu_devi_t *immu_devi;
+
+	ASSERT(ddip);
+	ASSERT(imarg);
+	ASSERT(imarg->ima_seg == 0);
+	ASSERT(imarg->ima_bus >= 0);
+	ASSERT(imarg->ima_devfunc >= 0);
+	ASSERT(imarg->ima_ddip == NULL);
+
+	/* rdip can be NULL */
+
+	mutex_enter(&(DEVI(ddip)->devi_lock));
+
+	immu_devi = IMMU_DEVI(ddip);
+	ASSERT(immu_devi);
+
+	if (immu_devi->imd_seg == imarg->ima_seg &&
+	    immu_devi->imd_bus == imarg->ima_bus &&
+	    immu_devi->imd_devfunc == imarg->ima_devfunc) {
+		imarg->ima_ddip = ddip;
+	}
+
+	mutex_exit(&(DEVI(ddip)->devi_lock));
+
+	return (imarg->ima_ddip ? DDI_WALK_TERMINATE : DDI_WALK_CONTINUE);
+}
+static void
+dmar_table_destroy(dmar_table_t *tbl)
+{
+	int i;
+
+	ASSERT(tbl);
+
+	/* destroy lists for DRHD and RMRR */
+	for (i = 0; i < IMMU_MAXSEG; i++) {
+		rmrr_list_destroy(&(tbl->tbl_rmrr_list[i]));
+		drhd_list_destroy(&(tbl->tbl_drhd_list[i]));
+	}
+
+	/* free strings */
+	kmem_free(tbl->tbl_oem_tblid, TBL_OEM_ID_SZ + 1);
+	kmem_free(tbl->tbl_oem_id, TBL_OEM_TBLID_SZ + 1);
+	tbl->tbl_raw = NULL; /* raw ACPI table doesn't have to be freed */
+	mutex_destroy(&(tbl->tbl_lock));
+	kmem_free(tbl, sizeof (dmar_table_t));
+}
+
+/*
+ * #########################################################################
+ * Functions exported by dmar.c
+ * This file deals with reading and processing the DMAR ACPI table
+ * #########################################################################
+ */
+
+/*
+ * immu_dmar_setup()
+ *	Check if the system has a DMAR ACPI table. If yes, the system
+ *	has Intel IOMMU hardware
+ */
+int
+immu_dmar_setup(void)
+{
+	if (AcpiGetTable("DMAR", 1, (ACPI_TABLE_HEADER **)&dmar_raw) != AE_OK) {
+		ddi_err(DER_LOG, NULL,
+		    "No DMAR ACPI table. No Intel IOMMU present\n");
+		dmar_raw = NULL;
+		return (DDI_FAILURE);
+	}
+	ASSERT(dmar_raw);
+	return (DDI_SUCCESS);
+}
+
+/*
+ * immu_dmar_parse()
+ *  Called by immu.c to parse and convert "raw" ACPI DMAR table
+ */
+int
+immu_dmar_parse(void)
+{
+	dmar_table_t *tbl = NULL;
+
+	/* we should already have found the "raw" table */
+	ASSERT(dmar_raw);
+
+	ddi_err(DER_CONT, NULL, "?Processing DMAR ACPI table\n");
+
+	dmar_table = NULL;
+
+	/*
+	 * parse DMAR ACPI table
+	 */
+	if (dmar_parse(&tbl, dmar_raw) != DDI_SUCCESS) {
+		ASSERT(tbl == NULL);
+		return (DDI_FAILURE);
+	}
+
+	ASSERT(tbl);
+
+	/*
+	 * create one devinfo for every drhd unit
+	 * in the DMAR table
+	 */
+	dmar_devinfos_create(tbl);
+
+	/*
+	 * print the dmar table if the debug option is set
+	 */
+	dmar_table_print(tbl);
+
+	dmar_table = tbl;
+
+	return (DDI_SUCCESS);
+}
+
+void
+immu_dmar_startup(void)
+{
+	/* nothing to do */
+}
+
+void
+immu_dmar_shutdown(void)
+{
+	/* nothing to do */
+}
+
+void
+immu_dmar_destroy(void)
+{
+	dmar_devi_destroy(dmar_table);
+	dmar_table_destroy(dmar_table);
+	ioapic_drhd_destroy();
+	dmar_table = NULL;
+	dmar_raw = NULL;
+}
+
+boolean_t
+immu_dmar_blacklisted(char **strptr, uint_t nstrs)
+{
+	dmar_table_t *tbl = dmar_table;
+	int i;
+	char oem_rev[IMMU_MAXNAMELEN];
+
+	ASSERT(tbl);
+
+	ASSERT((strptr == NULL) ^ (nstrs != 0));
+
+	/*
+	 * Must be a minimum of 4
+	 */
+	if (nstrs < 4) {
+		return (B_FALSE);
+	}
+
+	ddi_err(DER_CONT, NULL, "?System DMAR ACPI table information:\n");
+	ddi_err(DER_CONT, NULL, "?OEM-ID = <%s>\n", tbl->tbl_oem_id);
+	ddi_err(DER_CONT, NULL, "?Table-ID = <%s>\n", tbl->tbl_oem_tblid);
+	(void) snprintf(oem_rev, sizeof (oem_rev), "%d", tbl->tbl_oem_rev);
+	ddi_err(DER_CONT, NULL, "?Revision = <%s>\n", oem_rev);
+
+	for (i = 0; nstrs - i >= 4; i++) {
+		if (strcmp(*strptr++, "DMAR") == 0) {
+			if (strcmp(*strptr++, tbl->tbl_oem_id) == 0 &&
+			    ((char *)strptr == '\0' ||
+			    strcmp(*strptr++, tbl->tbl_oem_tblid) == 0) &&
+			    ((char *)strptr == '\0' ||
+			    strcmp(*strptr++, oem_rev) == 0)) {
+				return (B_TRUE);
+			}
+			i += 3; /* for loops adds 1 as well, so only 3 here */
+		}
+	}
+	return (B_FALSE);
+}
+
+void
+immu_dmar_rmrr_map(void)
+{
+	int seg;
+	int e;
+	int count;
+	dev_info_t *rdip;
+	scope_t *scope;
+	rmrr_t *rmrr;
+	dmar_table_t *tbl;
+
+	ASSERT(dmar_table);
+
+	tbl = dmar_table;
+
+	/* called during boot, when kernel is single threaded. No lock */
+
+	/*
+	 * for each segment, walk the rmrr list looking for an exact match
+	 */
+	for (seg = 0; seg < IMMU_MAXSEG; seg++) {
+		rmrr = list_head(&(tbl->tbl_rmrr_list)[seg]);
+		for (; rmrr; rmrr = list_next(&(tbl->tbl_rmrr_list)[seg],
+		    rmrr)) {
+
+			/*
+			 * try to match BDF *exactly* to a device scope.
+			 */
+			scope = list_head(&(rmrr->rm_scope_list));
+			for (; scope;
+			    scope = list_next(&(rmrr->rm_scope_list), scope)) {
+				immu_arg_t imarg = {0};
+				memrng_t mrng = {0};
+
+				/* PCI endpoint devices only */
+				if (scope->scp_type != DMAR_ENDPOINT)
+					continue;
+
+				imarg.ima_seg = seg;
+				imarg.ima_bus = scope->scp_bus;
+				imarg.ima_devfunc =
+				    IMMU_PCI_DEVFUNC(scope->scp_dev,
+				    scope->scp_func);
+				imarg.ima_ddip = NULL;
+				imarg.ima_rdip = NULL;
+
+				ASSERT(root_devinfo);
+				/* XXX should be optimized */
+				ndi_devi_enter(root_devinfo, &count);
+				ddi_walk_devs(ddi_get_child(root_devinfo),
+				    match_bdf, &imarg);
+				ndi_devi_exit(root_devinfo, count);
+
+				if (imarg.ima_ddip == NULL) {
+					ddi_err(DER_WARN, NULL,
+					    "No dip found for "
+					    "bus=0x%x, dev=0x%x, func= 0x%x",
+					    scope->scp_bus, scope->scp_dev,
+					    scope->scp_func);
+					continue;
+				}
+
+				rdip = imarg.ima_ddip;
+				/*
+				 * This address must be in the BIOS reserved
+				 * map
+				 */
+				if (!address_in_memlist(bios_rsvd,
+				    (uint64_t)rmrr->rm_base, rmrr->rm_limit -
+				    rmrr->rm_base + 1)) {
+					ddi_err(DER_WARN, rdip, "RMRR range "
+					    " [0x%" PRIx64 " - 0x%" PRIx64 "]"
+					    " is not in BIOS reserved map",
+					    rmrr->rm_base, rmrr->rm_limit);
+				}
+
+				/* XXX could be more efficient */
+				memlist_read_lock();
+				if (address_in_memlist(phys_install,
+				    (uint64_t)rmrr->rm_base, rmrr->rm_limit -
+				    rmrr->rm_base + 1)) {
+					ddi_err(DER_WARN, rdip, "RMRR range "
+					    " [0x%" PRIx64 " - 0x%" PRIx64 "]"
+					    " is in physinstall map",
+					    rmrr->rm_base, rmrr->rm_limit);
+				}
+				memlist_read_unlock();
+
+
+				ddi_err(DER_LOG, rdip,
+				    "IMMU: Mapping RMRR range "
+				    "[0x%" PRIx64 " - 0x%"PRIx64 "]",
+				    rmrr->rm_base, rmrr->rm_limit);
+
+				mrng.mrng_start =
+				    IMMU_ROUNDOWN((uintptr_t)rmrr->rm_base);
+				mrng.mrng_npages =
+				    IMMU_ROUNDUP((uintptr_t)rmrr->rm_limit -
+				    (uintptr_t)rmrr->rm_base + 1) /
+				    IMMU_PAGESIZE;
+				e = immu_dvma_map(NULL, NULL, &mrng, 0, rdip,
+				    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE |
+				    IMMU_FLAGS_MEMRNG);
+				/*
+				 * dip may have unity domain or xlate domain
+				 * If the former, PHYSICAL is returned else
+				 * MAPPED is returned.
+				 */
+				ASSERT(e == DDI_DMA_MAPPED ||
+				    e == DDI_DMA_USE_PHYSICAL);
+			}
+		}
+	}
+
+}
+
+immu_t *
+immu_dmar_get_immu(dev_info_t *rdip)
+{
+	int seg;
+	int tlevel;
+	int level;
+	drhd_t *drhd;
+	drhd_t *tdrhd;
+	scope_t *scope;
+	dmar_table_t *tbl;
+
+	ASSERT(dmar_table);
+
+	tbl = dmar_table;
+
+	mutex_enter(&(tbl->tbl_lock));
+
+	/*
+	 * for each segment, walk the drhd list looking for an exact match
+	 */
+	for (seg = 0; seg < IMMU_MAXSEG; seg++) {
+		drhd = list_head(&(tbl->tbl_drhd_list)[seg]);
+		for (; drhd; drhd = list_next(&(tbl->tbl_drhd_list)[seg],
+		    drhd)) {
+
+			/*
+			 * we are currently searching for exact matches so
+			 * skip "include all" (catchall) and subtree matches
+			 */
+			if (drhd->dr_include_all == B_TRUE)
+				continue;
+
+			/*
+			 * try to match BDF *exactly* to a device scope.
+			 */
+			scope = list_head(&(drhd->dr_scope_list));
+			for (; scope;
+			    scope = list_next(&(drhd->dr_scope_list), scope)) {
+				immu_arg_t imarg = {0};
+
+				/* PCI endpoint devices only */
+				if (scope->scp_type != DMAR_ENDPOINT)
+					continue;
+
+				imarg.ima_seg = seg;
+				imarg.ima_bus = scope->scp_bus;
+				imarg.ima_devfunc =
+				    IMMU_PCI_DEVFUNC(scope->scp_dev,
+				    scope->scp_func);
+				imarg.ima_ddip = NULL;
+				imarg.ima_rdip = rdip;
+				level = 0;
+				if (immu_walk_ancestor(rdip, NULL, match_bdf,
+				    &imarg, &level, IMMU_FLAGS_DONTPASS)
+				    != DDI_SUCCESS) {
+					/* skip - nothing else we can do */
+					continue;
+				}
+
+				/* Should have walked only 1 level i.e. rdip */
+				ASSERT(level == 1);
+
+				if (imarg.ima_ddip) {
+					ASSERT(imarg.ima_ddip == rdip);
+					goto found;
+				}
+			}
+		}
+	}
+
+	/*
+	 * walk the drhd list looking for subtree match
+	 * i.e. is the device a descendant of a devscope BDF.
+	 * We want the lowest subtree.
+	 */
+	tdrhd = NULL;
+	tlevel = 0;
+	for (seg = 0; seg < IMMU_MAXSEG; seg++) {
+		drhd = list_head(&(tbl->tbl_drhd_list)[seg]);
+		for (; drhd; drhd = list_next(&(tbl->tbl_drhd_list)[seg],
+		    drhd)) {
+
+			/* looking for subtree match */
+			if (drhd->dr_include_all == B_TRUE)
+				continue;
+
+			/*
+			 * try to match the device scope
+			 */
+			scope = list_head(&(drhd->dr_scope_list));
+			for (; scope;
+			    scope = list_next(&(drhd->dr_scope_list), scope)) {
+				immu_arg_t imarg = {0};
+
+				/* PCI subtree only */
+				if (scope->scp_type != DMAR_SUBTREE)
+					continue;
+
+				imarg.ima_seg = seg;
+				imarg.ima_bus = scope->scp_bus;
+				imarg.ima_devfunc =
+				    IMMU_PCI_DEVFUNC(scope->scp_dev,
+				    scope->scp_func);
+
+				imarg.ima_ddip = NULL;
+				imarg.ima_rdip = rdip;
+				level = 0;
+				if (immu_walk_ancestor(rdip, NULL, match_bdf,
+				    &imarg, &level, 0) != DDI_SUCCESS) {
+					/* skip - nothing else we can do */
+					continue;
+				}
+
+				/* should have walked 1 level i.e. rdip */
+				ASSERT(level > 0);
+
+				/* look for lowest ancestor matching drhd */
+				if (imarg.ima_ddip && (tdrhd == NULL ||
+				    level < tlevel)) {
+					tdrhd = drhd;
+					tlevel = level;
+				}
+			}
+		}
+	}
+
+	if ((drhd = tdrhd) != NULL) {
+		goto found;
+	}
+
+	for (seg = 0; seg < IMMU_MAXSEG; seg++) {
+		drhd = list_head(&(tbl->tbl_drhd_list[seg]));
+		for (; drhd; drhd = list_next(&(tbl->tbl_drhd_list)[seg],
+		    drhd)) {
+			/* Look for include all */
+			if (drhd->dr_include_all == B_TRUE) {
+				break;
+			}
+		}
+	}
+
+	/*FALLTHRU*/
+
+found:
+	mutex_exit(&(tbl->tbl_lock));
+
+	/*
+	 * No drhd (dmar unit) found for this device in the ACPI DMAR tables.
+	 * This may happen with buggy versions of BIOSes. Just warn instead
+	 * of panic as we don't want whole system to go down because of one
+	 * device.
+	 */
+	if (drhd == NULL) {
+		ddi_err(DER_WARN, rdip, "can't find Intel IOMMU unit for "
+		    "device in ACPI DMAR table.");
+		return (NULL);
+	}
+
+	return (drhd->dr_immu);
+}
+
+char *
+immu_dmar_unit_name(void *dmar_unit)
+{
+	drhd_t *drhd = (drhd_t *)dmar_unit;
+
+	ASSERT(drhd->dr_dip);
+	return (ddi_node_name(drhd->dr_dip));
+}
+
+dev_info_t *
+immu_dmar_unit_dip(void *dmar_unit)
+{
+	drhd_t *drhd = (drhd_t *)dmar_unit;
+	return (drhd->dr_dip);
+}
+
+void *
+immu_dmar_walk_units(int seg, void *dmar_unit)
+{
+	list_t *drhd_list;
+	drhd_t *drhd = (drhd_t *)dmar_unit;
+
+	drhd_list = &(dmar_table->tbl_drhd_list[seg]);
+
+	if (drhd == NULL) {
+		return ((void *)list_head(drhd_list));
+	} else {
+		return ((void *)list_next(drhd_list, drhd));
+	}
+}
+
+void
+immu_dmar_set_immu(void *dmar_unit, immu_t *immu)
+{
+	drhd_t *drhd = (drhd_t *)dmar_unit;
+
+	ASSERT(drhd);
+	ASSERT(immu);
+
+	drhd->dr_immu = immu;
+}
+
+boolean_t
+immu_dmar_intrmap_supported(void)
+{
+	ASSERT(dmar_table);
+	return (dmar_table->tbl_intrmap);
+}
+
+/* for a given ioapicid, find the source id and immu */
+uint16_t
+immu_dmar_ioapic_sid(int ioapicid)
+{
+	ioapic_drhd_t *idt;
+
+	idt = ioapic_drhd_lookup(ioapicid);
+	if (idt == NULL) {
+		ddi_err(DER_PANIC, NULL, "cannot determine source-id for "
+		    "IOAPIC (id = %d)", ioapicid);
+		/*NOTREACHED*/
+	}
+
+	return (idt->ioapic_sid);
+}
+
+/* for a given ioapicid, find the source id and immu */
+immu_t *
+immu_dmar_ioapic_immu(int ioapicid)
+{
+	ioapic_drhd_t *idt;
+
+	idt = ioapic_drhd_lookup(ioapicid);
+	if (idt) {
+		return (idt->ioapic_drhd ? idt->ioapic_drhd->dr_immu : NULL);
+	}
+	return (NULL);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/io/immu_dvma.c	Sat Jan 30 18:23:16 2010 -0800
@@ -0,0 +1,3190 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Portions Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ * All rights reserved.
+ */
+
+/*
+ * DVMA code
+ * This file contains Intel IOMMU code that deals with DVMA
+ * i.e. DMA remapping.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/pcie.h>
+#include <sys/pci_cfgspace.h>
+#include <vm/hat_i86.h>
+#include <sys/memlist.h>
+#include <sys/acpi/acpi.h>
+#include <sys/acpica.h>
+#include <sys/modhash.h>
+#include <sys/immu.h>
+
+#undef	TEST
+
+/*
+ * Macros based on PCI spec
+ */
+#define	IMMU_PCI_REV2CLASS(r)   ((r) >> 8)  /* classcode from revid */
+#define	IMMU_PCI_CLASS2BASE(c)  ((c) >> 16) /* baseclass from classcode */
+#define	IMMU_PCI_CLASS2SUB(c)   (((c) >> 8) & 0xff); /* classcode */
+
+#define	IMMU_CONTIG_PADDR(d, p) \
+	((d).dck_paddr && ((d).dck_paddr + IMMU_PAGESIZE) == (p))
+
+typedef struct dvma_arg {
+	immu_t *dva_immu;
+	dev_info_t *dva_rdip;
+	dev_info_t *dva_ddip;
+	domain_t *dva_domain;
+	int dva_level;
+	immu_flags_t dva_flags;
+	list_t *dva_list;
+	int dva_error;
+} dvma_arg_t;
+
+static domain_t *domain_create(immu_t *immu, dev_info_t *ddip,
+    dev_info_t *rdip, immu_flags_t immu_flags);
+static immu_devi_t *create_immu_devi(dev_info_t *rdip, int bus,
+    int dev, int func, immu_flags_t immu_flags);
+static void destroy_immu_devi(immu_devi_t *immu_devi);
+static void dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma,
+    uint64_t spaddr, uint64_t npages, dev_info_t *rdip,
+    immu_flags_t immu_flags);
+extern struct memlist  *phys_install;
+
+
+
+/* static Globals */
+
+/*
+ * Used to setup DMA objects (memory regions)
+ * for DMA reads by IOMMU units
+ */
+static ddi_dma_attr_t immu_dma_attr = {
+	DMA_ATTR_V0,
+	0U,
+	0xffffffffU,
+	0xffffffffU,
+	MMU_PAGESIZE, /* MMU page aligned */
+	0x1,
+	0x1,
+	0xffffffffU,
+	0xffffffffU,
+	1,
+	4,
+	0
+};
+
+static ddi_device_acc_attr_t immu_acc_attr = {
+	DDI_DEVICE_ATTR_V0,
+	DDI_NEVERSWAP_ACC,
+	DDI_STRICTORDER_ACC
+};
+
+
+/* globals private to this file */
+static kmutex_t immu_domain_lock;
+static list_t immu_unity_domain_list;
+static list_t immu_xlate_domain_list;
+
+/* structure used to store idx into each level of the page tables */
+typedef struct xlate {
+	int xlt_level;
+	uint_t xlt_idx;
+	pgtable_t *xlt_pgtable;
+} xlate_t;
+
+/* 0 is reserved by Vt-d spec. Solaris reserves 1 */
+#define	IMMU_UNITY_DID   1
+
+static mod_hash_t *bdf_domain_hash;
+
+static domain_t *
+bdf_domain_lookup(immu_devi_t *immu_devi)
+{
+	domain_t *domain;
+	int16_t seg = immu_devi->imd_seg;
+	int16_t bus = immu_devi->imd_bus;
+	int16_t devfunc = immu_devi->imd_devfunc;
+	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
+
+	if (seg < 0 || bus < 0 || devfunc < 0) {
+		return (NULL);
+	}
+
+	domain = NULL;
+	if (mod_hash_find(bdf_domain_hash,
+	    (void *)bdf, (void *)&domain) == 0) {
+		ASSERT(domain);
+		ASSERT(domain->dom_did > 0);
+		return (domain);
+	} else {
+		return (NULL);
+	}
+}
+
+static void
+bdf_domain_insert(immu_devi_t *immu_devi, domain_t *domain)
+{
+	int16_t seg = immu_devi->imd_seg;
+	int16_t bus = immu_devi->imd_bus;
+	int16_t devfunc = immu_devi->imd_devfunc;
+	uintptr_t bdf = (seg << 16 | bus << 8 | devfunc);
+	int r;
+
+	if (seg < 0 || bus < 0 || devfunc < 0) {
+		return;
+	}
+
+	r = mod_hash_insert(bdf_domain_hash, (void *)bdf, (void *)domain);
+	ASSERT(r != MH_ERR_DUPLICATE);
+	ASSERT(r == 0);
+}
+
+static int
+match_lpc(dev_info_t *pdip, void *arg)
+{
+	immu_devi_t *immu_devi;
+	dvma_arg_t *dvap = (dvma_arg_t *)arg;
+
+	ASSERT(dvap->dva_error == DDI_FAILURE);
+	ASSERT(dvap->dva_ddip == NULL);
+	ASSERT(dvap->dva_list);
+
+	if (list_is_empty(dvap->dva_list)) {
+		return (DDI_WALK_TERMINATE);
+	}
+
+	immu_devi = list_head(dvap->dva_list);
+	for (; immu_devi; immu_devi = list_next(dvap->dva_list,
+	    immu_devi)) {
+		ASSERT(immu_devi->imd_dip);
+		if (immu_devi->imd_dip == pdip) {
+			dvap->dva_ddip = pdip;
+			dvap->dva_error = DDI_SUCCESS;
+			return (DDI_WALK_TERMINATE);
+		}
+	}
+
+	return (DDI_WALK_CONTINUE);
+}
+
+static void
+immu_devi_set_spclist(dev_info_t *dip, immu_t *immu)
+{
+	list_t *spclist = NULL;
+	immu_devi_t *immu_devi;
+
+	ASSERT(MUTEX_HELD(&(DEVI(dip)->devi_lock)));
+
+	immu_devi = IMMU_DEVI(dip);
+	if (immu_devi->imd_display == B_TRUE) {
+		spclist = &(immu->immu_dvma_gfx_list);
+	} else if (immu_devi->imd_lpc == B_TRUE) {
+		spclist = &(immu->immu_dvma_lpc_list);
+	}
+
+	if (spclist) {
+		mutex_enter(&(immu->immu_lock));
+		list_insert_head(spclist, immu_devi);
+		mutex_exit(&(immu->immu_lock));
+	}
+}
+
+/*
+ * Set the immu_devi struct in the immu_devi field of a devinfo node
+ */
+int
+immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags)
+{
+	int bus, dev, func;
+	immu_devi_t *new_imd;
+	immu_devi_t *immu_devi;
+
+	ASSERT(root_devinfo);
+	ASSERT(dip);
+	ASSERT(dip != root_devinfo);
+
+	immu_devi = immu_devi_get(dip);
+	if (immu_devi != NULL) {
+		return (DDI_SUCCESS);
+	}
+
+	bus = dev = func = -1;
+
+	/*
+	 * Assume a new immu_devi struct is needed
+	 */
+	if (!DEVI_IS_PCI(dip) || acpica_get_bdf(dip, &bus, &dev, &func) != 0) {
+		/*
+		 * No BDF. Set bus = -1 to indicate this.
+		 * We still need to create a immu_devi struct
+		 * though
+		 */
+		bus = -1;
+		dev = 0;
+		func = 0;
+	}
+
+	new_imd = create_immu_devi(dip, bus, dev, func, immu_flags);
+	if (new_imd  == NULL) {
+		ddi_err(DER_WARN, dip, "Failed to create immu_devi "
+		    "structure");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Check if some other thread allocated a immu_devi while we
+	 * didn't own the lock.
+	 */
+	mutex_enter(&(DEVI(dip)->devi_lock));
+	if (IMMU_DEVI(dip) == NULL) {
+		IMMU_DEVI_SET(dip, new_imd);
+	} else {
+		destroy_immu_devi(new_imd);
+	}
+	mutex_exit(&(DEVI(dip)->devi_lock));
+
+	return (DDI_SUCCESS);
+}
+
+static dev_info_t *
+get_lpc_devinfo(immu_t *immu, dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	dvma_arg_t dvarg = {0};
+	dvarg.dva_list = &(immu->immu_dvma_lpc_list);
+	dvarg.dva_rdip = rdip;
+	dvarg.dva_error = DDI_FAILURE;
+
+	if (immu_walk_ancestor(rdip, NULL, match_lpc,
+	    &dvarg, NULL, immu_flags) != DDI_SUCCESS) {
+		ddi_err(DER_MODE, rdip, "Could not walk ancestors to "
+		    "find lpc_devinfo for ISA device");
+		return (NULL);
+	}
+
+	if (dvarg.dva_error != DDI_SUCCESS || dvarg.dva_ddip == NULL) {
+		ddi_err(DER_MODE, rdip, "Could not find lpc_devinfo for "
+		    "ISA device");
+		return (NULL);
+	}
+
+	return (dvarg.dva_ddip);
+}
+
+static dev_info_t *
+get_gfx_devinfo(dev_info_t *rdip)
+{
+	immu_t *immu;
+	immu_devi_t *immu_devi;
+	list_t *list_gfx;
+
+	/*
+	 * The GFX device may not be on the same IMMU unit as "agpgart"
+	 * so search globally
+	 */
+	immu_devi = NULL;
+	immu = list_head(&immu_list);
+	for (; immu; immu = list_next(&immu_list, immu)) {
+		list_gfx = &(immu->immu_dvma_gfx_list);
+		if (!list_is_empty(list_gfx)) {
+			immu_devi = list_head(list_gfx);
+			break;
+		}
+	}
+
+	if (immu_devi == NULL) {
+		ddi_err(DER_WARN, rdip, "IMMU: No GFX device. "
+		    "Cannot redirect agpgart",
+		    ddi_node_name(immu_devi->imd_dip));
+		return (NULL);
+	}
+
+	/* list is not empty we checked above */
+	ASSERT(immu_devi);
+	ASSERT(immu_devi->imd_dip);
+
+	ddi_err(DER_LOG, rdip, "IMMU: GFX redirect to %s",
+	    ddi_node_name(immu_devi->imd_dip));
+
+	return (immu_devi->imd_dip);
+}
+
+static immu_flags_t
+dma_to_immu_flags(struct ddi_dma_req *dmareq)
+{
+	immu_flags_t flags = 0;
+
+	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
+		flags |= IMMU_FLAGS_SLEEP;
+	} else {
+		flags |= IMMU_FLAGS_NOSLEEP;
+	}
+
+	/*
+	 * Read and write flags need to be reversed.
+	 * DMA_READ means read from device and write
+	 * to memory. So DMA read means DVMA write.
+	 */
+	if (dmareq->dmar_flags & DDI_DMA_READ)
+		flags |= IMMU_FLAGS_WRITE;
+
+	if (dmareq->dmar_flags & DDI_DMA_WRITE)
+		flags |= IMMU_FLAGS_READ;
+
+#ifdef BUGGY_DRIVERS
+	/*
+	 * Some buggy drivers specify neither READ or WRITE
+	 * For such drivers set both read and write permissions
+	 */
+	if ((dmareq->dmar_flags & (DDI_DMA_READ | DDI_DMA_WRITE)) == 0) {
+		flags |= (IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
+	}
+#endif
+
+	return (flags);
+}
+
+/*
+ * pgtable_alloc()
+ *	alloc a IOMMU pgtable structure.
+ *	This same struct is used for root and context tables as well.
+ *	This routine allocs the f/ollowing:
+ *	- a pgtable_t struct
+ *	- a HW page which holds PTEs/entries which is accesssed by HW
+ *        so we set up DMA for this page
+ *	- a SW page which is only for our bookeeping
+ *        (for example to  hold pointers to the next level pgtable).
+ *        So a simple kmem_alloc suffices
+ */
+static pgtable_t *
+pgtable_alloc(immu_t *immu, domain_t *domain, immu_flags_t immu_flags)
+{
+	size_t actual_size = 0;
+	pgtable_t *pgtable;
+	int (*dmafp)(caddr_t);
+	caddr_t vaddr;
+	int kmflags;
+
+	/* TO DO cache freed pgtables as it is expensive to create em */
+	ASSERT(immu);
+
+	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ?
+	    KM_NOSLEEP : KM_SLEEP;
+
+	dmafp = (immu_flags & IMMU_FLAGS_NOSLEEP) ?
+	    DDI_DMA_DONTWAIT : DDI_DMA_SLEEP;
+
+	pgtable = kmem_zalloc(sizeof (pgtable_t), kmflags);
+	if (pgtable == NULL) {
+		return (NULL);
+	}
+
+	pgtable->swpg_next_array = kmem_zalloc(IMMU_PAGESIZE, kmflags);
+	if (pgtable->swpg_next_array == NULL) {
+		kmem_free(pgtable, sizeof (pgtable_t));
+		return (NULL);
+	}
+
+	ASSERT(root_devinfo);
+	if (ddi_dma_alloc_handle(root_devinfo, &immu_dma_attr,
+	    dmafp, NULL, &pgtable->hwpg_dmahdl) != DDI_SUCCESS) {
+		kmem_free(pgtable->swpg_next_array, IMMU_PAGESIZE);
+		kmem_free(pgtable, sizeof (pgtable_t));
+		return (NULL);
+	}
+
+	if (ddi_dma_mem_alloc(pgtable->hwpg_dmahdl, IMMU_PAGESIZE,
+	    &immu_acc_attr, DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
+	    dmafp, NULL, &vaddr, &actual_size,
+	    &pgtable->hwpg_memhdl) != DDI_SUCCESS) {
+		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
+		kmem_free((void *)(pgtable->swpg_next_array),
+		    IMMU_PAGESIZE);
+		kmem_free(pgtable, sizeof (pgtable_t));
+		return (NULL);
+	}
+
+	/*
+	 * Memory allocation failure. Maybe a temporary condition
+	 * so return error rather than panic, so we can try again
+	 */
+	if (actual_size < IMMU_PAGESIZE) {
+		ddi_dma_mem_free(&pgtable->hwpg_memhdl);
+		ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
+		kmem_free((void *)(pgtable->swpg_next_array),
+		    IMMU_PAGESIZE);
+		kmem_free(pgtable, sizeof (pgtable_t));
+		return (NULL);
+	}
+
+	pgtable->hwpg_paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr));
+	pgtable->hwpg_vaddr = vaddr;
+
+	bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE);
+
+	/* Use immu directly as domain may be NULL, cant use dom_immu field */
+	immu_regs_cpu_flush(immu, pgtable->hwpg_vaddr, IMMU_PAGESIZE);
+
+	rw_init(&(pgtable->swpg_rwlock), NULL, RW_DEFAULT, NULL);
+
+	if (domain) {
+		rw_enter(&(domain->dom_pgtable_rwlock), RW_WRITER);
+		list_insert_head(&(domain->dom_pglist), pgtable);
+		rw_exit(&(domain->dom_pgtable_rwlock));
+	}
+
+	return (pgtable);
+}
+
+static void
+pgtable_free(immu_t *immu, pgtable_t *pgtable, domain_t *domain)
+{
+	ASSERT(immu);
+	ASSERT(pgtable);
+
+	if (domain) {
+		rw_enter(&(domain->dom_pgtable_rwlock), RW_WRITER);
+		list_remove(&(domain->dom_pglist), pgtable);
+		rw_exit(&(domain->dom_pgtable_rwlock));
+	}
+
+	/* destroy will panic if lock is held. */
+	rw_destroy(&(pgtable->swpg_rwlock));
+
+	/* Zero out the HW page being freed to catch errors */
+	bzero(pgtable->hwpg_vaddr, IMMU_PAGESIZE);
+	immu_regs_cpu_flush(immu, pgtable->hwpg_vaddr, IMMU_PAGESIZE);
+	ddi_dma_mem_free(&pgtable->hwpg_memhdl);
+	ddi_dma_free_handle(&pgtable->hwpg_dmahdl);
+	/* don't zero out the soft pages for debugging */
+	if (pgtable->swpg_next_array)
+		kmem_free((void *)(pgtable->swpg_next_array), IMMU_PAGESIZE);
+	kmem_free(pgtable, sizeof (pgtable_t));
+}
+
+/*
+ * Function to identify a display device from the PCI class code
+ */
+static boolean_t
+device_is_display(uint_t classcode)
+{
+	static uint_t disp_classes[] = {
+		0x000100,
+		0x030000,
+		0x030001
+	};
+	int i, nclasses = sizeof (disp_classes) / sizeof (uint_t);
+
+	for (i = 0; i < nclasses; i++) {
+		if (classcode == disp_classes[i])
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Function that determines if device is PCIEX and/or PCIEX bridge
+ */
+static boolean_t
+device_is_pciex(
+	uchar_t bus, uchar_t dev, uchar_t func, boolean_t *is_pcib)
+{
+	ushort_t cap;
+	ushort_t capsp;
+	ushort_t cap_count = PCI_CAP_MAX_PTR;
+	ushort_t status;
+	boolean_t is_pciex = B_FALSE;
+
+	*is_pcib = B_FALSE;
+
+	status = pci_getw_func(bus, dev, func, PCI_CONF_STAT);
+	if (!(status & PCI_STAT_CAP))
+		return (B_FALSE);
+
+	capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR);
+	while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) {
+		capsp &= PCI_CAP_PTR_MASK;
+		cap = pci_getb_func(bus, dev, func, capsp);
+
+		if (cap == PCI_CAP_ID_PCI_E) {
+			status = pci_getw_func(bus, dev, func, capsp + 2);
+			/*
+			 * See section 7.8.2 of PCI-Express Base Spec v1.0a
+			 * for Device/Port Type.
+			 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the
+			 * device is a PCIE2PCI bridge
+			 */
+			*is_pcib =
+			    ((status & PCIE_PCIECAP_DEV_TYPE_MASK) ==
+			    PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? B_TRUE : B_FALSE;
+			is_pciex = B_TRUE;
+		}
+
+		capsp = (*pci_getb_func)(bus, dev, func,
+		    capsp + PCI_CAP_NEXT_PTR);
+	}
+
+	return (is_pciex);
+}
+
+
+/*
+ * immu_dvma_get_immu()
+ *   get the immu unit structure for a dev_info node
+ */
+immu_t *
+immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags)
+{
+	immu_devi_t *immu_devi;
+	immu_t *immu;
+
+	/*
+	 * check if immu unit was already found earlier.
+	 * If yes, then it will be stashed in immu_devi struct.
+	 */
+	immu_devi = immu_devi_get(dip);
+	if (immu_devi == NULL) {
+		if (immu_devi_set(dip, immu_flags) != DDI_SUCCESS) {
+			/*
+			 * May fail because of low memory. Return error rather
+			 * than panic as we want driver to rey again later
+			 */
+			ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
+			    "No immu_devi structure");
+			/*NOTREACHED*/
+		}
+		immu_devi = immu_devi_get(dip);
+		ASSERT(immu_devi);
+	}
+
+	mutex_enter(&(DEVI(dip)->devi_lock));
+	if (immu_devi->imd_immu) {
+		immu = immu_devi->imd_immu;
+		mutex_exit(&(DEVI(dip)->devi_lock));
+		return (immu);
+	}
+	mutex_exit(&(DEVI(dip)->devi_lock));
+
+	immu = immu_dmar_get_immu(dip);
+	if (immu == NULL) {
+		ddi_err(DER_PANIC, dip, "immu_dvma_get_immu: "
+		    "Cannot find immu_t for device");
+		/*NOTREACHED*/
+	}
+
+	/*
+	 * Check if some other thread found immu
+	 * while lock was not held
+	 */
+	immu_devi = immu_devi_get(dip);
+	/* immu_devi should be present as we found it earlier */
+	if (immu_devi == NULL) {
+		ddi_err(DER_PANIC, dip,
+		    "immu_dvma_get_immu: No immu_devi structure");
+		/*NOTREACHED*/
+	}
+
+	mutex_enter(&(DEVI(dip)->devi_lock));
+	if (immu_devi->imd_immu == NULL) {
+		/* nobody else set it, so we should do it */
+		immu_devi->imd_immu = immu;
+		immu_devi_set_spclist(dip, immu);
+	} else {
+		/*
+		 * if some other thread got immu before
+		 * us, it should get the same results
+		 */
+		if (immu_devi->imd_immu != immu) {
+			ddi_err(DER_PANIC, dip, "Multiple "
+			    "immu units found for device. Expected (%p), "
+			    "actual (%p)", (void *)immu,
+			    (void *)immu_devi->imd_immu);
+			mutex_exit(&(DEVI(dip)->devi_lock));
+			/*NOTREACHED*/
+		}
+	}
+	mutex_exit(&(DEVI(dip)->devi_lock));
+
+	return (immu);
+}
+
+
+/* ############################# IMMU_DEVI code ############################ */
+
+/*
+ * Allocate a immu_devi structure and initialize it
+ */
+static immu_devi_t *
+create_immu_devi(dev_info_t *rdip, int bus, int dev, int func,
+    immu_flags_t immu_flags)
+{
+	uchar_t baseclass, subclass;
+	uint_t classcode, revclass;
+	immu_devi_t *immu_devi;
+	boolean_t pciex = B_FALSE;
+	int kmflags;
+	boolean_t is_pcib = B_FALSE;
+
+	/* bus ==  -1 indicate non-PCI device (no BDF) */
+	ASSERT(bus == -1 || bus >= 0);
+	ASSERT(dev >= 0);
+	ASSERT(func >= 0);
+
+	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+	immu_devi = kmem_zalloc(sizeof (immu_devi_t), kmflags);
+	if (immu_devi == NULL) {
+		ddi_err(DER_WARN, rdip, "Failed to allocate memory for "
+		    "Intel IOMMU immu_devi structure");
+		return (NULL);
+	}
+	immu_devi->imd_dip = rdip;
+	immu_devi->imd_seg = 0; /* Currently seg can only be 0 */
+	immu_devi->imd_bus = bus;
+	immu_devi->imd_pcib_type = IMMU_PCIB_BAD;
+
+	if (bus == -1) {
+		immu_devi->imd_pcib_type = IMMU_PCIB_NOBDF;
+		return (immu_devi);
+	}
+
+	immu_devi->imd_devfunc = IMMU_PCI_DEVFUNC(dev, func);
+	immu_devi->imd_sec = 0;
+	immu_devi->imd_sub = 0;
+
+	revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID);
+
+	classcode = IMMU_PCI_REV2CLASS(revclass);
+	baseclass = IMMU_PCI_CLASS2BASE(classcode);
+	subclass = IMMU_PCI_CLASS2SUB(classcode);
+
+	if (baseclass == PCI_CLASS_BRIDGE && subclass == PCI_BRIDGE_PCI) {
+
+		immu_devi->imd_sec = pci_getb_func(bus, dev, func,
+		    PCI_BCNF_SECBUS);
+		immu_devi->imd_sub = pci_getb_func(bus, dev, func,
+		    PCI_BCNF_SUBBUS);
+
+		pciex = device_is_pciex(bus, dev, func, &is_pcib);
+		if (pciex  == B_TRUE && is_pcib == B_TRUE) {
+			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCI;
+		} else if (pciex == B_TRUE) {
+			immu_devi->imd_pcib_type = IMMU_PCIB_PCIE_PCIE;
+		} else {
+			immu_devi->imd_pcib_type = IMMU_PCIB_PCI_PCI;
+		}
+	} else {
+		immu_devi->imd_pcib_type = IMMU_PCIB_ENDPOINT;
+	}
+
+	/* check for certain special devices */
+	immu_devi->imd_display = device_is_display(classcode);
+
+	immu_devi->imd_lpc = ((baseclass == PCI_CLASS_BRIDGE) &&
+	    (subclass == PCI_BRIDGE_ISA)) ? B_TRUE : B_FALSE;
+
+	immu_devi->imd_domain = NULL;
+
+	return (immu_devi);
+}
+
+static void
+destroy_immu_devi(immu_devi_t *immu_devi)
+{
+	kmem_free(immu_devi, sizeof (immu_devi_t));
+}
+
+static domain_t *
+immu_devi_domain(dev_info_t *rdip, dev_info_t **ddipp)
+{
+	immu_devi_t *immu_devi;
+	domain_t *domain;
+	dev_info_t *ddip;
+
+	ASSERT(rdip);
+	ASSERT(ddipp);
+
+	*ddipp = NULL;
+
+	immu_devi = immu_devi_get(rdip);
+	if (immu_devi == NULL) {
+		return (NULL);
+	}
+
+	mutex_enter(&(DEVI(rdip)->devi_lock));
+	domain = immu_devi->imd_domain;
+	ddip = immu_devi->imd_ddip;
+	mutex_exit(&(DEVI(rdip)->devi_lock));
+
+	if (domain) {
+		ASSERT(domain->dom_did > 0);
+		ASSERT(ddip);
+		*ddipp = ddip;
+	}
+
+	return (domain);
+
+}
+
+/* ############################# END IMMU_DEVI code ######################## */
+/* ############################# DOMAIN code ############################### */
+
+/*
+ * This routine always succeeds
+ */
+static int
+did_alloc(immu_t *immu, dev_info_t *rdip,
+    dev_info_t *ddip, immu_flags_t immu_flags)
+{
+	int did;
+
+	ASSERT(immu);
+	ASSERT(rdip);
+	ASSERT(rdip != root_devinfo);
+
+	did = (uintptr_t)vmem_alloc(immu->immu_did_arena, 1,
+	    (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP);
+
+	if (did == 0) {
+		ASSERT(immu->immu_unity_domain);
+		ASSERT(immu->immu_unity_domain->dom_did > 0);
+		ddi_err(DER_WARN, rdip, "device domain-id alloc error"
+		    " domain-device: %s%d. immu unit is %s. Using "
+		    "unity domain with domain-id (%d)",
+		    ddi_driver_name(ddip), ddi_get_instance(ddip),
+		    immu->immu_name, immu->immu_unity_domain->dom_did);
+		did = immu->immu_unity_domain->dom_did;
+	}
+
+	return (did);
+}
+
+static int
+get_branch_domain(dev_info_t *pdip, void *arg)
+{
+	immu_devi_t *immu_devi;
+	domain_t *domain;
+	dev_info_t *ddip;
+	immu_t *immu;
+	dvma_arg_t *dvp = (dvma_arg_t *)arg;
+
+	ASSERT(pdip);
+	ASSERT(dvp);
+	ASSERT(dvp->dva_rdip);
+
+	/*
+	 * The field dvp->dva_rdip is a work-in-progress
+	 * and gets updated as we walk up the ancestor
+	 * tree. The final ddip is set only when we reach
+	 * the top of the tree. So the dvp->dva_ddip field cannot
+	 * be relied on until we reach the top of the field.
+	 */
+
+	/* immu_devi may not be set. */
+	immu_devi = immu_devi_get(pdip);
+	if (immu_devi == NULL) {
+		if (immu_devi_set(pdip, dvp->dva_flags) != DDI_SUCCESS) {
+			dvp->dva_error = DDI_FAILURE;
+			return (DDI_WALK_TERMINATE);
+		}
+	}
+
+	immu_devi = immu_devi_get(pdip);
+	ASSERT(immu_devi);
+	immu = immu_devi->imd_immu;
+	if (immu == NULL) {
+		immu = immu_dvma_get_immu(pdip, dvp->dva_flags);
+		ASSERT(immu);
+	}
+
+	/*
+	 * If we encounter a PCIE_PCIE bridge *ANCESTOR* we need to
+	 * terminate the walk (since the device under the PCIE bridge
+	 * is a PCIE device and has an independent entry in the
+	 * root/context table)
+	 */
+	if (dvp->dva_rdip != pdip &&
+	    immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCIE) {
+		return (DDI_WALK_TERMINATE);
+	}
+
+	/*
+	 * In order to be a domain-dim, it must be a PCI device i.e.
+	 * must have valid BDF. This also eliminates the root complex.
+	 */
+	if (immu_devi->imd_pcib_type != IMMU_PCIB_BAD &&
+	    immu_devi->imd_pcib_type != IMMU_PCIB_NOBDF) {
+		ASSERT(immu_devi->imd_bus >= 0);
+		ASSERT(immu_devi->imd_devfunc >= 0);
+		dvp->dva_ddip = pdip;
+	}
+
+	if (immu_devi->imd_display == B_TRUE ||
+	    (dvp->dva_flags & IMMU_FLAGS_UNITY)) {
+		dvp->dva_domain = immu->immu_unity_domain;
+		/* continue walking to find ddip */
+		return (DDI_WALK_CONTINUE);
+	}
+
+	mutex_enter(&(DEVI(pdip)->devi_lock));
+	domain = immu_devi->imd_domain;
+	ddip = immu_devi->imd_ddip;
+	mutex_exit(&(DEVI(pdip)->devi_lock));
+
+	if (domain && ddip) {
+		/* if domain is set, it must be the same */
+		if (dvp->dva_domain) {
+			ASSERT(domain == dvp->dva_domain);
+		}
+		dvp->dva_domain = domain;
+		dvp->dva_ddip = ddip;
+		return (DDI_WALK_TERMINATE);
+	}
+
+	/* immu_devi either has both set or both clear */
+	ASSERT(domain == NULL);
+	ASSERT(ddip == NULL);
+
+	/* Domain may already be set, continue walking so that ddip gets set */
+	if (dvp->dva_domain) {
+		return (DDI_WALK_CONTINUE);
+	}
+
+	/* domain is not set in either immu_devi or dvp */
+	domain = bdf_domain_lookup(immu_devi);
+	if (domain == NULL) {
+		return (DDI_WALK_CONTINUE);
+	}
+
+	/* ok, the BDF hash had a domain for this BDF. */
+
+	/* Grab lock again to check if something else set immu_devi fields */
+	mutex_enter(&(DEVI(pdip)->devi_lock));
+	if (immu_devi->imd_domain != NULL) {
+		ASSERT(immu_devi->imd_domain == domain);
+		dvp->dva_domain = domain;
+	} else {
+		dvp->dva_domain = domain;
+	}
+	mutex_exit(&(DEVI(pdip)->devi_lock));
+
+	/*
+	 * walk upwards until the topmost PCI bridge is found
+	 */
+	return (DDI_WALK_CONTINUE);
+}
+
+static void
+map_unity_domain(domain_t *domain)
+{
+	struct memlist *mp;
+	uint64_t start;
+	uint64_t npages;
+
+	ASSERT(domain);
+	ASSERT(domain->dom_did == IMMU_UNITY_DID);
+
+	/*
+	 * We call into routines that grab the lock so we should
+	 * not be called with the lock held. This does not matter
+	 * much since, no else has a reference to this domain
+	 */
+	ASSERT(!rw_lock_held(&(domain->dom_pgtable_rwlock)));
+
+	/*
+	 * UNITY arenas are a mirror of the physical memory
+	 * installed on the system.
+	 */
+
+#ifdef BUGGY_DRIVERS
+	/*
+	 * Dont skip page0. Some broken HW/FW access it.
+	 */
+	dvma_map(domain->dom_immu, domain, 0, 0, 1, NULL,
+	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
+#endif
+
+	memlist_read_lock();
+
+	mp = phys_install;
+
+	if (mp->ml_address == 0) {
+		/* since we already mapped page1 above */
+		start = IMMU_PAGESIZE;
+	} else {
+		start = mp->ml_address;
+	}
+	npages = mp->ml_size/IMMU_PAGESIZE + 1;
+
+	dvma_map(domain->dom_immu, domain, start, start, npages, NULL,
+	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
+
+	ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
+	    " - 0x%" PRIx64 "]", start, start + mp->ml_size);
+
+	mp = mp->ml_next;
+	while (mp) {
+		ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
+		    " - 0x%" PRIx64 "]", mp->ml_address,
+		    mp->ml_address + mp->ml_size);
+
+		start = mp->ml_address;
+		npages = mp->ml_size/IMMU_PAGESIZE + 1;
+
+		dvma_map(domain->dom_immu, domain, start, start,
+		    npages, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
+
+		mp = mp->ml_next;
+	}
+
+	mp = bios_rsvd;
+	while (mp) {
+		ddi_err(DER_LOG, NULL, "IMMU: mapping PHYS span [0x%" PRIx64
+		    " - 0x%" PRIx64 "]", mp->ml_address,
+		    mp->ml_address + mp->ml_size);
+
+		start = mp->ml_address;
+		npages = mp->ml_size/IMMU_PAGESIZE + 1;
+
+		dvma_map(domain->dom_immu, domain, start, start,
+		    npages, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
+
+		mp = mp->ml_next;
+	}
+
+	memlist_read_unlock();
+}
+
+/*
+ * create_xlate_arena()
+ * 	Create the dvma arena for a domain with translation
+ *	mapping
+ */
+static void
+create_xlate_arena(immu_t *immu, domain_t *domain,
+    dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	char *arena_name;
+	struct memlist *mp;
+	int vmem_flags;
+	uint64_t start;
+	uint_t mgaw;
+	uint64_t size;
+	uint64_t maxaddr;
+	void *vmem_ret;
+
+	arena_name = domain->dom_dvma_arena_name;
+
+	/* Note, don't do sizeof (arena_name) - it is just a pointer */
+	(void) snprintf(arena_name,
+	    sizeof (domain->dom_dvma_arena_name),
+	    "%s-domain-%d-xlate-DVMA-arena", immu->immu_name,
+	    domain->dom_did);
+
+	vmem_flags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? VM_NOSLEEP : VM_SLEEP;
+
+	/*
+	 * No one else has access to this domain.
+	 * So no domain locks needed
+	 */
+	ASSERT(!rw_lock_held(&(domain->dom_pgtable_rwlock)));
+
+	/* Restrict mgaddr (max guest addr) to MGAW */
+	mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap);
+
+	/*
+	 * To ensure we avoid ioapic and PCI MMIO ranges we just
+	 * use the physical memory address range of the system as the
+	 * range
+	 * Implementing above causes graphics device to barf on
+	 * Lenovo X301 hence the toggle switch immu_mmio_safe.
+	 */
+	maxaddr = ((uint64_t)1 << mgaw);
+
+	if (immu_mmio_safe == B_FALSE) {
+
+		start = MMU_PAGESIZE;
+		size = maxaddr - start;
+
+		ddi_err(DER_VERB, rdip,
+		    "%s: Creating dvma vmem arena [0x%" PRIx64
+		    " - 0x%" PRIx64 "]", arena_name, start, start + size);
+
+		ASSERT(domain->dom_dvma_arena == NULL);
+
+		/*
+		 * We always allocate in quanta of IMMU_PAGESIZE
+		 */
+		domain->dom_dvma_arena = vmem_create(arena_name,
+		    (void *)(uintptr_t)start,	/* start addr */
+		    size,			/* size */
+		    IMMU_PAGESIZE,		/* quantum */
+		    NULL,			/* afunc */
+		    NULL,			/* ffunc */
+		    NULL,			/* source */
+		    0,				/* qcache_max */
+		    vmem_flags);
+
+		if (domain->dom_dvma_arena == NULL) {
+			ddi_err(DER_PANIC, rdip,
+			    "Failed to allocate DVMA arena(%s) "
+			    "for domain ID (%d)", arena_name, domain->dom_did);
+			/*NOTREACHED*/
+		}
+
+	} else {
+
+		memlist_read_lock();
+
+		mp = phys_install;
+
+		if (mp->ml_address == 0)
+			start = MMU_PAGESIZE;
+		else
+			start = mp->ml_address;
+
+		if (start + mp->ml_size > maxaddr)
+			size = maxaddr - start;
+		else
+			size = mp->ml_size;
+
+		ddi_err(DER_VERB, rdip,
+		    "%s: Creating dvma vmem arena [0x%" PRIx64
+		    " - 0x%" PRIx64 "]", arena_name, start, start + size);
+
+		ASSERT(domain->dom_dvma_arena == NULL);
+
+		/*
+		 * We always allocate in quanta of IMMU_PAGESIZE
+		 */
+		domain->dom_dvma_arena = vmem_create(arena_name,
+		    (void *)(uintptr_t)start,	/* start addr */
+		    size,			/* size */
+		    IMMU_PAGESIZE,		/* quantum */
+		    NULL,			/* afunc */
+		    NULL,			/* ffunc */
+		    NULL,			/* source */
+		    0,				/* qcache_max */
+		    vmem_flags);
+
+		if (domain->dom_dvma_arena == NULL) {
+			ddi_err(DER_PANIC, rdip,
+			    "Failed to allocate DVMA arena(%s) "
+			    "for domain ID (%d)", arena_name, domain->dom_did);
+			/*NOTREACHED*/
+		}
+
+		mp = mp->ml_next;
+		while (mp) {
+
+			if (mp->ml_address == 0)
+				start = MMU_PAGESIZE;
+			else
+				start = mp->ml_address;
+
+			if (start + mp->ml_size > maxaddr)
+				size = maxaddr - start;
+			else
+				size = mp->ml_size;
+
+			ddi_err(DER_VERB, rdip,
+			    "%s: Adding dvma vmem span [0x%" PRIx64
+			    " - 0x%" PRIx64 "]", arena_name, start,
+			    start + size);
+
+			vmem_ret = vmem_add(domain->dom_dvma_arena,
+			    (void *)(uintptr_t)start, size,  vmem_flags);
+
+			if (vmem_ret == NULL) {
+				ddi_err(DER_PANIC, rdip,
+				    "Failed to allocate DVMA arena(%s) "
+				    "for domain ID (%d)",
+				    arena_name, domain->dom_did);
+				/*NOTREACHED*/
+			}
+
+			mp = mp->ml_next;
+		}
+		memlist_read_unlock();
+	}
+}
+
+/* ################################### DOMAIN CODE ######################### */
+
+/*
+ * Set the domain and domain-dip for a dip
+ */
+static void
+set_domain(
+	dev_info_t *dip,
+	dev_info_t *ddip,
+	domain_t *domain)
+{
+	immu_devi_t *immu_devi;
+	domain_t *fdomain;
+	dev_info_t *fddip;
+
+	ASSERT(dip);
+	ASSERT(ddip);
+	ASSERT(domain);
+	ASSERT(domain->dom_did > 0); /* must be an initialized domain */
+
+	immu_devi = immu_devi_get(dip);
+	ASSERT(immu_devi);
+
+	mutex_enter(&(DEVI(dip)->devi_lock));
+	fddip = immu_devi->imd_ddip;
+	fdomain = immu_devi->imd_domain;
+
+	if (fddip) {
+		ASSERT(fddip == ddip);
+	} else {
+		immu_devi->imd_ddip = ddip;
+	}
+
+	if (fdomain) {
+		ASSERT(fdomain == domain);
+	} else {
+		immu_devi->imd_domain = domain;
+	}
+	mutex_exit(&(DEVI(dip)->devi_lock));
+}
+
+/*
+ * device_domain()
+ * 	Get domain for a device. The domain may be global in which case it
+ *	is shared between all IOMMU units. Due to potential AGAW differences
+ *      between IOMMU units, such global domains *have to be* UNITY mapping
+ *      domains. Alternatively, the domain may be local to a IOMMU unit.
+ *	Local domains may be shared or immu_devi, although the
+ *      scope of sharing
+ *	is restricted to devices controlled by the IOMMU unit to
+ *      which the domain
+ *	belongs. If shared, they (currently) have to be UNITY domains. If
+ *      immu_devi a domain may be either UNITY or translation (XLATE) domain.
+ */
+static domain_t *
+device_domain(dev_info_t *rdip, dev_info_t **ddipp, immu_flags_t immu_flags)
+{
+	dev_info_t *ddip; /* topmost dip in domain i.e. domain owner */
+	dev_info_t *edip; /* effective dip used for finding domain */
+	immu_t *immu;
+	domain_t *domain;
+	dvma_arg_t dvarg = {0};
+	int level;
+
+	ASSERT(rdip);
+
+	*ddipp = NULL;
+
+	/*
+	 * Check if the domain is already set. This is usually true
+	 * if this is not the first DVMA transaction.
+	 */
+	ddip = NULL;
+	domain = immu_devi_domain(rdip, &ddip);
+	if (domain) {
+		ASSERT(domain->dom_did > 0);
+		ASSERT(ddip);
+		*ddipp = ddip;
+		return (domain);
+	}
+
+	immu = immu_dvma_get_immu(rdip, immu_flags);
+	if (immu == NULL) {
+		/*
+		 * possible that there is no IOMMU unit for this device
+		 * - BIOS bugs are one example.
+		 */
+		return (NULL);
+	}
+
+	/*
+	 * Some devices need to be redirected
+	 */
+	edip = rdip;
+
+	/*
+	 * for isa devices attached under lpc
+	 */
+	if (strcmp(ddi_node_name(ddi_get_parent(rdip)), "isa") == 0) {
+		edip = get_lpc_devinfo(immu, rdip, immu_flags);
+	}
+
+	/*
+	 * for gart, use the real graphic devinfo
+	 */
+	if (strcmp(ddi_node_name(rdip), "agpgart") == 0) {
+		edip = get_gfx_devinfo(rdip);
+	}
+
+	if (edip == NULL) {
+		ddi_err(DER_MODE, rdip, "IMMU redirect failed");
+		return (NULL);
+	}
+
+	dvarg.dva_rdip = edip;
+	dvarg.dva_ddip = NULL;
+	dvarg.dva_domain = NULL;
+	dvarg.dva_flags = immu_flags;
+	level = 0;
+	if (immu_walk_ancestor(edip, NULL, get_branch_domain,
+	    &dvarg, &level, immu_flags) != DDI_SUCCESS) {
+		/*
+		 * maybe low memory. return error,
+		 * so driver tries again later
+		 */
+		return (NULL);
+	}
+
+	/* should have walked at least 1 dip (i.e. edip) */
+	ASSERT(level > 0);
+
+	ddip = dvarg.dva_ddip;	/* must be present */
+	domain = dvarg.dva_domain;	/* may be NULL */
+
+	/*
+	 * We may find the domain during our ancestor walk on any one of our
+	 * ancestor dips, If the domain is found then the domain-dip
+	 * (i.e. ddip) will also be found in the same immu_devi struct.
+	 * The domain-dip is the highest ancestor dip which shares the
+	 * same domain with edip.
+	 * The domain may or may not be found, but the domain dip must
+	 * be found.
+	 */
+	if (ddip == NULL) {
+		ddi_err(DER_MODE, rdip, "Cannot find domain dip for device. "
+		    "Effective dip (%s%d)", ddi_driver_name(edip),
+		    ddi_get_instance(edip));
+		return (NULL);
+	}
+
+	/*
+	 * Did we find a domain ?
+	 */
+	if (domain) {
+		goto found;
+	}
+
+	/* nope, so allocate */
+	domain = domain_create(immu, ddip, rdip, immu_flags);
+	if (domain == NULL) {
+		return (NULL);
+	}
+	ASSERT(domain->dom_did > 0);
+
+	/*FALLTHROUGH*/
+found:
+	/*
+	 * We know *domain *is* the right domain, so panic if
+	 * another domain is set for either the request-dip or
+	 * effective dip.
+	 */
+	set_domain(ddip, ddip, domain);
+	set_domain(edip, ddip, domain);
+	set_domain(rdip, ddip, domain);
+
+	*ddipp = ddip;
+	return (domain);
+}
+
+static void
+create_unity_domain(immu_t *immu)
+{
+	domain_t *domain;
+
+	/* 0 is reserved by Vt-d */
+	/*LINTED*/
+	ASSERT(IMMU_UNITY_DID > 0);
+
+	/* domain created during boot and always use sleep flag */
+	domain = kmem_zalloc(sizeof (domain_t), KM_SLEEP);
+
+	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
+	list_create(&(domain->dom_pglist), sizeof (pgtable_t),
+	    offsetof(pgtable_t, swpg_domain_node));
+
+	domain->dom_did = IMMU_UNITY_DID;
+	domain->dom_maptype = IMMU_MAPTYPE_UNITY;
+
+	domain->dom_immu = immu;
+	immu->immu_unity_domain = domain;
+
+	/*
+	 * Setup the domain's initial page table
+	 * should never fail.
+	 */
+	domain->dom_pgtable_root = pgtable_alloc(immu, domain,
+	    IMMU_FLAGS_SLEEP);
+
+	ASSERT(domain->dom_pgtable_root);
+
+	map_unity_domain(domain);
+
+	/*
+	 * put it on the system-wide UNITY domain list
+	 */
+	mutex_enter(&(immu_domain_lock));
+	list_insert_tail(&immu_unity_domain_list, domain);
+	mutex_exit(&(immu_domain_lock));
+}
+
+/*
+ * ddip is the domain-dip - the topmost dip in a domain
+ * rdip is the requesting-dip - the device which is
+ * requesting DVMA setup
+ * if domain is a non-shared domain rdip == ddip
+ */
+static domain_t *
+domain_create(immu_t *immu, dev_info_t *ddip, dev_info_t *rdip,
+    immu_flags_t immu_flags)
+{
+	int kmflags;
+	domain_t *domain;
+	char mod_hash_name[128];
+	immu_devi_t *immu_devi;
+	int did;
+
+	ASSERT(immu);
+	ASSERT(ddip);
+
+	immu_devi = immu_devi_get(rdip);
+
+	ASSERT(immu_devi);
+
+	/*
+	 * First allocate a domainid.
+	 * This routine will never fail, since if we run out
+	 * of domains the unity domain will be allocated.
+	 */
+	did = did_alloc(immu, rdip, ddip, immu_flags);
+	ASSERT(did > 0);
+	if (did == IMMU_UNITY_DID) {
+		/* domain overflow */
+		ASSERT(immu->immu_unity_domain);
+		return (immu->immu_unity_domain);
+	}
+
+	kmflags = (immu_flags & IMMU_FLAGS_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
+	domain = kmem_zalloc(sizeof (domain_t), kmflags);
+	if (domain == NULL) {
+		ddi_err(DER_PANIC, rdip, "Failed to alloc DVMA domain "
+		    "structure for device. IOMMU unit: %s", immu->immu_name);
+		/*NOTREACHED*/
+	}
+
+	rw_init(&(domain->dom_pgtable_rwlock), NULL, RW_DEFAULT, NULL);
+	list_create(&(domain->dom_pglist), sizeof (pgtable_t),
+	    offsetof(pgtable_t, swpg_domain_node));
+
+	(void) snprintf(mod_hash_name, sizeof (mod_hash_name),
+	    "immu%s-domain%d-pava-hash", immu->immu_name, did);
+
+	domain->dom_did = did;
+	domain->dom_immu = immu;
+	domain->dom_maptype = IMMU_MAPTYPE_XLATE;
+
+	/*
+	 * Create xlate DVMA arena for this domain.
+	 */
+	create_xlate_arena(immu, domain, rdip, immu_flags);
+
+	/*
+	 * Setup the domain's initial page table
+	 */
+	domain->dom_pgtable_root = pgtable_alloc(immu, domain, immu_flags);
+	if (domain->dom_pgtable_root == NULL) {
+		ddi_err(DER_PANIC, rdip, "Failed to alloc root "
+		    "pgtable for domain (%d). IOMMU unit: %s",
+		    domain->dom_did, immu->immu_name);
+		/*NOTREACHED*/
+	}
+
+	/*
+	 * Since this is a immu unit-specific domain, put it on
+	 * the per-immu domain list.
+	 */
+	mutex_enter(&(immu->immu_lock));
+	list_insert_head(&immu->immu_domain_list, domain);
+	mutex_exit(&(immu->immu_lock));
+
+	/*
+	 * Also put it on the system-wide xlate domain list
+	 */
+	mutex_enter(&(immu_domain_lock));
+	list_insert_head(&immu_xlate_domain_list, domain);
+	mutex_exit(&(immu_domain_lock));
+
+	bdf_domain_insert(immu_devi, domain);
+
+#ifdef BUGGY_DRIVERS
+	/*
+	 * Map page0. Some broken HW/FW access it.
+	 */
+	dvma_map(domain->dom_immu, domain, 0, 0, 1, NULL,
+	    IMMU_FLAGS_READ | IMMU_FLAGS_WRITE | IMMU_FLAGS_PAGE1);
+#endif
+
+	return (domain);
+}
+
+/*
+ * Create domainid arena.
+ * Domainid 0 is reserved by Vt-d spec and cannot be used by
+ * system software.
+ * Domainid 1 is reserved by solaris and used for *all* of the following:
+ *	as the "uninitialized" domain - For devices not yet controlled
+ *	by Solaris
+ *	as the "unity" domain - For devices that will always belong
+ *	to the unity domain
+ *	as the "overflow" domain - Used for any new device after we
+ *	run out of domains
+ * All of the above domains map into a single domain with
+ * domainid 1 and UNITY DVMA mapping
+ * Each IMMU unity has its own unity/uninit/overflow domain
+ */
+static void
+did_init(immu_t *immu)
+{
+	(void) snprintf(immu->immu_did_arena_name,
+	    sizeof (immu->immu_did_arena_name),
+	    "%s_domainid_arena", immu->immu_name);
+
+	ddi_err(DER_VERB, NULL, "%s: Creating domainid arena %s",
+	    immu->immu_name, immu->immu_did_arena_name);
+
+	immu->immu_did_arena = vmem_create(
+	    immu->immu_did_arena_name,
+	    (void *)(uintptr_t)(IMMU_UNITY_DID + 1),   /* start addr */
+	    immu->immu_max_domains - IMMU_UNITY_DID,
+	    1,				/* quantum */
+	    NULL,			/* afunc */
+	    NULL,			/* ffunc */
+	    NULL,			/* source */
+	    0,				/* qcache_max */
+	    VM_SLEEP);
+
+	/* Even with SLEEP flag, vmem_create() can fail */
+	if (immu->immu_did_arena == NULL) {
+		ddi_err(DER_PANIC, NULL, "%s: Failed to create Intel "
+		    "IOMMU domainid allocator: %s", immu->immu_name,
+		    immu->immu_did_arena_name);
+	}
+}
+
+/* #########################  CONTEXT CODE ################################# */
+
+static void
+context_set(immu_t *immu, domain_t *domain, pgtable_t *root_table,
+    int bus, int devfunc)
+{
+	pgtable_t *context;
+	pgtable_t *pgtable_root;
+	pgtable_t *unity_pgtable_root;
+	hw_rce_t *hw_rent;
+	hw_rce_t *hw_cent;
+	hw_rce_t *ctxp;
+
+	ASSERT(rw_write_held(&(immu->immu_ctx_rwlock)));
+
+	ASSERT(immu);
+	ASSERT(domain);
+	ASSERT(root_table);
+	ASSERT(bus >= 0);
+	ASSERT(devfunc >= 0);
+	ASSERT(domain->dom_pgtable_root);
+
+	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
+	context = *(pgtable_t **)(ctxp + bus);
+	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr) + bus;
+	if (ROOT_GET_P(hw_rent)) {
+		ASSERT(ROOT_GET_CONT(hw_rent) == context->hwpg_paddr);
+	} else {
+		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
+		ROOT_SET_P(hw_rent);
+		immu_regs_cpu_flush(immu, (caddr_t)hw_rent, sizeof (hw_rce_t));
+	}
+	hw_cent = (hw_rce_t *)(context->hwpg_vaddr) + devfunc;
+
+	pgtable_root = domain->dom_pgtable_root;
+	unity_pgtable_root = immu->immu_unity_domain->dom_pgtable_root;
+	if (CONT_GET_AVAIL(hw_cent) == IMMU_CONT_UNINITED) {
+		ASSERT(CONT_GET_P(hw_cent));
+		ASSERT(CONT_GET_DID(hw_cent) ==
+		    immu->immu_unity_domain->dom_did);
+		ASSERT(CONT_GET_AW(hw_cent) == immu->immu_dvma_agaw);
+		ASSERT(CONT_GET_TTYPE(hw_cent) == TTYPE_XLATE_ONLY);
+		ASSERT(CONT_GET_ASR(hw_cent) ==
+		    unity_pgtable_root->hwpg_paddr);
+
+		/* need to disable context entry before reprogramming it */
+		bzero(hw_cent, sizeof (hw_rce_t));
+
+		/* flush caches */
+		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
+		ASSERT(rw_write_held(&(immu->immu_ctx_rwlock)));
+		immu_regs_context_flush(immu, 0, 0,
+		    immu->immu_unity_domain->dom_did, CONTEXT_DSI);
+		immu_regs_context_flush(immu, 0, 0, domain->dom_did,
+		    CONTEXT_DSI);
+		immu_regs_iotlb_flush(immu, immu->immu_unity_domain->dom_did,
+		    0, 0, TLB_IVA_WHOLE, IOTLB_DSI);
+		immu_regs_iotlb_flush(immu, domain->dom_did, 0, 0,
+		    TLB_IVA_WHOLE, IOTLB_DSI);
+		immu_regs_wbf_flush(immu);
+
+		CONT_SET_AVAIL(hw_cent, IMMU_CONT_INITED);
+		CONT_SET_DID(hw_cent, domain->dom_did);
+		CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
+		CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
+		/*LINTED*/
+		CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
+		CONT_SET_P(hw_cent);
+		immu_regs_cpu_flush(immu, (caddr_t)hw_cent, sizeof (hw_rce_t));
+	} else {
+		ASSERT(CONT_GET_AVAIL(hw_cent) == IMMU_CONT_INITED);
+		ASSERT(CONT_GET_P(hw_cent));
+		ASSERT(CONT_GET_DID(hw_cent) == domain->dom_did);
+		ASSERT(CONT_GET_AW(hw_cent) == immu->immu_dvma_agaw);
+		ASSERT(CONT_GET_TTYPE(hw_cent) == TTYPE_XLATE_ONLY);
+		ASSERT(CONT_GET_ASR(hw_cent) == pgtable_root->hwpg_paddr);
+	}
+}
+
+static pgtable_t *
+context_create(immu_t *immu)
+{
+	int	bus;
+	int	devfunc;
+	pgtable_t *root_table;
+	pgtable_t *context;
+	pgtable_t *pgtable_root;
+	hw_rce_t *ctxp;
+	hw_rce_t *hw_rent;
+	hw_rce_t *hw_cent;
+
+	/* Allocate a zeroed root table (4K 256b entries) */
+	root_table = pgtable_alloc(immu, NULL, IMMU_FLAGS_SLEEP);
+
+	/*
+	 * Setup context tables for all possible root table entries.
+	 * Start out with unity domains for all entries.
+	 */
+	ctxp = (hw_rce_t *)(root_table->swpg_next_array);
+	hw_rent = (hw_rce_t *)(root_table->hwpg_vaddr);
+	for (bus = 0; bus < IMMU_ROOT_NUM; bus++, ctxp++, hw_rent++) {
+		context = pgtable_alloc(immu, NULL, IMMU_FLAGS_SLEEP);
+		ASSERT(ROOT_GET_P(hw_rent) == 0);
+		ROOT_SET_P(hw_rent);
+		ROOT_SET_CONT(hw_rent, context->hwpg_paddr);
+		hw_cent = (hw_rce_t *)(context->hwpg_vaddr);
+		for (devfunc = 0; devfunc < IMMU_CONT_NUM;
+		    devfunc++, hw_cent++) {
+			ASSERT(CONT_GET_P(hw_cent) == 0);
+			pgtable_root =
+			    immu->immu_unity_domain->dom_pgtable_root;
+			CONT_SET_DID(hw_cent,
+			    immu->immu_unity_domain->dom_did);
+			CONT_SET_AW(hw_cent, immu->immu_dvma_agaw);
+			CONT_SET_ASR(hw_cent, pgtable_root->hwpg_paddr);
+			/*LINTED*/
+			CONT_SET_TTYPE(hw_cent, TTYPE_XLATE_ONLY);
+			CONT_SET_AVAIL(hw_cent, IMMU_CONT_UNINITED);
+			CONT_SET_P(hw_cent);
+		}
+		immu_regs_cpu_flush(immu, context->hwpg_vaddr, IMMU_PAGESIZE);
+		*((pgtable_t **)ctxp) = context;
+	}
+	immu_regs_cpu_flush(immu, root_table->hwpg_vaddr, IMMU_PAGESIZE);
+
+	return (root_table);
+}
+
+/*
+ * Called during rootnex attach, so no locks needed
+ */
+static void
+context_init(immu_t *immu)
+{
+	ASSERT(immu);
+	ASSERT(immu->immu_ctx_root == NULL);
+
+	rw_init(&(immu->immu_ctx_rwlock), NULL, RW_DEFAULT, NULL);
+
+	immu_regs_wbf_flush(immu);
+
+	immu->immu_ctx_root = context_create(immu);
+
+	immu_regs_set_root_table(immu);
+
+	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
+	immu_regs_context_flush(immu, 0, 0, 0, CONTEXT_GLOBAL);
+	rw_exit(&(immu->immu_ctx_rwlock));
+	immu_regs_iotlb_flush(immu, 0, 0, 0, 0, IOTLB_GLOBAL);
+	immu_regs_wbf_flush(immu);
+}
+
+
+/*
+ * Find top pcib
+ */
+static int
+find_top_pcib(dev_info_t *dip, void *arg)
+{
+	immu_devi_t *immu_devi;
+	dev_info_t **pcibdipp = (dev_info_t **)arg;
+
+	ASSERT(dip);
+
+	immu_devi = immu_devi_get(dip);
+	ASSERT(immu_devi);
+
+	if (immu_devi->imd_pcib_type == IMMU_PCIB_PCI_PCI) {
+		*pcibdipp = dip;
+	}
+
+	return (DDI_WALK_CONTINUE);
+}
+
+static int
+immu_context_update(immu_t *immu, domain_t *domain, dev_info_t *ddip,
+    dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	immu_devi_t *r_immu_devi;
+	immu_devi_t *d_immu_devi;
+	int r_bus;
+	int d_bus;
+	int r_devfunc;
+	int d_devfunc;
+	immu_pcib_t d_pcib_type;
+	immu_pcib_t r_pcib_type;
+	dev_info_t *pcibdip;
+
+	if (ddip == NULL || rdip == NULL ||
+	    ddip == root_devinfo || rdip == root_devinfo) {
+		ddi_err(DER_MODE, rdip, "immu_contexts_update: domain-dip or "
+		    "request-dip are NULL or are root devinfo");
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * We need to set the context fields
+	 * based on what type of device rdip and ddip are.
+	 * To do that we need the immu_devi field.
+	 * Set the immu_devi field (if not already set)
+	 */
+	if (immu_devi_set(ddip, immu_flags) == DDI_FAILURE) {
+		ddi_err(DER_MODE, rdip,
+		    "immu_context_update: failed to set immu_devi for ddip");
+		return (DDI_FAILURE);
+	}
+
+	if (immu_devi_set(rdip, immu_flags) == DDI_FAILURE) {
+		ddi_err(DER_MODE, rdip,
+		    "immu_context_update: failed to set immu_devi for rdip");
+		return (DDI_FAILURE);
+	}
+
+	d_immu_devi = immu_devi_get(ddip);
+	r_immu_devi = immu_devi_get(rdip);
+	ASSERT(r_immu_devi);
+	ASSERT(d_immu_devi);
+
+	d_bus = d_immu_devi->imd_bus;
+	d_devfunc = d_immu_devi->imd_devfunc;
+	d_pcib_type = d_immu_devi->imd_pcib_type;
+	r_bus = r_immu_devi->imd_bus;
+	r_devfunc = r_immu_devi->imd_devfunc;
+	r_pcib_type = r_immu_devi->imd_pcib_type;
+
+	ASSERT(d_bus >= 0);
+
+	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
+	if (rdip == ddip) {
+		ASSERT(d_pcib_type == IMMU_PCIB_ENDPOINT ||
+		    d_pcib_type == IMMU_PCIB_PCIE_PCIE);
+		ASSERT(r_bus >= 0);
+		ASSERT(r_devfunc >= 0);
+		/* rdip is a PCIE device. set context for it only */
+		context_set(immu, domain, immu->immu_ctx_root, r_bus,
+		    r_devfunc);
+#ifdef BUGGY_DRIVERS
+	} else if (r_immu_devi == d_immu_devi) {
+#ifdef TEST
+		ddi_err(DER_WARN, rdip, "Driver bug: Devices 0x%lx and "
+		    "0x%lx are identical", rdip, ddip);
+#endif
+		ASSERT(d_pcib_type == IMMU_PCIB_ENDPOINT);
+		ASSERT(r_bus >= 0);
+		ASSERT(r_devfunc >= 0);
+		/* rdip is a PCIE device. set context for it only */
+		context_set(immu, domain, immu->immu_ctx_root, r_bus,
+		    r_devfunc);
+#endif
+	} else if (d_pcib_type == IMMU_PCIB_PCIE_PCI) {
+		/*
+		 * ddip is a PCIE_PCI bridge. Set context for ddip's
+		 * secondary bus. If rdip is on ddip's secondary
+		 * bus, set context for rdip. Else, set context
+		 * for rdip's PCI bridge on ddip's secondary bus.
+		 */
+		context_set(immu, domain, immu->immu_ctx_root,
+		    d_immu_devi->imd_sec, 0);
+		if (d_immu_devi->imd_sec == r_bus) {
+			context_set(immu, domain, immu->immu_ctx_root,
+			    r_bus, r_devfunc);
+		} else {
+			pcibdip = NULL;
+			if (immu_walk_ancestor(rdip, ddip, find_top_pcib,
+			    &pcibdip, NULL, immu_flags) == DDI_SUCCESS &&
+			    pcibdip != NULL) {
+				ASSERT(pcibdip);
+				r_immu_devi = immu_devi_get(pcibdip);
+				ASSERT(d_immu_devi);
+				ASSERT(d_immu_devi->imd_pcib_type ==
+				    IMMU_PCIB_PCI_PCI);
+				r_bus = r_immu_devi->imd_bus;
+				r_devfunc = r_immu_devi->imd_devfunc;
+				context_set(immu, domain, immu->immu_ctx_root,
+				    r_bus, r_devfunc);
+			} else {
+				ddi_err(DER_PANIC, rdip, "Failed to find PCI "
+				    " bridge for PCI device");
+				/*NOTREACHED*/
+			}
+		}
+	} else if (d_pcib_type == IMMU_PCIB_PCI_PCI) {
+		context_set(immu, domain, immu->immu_ctx_root, d_bus,
+		    d_devfunc);
+	} else if (d_pcib_type == IMMU_PCIB_ENDPOINT) {
+		ASSERT(r_pcib_type == IMMU_PCIB_NOBDF);
+		/*
+		 * ddip is a PCIE device which has a non-PCI device under it
+		 * i.e. it is a PCI-nonPCI bridge. Example: pciicde-ata
+		 */
+		context_set(immu, domain, immu->immu_ctx_root, d_bus,
+		    d_devfunc);
+	} else {
+		ddi_err(DER_PANIC, rdip, "unknown device type. Cannot "
+		    "set IMMU context.");
+		/*NOTREACHED*/
+	}
+	rw_exit(&(immu->immu_ctx_rwlock));
+
+	/* XXX do we need a membar_producer() here */
+	return (DDI_SUCCESS);
+}
+
+/* ##################### END CONTEXT CODE ################################## */
+/* ##################### MAPPING CODE ################################## */
+
+
+static boolean_t
+PDTE_check(immu_t *immu, hw_pdte_t pdte, pgtable_t *next, paddr_t paddr,
+    dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	if (immu_flags & IMMU_FLAGS_PAGE1) {
+		ASSERT(paddr == 0);
+	} else {
+		ASSERT((next == NULL) ^ (paddr == 0));
+	}
+
+	/* The PDTE must be set i.e. present bit is set */
+	if (!PDTE_P(pdte)) {
+		ddi_err(DER_MODE, rdip, "No present flag");
+		return (B_FALSE);
+	}
+
+	/*
+	 * Just assert to check most significant system software field
+	 * (PDTE_SW4) as it is same as present bit and we
+	 * checked that above
+	 */
+	ASSERT(PDTE_SW4(pdte));
+
+	/*
+	 * TM field should be clear if not reserved.
+	 * non-leaf is always reserved
+	 */
+	if (next == NULL && immu_regs_is_TM_reserved(immu) == B_FALSE) {
+		if (PDTE_TM(pdte)) {
+			ddi_err(DER_MODE, rdip, "TM flag set");
+			return (B_FALSE);
+		}
+	}
+
+	/*
+	 * The SW3 field is not used and must be clear
+	 */
+	if (PDTE_SW3(pdte)) {
+		ddi_err(DER_MODE, rdip, "SW3 set");
+		return (B_FALSE);
+	}
+
+	/*
+	 * PFN (for PTE) or next level pgtable-paddr (for PDE) must be set
+	 */
+	if (next == NULL) {
+		ASSERT(paddr % IMMU_PAGESIZE == 0);
+		if (PDTE_PADDR(pdte) != paddr) {
+			ddi_err(DER_MODE, rdip,
+			    "PTE paddr mismatch: %lx != %lx",
+			    PDTE_PADDR(pdte), paddr);
+			return (B_FALSE);
+		}
+	} else {
+		if (PDTE_PADDR(pdte) != next->hwpg_paddr) {
+			ddi_err(DER_MODE, rdip,
+			    "PDE paddr mismatch: %lx != %lx",
+			    PDTE_PADDR(pdte), next->hwpg_paddr);
+			return (B_FALSE);
+		}
+	}
+
+	/*
+	 * SNP field should be clear if not reserved.
+	 * non-leaf is always reserved
+	 */
+	if (next == NULL && immu_regs_is_SNP_reserved(immu) == B_FALSE) {
+		if (PDTE_SNP(pdte)) {
+			ddi_err(DER_MODE, rdip, "SNP set");
+			return (B_FALSE);
+		}
+	}
+
+	/* second field available for system software should be clear */
+	if (PDTE_SW2(pdte)) {
+		ddi_err(DER_MODE, rdip, "SW2 set");
+		return (B_FALSE);
+	}
+
+	/* Super pages field should be clear */
+	if (PDTE_SP(pdte)) {
+		ddi_err(DER_MODE, rdip, "SP set");
+		return (B_FALSE);
+	}
+
+	/*
+	 * least significant field available for
+	 * system software should be clear
+	 */
+	if (PDTE_SW1(pdte)) {
+		ddi_err(DER_MODE, rdip, "SW1 set");
+		return (B_FALSE);
+	}
+
+	if ((immu_flags & IMMU_FLAGS_READ) && !PDTE_READ(pdte)) {
+		ddi_err(DER_MODE, rdip, "READ not set");
+		return (B_FALSE);
+	}
+
+	if ((immu_flags & IMMU_FLAGS_WRITE) && !PDTE_WRITE(pdte)) {
+		ddi_err(DER_MODE, rdip, "WRITE not set");
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+/*ARGSUSED*/
+static void
+PTE_clear_one(immu_t *immu, domain_t *domain, xlate_t *xlate, uint64_t dvma,
+    dev_info_t *rdip)
+{
+	hw_pdte_t *hwp;
+	pgtable_t *pgtable;
+	int idx;
+	hw_pdte_t pte;
+
+	ASSERT(xlate->xlt_level == 1);
+
+	idx = xlate->xlt_idx;
+	pgtable = xlate->xlt_pgtable;
+
+	ASSERT(dvma % IMMU_PAGESIZE == 0);
+	ASSERT(pgtable);
+	ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
+
+	/*
+	 * since we are clearing PTEs, lock the
+	 * page table write mode
+	 */
+	rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
+
+	/*
+	 * We are at the leaf - next level array must be NULL
+	 */
+	ASSERT(pgtable->swpg_next_array == NULL);
+
+	hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
+
+	pte = *hwp;
+	/* Cannot clear a HW PTE that is aleady clear */
+	ASSERT(PDTE_P(pte));
+	PDTE_CLEAR_P(pte);
+	*hwp = pte;
+
+	/* flush writes to HW PTE table */
+	immu_regs_cpu_flush(immu, (caddr_t)hwp, sizeof (hw_pdte_t));
+
+	rw_exit(&(xlate->xlt_pgtable->swpg_rwlock));
+}
+
+/*ARGSUSED*/
+static void
+xlate_setup(immu_t *immu, uint64_t dvma, xlate_t *xlate,
+    int nlevels, dev_info_t *rdip)
+{
+	int level;
+	uint64_t offbits;
+
+	/* level 0 is never used. Sanity check */
+	ASSERT(xlate->xlt_level == 0);
+	ASSERT(xlate->xlt_idx == 0);
+	ASSERT(xlate->xlt_pgtable == NULL);
+	ASSERT(dvma % IMMU_PAGESIZE == 0);
+
+	/*
+	 * Skip the first 12 bits which is the offset into
+	 * 4K PFN (phys page frame based on IMMU_PAGESIZE)
+	 */
+	offbits = dvma >> IMMU_PAGESHIFT;
+
+	/* skip to level 1 i.e. leaf PTE */
+	for (level = 1, xlate++; level <= nlevels; level++, xlate++) {
+		xlate->xlt_level = level;
+		xlate->xlt_idx = (offbits & IMMU_PGTABLE_LEVEL_MASK);
+		ASSERT(xlate->xlt_idx <= IMMU_PGTABLE_MAXIDX);
+		xlate->xlt_pgtable = NULL;
+		offbits >>= IMMU_PGTABLE_LEVEL_STRIDE;
+	}
+}
+
+/*
+ * Read the pgtables
+ */
+static void
+PDE_lookup(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
+    dev_info_t *rdip)
+{
+	pgtable_t *pgtable;
+	pgtable_t *next;
+	hw_pdte_t pde;
+	uint_t idx;
+
+	/* xlate should be at level 0 */
+	ASSERT(xlate->xlt_level == 0);
+	ASSERT(xlate->xlt_idx == 0);
+
+	/* start with highest level pgtable i.e. root */
+	xlate += nlevels;
+	ASSERT(xlate->xlt_level == nlevels);
+
+	if (xlate->xlt_pgtable == NULL) {
+		xlate->xlt_pgtable = domain->dom_pgtable_root;
+	}
+
+	for (; xlate->xlt_level > 1; xlate--) {
+
+		idx = xlate->xlt_idx;
+		pgtable = xlate->xlt_pgtable;
+
+		ASSERT(pgtable);
+		ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
+
+		if ((xlate - 1)->xlt_pgtable) {
+			continue;
+		}
+
+		/* xlate's leafier level is not set, set it now */
+
+		/* Lock the pgtable in read mode */
+		rw_enter(&(pgtable->swpg_rwlock), RW_READER);
+
+		/*
+		 * since we are unmapping, the pgtable should
+		 * already point to a leafier pgtable.
+		 */
+		next = *(pgtable->swpg_next_array + idx);
+		ASSERT(next);
+
+		pde = *((hw_pdte_t *)(pgtable->hwpg_vaddr) + idx);
+
+		ASSERT(PDTE_check(immu, pde, next, 0, rdip, 0) == B_TRUE);
+
+		(xlate - 1)->xlt_pgtable = next;
+
+		rw_exit(&(pgtable->swpg_rwlock));
+	}
+}
+
+static void
+PTE_set_one(immu_t *immu, hw_pdte_t *hwp, paddr_t paddr,
+    dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	hw_pdte_t pte;
+
+	pte = *hwp;
+
+	if (PDTE_P(pte)) {
+		if (PDTE_PADDR(pte) != paddr) {
+			ddi_err(DER_MODE, rdip, "PTE paddr %lx != paddr %lx",
+			    PDTE_PADDR(pte), paddr);
+		}
+		goto out;
+	}
+
+
+	/* Don't touch SW4. It is the present field */
+
+	/* clear TM field if not reserved */
+	if (immu_regs_is_TM_reserved(immu) == B_FALSE) {
+		PDTE_CLEAR_TM(pte);
+	}
+
+	/* Clear 3rd field for system software  - not used */
+	PDTE_CLEAR_SW3(pte);
+
+	/* Set paddr */
+	ASSERT(paddr % IMMU_PAGESIZE == 0);
+	PDTE_CLEAR_PADDR(pte);
+	PDTE_SET_PADDR(pte, paddr);
+
+	/*  clear SNP field if not reserved. */
+	if (immu_regs_is_SNP_reserved(immu) == B_FALSE) {
+		PDTE_CLEAR_SNP(pte);
+	}
+
+	/* Clear SW2 field available for software */
+	PDTE_CLEAR_SW2(pte);
+
+	/* SP is don't care for PTEs. Clear it for cleanliness */
+	PDTE_CLEAR_SP(pte);
+
+	/* Clear SW1 field available for software */
+	PDTE_CLEAR_SW1(pte);
+
+	/*
+	 * Now that we are done writing the PTE
+	 * set the "present" flag. Note this present
+	 * flag is a bit in the PDE/PTE that the
+	 * spec says is available for system software.
+	 * This is an implementation detail of Solaris
+	 * bare-metal Intel IOMMU.
+	 * The present field in a PDE/PTE is not defined
+	 * by the Vt-d spec
+	 */
+
+	PDTE_SET_P(pte);
+
+out:
+	if (immu_flags & IMMU_FLAGS_READ)
+		PDTE_SET_READ(pte);
+	if (immu_flags & IMMU_FLAGS_WRITE)
+		PDTE_SET_WRITE(pte);
+
+#ifdef BUGGY_DRIVERS
+	PDTE_SET_READ(pte);
+	PDTE_SET_WRITE(pte);
+#endif
+
+	*hwp = pte;
+}
+
+/*ARGSUSED*/
+static void
+PTE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate,
+    uint64_t *dvma_ptr, paddr_t *paddr_ptr, uint64_t *npages_ptr,
+    dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	paddr_t paddr;
+	uint64_t npages;
+	uint64_t dvma;
+	pgtable_t *pgtable;
+	hw_pdte_t *hwp;
+	hw_pdte_t *shwp;
+	int idx;
+
+	ASSERT(xlate->xlt_level == 1);
+
+	pgtable = xlate->xlt_pgtable;
+	idx = xlate->xlt_idx;
+
+	ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
+	ASSERT(pgtable);
+
+	dvma = *dvma_ptr;
+	paddr = *paddr_ptr;
+	npages = *npages_ptr;
+
+	ASSERT(paddr || (immu_flags & IMMU_FLAGS_PAGE1));
+	ASSERT(dvma || (immu_flags & IMMU_FLAGS_PAGE1));
+	ASSERT(npages);
+
+	/*
+	 * since we are setting PTEs, lock the page table in
+	 * write mode
+	 */
+	rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
+
+	/*
+	 * we are at the leaf pgtable - no further levels.
+	 * The next_array field should be NULL.
+	 */
+	ASSERT(pgtable->swpg_next_array == NULL);
+
+	shwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
+
+	hwp = shwp;
+	for (; npages > 0 && idx <= IMMU_PGTABLE_MAXIDX; idx++, hwp++) {
+
+		PTE_set_one(immu, hwp, paddr, rdip, immu_flags);
+
+		ASSERT(PDTE_check(immu, *hwp, NULL, paddr, rdip, immu_flags)
+		    == B_TRUE);
+
+		paddr += IMMU_PAGESIZE;
+		dvma += IMMU_PAGESIZE;
+		npages--;
+	}
+
+	/* flush writes to HW PTE table */
+	immu_regs_cpu_flush(immu, (caddr_t)shwp, (hwp - shwp) *
+	    sizeof (hw_pdte_t));
+
+	*dvma_ptr = dvma;
+	*paddr_ptr = paddr;
+	*npages_ptr = npages;
+	xlate->xlt_idx = idx;
+
+	rw_exit(&(pgtable->swpg_rwlock));
+}
+
+/*ARGSUSED*/
+static void
+PDE_set_one(immu_t *immu, hw_pdte_t *hwp, pgtable_t *next,
+    dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	hw_pdte_t pde;
+
+	pde = *hwp;
+
+	/* if PDE is already set, make sure it is correct */
+	if (PDTE_P(pde)) {
+		ASSERT(PDTE_PADDR(pde) == next->hwpg_paddr);
+		goto out;
+	}
+
+	/* Dont touch SW4, it is the present bit */
+
+	/* don't touch TM field it is reserved for PDEs */
+
+	/* 3rd field available for system software is not used */
+	PDTE_CLEAR_SW3(pde);
+
+	/* Set next level pgtable-paddr for PDE */
+	ASSERT(next->hwpg_paddr % IMMU_PAGESIZE == 0);
+	PDTE_CLEAR_PADDR(pde);
+	PDTE_SET_PADDR(pde, next->hwpg_paddr);
+
+	/* don't touch SNP field it is reserved for PDEs */
+
+	/* Clear second field available for system software */
+	PDTE_CLEAR_SW2(pde);
+
+	/* No super pages for PDEs */
+	PDTE_CLEAR_SP(pde);
+
+	/* Clear SW1 for software */
+	PDTE_CLEAR_SW1(pde);
+
+	/*
+	 * Now that we are done writing the PDE
+	 * set the "present" flag. Note this present
+	 * flag is a bit in the PDE/PTE that the
+	 * spec says is available for system software.
+	 * This is an implementation detail of Solaris
+	 * base-metal Intel IOMMU.
+	 * The present field in a PDE/PTE is not defined
+	 * by the Vt-d spec
+	 */
+out:
+
+	if (immu_flags & IMMU_FLAGS_READ)
+		PDTE_SET_READ(pde);
+	if (immu_flags & IMMU_FLAGS_WRITE)
+		PDTE_SET_WRITE(pde);
+
+#ifdef  BUGGY_DRIVERS
+	PDTE_SET_READ(pde);
+	PDTE_SET_WRITE(pde);
+#endif
+
+	PDTE_SET_P(pde);
+
+	*hwp = pde;
+
+	immu_regs_cpu_flush(immu, (caddr_t)hwp, sizeof (hw_pdte_t));
+}
+
+/*
+ * Used to set PDEs
+ */
+static void
+PDE_set_all(immu_t *immu, domain_t *domain, xlate_t *xlate, int nlevels,
+    dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	pgtable_t *pgtable;
+	pgtable_t *new;
+	pgtable_t *next;
+	hw_pdte_t *hwp;
+	int level;
+	uint_t idx;
+
+	/* xlate should be at level 0 */
+	ASSERT(xlate->xlt_level == 0);
+	ASSERT(xlate->xlt_idx == 0);
+
+	/* start with highest level pgtable i.e. root */
+	xlate += nlevels;
+	ASSERT(xlate->xlt_level == nlevels);
+
+	new = NULL;
+	xlate->xlt_pgtable = domain->dom_pgtable_root;
+	for (level = nlevels; level > 1; level--, xlate--) {
+
+		ASSERT(xlate->xlt_level == level);
+
+		idx = xlate->xlt_idx;
+		pgtable = xlate->xlt_pgtable;
+
+		ASSERT(pgtable);
+		ASSERT(idx <= IMMU_PGTABLE_MAXIDX);
+
+		/* speculative alloc */
+		if (new == NULL) {
+			new = pgtable_alloc(immu, domain, immu_flags);
+			if (new == NULL) {
+				ddi_err(DER_PANIC, rdip, "pgtable alloc err");
+			}
+
+		}
+
+		/* Alway lock the pgtable in write mode */
+		rw_enter(&(pgtable->swpg_rwlock), RW_WRITER);
+
+		hwp = (hw_pdte_t *)(pgtable->hwpg_vaddr) + idx;
+
+		ASSERT(pgtable->swpg_next_array);
+
+		next = (pgtable->swpg_next_array)[idx];
+
+		/*
+		 * check if leafier level already has a pgtable
+		 * if yes, verify
+		 */
+		if (next == NULL) {
+			next = new;
+			new = NULL;
+			if (level == 2) {
+				/* leaf cannot have next_array */
+				kmem_free(next->swpg_next_array,
+				    IMMU_PAGESIZE);
+				next->swpg_next_array = NULL;
+			}
+			(pgtable->swpg_next_array)[idx] = next;
+			PDE_set_one(immu, hwp, next, rdip, immu_flags);
+		} else {
+			hw_pdte_t pde = *hwp;
+
+			if (immu_flags & IMMU_FLAGS_READ)
+				PDTE_SET_READ(pde);
+			if (immu_flags & IMMU_FLAGS_WRITE)
+				PDTE_SET_WRITE(pde);
+
+#ifdef  BUGGY_DRIVERS
+/* If buggy driver we already set permission READ+WRITE so nothing to do */
+#endif
+
+			*hwp = pde;
+		}
+
+		ASSERT(PDTE_check(immu, *hwp, next, 0, rdip, immu_flags)
+		    == B_TRUE);
+
+		(xlate - 1)->xlt_pgtable = next;
+
+		rw_exit(&(pgtable->swpg_rwlock));
+	}
+
+	if (new) {
+		pgtable_free(immu, new, domain);
+	}
+}
+
+/*
+ * dvma_map()
+ *     map a contiguous range of DVMA pages
+ *
+ *     immu: IOMMU unit for which we are generating DVMA cookies
+ *   domain: domain
+ *    sdvma: Starting dvma
+ *   spaddr: Starting paddr
+ *   npages: Number of pages
+ *     rdip: requesting device
+ *     immu_flags: flags
+ */
+static void
+dvma_map(immu_t *immu, domain_t *domain, uint64_t sdvma, uint64_t spaddr,
+    uint64_t npages, dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	uint64_t dvma;
+	paddr_t paddr;
+	uint64_t n;
+	int nlevels = immu->immu_dvma_nlevels;
+	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
+
+	ASSERT(nlevels <= IMMU_PGTABLE_MAX_LEVELS);
+	ASSERT(spaddr % IMMU_PAGESIZE == 0);
+	ASSERT(sdvma % IMMU_PAGESIZE == 0);
+	ASSERT(npages);
+
+	n = npages;
+	dvma = sdvma;
+	paddr = spaddr;
+
+	while (n > 0) {
+		xlate_setup(immu, dvma, xlate, nlevels, rdip);
+
+		/* Lookup or allocate PGDIRs and PGTABLEs if necessary */
+		PDE_set_all(immu, domain, xlate, nlevels, rdip, immu_flags);
+
+		/* set all matching ptes that fit into this leaf pgtable */
+		PTE_set_all(immu, domain, &xlate[1], &dvma, &paddr, &n, rdip,
+		    immu_flags);
+	}
+}
+
+/*
+ * dvma_unmap()
+ *   unmap a range of DVMAs
+ *
+ * immu: IOMMU unit state
+ * domain: domain for requesting device
+ * ddip: domain-dip
+ * dvma: starting DVMA
+ * npages: Number of IMMU pages to be unmapped
+ * rdip: requesting device
+ */
+static void
+dvma_unmap(immu_t *immu, domain_t *domain, uint64_t dvma, uint64_t snpages,
+    dev_info_t *rdip)
+{
+	int nlevels = immu->immu_dvma_nlevels;
+	xlate_t xlate[IMMU_PGTABLE_MAX_LEVELS + 1] = {0};
+	uint64_t npages;
+
+	ASSERT(nlevels <= IMMU_PGTABLE_MAX_LEVELS);
+	ASSERT(dvma != 0);
+	ASSERT(dvma % IMMU_PAGESIZE == 0);
+	ASSERT(snpages);
+
+	for (npages = snpages; npages > 0; npages--) {
+		/* setup the xlate array */
+		xlate_setup(immu, dvma, xlate, nlevels, rdip);
+
+		/* just lookup existing pgtables. Should never fail */
+		PDE_lookup(immu, domain, xlate, nlevels, rdip);
+
+		/* XXX should be more efficient - batch clear */
+		PTE_clear_one(immu, domain, &xlate[1], dvma, rdip);
+
+		dvma += IMMU_PAGESIZE;
+	}
+}
+
+static uint64_t
+dvma_alloc(ddi_dma_impl_t *hp, domain_t *domain, uint_t npages)
+{
+	ddi_dma_attr_t *dma_attr;
+	uint64_t dvma;
+	size_t xsize, align, nocross;
+	uint64_t minaddr, maxaddr;
+
+	ASSERT(domain->dom_maptype != IMMU_MAPTYPE_UNITY);
+
+	/* shotcuts */
+	dma_attr = &(hp->dmai_attr);
+
+	/* parameters */
+	xsize = npages * IMMU_PAGESIZE;
+	align = MAX((size_t)(dma_attr->dma_attr_align), IMMU_PAGESIZE);
+	nocross = (size_t)(dma_attr->dma_attr_seg + 1);
+	minaddr = dma_attr->dma_attr_addr_lo;
+	maxaddr = dma_attr->dma_attr_addr_hi + 1;
+
+	/* handle the rollover cases */
+	if (maxaddr < dma_attr->dma_attr_addr_hi) {
+		maxaddr = dma_attr->dma_attr_addr_hi;
+	}
+
+	/*
+	 * allocate from vmem arena.
+	 */
+	dvma = (uint64_t)(uintptr_t)vmem_xalloc(domain->dom_dvma_arena,
+	    xsize, align, 0, nocross, (void *)(uintptr_t)minaddr,
+	    (void *)(uintptr_t)maxaddr, VM_NOSLEEP);
+
+	ASSERT(dvma);
+	ASSERT(dvma >= minaddr);
+	ASSERT(dvma + xsize - 1 < maxaddr);
+
+	return (dvma);
+}
+
+static void
+dvma_free(domain_t *domain, uint64_t dvma, uint64_t npages)
+{
+	uint64_t size = npages * IMMU_PAGESIZE;
+
+	ASSERT(domain);
+	ASSERT(domain->dom_did > 0);
+	ASSERT(dvma);
+	ASSERT(npages);
+
+	if (domain->dom_maptype != IMMU_MAPTYPE_XLATE) {
+		ASSERT(domain->dom_maptype == IMMU_MAPTYPE_UNITY);
+		return;
+	}
+
+	vmem_free(domain->dom_dvma_arena, (void *)(uintptr_t)dvma, size);
+}
+/*ARGSUSED*/
+static void
+cookie_free(rootnex_dma_t *dma, immu_t *immu, domain_t *domain,
+    dev_info_t *ddip, dev_info_t *rdip)
+{
+	int i;
+	uint64_t dvma;
+	uint64_t npages;
+	dvcookie_t  *dvcookies = dma->dp_dvcookies;
+	uint64_t dvmax =  dma->dp_dvmax;
+
+	ASSERT(dma->dp_max_cookies);
+	ASSERT(dma->dp_max_dcookies);
+	ASSERT(dma->dp_dvmax < dma->dp_max_cookies);
+	ASSERT(dma->dp_dmax < dma->dp_max_dcookies);
+
+	for (i = 0; i <= dvmax; i++) {
+		dvma = dvcookies[i].dvck_dvma;
+		npages = dvcookies[i].dvck_npages;
+		dvma_unmap(immu, domain, dvma, npages, rdip);
+		dvma_free(domain, dvma, npages);
+	}
+
+	kmem_free(dma->dp_dvcookies, sizeof (dvcookie_t) * dma->dp_max_cookies);
+	dma->dp_dvcookies = NULL;
+	kmem_free(dma->dp_dcookies, sizeof (dcookie_t) * dma->dp_max_dcookies);
+	dma->dp_dcookies = NULL;
+	if (dma->dp_need_to_free_cookie == B_TRUE) {
+		kmem_free(dma->dp_cookies, sizeof (ddi_dma_cookie_t) *
+		    dma->dp_max_cookies);
+		dma->dp_dcookies = NULL;
+		dma->dp_need_to_free_cookie = B_FALSE;
+	}
+
+	dma->dp_max_cookies = 0;
+	dma->dp_max_dcookies = 0;
+	dma->dp_cookie_size = 0;
+	dma->dp_dvmax = 0;
+	dma->dp_dmax = 0;
+}
+
+/*
+ * cookie_alloc()
+ */
+static int
+cookie_alloc(rootnex_dma_t *dma, struct ddi_dma_req *dmareq,
+    ddi_dma_attr_t *attr, uint_t prealloc)
+{
+	int kmflag;
+	rootnex_sglinfo_t *sinfo = &(dma->dp_sglinfo);
+	dvcookie_t *dvcookies = dma->dp_dvcookies;
+	dcookie_t *dcookies = dma->dp_dcookies;
+	ddi_dma_cookie_t *cookies = dma->dp_cookies;
+	uint64_t max_cookies;
+	uint64_t max_dcookies;
+	uint64_t cookie_size;
+
+	/* we need to allocate new array */
+	if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
+		kmflag =  KM_SLEEP;
+	} else {
+		kmflag =  KM_NOSLEEP;
+	}
+
+	/*
+	 * XXX make sure cookies size doen't exceed sinfo->si_max_cookie_size;
+	 */
+
+	/*
+	 * figure out the rough estimate of array size
+	 * At a minimum, each cookie must hold 1 page.
+	 * At a maximum, it cannot exceed dma_attr_sgllen
+	 */
+	max_dcookies = dmareq->dmar_object.dmao_size + IMMU_PAGEOFFSET;
+	max_dcookies /= IMMU_PAGESIZE;
+	max_dcookies++;
+	max_cookies = MIN(max_dcookies, attr->dma_attr_sgllen);
+
+	/* allocate the dvma cookie array */
+	dvcookies = kmem_zalloc(sizeof (dvcookie_t) * max_cookies, kmflag);
+	if (dvcookies == NULL) {
+		return (DDI_FAILURE);
+	}
+
+	/* allocate the "phys" cookie array */
+	dcookies = kmem_zalloc(sizeof (dcookie_t) * max_dcookies, kmflag);
+	if (dcookies == NULL) {
+		kmem_free(dvcookies, sizeof (dvcookie_t) * max_cookies);
+		dvcookies = NULL;
+		return (DDI_FAILURE);
+	}
+
+	/* allocate the "real" cookie array  - the one given to users */
+	cookie_size = sizeof (ddi_dma_cookie_t) * max_cookies;
+	if (max_cookies > prealloc) {
+		cookies = kmem_zalloc(cookie_size, kmflag);
+		if (cookies == NULL) {
+			kmem_free(dvcookies, sizeof (dvcookie_t) *
+			    max_cookies);
+			kmem_free(dcookies, sizeof (dcookie_t) *
+			    max_dcookies);
+			goto fail;
+		}
+		dma->dp_need_to_free_cookie = B_TRUE;
+	} else {
+		/* the preallocated buffer fits this size */
+		cookies = (ddi_dma_cookie_t *)dma->dp_prealloc_buffer;
+		bzero(cookies, sizeof (ddi_dma_cookie_t) * max_cookies);
+		dma->dp_need_to_free_cookie = B_FALSE;
+	}
+
+	dma->dp_dvcookies = dvcookies;
+	dma->dp_dcookies = dcookies;
+	dma->dp_cookies = cookies;
+	dma->dp_cookie_size = cookie_size;
+	dma->dp_max_cookies = max_cookies;
+	dma->dp_max_dcookies = max_dcookies;
+	dma->dp_dvmax = 0;
+	dma->dp_dmax = 0;
+
+	sinfo->si_max_pages = dma->dp_max_cookies;
+
+	return (DDI_SUCCESS);
+
+fail:
+	dma->dp_dvcookies = NULL;
+	dma->dp_dcookies = NULL;
+	dma->dp_cookies = NULL;
+	dma->dp_cookie_size = 0;
+	dma->dp_max_cookies = 0;
+	dma->dp_max_dcookies = 0;
+	dma->dp_dvmax = 0;
+	dma->dp_dmax = 0;
+	dma->dp_need_to_free_cookie = B_FALSE;
+	sinfo->si_max_pages = 0;
+	return (DDI_FAILURE);
+}
+
+/*ARGSUSED*/
+static void
+cookie_update(domain_t *domain, rootnex_dma_t *dma, paddr_t paddr,
+    int64_t psize, uint64_t maxseg)
+{
+	dvcookie_t *dvcookies = dma->dp_dvcookies;
+	dcookie_t *dcookies = dma->dp_dcookies;
+	ddi_dma_cookie_t *cookies = dma->dp_cookies;
+	uint64_t dvmax = dma->dp_dvmax;
+	uint64_t dmax = dma->dp_dmax;
+
+	ASSERT(dvmax < dma->dp_max_cookies);
+	ASSERT(dmax < dma->dp_max_dcookies);
+
+	paddr &= IMMU_PAGEMASK;
+
+	ASSERT(paddr);
+	ASSERT(psize);
+	ASSERT(maxseg);
+
+	/*
+	 * check to see if this page would put us
+	 * over the max cookie size
+	 */
+	if (cookies[dvmax].dmac_size + psize > maxseg) {
+		dvcookies[dvmax].dvck_eidx = dmax;
+		dvmax++;    /* use the next dvcookie */
+		dmax++;    /* also mean we use the next dcookie */
+		dvcookies[dvmax].dvck_sidx = dmax;
+
+		ASSERT(dvmax < dma->dp_max_cookies);
+		ASSERT(dmax < dma->dp_max_dcookies);
+	}
+
+	/*
+	 * If the cookie is mapped or empty
+	 */
+	if (dvcookies[dvmax].dvck_dvma != 0 ||
+	    dvcookies[dvmax].dvck_npages == 0) {
+		/* if mapped, we need a new empty one */
+		if (dvcookies[dvmax].dvck_dvma != 0) {
+			dvcookies[dvmax].dvck_eidx = dmax;
+			dvmax++;
+			dmax++;
+			dvcookies[dvmax].dvck_sidx = dma->dp_dmax;
+			ASSERT(dvmax < dma->dp_max_cookies);
+			ASSERT(dmax < dma->dp_max_dcookies);
+		}
+
+		/* ok, we have an empty cookie */
+		ASSERT(cookies[dvmax].dmac_size == 0);
+		ASSERT(dvcookies[dvmax].dvck_dvma == 0);
+		ASSERT(dvcookies[dvmax].dvck_npages
+		    == 0);
+		ASSERT(dcookies[dmax].dck_paddr == 0);
+		ASSERT(dcookies[dmax].dck_npages == 0);
+
+		dvcookies[dvmax].dvck_dvma = 0;
+		dvcookies[dvmax].dvck_npages = 1;
+		dcookies[dmax].dck_paddr = paddr;
+		dcookies[dmax].dck_npages = 1;
+		cookies[dvmax].dmac_size = psize;
+	} else {
+		/* Unmapped cookie but not empty. Add to it */
+		cookies[dma->dp_dvmax].dmac_size += psize;
+		ASSERT(dvcookies[dma->dp_dvmax].dvck_dvma == 0);
+		dvcookies[dma->dp_dvmax].dvck_npages++;
+		ASSERT(dcookies[dmax].dck_paddr != 0);
+		ASSERT(dcookies[dmax].dck_npages != 0);
+
+		/* Check if this paddr is contiguous */
+		if (IMMU_CONTIG_PADDR(dcookies[dmax], paddr)) {
+			dcookies[dmax].dck_npages++;
+		} else {
+			/* No, we need a new dcookie */
+			dmax++;
+			ASSERT(dcookies[dmax].dck_paddr == 0);
+			ASSERT(dcookies[dmax].dck_npages == 0);
+			dcookies[dmax].dck_paddr = paddr;
+			dcookies[dmax].dck_npages = 1;
+		}
+	}
+
+	dma->dp_dvmax = dvmax;
+	dma->dp_dmax = dmax;
+}
+
+static void
+cookie_finalize(ddi_dma_impl_t *hp, immu_t *immu, domain_t *domain,
+    dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	int i;
+	int j;
+	rootnex_dma_t *dma = (rootnex_dma_t *)hp->dmai_private;
+	dvcookie_t *dvcookies = dma->dp_dvcookies;
+	dcookie_t *dcookies = dma->dp_dcookies;
+	ddi_dma_cookie_t *cookies = dma->dp_cookies;
+	paddr_t paddr;
+	uint64_t npages;
+	uint64_t dvma;
+
+	for (i = 0; i <= dma->dp_dvmax; i++) {
+		/* Finish up the last cookie */
+		if (i == dma->dp_dvmax) {
+			dvcookies[i].dvck_eidx = dma->dp_dmax;
+		}
+		if ((dvma = dvcookies[i].dvck_dvma) != 0) {
+			cookies[i].dmac_laddress = dvma;
+			ASSERT(cookies[i].dmac_size != 0);
+			cookies[i].dmac_type = 0;
+			for (j = dvcookies[i].dvck_sidx;
+			    j <= dvcookies[i].dvck_eidx; j++) {
+				ASSERT(dcookies[j].dck_paddr != 0);
+				ASSERT(dcookies[j].dck_npages != 0);
+			}
+			continue;
+		}
+
+		dvma = dvma_alloc(hp, domain, dvcookies[i].dvck_npages);
+
+		dvcookies[i].dvck_dvma = dvma;
+
+		/* Set "real" cookies addr, cookie size already set */
+		cookies[i].dmac_laddress = dvma;
+		ASSERT(cookies[i].dmac_size != 0);
+		cookies[i].dmac_type = 0;
+
+		for (j = dvcookies[i].dvck_sidx;
+		    j <= dvcookies[i].dvck_eidx; j++) {
+
+			paddr = dcookies[j].dck_paddr;
+			npages = dcookies[j].dck_npages;
+
+			ASSERT(paddr);
+			ASSERT(npages);
+
+			dvma_map(immu, domain, dvma, paddr, npages,
+			    rdip, immu_flags);
+			dvma += npages * IMMU_PAGESIZE;
+		}
+	}
+}
+
+/*
+ * cookie_create()
+ */
+static int
+cookie_create(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq,
+    ddi_dma_attr_t *a, immu_t *immu, domain_t *domain, dev_info_t *rdip,
+    uint_t prealloc_count, immu_flags_t immu_flags)
+{
+
+	ddi_dma_atyp_t buftype;
+	uint64_t offset;
+	page_t **pparray;
+	uint64_t paddr;
+	uint_t psize;
+	uint_t size;
+	uint64_t maxseg;
+	caddr_t vaddr;
+	uint_t pcnt;
+	page_t *page;
+	rootnex_sglinfo_t *sglinfo;
+	ddi_dma_obj_t *dmar_object;
+	rootnex_dma_t *dma;
+
+	dma = (rootnex_dma_t *)hp->dmai_private;
+	sglinfo = &(dma->dp_sglinfo);
+	dmar_object = &(dmareq->dmar_object);
+	maxseg = sglinfo->si_max_cookie_size;
+	pparray = dmar_object->dmao_obj.virt_obj.v_priv;
+	vaddr = dmar_object->dmao_obj.virt_obj.v_addr;
+	buftype = dmar_object->dmao_type;
+	size = dmar_object->dmao_size;
+
+	/*
+	 * Allocate cookie, dvcookie and dcookie
+	 */
+	if (cookie_alloc(dma, dmareq, a, prealloc_count) != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+	hp->dmai_cookie = dma->dp_cookies;
+
+	pcnt = 0;
+
+	/* retrieve paddr, psize, offset from dmareq */
+	if (buftype == DMA_OTYP_PAGES) {
+		page = dmar_object->dmao_obj.pp_obj.pp_pp;
+		ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
+		offset =  dmar_object->dmao_obj.pp_obj.pp_offset &
+		    MMU_PAGEOFFSET;
+		paddr = pfn_to_pa(page->p_pagenum) + offset;
+		psize = MIN((MMU_PAGESIZE - offset), size);
+		sglinfo->si_asp = NULL;
+		page = page->p_next;
+	} else {
+		ASSERT((buftype == DMA_OTYP_VADDR) ||
+		    (buftype == DMA_OTYP_BUFVADDR));
+		sglinfo->si_asp = dmar_object->dmao_obj.virt_obj.v_as;
+		if (sglinfo->si_asp == NULL) {
+			sglinfo->si_asp = &kas;
+		}
+		offset = (uintptr_t)vaddr & MMU_PAGEOFFSET;
+		if (pparray != NULL) {
+			ASSERT(!PP_ISFREE(pparray[pcnt]));
+			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset;
+			psize = MIN((MMU_PAGESIZE - offset), size);
+			pcnt++;
+		} else {
+			paddr = pfn_to_pa(hat_getpfnum(sglinfo->si_asp->a_hat,
+			    vaddr)) + offset;
+			psize = MIN(size, (MMU_PAGESIZE - offset));
+			vaddr += psize;
+		}
+	}
+
+	/* save the iommu page offset */
+	sglinfo->si_buf_offset = offset & IMMU_PAGEOFFSET;
+
+	/*
+	 * setup dvcookie and dcookie for [paddr, paddr+psize)
+	 */
+	cookie_update(domain, dma, paddr, psize, maxseg);
+
+	size -= psize;
+	while (size > 0) {
+		/* get the size for this page (i.e. partial or full page) */
+		psize = MIN(size, MMU_PAGESIZE);
+		if (buftype == DMA_OTYP_PAGES) {
+			/* get the paddr from the page_t */
+			ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
+			paddr = pfn_to_pa(page->p_pagenum);
+			page = page->p_next;
+		} else if (pparray != NULL) {
+			/* index into the array of page_t's to get the paddr */
+			ASSERT(!PP_ISFREE(pparray[pcnt]));
+			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum);
+			pcnt++;
+		} else {
+			/* call into the VM to get the paddr */
+			paddr = pfn_to_pa(hat_getpfnum
+			    (sglinfo->si_asp->a_hat, vaddr));
+			vaddr += psize;
+		}
+		/*
+		 * set dvcookie and dcookie for [paddr, paddr+psize)
+		 */
+		cookie_update(domain, dma, paddr, psize, maxseg);
+		size -= psize;
+	}
+
+	cookie_finalize(hp, immu, domain, rdip, immu_flags);
+
+	/* take account in the offset into the first page */
+	dma->dp_cookies[0].dmac_laddress += sglinfo->si_buf_offset;
+
+	/* save away how many cookies we have */
+	sglinfo->si_sgl_size = dma->dp_dvmax + 1;
+
+	return (DDI_SUCCESS);
+}
+
+/* ############################# Functions exported ######################## */
+
+/*
+ * setup the DVMA subsystem
+ * this code runs only for the first IOMMU unit
+ */
+void
+immu_dvma_setup(list_t *listp)
+{
+	immu_t *immu;
+	uint_t kval;
+	size_t nchains;
+
+	/* locks */
+	mutex_init(&immu_domain_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/* Create lists */
+	list_create(&immu_unity_domain_list, sizeof (domain_t),
+	    offsetof(domain_t, dom_maptype_node));
+	list_create(&immu_xlate_domain_list, sizeof (domain_t),
+	    offsetof(domain_t, dom_maptype_node));
+
+	/* Setup BDF domain hash */
+	nchains = 0xff;
+	kval = mod_hash_iddata_gen(nchains);
+
+	bdf_domain_hash = mod_hash_create_extended("BDF-DOMAIN_HASH",
+	    nchains, mod_hash_null_keydtor, mod_hash_null_valdtor,
+	    mod_hash_byid, (void *)(uintptr_t)kval, mod_hash_idkey_cmp,
+	    KM_NOSLEEP);
+	ASSERT(bdf_domain_hash);
+
+	immu = list_head(listp);
+	for (; immu; immu = list_next(listp, immu)) {
+		create_unity_domain(immu);
+		did_init(immu);
+		context_init(immu);
+		immu->immu_dvma_setup = B_TRUE;
+	}
+}
+
+/*
+ * Startup up one DVMA unit
+ */
+void
+immu_dvma_startup(immu_t *immu)
+{
+	ASSERT(immu);
+	ASSERT(immu->immu_dvma_running == B_FALSE);
+
+	if (immu_gfxdvma_enable == B_FALSE &&
+	    immu->immu_dvma_gfx_only == B_TRUE) {
+		return;
+	}
+
+	/*
+	 * DVMA will start once IOMMU is "running"
+	 */
+	ASSERT(immu->immu_dvma_running == B_FALSE);
+	immu->immu_dvma_running = B_TRUE;
+}
+
+/*
+ * immu_dvma_physmem_update()
+ *       called when the installed memory on a
+ *       system increases, to expand domain DVMA
+ *       for domains with UNITY mapping
+ */
+void
+immu_dvma_physmem_update(uint64_t addr, uint64_t size)
+{
+	uint64_t start;
+	uint64_t npages;
+	domain_t *domain;
+
+	/*
+	 * Just walk the system-wide list of domains with
+	 * UNITY mapping. Both the list of *all* domains
+	 * and *UNITY* domains is protected by the same
+	 * single lock
+	 */
+	mutex_enter(&immu_domain_lock);
+	domain = list_head(&immu_unity_domain_list);
+	for (; domain; domain = list_next(&immu_unity_domain_list, domain)) {
+
+		/* There is no vmem_arena for unity domains. Just map it */
+		ddi_err(DER_LOG, NULL, "IMMU: unity-domain: Adding map "
+		    "[0x%" PRIx64 " - 0x%" PRIx64 "]", addr, addr + size);
+
+		start = IMMU_ROUNDOWN(addr);
+		npages = (IMMU_ROUNDUP(size) / IMMU_PAGESIZE) + 1;
+
+		dvma_map(domain->dom_immu, domain, start, start,
+		    npages, NULL, IMMU_FLAGS_READ | IMMU_FLAGS_WRITE);
+
+	}
+	mutex_exit(&immu_domain_lock);
+}
+
+int
+immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *mrng,
+    uint_t prealloc_count, dev_info_t *rdip, immu_flags_t immu_flags)
+{
+	ddi_dma_attr_t *attr;
+	dev_info_t *ddip;
+	domain_t *domain;
+	immu_t *immu;
+	int r = DDI_FAILURE;
+
+	ASSERT(immu_enable == B_TRUE);
+	ASSERT(immu_running == B_TRUE || !(immu_flags & IMMU_FLAGS_DMAHDL));
+	ASSERT(hp || !(immu_flags & IMMU_FLAGS_DMAHDL));
+
+	/*
+	 * Intel IOMMU will only be turned on if IOMMU
+	 * page size is a multiple of IOMMU page size
+	 */
+
+	/*LINTED*/
+	ASSERT(MMU_PAGESIZE % IMMU_PAGESIZE == 0);
+
+	/* Can only do DVMA if dip is attached */
+	if (rdip == NULL) {
+		ddi_err(DER_PANIC, rdip, "DVMA map: No device specified");
+		/*NOTREACHED*/
+	}
+
+	immu_flags |= dma_to_immu_flags(dmareq);
+
+
+	/*
+	 * Setup DVMA domain for the device. This does
+	 * work only the first time we do DVMA for a
+	 * device.
+	 */
+	ddip = NULL;
+	domain = device_domain(rdip, &ddip, immu_flags);
+	if (domain == NULL) {
+		ASSERT(ddip == NULL);
+		ddi_err(DER_MODE, rdip, "Intel IOMMU setup failed for device");
+		return (DDI_DMA_NORESOURCES);
+	}
+
+	/*
+	 * If a domain is found, we must also have a domain dip
+	 * which is the topmost ancestor dip of rdip that shares
+	 * the same domain with rdip.
+	 */
+	if (domain->dom_did == 0 || ddip == NULL) {
+		ddi_err(DER_MODE, rdip, "domain did 0(%d) or ddip NULL(%p)",
+		    domain->dom_did, ddip);
+		return (DDI_DMA_NORESOURCES);
+	}
+
+	immu = domain->dom_immu;
+	ASSERT(immu);
+	if (domain->dom_did == IMMU_UNITY_DID) {
+		ASSERT(domain == immu->immu_unity_domain);
+
+		/* mapping already done. Let rootnex create cookies */
+		r = DDI_DMA_USE_PHYSICAL;
+	} else  if (immu_flags & IMMU_FLAGS_DMAHDL) {
+
+		/* if we have a DMA handle, the IOMMUs must be running */
+		ASSERT(immu->immu_regs_running == B_TRUE);
+		ASSERT(immu->immu_dvma_running == B_TRUE);
+
+		attr = &hp->dmai_attr;
+		if (attr == NULL) {
+			ddi_err(DER_PANIC, rdip,
+			    "DMA handle (%p): NULL attr", hp);
+			/*NOTREACHED*/
+		}
+		if (cookie_create(hp, dmareq, attr, immu, domain, rdip,
+		    prealloc_count, immu_flags) != DDI_SUCCESS) {
+			ddi_err(DER_MODE, rdip, "dvcookie_alloc: failed");
+			return (DDI_DMA_NORESOURCES);
+		}
+
+		/* flush write buffer */
+		immu_regs_wbf_flush(immu);
+		r = DDI_DMA_MAPPED;
+	} else if (immu_flags & IMMU_FLAGS_MEMRNG) {
+		dvma_map(immu, domain, mrng->mrng_start, mrng->mrng_start,
+		    mrng->mrng_npages, rdip, immu_flags);
+		r = DDI_DMA_MAPPED;
+	} else {
+		ddi_err(DER_PANIC, rdip, "invalid flags for immu_dvma_map()");
+		/*NOTREACHED*/
+	}
+
+	/*
+	 * Update the root and context entries
+	 */
+	if (immu_context_update(immu, domain, ddip, rdip, immu_flags)
+	    != DDI_SUCCESS) {
+		ddi_err(DER_MODE, rdip, "DVMA map: context update failed");
+		return (DDI_DMA_NORESOURCES);
+	}
+
+	/* flush caches */
+	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
+	immu_regs_context_flush(immu, 0, 0, domain->dom_did, CONTEXT_DSI);
+	rw_exit(&(immu->immu_ctx_rwlock));
+	immu_regs_iotlb_flush(immu, domain->dom_did, 0, 0, TLB_IVA_WHOLE,
+	    IOTLB_DSI);
+	immu_regs_wbf_flush(immu);
+
+	return (r);
+}
+
+int
+immu_dvma_unmap(ddi_dma_impl_t *hp, dev_info_t *rdip)
+{
+	ddi_dma_attr_t *attr;
+	rootnex_dma_t *dma;
+	domain_t *domain;
+	immu_t *immu;
+	dev_info_t *ddip;
+	immu_flags_t immu_flags;
+
+	ASSERT(immu_enable == B_TRUE);
+	ASSERT(immu_running == B_TRUE);
+	ASSERT(hp);
+
+	/*
+	 * Intel IOMMU will only be turned on if IOMMU
+	 * page size is same as MMU page size
+	 */
+	/*LINTED*/
+	ASSERT(MMU_PAGESIZE == IMMU_PAGESIZE);
+
+	/* rdip need not be attached */
+	if (rdip == NULL) {
+		ddi_err(DER_PANIC, rdip, "DVMA unmap: No device specified");
+		return (DDI_DMA_NORESOURCES);
+	}
+
+	/*
+	 * Get the device domain, this should always
+	 * succeed since there had to be a domain to
+	 * setup DVMA.
+	 */
+	dma = (rootnex_dma_t *)hp->dmai_private;
+	attr = &hp->dmai_attr;
+	if (attr == NULL) {
+		ddi_err(DER_PANIC, rdip, "DMA handle (%p) has NULL attr", hp);
+		/*NOTREACHED*/
+	}
+	immu_flags = dma->dp_sleep_flags;
+
+	ddip = NULL;
+	domain = device_domain(rdip, &ddip, immu_flags);
+	if (domain == NULL || domain->dom_did == 0 || ddip == NULL) {
+		ddi_err(DER_MODE, rdip, "Attempt to unmap DVMA for "
+		    "a device without domain or with an uninitialized "
+		    "domain");
+		return (DDI_DMA_NORESOURCES);
+	}
+
+	/*
+	 * immu must be set in the domain.
+	 */
+	immu = domain->dom_immu;
+	ASSERT(immu);
+	if (domain->dom_did == IMMU_UNITY_DID) {
+		ASSERT(domain == immu->immu_unity_domain);
+		/*
+		 * domain is unity, nothing to do here, let the rootnex
+		 * code free the cookies.
+		 */
+		return (DDI_DMA_USE_PHYSICAL);
+	}
+
+	dma = hp->dmai_private;
+	if (dma == NULL) {
+		ddi_err(DER_PANIC, rdip, "DVMA unmap: DMA handle (%p) has "
+		    "no private dma structure", hp);
+		/*NOTREACHED*/
+	}
+
+	/* free all cookies */
+	cookie_free(dma, immu, domain, ddip, rdip);
+
+	/* flush caches */
+	rw_enter(&(immu->immu_ctx_rwlock), RW_WRITER);
+	immu_regs_context_flush(immu, 0, 0, domain->dom_did, CONTEXT_DSI);
+	rw_exit(&(immu->immu_ctx_rwlock));
+	immu_regs_iotlb_flush(immu, domain->dom_did, 0, 0, TLB_IVA_WHOLE,
+	    IOTLB_DSI);
+	immu_regs_wbf_flush(immu);
+
+	return (DDI_SUCCESS);
+}
+
+immu_devi_t *
+immu_devi_get(dev_info_t *rdip)
+{
+	immu_devi_t *immu_devi;
+
+	mutex_enter(&DEVI(rdip)->devi_lock);
+	immu_devi = DEVI(rdip)->devi_iommu;
+	mutex_exit(&DEVI(rdip)->devi_lock);
+
+	return (immu_devi);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/io/immu_intrmap.c	Sat Jan 30 18:23:16 2010 -0800
@@ -0,0 +1,1000 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Portions Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ * All rights reserved.
+ */
+
+
+#include <sys/apic.h>
+#include <vm/hat_i86.h>
+#include <sys/sysmacros.h>
+#include <sys/smp_impldefs.h>
+#include <sys/immu.h>
+
+
+typedef struct intrmap_private {
+	immu_t		*ir_immu;
+	uint16_t	ir_idx;
+	uint32_t	ir_sid_svt_sq;
+} intrmap_private_t;
+
+#define	INTRMAP_PRIVATE(airq) ((intrmap_private_t *)airq->airq_intrmap_private)
+#define	AIRQ_PRIVATE(airq) (airq->airq_intrmap_private)
+
+/* interrupt remapping table entry */
+typedef struct intrmap_rte {
+	uint64_t	lo;
+	uint64_t	hi;
+} intrmap_rte_t;
+
+#define	IRTE_HIGH(sid_svt_sq) (sid_svt_sq)
+#define	IRTE_LOW(dst, vector, dlm, tm, rh, dm, fpd, p)	\
+	    (((uint64_t)(dst) << 32) |  \
+	    ((uint64_t)(vector) << 16) | \
+	    ((uint64_t)(dlm) << 5) | \
+	    ((uint64_t)(tm) << 4) | \
+	    ((uint64_t)(rh) << 3) | \
+	    ((uint64_t)(dm) << 2) | \
+	    ((uint64_t)(fpd) << 1) | \
+	    (p))
+
+typedef enum {
+	SVT_NO_VERIFY = 0, 	/* no verification */
+	SVT_ALL_VERIFY,		/* using sid and sq to verify */
+	SVT_BUS_VERIFY,		/* verify #startbus and #endbus */
+	SVT_RSVD
+} intrmap_svt_t;
+
+typedef enum {
+	SQ_VERIFY_ALL = 0,	/* verify all 16 bits */
+	SQ_VERIFY_IGR_1,	/* ignore bit 3 */
+	SQ_VERIFY_IGR_2,	/* ignore bit 2-3 */
+	SQ_VERIFY_IGR_3		/* ignore bit 1-3 */
+} intrmap_sq_t;
+
+/*
+ * S field of the Interrupt Remapping Table Address Register
+ * the size of the interrupt remapping table is 1 << (immu_intrmap_irta_s + 1)
+ */
+static uint_t intrmap_irta_s = INTRMAP_MAX_IRTA_SIZE;
+
+/*
+ * If true, arrange to suppress broadcast EOI by setting edge-triggered mode
+ * even for level-triggered interrupts in the interrupt-remapping engine.
+ * If false, broadcast EOI can still be suppressed if the CPU supports the
+ * APIC_SVR_SUPPRESS_BROADCAST_EOI bit.  In both cases, the IOAPIC is still
+ * programmed with the correct trigger mode, and pcplusmp must send an EOI
+ * to the IOAPIC by writing to the IOAPIC's EOI register to make up for the
+ * missing broadcast EOI.
+ */
+static int intrmap_suppress_brdcst_eoi = 0;
+
+/*
+ * whether verify the source id of interrupt request
+ */
+static int intrmap_enable_sid_verify = 0;
+
+/* fault types for DVMA remapping */
+static char *immu_dvma_faults[] = {
+	"Reserved",
+	"The present field in root-entry is Clear",
+	"The present field in context-entry is Clear",
+	"Hardware detected invalid programming of a context-entry",
+	"The DMA request attempted to access an address beyond max support",
+	"The Write field in a page-table entry is Clear when DMA write",
+	"The Read field in a page-table entry is Clear when DMA read",
+	"Access the next level page table resulted in error",
+	"Access the root-entry table resulted in error",
+	"Access the context-entry table resulted in error",
+	"Reserved field not initialized to zero in a present root-entry",
+	"Reserved field not initialized to zero in a present context-entry",
+	"Reserved field not initialized to zero in a present page-table entry",
+	"DMA blocked due to the Translation Type field in context-entry",
+	"Incorrect fault event reason number",
+};
+#define	DVMA_MAX_FAULTS (sizeof (immu_dvma_faults)/(sizeof (char *))) - 1
+
+/* fault types for interrupt remapping */
+static char *immu_intrmap_faults[] = {
+	"reserved field set in IRTE",
+	"interrupt_index exceed the intr-remap table size",
+	"present field in IRTE is clear",
+	"hardware access intr-remap table address resulted in error",
+	"reserved field set in IRTE, include various conditional",
+	"hardware blocked an interrupt request in Compatibility format",
+	"remappable interrupt request blocked due to verification failure"
+};
+#define	INTRMAP_MAX_FAULTS \
+	(sizeof (immu_intrmap_faults) / (sizeof (char *))) - 1
+
+/* Function prototypes */
+static int immu_intrmap_init(int apic_mode);
+static void immu_intrmap_switchon(int suppress_brdcst_eoi);
+static void immu_intrmap_alloc(apic_irq_t *irq_ptr);
+static void immu_intrmap_map(apic_irq_t *irq_ptr, void *intrmap_data);
+static void immu_intrmap_free(apic_irq_t *irq_ptr);
+static void immu_intrmap_rdt(apic_irq_t *irq_ptr, ioapic_rdt_t *irdt);
+static void immu_intrmap_msi(apic_irq_t *irq_ptr, msi_regs_t *mregs);
+
+static struct apic_intrmap_ops intrmap_ops = {
+	immu_intrmap_init,
+	immu_intrmap_switchon,
+	immu_intrmap_alloc,
+	immu_intrmap_map,
+	immu_intrmap_free,
+	immu_intrmap_rdt,
+	immu_intrmap_msi,
+};
+
+/* apic mode, APIC/X2APIC */
+static int intrmap_apic_mode = LOCAL_APIC;
+
+
+/*
+ * helper functions
+ */
+static uint_t
+bitset_find_free(bitset_t *b, uint_t post)
+{
+	uint_t	i;
+	uint_t	cap = bitset_capacity(b);
+
+	if (post == cap)
+		post = 0;
+
+	ASSERT(post < cap);
+
+	for (i = post; i < cap; i++) {
+		if (!bitset_in_set(b, i))
+			return (i);
+	}
+
+	for (i = 0; i < post; i++) {
+		if (!bitset_in_set(b, i))
+			return (i);
+	}
+
+	return (INTRMAP_IDX_FULL);	/* no free index */
+}
+
+/*
+ * helper function to find 'count' contigous free
+ * interrupt remapping table entries
+ */
+static uint_t
+bitset_find_multi_free(bitset_t *b, uint_t post, uint_t count)
+{
+	uint_t  i, j;
+	uint_t	cap = bitset_capacity(b);
+
+	if (post == INTRMAP_IDX_FULL) {
+		return (INTRMAP_IDX_FULL);
+	}
+
+	if (count > cap)
+		return (INTRMAP_IDX_FULL);
+
+	ASSERT(post < cap);
+
+	for (i = post; (i + count) <= cap; i++) {
+		for (j = 0; j < count; j++) {
+			if (bitset_in_set(b, (i + j))) {
+				i = i + j;
+				break;
+			}
+			if (j == count - 1)
+				return (i);
+		}
+	}
+
+	for (i = 0; (i < post) && ((i + count) <= cap); i++) {
+		for (j = 0; j < count; j++) {
+			if (bitset_in_set(b, (i + j))) {
+				i = i + j;
+				break;
+			}
+			if (j == count - 1)
+				return (i);
+		}
+	}
+
+	return (INTRMAP_IDX_FULL);  		/* no free index */
+}
+
+/* alloc one interrupt remapping table entry */
+static int
+alloc_tbl_entry(intrmap_t *intrmap)
+{
+	uint32_t idx;
+
+	for (;;) {
+		mutex_enter(&intrmap->intrmap_lock);
+		idx = intrmap->intrmap_free;
+		if (idx != INTRMAP_IDX_FULL) {
+			bitset_add(&intrmap->intrmap_map, idx);
+			intrmap->intrmap_free =
+			    bitset_find_free(&intrmap->intrmap_map, idx + 1);
+			mutex_exit(&intrmap->intrmap_lock);
+			break;
+		}
+
+		/* no free intr entry, use compatible format intr */
+		mutex_exit(&intrmap->intrmap_lock);
+
+		if (intrmap_apic_mode != LOCAL_X2APIC) {
+			break;
+		}
+
+		/*
+		 * x2apic mode not allowed compatible
+		 * interrupt
+		 */
+		delay(IMMU_ALLOC_RESOURCE_DELAY);
+	}
+
+	return (idx);
+}
+
+/* alloc 'cnt' contigous interrupt remapping table entries */
+static int
+alloc_tbl_multi_entries(intrmap_t *intrmap, uint_t cnt)
+{
+	uint_t idx, pos, i;
+
+	for (; ; ) {
+		mutex_enter(&intrmap->intrmap_lock);
+		pos = intrmap->intrmap_free;
+		idx = bitset_find_multi_free(&intrmap->intrmap_map, pos, cnt);
+
+		if (idx != INTRMAP_IDX_FULL) {
+			if (idx <= pos && pos < (idx + cnt)) {
+				intrmap->intrmap_free = bitset_find_free(
+				    &intrmap->intrmap_map, idx + cnt);
+			}
+			for (i = 0; i < cnt; i++) {
+				bitset_add(&intrmap->intrmap_map, idx + i);
+			}
+			mutex_exit(&intrmap->intrmap_lock);
+		}
+
+		mutex_exit(&intrmap->intrmap_lock);
+
+		if (intrmap_apic_mode != LOCAL_X2APIC) {
+			break;
+		}
+
+		/* x2apic mode not allowed comapitible interrupt */
+		delay(IMMU_ALLOC_RESOURCE_DELAY);
+	}
+
+	return (idx);
+}
+
+/* init interrupt remapping table */
+static int
+init_unit(immu_t *immu)
+{
+	intrmap_t *intrmap;
+	size_t size;
+
+	ddi_dma_attr_t intrmap_dma_attr = {
+		DMA_ATTR_V0,
+		0U,
+		0xffffffffU,
+		0xffffffffU,
+		MMU_PAGESIZE,	/* page aligned */
+		0x1,
+		0x1,
+		0xffffffffU,
+		0xffffffffU,
+		1,
+		4,
+		0
+	};
+
+	ddi_device_acc_attr_t intrmap_acc_attr = {
+		DDI_DEVICE_ATTR_V0,
+		DDI_NEVERSWAP_ACC,
+		DDI_STRICTORDER_ACC
+	};
+
+	if (intrmap_apic_mode == LOCAL_X2APIC) {
+		if (!IMMU_ECAP_GET_EIM(immu->immu_regs_excap)) {
+			return (DDI_FAILURE);
+		}
+	}
+
+	if (intrmap_irta_s > INTRMAP_MAX_IRTA_SIZE) {
+		intrmap_irta_s = INTRMAP_MAX_IRTA_SIZE;
+	}
+
+	intrmap =  kmem_zalloc(sizeof (intrmap_t), KM_SLEEP);
+
+	if (ddi_dma_alloc_handle(immu->immu_dip,
+	    &intrmap_dma_attr,
+	    DDI_DMA_SLEEP,
+	    NULL,
+	    &(intrmap->intrmap_dma_hdl)) != DDI_SUCCESS) {
+		kmem_free(intrmap, sizeof (intrmap_t));
+		return (DDI_FAILURE);
+	}
+
+	intrmap->intrmap_size = 1 << (intrmap_irta_s + 1);
+	size = intrmap->intrmap_size * INTRMAP_RTE_SIZE;
+	if (ddi_dma_mem_alloc(intrmap->intrmap_dma_hdl,
+	    size,
+	    &intrmap_acc_attr,
+	    DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
+	    DDI_DMA_SLEEP,
+	    NULL,
+	    &(intrmap->intrmap_vaddr),
+	    &size,
+	    &(intrmap->intrmap_acc_hdl)) != DDI_SUCCESS) {
+		ddi_dma_free_handle(&(intrmap->intrmap_dma_hdl));
+		kmem_free(intrmap, sizeof (intrmap_t));
+		return (DDI_FAILURE);
+	}
+
+	ASSERT(!((uintptr_t)intrmap->intrmap_vaddr & MMU_PAGEOFFSET));
+	bzero(intrmap->intrmap_vaddr, size);
+	intrmap->intrmap_paddr = pfn_to_pa(
+	    hat_getpfnum(kas.a_hat, intrmap->intrmap_vaddr));
+
+	mutex_init(&(intrmap->intrmap_lock), NULL, MUTEX_DRIVER, NULL);
+	bitset_init(&intrmap->intrmap_map);
+	bitset_resize(&intrmap->intrmap_map, intrmap->intrmap_size);
+	intrmap->intrmap_free = 0;
+
+	immu->immu_intrmap = intrmap;
+
+	return (DDI_SUCCESS);
+}
+
+static void
+get_immu(apic_irq_t *irq_ptr)
+{
+	immu_t	*immu = NULL;
+
+	ASSERT(INTRMAP_PRIVATE(irq_ptr)->ir_immu == NULL);
+
+	if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
+		immu = immu_dmar_ioapic_immu(irq_ptr->airq_ioapicindex);
+	} else {
+		if (irq_ptr->airq_dip != NULL) {
+			immu = immu_dmar_get_immu(irq_ptr->airq_dip);
+		}
+	}
+
+	if (immu && (immu->immu_intrmap_running == B_TRUE)) {
+		INTRMAP_PRIVATE(irq_ptr)->ir_immu = immu;
+	}
+}
+
+static int
+get_top_pcibridge(dev_info_t *dip, void *arg)
+{
+	dev_info_t **topdipp = arg;
+	immu_devi_t *immu_devi;
+
+	mutex_enter(&(DEVI(dip)->devi_lock));
+	immu_devi = DEVI(dip)->devi_iommu;
+	mutex_exit(&(DEVI(dip)->devi_lock));
+
+	if (immu_devi == NULL || immu_devi->imd_pcib_type == IMMU_PCIB_BAD ||
+	    immu_devi->imd_pcib_type == IMMU_PCIB_ENDPOINT) {
+		return (DDI_WALK_CONTINUE);
+	}
+
+	*topdipp = dip;
+
+	return (DDI_WALK_CONTINUE);
+}
+
+static dev_info_t *
+intrmap_top_pcibridge(dev_info_t *rdip)
+{
+	dev_info_t *top_pcibridge = NULL;
+
+	if (immu_walk_ancestor(rdip, NULL, get_top_pcibridge,
+	    &top_pcibridge, NULL, 0) != DDI_SUCCESS) {
+		return (NULL);
+	}
+
+	return (top_pcibridge);
+}
+
+/* function to get interrupt request source id */
+static void
+get_sid(apic_irq_t *irq_ptr)
+{
+	dev_info_t	*dip, *pdip;
+	immu_devi_t	*immu_devi;
+	uint16_t	sid;
+	uchar_t		svt, sq;
+
+	if (!intrmap_enable_sid_verify) {
+		return;
+	}
+
+	if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
+		/* for interrupt through I/O APIC */
+		sid = immu_dmar_ioapic_sid(irq_ptr->airq_ioapicindex);
+		svt = SVT_ALL_VERIFY;
+		sq = SQ_VERIFY_ALL;
+	} else {
+		/* MSI/MSI-X interrupt */
+		dip = irq_ptr->airq_dip;
+		ASSERT(dip);
+		pdip = intrmap_top_pcibridge(dip);
+		ASSERT(pdip);
+		immu_devi = DEVI(pdip)->devi_iommu;
+		ASSERT(immu_devi);
+		if (immu_devi->imd_pcib_type == IMMU_PCIB_PCIE_PCI) {
+			/* device behind pcie to pci bridge */
+			sid = (immu_devi->imd_bus << 8) | immu_devi->imd_sec;
+			svt = SVT_BUS_VERIFY;
+			sq = SQ_VERIFY_ALL;
+		} else {
+			/* pcie device or device behind pci to pci bridge */
+			sid = (immu_devi->imd_bus << 8) |
+			    immu_devi->imd_devfunc;
+			svt = SVT_ALL_VERIFY;
+			sq = SQ_VERIFY_ALL;
+		}
+	}
+
+	INTRMAP_PRIVATE(irq_ptr)->ir_sid_svt_sq =
+	    sid | (svt << 18) | (sq << 16);
+}
+
+static void
+intrmap_enable(immu_t *immu)
+{
+	intrmap_t *intrmap;
+	uint64_t irta_reg;
+
+	intrmap = immu->immu_intrmap;
+
+	irta_reg = intrmap->intrmap_paddr | intrmap_irta_s;
+	if (intrmap_apic_mode == LOCAL_X2APIC) {
+		irta_reg |= (0x1 << 11);
+	}
+
+	immu_regs_intrmap_enable(immu, irta_reg);
+}
+
+/* ####################################################################### */
+
+/*
+ * immu_intr_handler()
+ * 	the fault event handler for a single immu unit
+ */
+int
+immu_intr_handler(immu_t *immu)
+{
+	uint32_t status;
+	int index, fault_reg_offset;
+	int max_fault_index;
+	boolean_t found_fault;
+	dev_info_t *idip;
+
+	mutex_enter(&(immu->immu_intr_lock));
+	mutex_enter(&(immu->immu_regs_lock));
+
+	/* read the fault status */
+	status = immu_regs_get32(immu, IMMU_REG_FAULT_STS);
+
+	idip = immu->immu_dip;
+	ASSERT(idip);
+
+	/* check if we have a pending fault for this immu unit */
+	if ((status & IMMU_FAULT_STS_PPF) == 0) {
+		mutex_exit(&(immu->immu_regs_lock));
+		mutex_exit(&(immu->immu_intr_lock));
+		return (DDI_INTR_UNCLAIMED);
+	}
+
+	/*
+	 * handle all primary pending faults
+	 */
+	index = IMMU_FAULT_GET_INDEX(status);
+	max_fault_index =  IMMU_CAP_GET_NFR(immu->immu_regs_cap) - 1;
+	fault_reg_offset = IMMU_CAP_GET_FRO(immu->immu_regs_cap);
+
+	found_fault = B_FALSE;
+	_NOTE(CONSTCOND)
+	while (1) {
+		uint64_t val;
+		uint8_t fault_reason;
+		uint8_t fault_type;
+		uint16_t sid;
+		uint64_t pg_addr;
+		uint64_t idx;
+
+		/* read the higher 64bits */
+		val = immu_regs_get64(immu, fault_reg_offset + index * 16 + 8);
+
+		/* check if this fault register has pending fault */
+		if (!IMMU_FRR_GET_F(val)) {
+			break;
+		}
+
+		found_fault = B_TRUE;
+
+		/* get the fault reason, fault type and sid */
+		fault_reason = IMMU_FRR_GET_FR(val);
+		fault_type = IMMU_FRR_GET_FT(val);
+		sid = IMMU_FRR_GET_SID(val);
+
+		/* read the first 64bits */
+		val = immu_regs_get64(immu, fault_reg_offset + index * 16);
+		pg_addr = val & IMMU_PAGEMASK;
+		idx = val >> 48;
+
+		/* clear the fault */
+		immu_regs_put32(immu, fault_reg_offset + index * 16 + 12,
+		    (((uint32_t)1) << 31));
+
+		/* report the fault info */
+		if (fault_reason < 0x20) {
+			/* immu-remapping fault */
+			ddi_err(DER_WARN, idip,
+			    "generated a fault event when translating DMA %s\n"
+			    "\t on address 0x%" PRIx64 " for PCI(%d, %d, %d), "
+			    "the reason is:\n\t %s",
+			    fault_type ? "read" : "write", pg_addr,
+			    (sid >> 8) & 0xff, (sid >> 3) & 0x1f, sid & 0x7,
+			    immu_dvma_faults[MIN(fault_reason,
+			    DVMA_MAX_FAULTS)]);
+		} else if (fault_reason < 0x27) {
+			/* intr-remapping fault */
+			ddi_err(DER_WARN, idip,
+			    "generated a fault event when translating "
+			    "interrupt request\n"
+			    "\t on index 0x%" PRIx64 " for PCI(%d, %d, %d), "
+			    "the reason is:\n\t %s",
+			    idx,
+			    (sid >> 8) & 0xff, (sid >> 3) & 0x1f, sid & 0x7,
+			    immu_intrmap_faults[MIN((fault_reason - 0x20),
+			    INTRMAP_MAX_FAULTS)]);
+		} else {
+			ddi_err(DER_WARN, idip, "Unknown fault reason: 0x%x",
+			    fault_reason);
+		}
+
+		index++;
+		if (index > max_fault_index)
+			index = 0;
+	}
+
+	/* Clear the fault */
+	if (!found_fault) {
+		ddi_err(DER_MODE, idip,
+		    "Fault register set but no fault present");
+	}
+	immu_regs_put32(immu, IMMU_REG_FAULT_STS, 1);
+	mutex_exit(&(immu->immu_regs_lock));
+	mutex_exit(&(immu->immu_intr_lock));
+	return (DDI_INTR_CLAIMED);
+}
+/* ######################################################################### */
+
+/*
+ * Interrupt remap entry points
+ */
+
+/* initialize interrupt remapping */
+static int
+immu_intrmap_init(int apic_mode)
+{
+	immu_t *immu;
+	int error = DDI_FAILURE;
+
+	if (immu_intrmap_enable == B_FALSE) {
+		return (DDI_SUCCESS);
+	}
+
+	intrmap_apic_mode = apic_mode;
+
+	immu = list_head(&immu_list);
+	for (; immu; immu = list_next(&immu_list, immu)) {
+		if ((immu->immu_intrmap_running == B_TRUE) &&
+		    IMMU_ECAP_GET_IR(immu->immu_regs_excap)) {
+			if (init_unit(immu) == DDI_SUCCESS) {
+				error = DDI_SUCCESS;
+			}
+		}
+	}
+
+	/*
+	 * if all IOMMU units disable intr remapping,
+	 * return FAILURE
+	 */
+	return (error);
+}
+
+
+
+/* enable interrupt remapping */
+static void
+immu_intrmap_switchon(int suppress_brdcst_eoi)
+{
+	immu_t *immu;
+
+
+	intrmap_suppress_brdcst_eoi = suppress_brdcst_eoi;
+
+	immu = list_head(&immu_list);
+	for (; immu; immu = list_next(&immu_list, immu)) {
+		if (immu->immu_intrmap_setup == B_TRUE) {
+			intrmap_enable(immu);
+		}
+	}
+}
+
+/* alloc remapping entry for the interrupt */
+static void
+immu_intrmap_alloc(apic_irq_t *irq_ptr)
+{
+	immu_t	*immu;
+	intrmap_t *intrmap;
+	uint32_t		idx, cnt, i;
+	uint_t			vector, irqno;
+	uint32_t		sid_svt_sq;
+
+	if (AIRQ_PRIVATE(irq_ptr) == INTRMAP_DISABLE ||
+	    AIRQ_PRIVATE(irq_ptr) != NULL) {
+		return;
+	}
+
+	AIRQ_PRIVATE(irq_ptr) =
+	    kmem_zalloc(sizeof (intrmap_private_t), KM_SLEEP);
+
+	get_immu(irq_ptr);
+
+	immu = INTRMAP_PRIVATE(irq_ptr)->ir_immu;
+	if (immu == NULL) {
+		goto intrmap_disable;
+	}
+
+	intrmap = immu->immu_intrmap;
+
+	if (irq_ptr->airq_mps_intr_index == MSI_INDEX) {
+		cnt = irq_ptr->airq_intin_no;
+	} else {
+		cnt = 1;
+	}
+
+	if (cnt == 1) {
+		idx = alloc_tbl_entry(intrmap);
+	} else {
+		idx = alloc_tbl_multi_entries(intrmap, cnt);
+	}
+
+	if (idx == INTRMAP_IDX_FULL) {
+		goto intrmap_disable;
+	}
+
+	INTRMAP_PRIVATE(irq_ptr)->ir_idx = idx;
+
+	get_sid(irq_ptr);
+
+	if (cnt == 1) {
+		if (IMMU_CAP_GET_CM(immu->immu_regs_cap)) {
+			immu_qinv_intr_one_cache(immu, idx);
+		} else {
+			immu_regs_wbf_flush(immu);
+		}
+		return;
+	}
+
+	sid_svt_sq = INTRMAP_PRIVATE(irq_ptr)->ir_sid_svt_sq;
+
+	vector = irq_ptr->airq_vector;
+
+	for (i = 1; i < cnt; i++) {
+		irqno = apic_vector_to_irq[vector + i];
+		irq_ptr = apic_irq_table[irqno];
+
+		ASSERT(irq_ptr);
+
+		AIRQ_PRIVATE(irq_ptr) =
+		    kmem_zalloc(sizeof (intrmap_private_t), KM_SLEEP);
+
+		INTRMAP_PRIVATE(irq_ptr)->ir_immu = immu;
+		INTRMAP_PRIVATE(irq_ptr)->ir_sid_svt_sq = sid_svt_sq;
+		INTRMAP_PRIVATE(irq_ptr)->ir_idx = idx + i;
+	}
+
+	if (IMMU_CAP_GET_CM(immu->immu_regs_cap)) {
+		immu_qinv_intr_caches(immu, idx, cnt);
+	} else {
+		immu_regs_wbf_flush(immu);
+	}
+
+	return;
+
+intrmap_disable:
+	kmem_free(AIRQ_PRIVATE(irq_ptr), sizeof (intrmap_private_t));
+	AIRQ_PRIVATE(irq_ptr) = INTRMAP_DISABLE;
+}
+
+
+/* remapping the interrupt */
+static void
+immu_intrmap_map(apic_irq_t *irq_ptr, void *intrmap_data)
+{
+	immu_t	*immu;
+	intrmap_t	*intrmap;
+	ioapic_rdt_t	*irdt = (ioapic_rdt_t *)intrmap_data;
+	msi_regs_t	*mregs = (msi_regs_t *)intrmap_data;
+	intrmap_rte_t	irte;
+	uint_t		idx, i, cnt;
+	uint32_t	dst, sid_svt_sq;
+	uchar_t		vector, dlm, tm, rh, dm;
+
+	if (AIRQ_PRIVATE(irq_ptr) == INTRMAP_DISABLE) {
+		return;
+	}
+
+	if (irq_ptr->airq_mps_intr_index == MSI_INDEX) {
+		cnt = irq_ptr->airq_intin_no;
+	} else {
+		cnt = 1;
+	}
+
+	idx = INTRMAP_PRIVATE(irq_ptr)->ir_idx;
+	immu = INTRMAP_PRIVATE(irq_ptr)->ir_immu;
+	intrmap = immu->immu_intrmap;
+	sid_svt_sq = INTRMAP_PRIVATE(irq_ptr)->ir_sid_svt_sq;
+	vector = irq_ptr->airq_vector;
+
+	if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
+		dm = RDT_DM(irdt->ir_lo);
+		rh = 0;
+		tm = RDT_TM(irdt->ir_lo);
+		dlm = RDT_DLM(irdt->ir_lo);
+		dst = irdt->ir_hi;
+
+		/*
+		 * Mark the IRTE's TM as Edge to suppress broadcast EOI.
+		 */
+		if (intrmap_suppress_brdcst_eoi) {
+			tm = TRIGGER_MODE_EDGE;
+		}
+	} else {
+		dm = MSI_ADDR_DM_PHYSICAL;
+		rh = MSI_ADDR_RH_FIXED;
+		tm = TRIGGER_MODE_EDGE;
+		dlm = 0;
+		dst = mregs->mr_addr;
+	}
+
+	if (intrmap_apic_mode == LOCAL_APIC)
+		dst = (dst & 0xFF) << 8;
+
+	if (cnt == 1) {
+		irte.lo = IRTE_LOW(dst, vector, dlm, tm, rh, dm, 0, 1);
+		irte.hi = IRTE_HIGH(sid_svt_sq);
+
+		/* set interrupt remapping table entry */
+		bcopy(&irte, intrmap->intrmap_vaddr +
+		    idx * INTRMAP_RTE_SIZE,
+		    INTRMAP_RTE_SIZE);
+
+		immu_qinv_intr_one_cache(immu, idx);
+
+	} else {
+		vector = irq_ptr->airq_vector;
+		for (i = 0; i < cnt; i++) {
+			irte.lo = IRTE_LOW(dst, vector, dlm, tm, rh, dm, 0, 1);
+			irte.hi = IRTE_HIGH(sid_svt_sq);
+
+			/* set interrupt remapping table entry */
+			bcopy(&irte, intrmap->intrmap_vaddr +
+			    idx * INTRMAP_RTE_SIZE,
+			    INTRMAP_RTE_SIZE);
+			vector++;
+			idx++;
+		}
+
+		immu_qinv_intr_caches(immu, idx, cnt);
+	}
+}
+
+/* free the remapping entry */
+static void
+immu_intrmap_free(apic_irq_t *irq_ptr)
+{
+	immu_t *immu;
+	intrmap_t *intrmap;
+	uint32_t idx;
+
+	if (AIRQ_PRIVATE(irq_ptr) == INTRMAP_DISABLE) {
+		AIRQ_PRIVATE(irq_ptr) = NULL;
+		return;
+	}
+
+	immu = INTRMAP_PRIVATE(irq_ptr)->ir_immu;
+	intrmap = immu->immu_intrmap;
+	idx = INTRMAP_PRIVATE(irq_ptr)->ir_idx;
+
+	bzero(intrmap->intrmap_vaddr + idx * INTRMAP_RTE_SIZE,
+	    INTRMAP_RTE_SIZE);
+
+	immu_qinv_intr_one_cache(immu, idx);
+
+	mutex_enter(&intrmap->intrmap_lock);
+	bitset_del(&intrmap->intrmap_map, idx);
+	if (intrmap->intrmap_free == INTRMAP_IDX_FULL) {
+		intrmap->intrmap_free = idx;
+	}
+	mutex_exit(&intrmap->intrmap_lock);
+
+	kmem_free(AIRQ_PRIVATE(irq_ptr), sizeof (intrmap_private_t));
+	AIRQ_PRIVATE(irq_ptr) = NULL;
+}
+
+/* record the ioapic rdt entry */
+static void
+immu_intrmap_rdt(apic_irq_t *irq_ptr, ioapic_rdt_t *irdt)
+{
+	uint32_t rdt_entry, tm, pol, idx, vector;
+
+	rdt_entry = irdt->ir_lo;
+
+	if (INTRMAP_PRIVATE(irq_ptr) != NULL) {
+		idx = INTRMAP_PRIVATE(irq_ptr)->ir_idx;
+		tm = RDT_TM(rdt_entry);
+		pol = RDT_POL(rdt_entry);
+		vector = irq_ptr->airq_vector;
+		irdt->ir_lo = (tm << INTRMAP_IOAPIC_TM_SHIFT) |
+		    (pol << INTRMAP_IOAPIC_POL_SHIFT) |
+		    ((idx >> 15) << INTRMAP_IOAPIC_IDX15_SHIFT) |
+		    vector;
+		irdt->ir_hi = (idx << INTRMAP_IOAPIC_IDX_SHIFT) |
+		    (1 << INTRMAP_IOAPIC_FORMAT_SHIFT);
+	} else {
+		irdt->ir_hi <<= APIC_ID_BIT_OFFSET;
+	}
+}
+
+/* record the msi interrupt structure */
+/*ARGSUSED*/
+static void
+immu_intrmap_msi(apic_irq_t *irq_ptr, msi_regs_t *mregs)
+{
+	uint_t	idx;
+
+	if (INTRMAP_PRIVATE(irq_ptr) != NULL) {
+		idx = INTRMAP_PRIVATE(irq_ptr)->ir_idx;
+
+		mregs->mr_data = 0;
+		mregs->mr_addr = MSI_ADDR_HDR |
+		    ((idx & 0x7fff) << INTRMAP_MSI_IDX_SHIFT) |
+		    (1 << INTRMAP_MSI_FORMAT_SHIFT) |
+		    (1 << INTRMAP_MSI_SHV_SHIFT) |
+		    ((idx >> 15) << INTRMAP_MSI_IDX15_SHIFT);
+	} else {
+		mregs->mr_addr = MSI_ADDR_HDR |
+		    (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
+		    (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) |
+		    (mregs->mr_addr << MSI_ADDR_DEST_SHIFT);
+		mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) |
+		    mregs->mr_data;
+	}
+}
+
+/* ######################################################################### */
+/*
+ * Functions exported by immu_intr.c
+ */
+void
+immu_intrmap_setup(list_t *listp)
+{
+	immu_t *immu;
+
+	/*
+	 * Check if ACPI DMAR tables say that
+	 * interrupt remapping is supported
+	 */
+	if (immu_dmar_intrmap_supported() == B_FALSE) {
+		return;
+	}
+
+	/*
+	 * Check if interrupt remapping is disabled.
+	 */
+	if (immu_intrmap_enable == B_FALSE) {
+		return;
+	}
+
+	psm_vt_ops = &intrmap_ops;
+
+	immu = list_head(listp);
+	for (; immu; immu = list_next(listp, immu)) {
+		mutex_init(&(immu->immu_intrmap_lock), NULL,
+		    MUTEX_DEFAULT, NULL);
+		mutex_enter(&(immu->immu_intrmap_lock));
+		immu->immu_intrmap_setup = B_TRUE;
+		mutex_exit(&(immu->immu_intrmap_lock));
+	}
+}
+
+void
+immu_intrmap_startup(immu_t *immu)
+{
+	/* do nothing */
+	mutex_enter(&(immu->immu_intrmap_lock));
+	if (immu->immu_intrmap_setup == B_TRUE) {
+		immu->immu_intrmap_running = B_TRUE;
+	}
+	mutex_exit(&(immu->immu_intrmap_lock));
+}
+
+/*
+ * Register a Intel IOMMU unit (i.e. DMAR unit's)
+ * interrupt handler
+ */
+void
+immu_intr_register(immu_t *immu)
+{
+	int irq, vect;
+	char intr_handler_name[IMMU_MAXNAMELEN];
+	uint32_t msi_data;
+	uint32_t uaddr;
+	uint32_t msi_addr;
+
+	msi_addr = (MSI_ADDR_HDR |
+	    apic_cpus[0].aci_local_id & 0xFF) << ((MSI_ADDR_DEST_SHIFT) |
+	    (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
+	    (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT));
+
+	if (intrmap_apic_mode == LOCAL_X2APIC) {
+		uaddr = (apic_cpus[0].aci_local_id & 0xFFFFFF00);
+	} else {
+		uaddr = 0;
+	}
+
+	/* Dont need to hold immu_intr_lock since we are in boot */
+	irq = psm_get_ipivect(IMMU_INTR_IPL, -1);
+	vect = apic_irq_table[irq]->airq_vector;
+	msi_data = ((MSI_DATA_DELIVERY_FIXED <<
+	    MSI_DATA_DELIVERY_SHIFT) | vect);
+
+	(void) snprintf(intr_handler_name, sizeof (intr_handler_name),
+	    "%s-intr-handler", immu->immu_name);
+
+	(void) add_avintr((void *)NULL, IMMU_INTR_IPL,
+	    (avfunc)(immu_intr_handler), intr_handler_name, irq,
+	    (caddr_t)immu, NULL, NULL, NULL);
+
+	immu_regs_intr_enable(immu, msi_addr, msi_data, uaddr);
+
+	(void) immu_intr_handler(immu);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/io/immu_qinv.c	Sat Jan 30 18:23:16 2010 -0800
@@ -0,0 +1,918 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Portions Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2009, Intel Corporation.
+ * All rights reserved.
+ */
+
+#include <sys/ddi.h>
+#include <sys/archsystm.h>
+#include <vm/hat_i86.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/immu.h>
+
+/* invalidation queue table entry size */
+#define	QINV_ENTRY_SIZE		0x10
+
+/* max value of Queue Size field of Invalidation Queue Address Register */
+#define	QINV_MAX_QUEUE_SIZE	0x7
+
+/* status data size of invalidation wait descriptor */
+#define	QINV_SYNC_DATA_SIZE	0x4
+
+/* status data value of invalidation wait descriptor */
+#define	QINV_SYNC_DATA_FENCE	1
+#define	QINV_SYNC_DATA_UNFENCE	2
+
+/* invalidation queue head and tail */
+#define	QINV_IQA_HEAD(QH)	BITX((QH), 18, 4)
+#define	QINV_IQA_TAIL_SHIFT	4
+
+/* invalidation queue entry structure */
+typedef struct qinv_inv_dsc {
+	uint64_t	lo;
+	uint64_t	hi;
+} qinv_dsc_t;
+
+/*
+ * struct iotlb_cache_node
+ *   the pending data for iotlb flush
+ */
+typedef struct iotlb_pend_node {
+	dvcookie_t	*icn_dvcookies;  /* ptr to dvma cookie array */
+	uint_t		icn_count;  /* valid cookie count */
+	uint_t		icn_array_size;  /* array size */
+	list_node_t	node;
+} qinv_iotlb_pend_node_t;
+
+/*
+ * struct iotlb_cache_head
+ *   the pending head for the iotlb flush
+ */
+typedef struct iotlb_pend_head {
+	/* the pending node cache list */
+	kmutex_t	ich_mem_lock;
+	list_t		ich_mem_list;
+} qinv_iotlb_pend_head_t;
+
+/*
+ * qinv_iotlb_t
+ *   pending data for qiueued invalidation iotlb flush
+ */
+typedef struct qinv_iotlb {
+	dvcookie_t	*qinv_iotlb_dvcookies;
+	uint_t		qinv_iotlb_count;
+	uint_t		qinv_iotlb_size;
+	list_node_t	qinv_iotlb_node;
+} qinv_iotlb_t;
+
+/* physical contigous pages for invalidation queue */
+typedef struct qinv_mem {
+	kmutex_t	   qinv_mem_lock;
+	ddi_dma_handle_t   qinv_mem_dma_hdl;
+	ddi_acc_handle_t   qinv_mem_acc_hdl;
+	caddr_t		   qinv_mem_vaddr;
+	paddr_t		   qinv_mem_paddr;
+	uint_t		   qinv_mem_size;
+	uint16_t	   qinv_mem_head;
+	uint16_t	   qinv_mem_tail;
+} qinv_mem_t;
+
+
+/*
+ * invalidation queue state
+ *   This structure describes the state information of the
+ *   invalidation queue table and related status memeory for
+ *   invalidation wait descriptor
+ *
+ * qinv_table		- invalidation queue table
+ * qinv_sync		- sync status memory for invalidation wait descriptor
+ * qinv_iotlb_pend_node	- pending iotlb node
+ */
+typedef struct qinv {
+	qinv_mem_t		qinv_table;
+	qinv_mem_t		qinv_sync;
+	qinv_iotlb_pend_head_t qinv_pend_head;
+	qinv_iotlb_pend_node_t  **qinv_iotlb_pend_node;
+} qinv_t;
+
+
+/* helper macro for making queue invalidation descriptor */
+#define	INV_DSC_TYPE(dsc)	((dsc)->lo & 0xF)
+#define	CC_INV_DSC_HIGH		(0)
+#define	CC_INV_DSC_LOW(fm, sid, did, g)	(((uint64_t)(fm) << 48) | \
+	((uint64_t)(sid) << 32) | \
+	((uint64_t)(did) << 16) | \
+	((uint64_t)(g) << 4) | \
+	1)
+
+#define	IOTLB_INV_DSC_HIGH(addr, ih, am) (((uint64_t)(addr)) | \
+	((uint64_t)(ih) << 6) |	\
+	((uint64_t)(am)))
+
+#define	IOTLB_INV_DSC_LOW(did, dr, dw, g) (((uint64_t)(did) << 16) | \
+	((uint64_t)(dr) << 7) | \
+	((uint64_t)(dw) << 6) | \
+	((uint64_t)(g) << 4) | \
+	2)
+
+#define	DEV_IOTLB_INV_DSC_HIGH(addr, s) (((uint64_t)(addr)) | (s))
+
+#define	DEV_IOTLB_INV_DSC_LOW(sid, max_invs_pd) ( \
+	((uint64_t)(sid) << 32) | \
+	((uint64_t)(max_invs_pd) << 16) | \
+	3)
+
+#define	IEC_INV_DSC_HIGH (0)
+#define	IEC_INV_DSC_LOW(idx, im, g) (((uint64_t)(idx) << 32) | \
+	((uint64_t)(im) << 27) | \
+	((uint64_t)(g) << 4) | \
+	4)
+
+#define	INV_WAIT_DSC_HIGH(saddr) ((uint64_t)(saddr))
+
+#define	INV_WAIT_DSC_LOW(sdata, fn, sw, iflag) (((uint64_t)(sdata) << 32) | \
+	((uint64_t)(fn) << 6) | \
+	((uint64_t)(sw) << 5) | \
+	((uint64_t)(iflag) << 4) | \
+	5)
+
+/*
+ * QS field of Invalidation Queue Address Register
+ * the size of invalidation queue is 1 << (qinv_iqa_qs + 8)
+ */
+static uint_t qinv_iqa_qs = 6;
+
+/*
+ * the invalidate desctiptor type of queued invalidation interface
+ */
+static char *qinv_dsc_type[] = {
+	"Reserved",
+	"Context Cache Invalidate Descriptor",
+	"IOTLB Invalidate Descriptor",
+	"Device-IOTLB Invalidate Descriptor",
+	"Interrupt Entry Cache Invalidate Descriptor",
+	"Invalidation Wait Descriptor",
+	"Incorrect queue invalidation type"
+};
+
+#define	QINV_MAX_DSC_TYPE	(sizeof (qinv_dsc_type) / sizeof (char *))
+
+/*
+ * the queued invalidation interface functions
+ */
+static void qinv_submit_inv_dsc(immu_t *immu, qinv_dsc_t *dsc);
+static void qinv_context_common(immu_t *immu, uint8_t function_mask,
+    uint16_t source_id, uint_t domain_id, ctt_inv_g_t type);
+static void qinv_iotlb_common(immu_t *immu, uint_t domain_id,
+    uint64_t addr, uint_t am, uint_t hint, tlb_inv_g_t type);
+static void qinv_iec_common(immu_t *immu, uint_t iidx,
+    uint_t im, uint_t g);
+static uint_t qinv_alloc_sync_mem_entry(immu_t *immu);
+static void qinv_wait_async_unfence(immu_t *immu,
+    qinv_iotlb_pend_node_t *node);
+static void qinv_wait_sync(immu_t *immu);
+static int qinv_wait_async_finish(immu_t *immu, int *count);
+/*LINTED*/
+static void qinv_wait_async_fence(immu_t *immu);
+/*LINTED*/
+static void qinv_dev_iotlb_common(immu_t *immu, uint16_t sid,
+    uint64_t addr, uint_t size, uint_t max_invs_pd);
+
+
+/* submit invalidation request descriptor to invalidation queue */
+static void
+qinv_submit_inv_dsc(immu_t *immu, qinv_dsc_t *dsc)
+{
+	qinv_t *qinv;
+	qinv_mem_t *qinv_table;
+	uint_t tail;
+
+	qinv = (qinv_t *)immu->immu_qinv;
+	qinv_table = &(qinv->qinv_table);
+
+	mutex_enter(&qinv_table->qinv_mem_lock);
+	tail = qinv_table->qinv_mem_tail;
+	qinv_table->qinv_mem_tail++;
+
+	if (qinv_table->qinv_mem_tail == qinv_table->qinv_mem_size)
+		qinv_table->qinv_mem_tail = 0;
+
+	while (qinv_table->qinv_mem_head == qinv_table->qinv_mem_tail) {
+		/*
+		 * inv queue table exhausted, wait hardware to fetch
+		 * next descriptor
+		 */
+		qinv_table->qinv_mem_head = QINV_IQA_HEAD(
+		    immu_regs_get64(immu, IMMU_REG_INVAL_QH));
+	}
+
+	bcopy(dsc, qinv_table->qinv_mem_vaddr + tail * QINV_ENTRY_SIZE,
+	    QINV_ENTRY_SIZE);
+
+	immu_regs_put64(immu, IMMU_REG_INVAL_QT,
+	    qinv_table->qinv_mem_tail << QINV_IQA_TAIL_SHIFT);
+
+	mutex_exit(&qinv_table->qinv_mem_lock);
+}
+
+/* queued invalidation interface -- invalidate context cache */
+static void
+qinv_context_common(immu_t *immu, uint8_t function_mask,
+    uint16_t source_id, uint_t domain_id, ctt_inv_g_t type)
+{
+	qinv_dsc_t dsc;
+
+	dsc.lo = CC_INV_DSC_LOW(function_mask, source_id, domain_id, type);
+	dsc.hi = CC_INV_DSC_HIGH;
+
+	qinv_submit_inv_dsc(immu, &dsc);
+}
+
+/* queued invalidation interface -- invalidate iotlb */
+static void
+qinv_iotlb_common(immu_t *immu, uint_t domain_id,
+    uint64_t addr, uint_t am, uint_t hint, tlb_inv_g_t type)
+{
+	qinv_dsc_t dsc;
+	uint8_t dr = 0;
+	uint8_t dw = 0;
+
+	if (IMMU_CAP_GET_DRD(immu->immu_regs_cap))
+		dr = 1;
+	if (IMMU_CAP_GET_DWD(immu->immu_regs_cap))
+		dw = 1;
+
+	switch (type) {
+	case TLB_INV_G_PAGE:
+		if (!IMMU_CAP_GET_PSI(immu->immu_regs_cap) ||
+		    am > IMMU_CAP_GET_MAMV(immu->immu_regs_cap) ||
+		    addr & IMMU_PAGEOFFSET) {
+			type = TLB_INV_G_DOMAIN;
+			goto qinv_ignore_psi;
+		}
+		dsc.lo = IOTLB_INV_DSC_LOW(domain_id, dr, dw, type);
+		dsc.hi = IOTLB_INV_DSC_HIGH(addr, hint, am);
+		break;
+
+	qinv_ignore_psi:
+	case TLB_INV_G_DOMAIN:
+		dsc.lo = IOTLB_INV_DSC_LOW(domain_id, dr, dw, type);
+		dsc.hi = 0;
+		break;
+
+	case TLB_INV_G_GLOBAL:
+		dsc.lo = IOTLB_INV_DSC_LOW(0, dr, dw, type);
+		dsc.hi = 0;
+		break;
+	default:
+		ddi_err(DER_WARN, NULL, "incorrect iotlb flush type");
+		return;
+	}
+
+	qinv_submit_inv_dsc(immu, &dsc);
+}
+
+/* queued invalidation interface -- invalidate dev_iotlb */
+static void
+qinv_dev_iotlb_common(immu_t *immu, uint16_t sid,
+    uint64_t addr, uint_t size, uint_t max_invs_pd)
+{
+	qinv_dsc_t dsc;
+
+	dsc.lo = DEV_IOTLB_INV_DSC_LOW(sid, max_invs_pd);
+	dsc.hi = DEV_IOTLB_INV_DSC_HIGH(addr, size);
+
+	qinv_submit_inv_dsc(immu, &dsc);
+}
+
+/* queued invalidation interface -- invalidate interrupt entry cache */
+static void
+qinv_iec_common(immu_t *immu, uint_t iidx, uint_t im, uint_t g)
+{
+	qinv_dsc_t dsc;
+
+	dsc.lo = IEC_INV_DSC_LOW(iidx, im, g);
+	dsc.hi = IEC_INV_DSC_HIGH;
+
+	qinv_submit_inv_dsc(immu, &dsc);
+}
+
+/*
+ * alloc free entry from sync status table
+ */
+static uint_t
+qinv_alloc_sync_mem_entry(immu_t *immu)
+{
+	qinv_mem_t *sync_mem;
+	uint_t tail;
+	qinv_t *qinv;
+
+	qinv = (qinv_t *)immu->immu_qinv;
+	sync_mem = &qinv->qinv_sync;
+
+sync_mem_exhausted:
+	mutex_enter(&sync_mem->qinv_mem_lock);
+	tail = sync_mem->qinv_mem_tail;
+	sync_mem->qinv_mem_tail++;
+	if (sync_mem->qinv_mem_tail == sync_mem->qinv_mem_size)
+		sync_mem->qinv_mem_tail = 0;
+
+	if (sync_mem->qinv_mem_head == sync_mem->qinv_mem_tail) {
+		/* should never happen */
+		ddi_err(DER_WARN, NULL, "sync mem exhausted");
+		sync_mem->qinv_mem_tail = tail;
+		mutex_exit(&sync_mem->qinv_mem_lock);
+		delay(IMMU_ALLOC_RESOURCE_DELAY);
+		goto sync_mem_exhausted;
+	}
+	mutex_exit(&sync_mem->qinv_mem_lock);
+
+	return (tail);
+}
+
+/*
+ * queued invalidation interface -- invalidation wait descriptor
+ *   fence flag not set, need status data to indicate the invalidation
+ *   wait descriptor completion
+ */
+static void
+qinv_wait_async_unfence(immu_t *immu, qinv_iotlb_pend_node_t *node)
+{
+	qinv_dsc_t dsc;
+	qinv_mem_t *sync_mem;
+	uint64_t saddr;
+	uint_t tail;
+	qinv_t *qinv;
+
+	qinv = (qinv_t *)immu->immu_qinv;
+	sync_mem = &qinv->qinv_sync;
+	tail = qinv_alloc_sync_mem_entry(immu);
+
+	/* plant an iotlb pending node */
+	qinv->qinv_iotlb_pend_node[tail] = node;
+
+	saddr = sync_mem->qinv_mem_paddr + tail * QINV_SYNC_DATA_SIZE;
+
+	/*
+	 * sdata = QINV_SYNC_DATA_UNFENCE, fence = 0, sw = 1, if = 0
+	 * indicate the invalidation wait descriptor completion by
+	 * performing a coherent DWORD write to the status address,
+	 * not by generating an invalidation completion event
+	 */
+	dsc.lo = INV_WAIT_DSC_LOW(QINV_SYNC_DATA_UNFENCE, 0, 1, 0);
+	dsc.hi = INV_WAIT_DSC_HIGH(saddr);
+
+	qinv_submit_inv_dsc(immu, &dsc);
+}
+
+/*
+ * queued invalidation interface -- invalidation wait descriptor
+ *   fence flag set, indicate descriptors following the invalidation
+ *   wait descriptor must be processed by hardware only after the
+ *   invalidation wait descriptor completes.
+ */
+static void
+qinv_wait_async_fence(immu_t *immu)
+{
+	qinv_dsc_t dsc;
+
+	/* sw = 0, fence = 1, iflag = 0 */
+	dsc.lo = INV_WAIT_DSC_LOW(0, 1, 0, 0);
+	dsc.hi = 0;
+	qinv_submit_inv_dsc(immu, &dsc);
+}
+
+/*
+ * queued invalidation interface -- invalidation wait descriptor
+ *   wait until the invalidation request finished
+ */
+static void
+qinv_wait_sync(immu_t *immu)
+{
+	qinv_dsc_t dsc;
+	qinv_mem_t *sync_mem;
+	uint64_t saddr;
+	uint_t tail;
+	qinv_t *qinv;
+	volatile uint32_t *status;
+
+	qinv = (qinv_t *)immu->immu_qinv;
+	sync_mem = &qinv->qinv_sync;
+	tail = qinv_alloc_sync_mem_entry(immu);
+	saddr = sync_mem->qinv_mem_paddr + tail * QINV_SYNC_DATA_SIZE;
+	status = (uint32_t *)(sync_mem->qinv_mem_vaddr + tail *
+	    QINV_SYNC_DATA_SIZE);
+
+	/*
+	 * sdata = QINV_SYNC_DATA_FENCE, fence = 1, sw = 1, if = 0
+	 * indicate the invalidation wait descriptor completion by
+	 * performing a coherent DWORD write to the status address,
+	 * not by generating an invalidation completion event
+	 */
+	dsc.lo = INV_WAIT_DSC_LOW(QINV_SYNC_DATA_FENCE, 1, 1, 0);
+	dsc.hi = INV_WAIT_DSC_HIGH(saddr);
+
+	qinv_submit_inv_dsc(immu, &dsc);
+
+	while ((*status) != QINV_SYNC_DATA_FENCE)
+		iommu_cpu_nop();
+	*status = QINV_SYNC_DATA_UNFENCE;
+}
+
+/* get already completed invalidation wait requests */
+static int
+qinv_wait_async_finish(immu_t *immu, int *cnt)
+{
+	qinv_mem_t *sync_mem;
+	int index;
+	qinv_t *qinv;
+	volatile uint32_t *value;
+
+	ASSERT((*cnt) == 0);
+
+	qinv = (qinv_t *)immu->immu_qinv;
+	sync_mem = &qinv->qinv_sync;
+
+	mutex_enter(&sync_mem->qinv_mem_lock);
+	index = sync_mem->qinv_mem_head;
+	value = (uint32_t *)(sync_mem->qinv_mem_vaddr + index
+	    * QINV_SYNC_DATA_SIZE);
+	while (*value == QINV_SYNC_DATA_UNFENCE) {
+		*value = 0;
+		(*cnt)++;
+		sync_mem->qinv_mem_head++;
+		if (sync_mem->qinv_mem_head == sync_mem->qinv_mem_size) {
+			sync_mem->qinv_mem_head = 0;
+			value = (uint32_t *)(sync_mem->qinv_mem_vaddr);
+		} else
+			value = (uint32_t *)((char *)value +
+			    QINV_SYNC_DATA_SIZE);
+	}
+
+	mutex_exit(&sync_mem->qinv_mem_lock);
+	if ((*cnt) > 0)
+		return (index);
+	else
+		return (-1);
+}
+
+/*
+ * call ddi_dma_mem_alloc to allocate physical contigous
+ * pages for invalidation queue table
+ */
+static int
+qinv_setup(immu_t *immu)
+{
+	qinv_t *qinv;
+	size_t size;
+
+	ddi_dma_attr_t qinv_dma_attr = {
+		DMA_ATTR_V0,
+		0U,
+		0xffffffffU,
+		0xffffffffU,
+		MMU_PAGESIZE, /* page aligned */
+		0x1,
+		0x1,
+		0xffffffffU,
+		0xffffffffU,
+		1,
+		4,
+		0
+	};
+
+	ddi_device_acc_attr_t qinv_acc_attr = {
+		DDI_DEVICE_ATTR_V0,
+		DDI_NEVERSWAP_ACC,
+		DDI_STRICTORDER_ACC
+	};
+
+	mutex_init(&(immu->immu_qinv_lock), NULL, MUTEX_DRIVER, NULL);
+
+
+	mutex_enter(&(immu->immu_qinv_lock));
+
+	immu->immu_qinv = NULL;
+	if (!IMMU_ECAP_GET_QI(immu->immu_regs_excap) ||
+	    immu_qinv_enable == B_FALSE) {
+		mutex_exit(&(immu->immu_qinv_lock));
+		return (DDI_SUCCESS);
+	}
+
+	if (qinv_iqa_qs > QINV_MAX_QUEUE_SIZE)
+		qinv_iqa_qs = QINV_MAX_QUEUE_SIZE;
+
+	qinv = kmem_zalloc(sizeof (qinv_t), KM_SLEEP);
+
+	if (ddi_dma_alloc_handle(root_devinfo,
+	    &qinv_dma_attr, DDI_DMA_SLEEP, NULL,
+	    &(qinv->qinv_table.qinv_mem_dma_hdl)) != DDI_SUCCESS) {
+		ddi_err(DER_WARN, root_devinfo,
+		    "alloc invalidation queue table handler failed");
+		goto queue_table_handle_failed;
+	}
+
+	if (ddi_dma_alloc_handle(root_devinfo,
+	    &qinv_dma_attr, DDI_DMA_SLEEP, NULL,
+	    &(qinv->qinv_sync.qinv_mem_dma_hdl)) != DDI_SUCCESS) {
+		ddi_err(DER_WARN, root_devinfo,
+		    "alloc invalidation queue sync mem handler failed");
+		goto sync_table_handle_failed;
+	}
+
+	qinv->qinv_table.qinv_mem_size = (1 << (qinv_iqa_qs + 8));
+	size = qinv->qinv_table.qinv_mem_size * QINV_ENTRY_SIZE;
+
+	/* alloc physical contiguous pages for invalidation queue */
+	if (ddi_dma_mem_alloc(qinv->qinv_table.qinv_mem_dma_hdl,
+	    size,
+	    &qinv_acc_attr,
+	    DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
+	    DDI_DMA_SLEEP,
+	    NULL,
+	    &(qinv->qinv_table.qinv_mem_vaddr),
+	    &size,
+	    &(qinv->qinv_table.qinv_mem_acc_hdl)) != DDI_SUCCESS) {
+		ddi_err(DER_WARN, root_devinfo,
+		    "alloc invalidation queue table failed");
+		goto queue_table_mem_failed;
+	}
+
+	ASSERT(!((uintptr_t)qinv->qinv_table.qinv_mem_vaddr & MMU_PAGEOFFSET));
+	bzero(qinv->qinv_table.qinv_mem_vaddr, size);
+
+	/* get the base physical address of invalidation request queue */
+	qinv->qinv_table.qinv_mem_paddr = pfn_to_pa(
+	    hat_getpfnum(kas.a_hat, qinv->qinv_table.qinv_mem_vaddr));
+
+	qinv->qinv_table.qinv_mem_head = qinv->qinv_table.qinv_mem_tail = 0;
+
+	qinv->qinv_sync.qinv_mem_size = qinv->qinv_table.qinv_mem_size;
+	size = qinv->qinv_sync.qinv_mem_size * QINV_SYNC_DATA_SIZE;
+
+	/* alloc status memory for invalidation wait descriptor */
+	if (ddi_dma_mem_alloc(qinv->qinv_sync.qinv_mem_dma_hdl,
+	    size,
+	    &qinv_acc_attr,
+	    DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
+	    DDI_DMA_SLEEP,
+	    NULL,
+	    &(qinv->qinv_sync.qinv_mem_vaddr),
+	    &size,
+	    &(qinv->qinv_sync.qinv_mem_acc_hdl)) != DDI_SUCCESS) {
+		ddi_err(DER_WARN, root_devinfo,
+		    "alloc invalidation queue sync mem failed");
+		goto sync_table_mem_failed;
+	}
+
+	ASSERT(!((uintptr_t)qinv->qinv_sync.qinv_mem_vaddr & MMU_PAGEOFFSET));
+	bzero(qinv->qinv_sync.qinv_mem_vaddr, size);
+	qinv->qinv_sync.qinv_mem_paddr = pfn_to_pa(
+	    hat_getpfnum(kas.a_hat, qinv->qinv_sync.qinv_mem_vaddr));
+
+	qinv->qinv_sync.qinv_mem_head = qinv->qinv_sync.qinv_mem_tail = 0;
+
+	mutex_init(&(qinv->qinv_table.qinv_mem_lock), NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&(qinv->qinv_sync.qinv_mem_lock), NULL, MUTEX_DRIVER, NULL);
+
+	/*
+	 * init iotlb pend node for submitting invalidation iotlb
+	 * queue request
+	 */
+	qinv->qinv_iotlb_pend_node = (qinv_iotlb_pend_node_t **)
+	    kmem_zalloc(qinv->qinv_sync.qinv_mem_size
+	    * sizeof (qinv_iotlb_pend_node_t *), KM_SLEEP);
+
+	/* set invalidation queue structure */
+	immu->immu_qinv = qinv;
+
+	mutex_exit(&(immu->immu_qinv_lock));
+
+	return (DDI_SUCCESS);
+
+sync_table_mem_failed:
+	ddi_dma_mem_free(&(qinv->qinv_table.qinv_mem_acc_hdl));
+
+queue_table_mem_failed:
+	ddi_dma_free_handle(&(qinv->qinv_sync.qinv_mem_dma_hdl));
+
+sync_table_handle_failed:
+	ddi_dma_free_handle(&(qinv->qinv_table.qinv_mem_dma_hdl));
+
+queue_table_handle_failed:
+	kmem_free(qinv, sizeof (qinv_t));
+
+	mutex_exit(&(immu->immu_qinv_lock));
+
+	return (DDI_FAILURE);
+}
+
+/*
+ * ###########################################################################
+ *
+ * Functions exported by immu_qinv.c
+ *
+ * ###########################################################################
+ */
+
+/*
+ * initialize invalidation request queue structure.
+ */
+void
+immu_qinv_setup(list_t *listp)
+{
+	immu_t *immu;
+
+	if (immu_qinv_enable == B_FALSE) {
+		return;
+	}
+
+	immu = list_head(listp);
+	for (; immu; immu = list_next(listp, immu)) {
+		if (qinv_setup(immu) == DDI_SUCCESS) {
+			immu->immu_qinv_setup = B_TRUE;
+		}
+	}
+}
+
+void
+immu_qinv_startup(immu_t *immu)
+{
+	qinv_t *qinv;
+	uint64_t qinv_reg_value;
+
+	if (immu->immu_qinv_setup == B_FALSE) {
+		return;
+	}
+
+	qinv = (qinv_t *)immu->immu_qinv;
+	qinv_reg_value = qinv->qinv_table.qinv_mem_paddr | qinv_iqa_qs;
+	immu_regs_qinv_enable(immu, qinv_reg_value);
+	immu->immu_qinv_running = B_TRUE;
+}
+
+/*
+ * queued invalidation interface
+ *   function based context cache invalidation
+ */
+void
+immu_qinv_context_fsi(immu_t *immu, uint8_t function_mask,
+    uint16_t source_id, uint_t domain_id)
+{
+	qinv_context_common(immu, function_mask, source_id,
+	    domain_id, CTT_INV_G_DEVICE);
+	qinv_wait_sync(immu);
+}
+
+/*
+ * queued invalidation interface
+ *   domain based context cache invalidation
+ */
+void
+immu_qinv_context_dsi(immu_t *immu, uint_t domain_id)
+{
+	qinv_context_common(immu, 0, 0, domain_id, CTT_INV_G_DOMAIN);
+	qinv_wait_sync(immu);
+}
+
+/*
+ * queued invalidation interface
+ *   invalidation global context cache
+ */
+void
+immu_qinv_context_gbl(immu_t *immu)
+{
+	qinv_context_common(immu, 0, 0, 0, CTT_INV_G_GLOBAL);
+	qinv_wait_sync(immu);
+}
+
+/*
+ * queued invalidation interface
+ *   paged based iotlb invalidation
+ */
+void
+immu_inv_iotlb_psi(immu_t *immu, uint_t domain_id,
+	uint64_t dvma, uint_t count, uint_t hint)
+{
+	uint_t am = 0;
+	uint_t max_am;
+
+	max_am = IMMU_CAP_GET_MAMV(immu->immu_regs_cap);
+
+	/* choose page specified invalidation */
+	if (IMMU_CAP_GET_PSI(immu->immu_regs_cap)) {
+		while (am <= max_am) {
+			if ((ADDR_AM_OFFSET(IMMU_BTOP(dvma), am) + count)
+			    <= ADDR_AM_MAX(am)) {
+				qinv_iotlb_common(immu, domain_id,
+				    dvma, am, hint, TLB_INV_G_PAGE);
+				break;
+			}
+			am++;
+		}
+		if (am > max_am) {
+			qinv_iotlb_common(immu, domain_id,
+			    dvma, 0, hint, TLB_INV_G_DOMAIN);
+		}
+
+	/* choose domain invalidation */
+	} else {
+		qinv_iotlb_common(immu, domain_id, dvma,
+		    0, hint, TLB_INV_G_DOMAIN);
+	}
+}
+
+/*
+ * queued invalidation interface
+ *   domain based iotlb invalidation
+ */
+void
+immu_qinv_iotlb_dsi(immu_t *immu, uint_t domain_id)
+{
+	qinv_iotlb_common(immu, domain_id, 0, 0, 0, TLB_INV_G_DOMAIN);
+	qinv_wait_sync(immu);
+}
+
+/*
+ * queued invalidation interface
+ *    global iotlb invalidation
+ */
+void
+immu_qinv_iotlb_gbl(immu_t *immu)
+{
+	qinv_iotlb_common(immu, 0, 0, 0, 0, TLB_INV_G_GLOBAL);
+	qinv_wait_sync(immu);
+}
+
+
+
+/*
+ * the plant wait operation for queued invalidation interface
+ */
+void
+immu_qinv_plant(immu_t *immu, dvcookie_t *dvcookies,
+	uint_t count, uint_t array_size)
+{
+	qinv_t *qinv;
+	qinv_iotlb_pend_node_t *node = NULL;
+	qinv_iotlb_pend_head_t *head;
+
+	qinv = (qinv_t *)immu->immu_qinv;
+
+	head = &(qinv->qinv_pend_head);
+	mutex_enter(&(head->ich_mem_lock));
+	node = list_head(&(head->ich_mem_list));
+	if (node) {
+		list_remove(&(head->ich_mem_list), node);
+	}
+	mutex_exit(&(head->ich_mem_lock));
+
+	/* no cache, alloc one */
+	if (node == NULL) {
+		node = kmem_zalloc(sizeof (qinv_iotlb_pend_node_t), KM_SLEEP);
+	}
+	node->icn_dvcookies = dvcookies;
+	node->icn_count = count;
+	node->icn_array_size = array_size;
+
+	/* plant an invalidation wait descriptor, not wait its completion */
+	qinv_wait_async_unfence(immu, node);
+}
+
+/*
+ * the reap wait operation for queued invalidation interface
+ */
+void
+immu_qinv_reap(immu_t *immu)
+{
+	int index, cnt = 0;
+	qinv_iotlb_pend_node_t *node;
+	qinv_iotlb_pend_head_t *head;
+	qinv_t *qinv;
+
+	qinv = (qinv_t *)immu->immu_qinv;
+	head = &(qinv->qinv_pend_head);
+
+	index = qinv_wait_async_finish(immu, &cnt);
+
+	while (cnt--) {
+		node = qinv->qinv_iotlb_pend_node[index];
+		if (node == NULL)
+			continue;
+		mutex_enter(&(head->ich_mem_lock));
+		list_insert_head(&(head->ich_mem_list), node);
+		mutex_exit(&(head->ich_mem_lock));
+		qinv->qinv_iotlb_pend_node[index] = NULL;
+		index++;
+		if (index == qinv->qinv_sync.qinv_mem_size)
+			index = 0;
+	}
+}
+
+
+/* queued invalidation interface -- global invalidate interrupt entry cache */
+void
+immu_qinv_intr_global(immu_t *immu)
+{
+	qinv_iec_common(immu, 0, 0, IEC_INV_GLOBAL);
+	qinv_wait_sync(immu);
+}
+
+/* queued invalidation interface -- invalidate single interrupt entry cache */
+void
+immu_qinv_intr_one_cache(immu_t *immu, uint_t iidx)
+{
+	qinv_iec_common(immu, iidx, 0, IEC_INV_INDEX);
+	qinv_wait_sync(immu);
+}
+
+/* queued invalidation interface -- invalidate interrupt entry caches */
+void
+immu_qinv_intr_caches(immu_t *immu, uint_t iidx, uint_t cnt)
+{
+	uint_t	i, mask = 0;
+
+	ASSERT(cnt != 0);
+
+	/* requested interrupt count is not a power of 2 */
+	if (!ISP2(cnt)) {
+		for (i = 0; i < cnt; i++) {
+			qinv_iec_common(immu, iidx + cnt, 0, IEC_INV_INDEX);
+		}
+		qinv_wait_sync(immu);
+		return;
+	}
+
+	while ((2 << mask) < cnt) {
+		mask++;
+	}
+
+	if (mask > IMMU_ECAP_GET_MHMV(immu->immu_regs_excap)) {
+		for (i = 0; i < cnt; i++) {
+			qinv_iec_common(immu, iidx + cnt, 0, IEC_INV_INDEX);
+		}
+		qinv_wait_sync(immu);
+		return;
+	}
+
+	qinv_iec_common(immu, iidx, mask, IEC_INV_INDEX);
+
+	qinv_wait_sync(immu);
+}
+
+void
+immu_qinv_report_fault(immu_t *immu)
+{
+	uint16_t head;
+	qinv_dsc_t *dsc;
+	qinv_t *qinv;
+
+	/* access qinv data */
+	mutex_enter(&(immu->immu_qinv_lock));
+
+	qinv = (qinv_t *)(immu->immu_qinv);
+
+	head = QINV_IQA_HEAD(
+	    immu_regs_get64(immu, IMMU_REG_INVAL_QH));
+
+	dsc = (qinv_dsc_t *)(qinv->qinv_table.qinv_mem_vaddr
+	    + (head * QINV_ENTRY_SIZE));
+
+	/* report the error */
+	ddi_err(DER_WARN, immu->immu_dip,
+	    "generated a fault when fetching a descriptor from the"
+	    "\tinvalidation queue, or detects that the fetched"
+	    "\tdescriptor is invalid. The head register is "
+	    "0x%" PRIx64
+	    "\tthe type is %s",
+	    head,
+	    qinv_dsc_type[MIN(INV_DSC_TYPE(dsc), QINV_MAX_DSC_TYPE)]);
+
+	mutex_exit(&(immu->immu_qinv_lock));
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/io/immu_regs.c	Sat Jan 30 18:23:16 2010 -0800
@@ -0,0 +1,851 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Portions Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * immu_regs.c  - File that operates on a IMMU unit's regsiters
+ */
+#include <sys/dditypes.h>
+#include <sys/ddi.h>
+#include <sys/archsystm.h>
+#include <sys/x86_archext.h>
+#include <sys/spl.h>
+#include <sys/immu.h>
+
+#define	get_reg32(immu, offset)	ddi_get32((immu)->immu_regs_handle, \
+		(uint32_t *)(immu->immu_regs_addr + (offset)))
+#define	get_reg64(immu, offset)	ddi_get64((immu)->immu_regs_handle, \
+		(uint64_t *)(immu->immu_regs_addr + (offset)))
+#define	put_reg32(immu, offset, val)	ddi_put32\
+		((immu)->immu_regs_handle, \
+		(uint32_t *)(immu->immu_regs_addr + (offset)), val)
+#define	put_reg64(immu, offset, val)	ddi_put64\
+		((immu)->immu_regs_handle, \
+		(uint64_t *)(immu->immu_regs_addr + (offset)), val)
+
+/*
+ * wait max 60s for the hardware completion
+ */
+#define	IMMU_MAX_WAIT_TIME		60000000
+#define	wait_completion(immu, offset, getf, completion, status) \
+{ \
+	clock_t stick = ddi_get_lbolt(); \
+	clock_t ntick; \
+	_NOTE(CONSTCOND) \
+	while (1) { \
+		status = getf(immu, offset); \
+		ntick = ddi_get_lbolt(); \
+		if (completion) { \
+			break; \
+		} \
+		if (ntick - stick >= drv_usectohz(IMMU_MAX_WAIT_TIME)) { \
+			ddi_err(DER_PANIC, NULL, \
+			    "immu wait completion time out");		\
+			/*NOTREACHED*/   \
+		} else { \
+			iommu_cpu_nop();\
+		}\
+	}\
+}
+
+static ddi_device_acc_attr_t immu_regs_attr = {
+	DDI_DEVICE_ATTR_V0,
+	DDI_NEVERSWAP_ACC,
+	DDI_STRICTORDER_ACC,
+};
+
+/*
+ * iotlb_flush()
+ *   flush the iotlb cache
+ */
+static void
+iotlb_flush(immu_t *immu, uint_t domain_id,
+    uint64_t addr, uint_t am, uint_t hint, immu_iotlb_inv_t type)
+{
+	uint64_t command = 0, iva = 0;
+	uint_t iva_offset, iotlb_offset;
+	uint64_t status = 0;
+
+	ASSERT(MUTEX_HELD(&(immu->immu_regs_lock)));
+
+	/* no lock needed since cap and excap fields are RDONLY */
+	iva_offset = IMMU_ECAP_GET_IRO(immu->immu_regs_excap);
+	iotlb_offset = iva_offset + 8;
+
+	/*
+	 * prepare drain read/write command
+	 */
+	if (IMMU_CAP_GET_DWD(immu->immu_regs_cap)) {
+		command |= TLB_INV_DRAIN_WRITE;
+	}
+
+	if (IMMU_CAP_GET_DRD(immu->immu_regs_cap)) {
+		command |= TLB_INV_DRAIN_READ;
+	}
+
+	/*
+	 * if the hardward doesn't support page selective invalidation, we
+	 * will use domain type. Otherwise, use global type
+	 */
+	switch (type) {
+	case IOTLB_PSI:
+		if (!IMMU_CAP_GET_PSI(immu->immu_regs_cap) ||
+		    (am > IMMU_CAP_GET_MAMV(immu->immu_regs_cap)) ||
+		    (addr & IMMU_PAGEOFFSET)) {
+			goto ignore_psi;
+		}
+		command |= TLB_INV_PAGE | TLB_INV_IVT |
+		    TLB_INV_DID(domain_id);
+		iva = addr | am | TLB_IVA_HINT(hint);
+		break;
+ignore_psi:
+	case IOTLB_DSI:
+		command |= TLB_INV_DOMAIN | TLB_INV_IVT |
+		    TLB_INV_DID(domain_id);
+		break;
+	case IOTLB_GLOBAL:
+		command |= TLB_INV_GLOBAL | TLB_INV_IVT;
+		break;
+	default:
+		ddi_err(DER_MODE, NULL, "%s: incorrect iotlb flush type",
+		    immu->immu_name);
+		return;
+	}
+
+	/* verify there is no pending command */
+	wait_completion(immu, iotlb_offset, get_reg64,
+	    (!(status & TLB_INV_IVT)), status);
+	if (iva)
+		put_reg64(immu, iva_offset, iva);
+	put_reg64(immu, iotlb_offset, command);
+	wait_completion(immu, iotlb_offset, get_reg64,
+	    (!(status & TLB_INV_IVT)), status);
+}
+
+/*
+ * iotlb_psi()
+ *   iotlb page specific invalidation
+ */
+static void
+iotlb_psi(immu_t *immu, uint_t domain_id,
+    uint64_t dvma, uint_t count, uint_t hint)
+{
+	uint_t am = 0;
+	uint_t max_am = 0;
+	uint64_t align = 0;
+	uint64_t dvma_pg = 0;
+	uint_t used_count = 0;
+
+	mutex_enter(&(immu->immu_regs_lock));
+
+	/* choose page specified invalidation */
+	if (IMMU_CAP_GET_PSI(immu->immu_regs_cap)) {
+		/* MAMV is valid only if PSI is set */
+		max_am = IMMU_CAP_GET_MAMV(immu->immu_regs_cap);
+		while (count != 0) {
+			/* First calculate alignment of DVMA */
+			dvma_pg = IMMU_BTOP(dvma);
+			ASSERT(dvma_pg != NULL);
+			ASSERT(count >= 1);
+			for (align = 1; (dvma_pg & align) == 0; align <<= 1)
+				;
+			/* truncate count to the nearest power of 2 */
+			for (used_count = 1, am = 0; count >> used_count != 0;
+			    used_count <<= 1, am++)
+				;
+			if (am > max_am) {
+				am = max_am;
+				used_count = 1 << am;
+			}
+			if (align >= used_count) {
+				iotlb_flush(immu, domain_id,
+				    dvma, am, hint, IOTLB_PSI);
+			} else {
+				/* align < used_count */
+				used_count = align;
+				for (am = 0; (1 << am) != used_count; am++)
+					;
+				iotlb_flush(immu, domain_id,
+				    dvma, am, hint, IOTLB_PSI);
+			}
+			count -= used_count;
+			dvma = (dvma_pg + used_count) << IMMU_PAGESHIFT;
+		}
+	} else {
+		/* choose domain invalidation */
+		iotlb_flush(immu, domain_id, dvma, 0, 0, IOTLB_DSI);
+	}
+
+	mutex_exit(&(immu->immu_regs_lock));
+}
+
+/*
+ * iotlb_dsi()
+ *	domain specific invalidation
+ */
+static void
+iotlb_dsi(immu_t *immu, uint_t domain_id)
+{
+	mutex_enter(&(immu->immu_regs_lock));
+	iotlb_flush(immu, domain_id, 0, 0, 0, IOTLB_DSI);
+	mutex_exit(&(immu->immu_regs_lock));
+}
+
+/*
+ * iotlb_global()
+ *     global iotlb invalidation
+ */
+static void
+iotlb_global(immu_t *immu)
+{
+	mutex_enter(&(immu->immu_regs_lock));
+	iotlb_flush(immu, 0, 0, 0, 0, IOTLB_GLOBAL);
+	mutex_exit(&(immu->immu_regs_lock));
+}
+
+
+static int
+gaw2agaw(int gaw)
+{
+	int r, agaw;
+
+	r = (gaw - 12) % 9;
+
+	if (r == 0)
+		agaw = gaw;
+	else
+		agaw = gaw + 9 - r;
+
+	if (agaw > 64)
+		agaw = 64;
+
+	return (agaw);
+}
+
+/*
+ * set_immu_agaw()
+ * 	calculate agaw for a IOMMU unit
+ */
+static int
+set_agaw(immu_t *immu)
+{
+	int mgaw, magaw, agaw;
+	uint_t bitpos;
+	int max_sagaw_mask, sagaw_mask, mask;
+	int nlevels;
+
+	/*
+	 * mgaw is the maximum guest address width.
+	 * Addresses above this value will be
+	 * blocked by the IOMMU unit.
+	 * sagaw is a bitmask that lists all the
+	 * AGAWs supported by this IOMMU unit.
+	 */
+	mgaw = IMMU_CAP_MGAW(immu->immu_regs_cap);
+	sagaw_mask = IMMU_CAP_SAGAW(immu->immu_regs_cap);
+
+	magaw = gaw2agaw(mgaw);
+
+	/*
+	 * Get bitpos corresponding to
+	 * magaw
+	 */
+
+	/*
+	 * Maximum SAGAW is specified by
+	 * Vt-d spec.
+	 */
+	max_sagaw_mask = ((1 << 5) - 1);
+
+	if (sagaw_mask > max_sagaw_mask) {
+		ddi_err(DER_WARN, NULL, "%s: SAGAW bitmask (%x) "
+		    "is larger than maximu SAGAW bitmask "
+		    "(%x) specified by Intel Vt-d spec",
+		    immu->immu_name, sagaw_mask, max_sagaw_mask);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Find a supported AGAW <= magaw
+	 *
+	 *	sagaw_mask    bitpos   AGAW (bits)  nlevels
+	 *	==============================================
+	 *	0 0 0 0 1	0	30		2
+	 *	0 0 0 1 0	1	39		3
+	 *	0 0 1 0 0	2	48		4
+	 *	0 1 0 0 0	3	57		5
+	 *	1 0 0 0 0	4	64(66)		6
+	 */
+	mask = 1;
+	nlevels = 0;
+	agaw = 0;
+	for (mask = 1, bitpos = 0; bitpos < 5;
+	    bitpos++, mask <<= 1) {
+		if (mask & sagaw_mask) {
+			nlevels = bitpos + 2;
+			agaw = 30 + (bitpos * 9);
+		}
+	}
+
+	/* calculated agaw can be > 64 */
+	agaw = (agaw > 64) ? 64 : agaw;
+
+	if (agaw < 30 || agaw > magaw) {
+		ddi_err(DER_WARN, NULL, "%s: Calculated AGAW (%d) "
+		    "is outside valid limits [30,%d] specified by Vt-d spec "
+		    "and magaw",  immu->immu_name, agaw, magaw);
+		return (DDI_FAILURE);
+	}
+
+	if (nlevels < 2 || nlevels > 6) {
+		ddi_err(DER_WARN, NULL, "%s: Calculated pagetable "
+		    "level (%d) is outside valid limits [2,6]",
+		    immu->immu_name, nlevels);
+		return (DDI_FAILURE);
+	}
+
+	ddi_err(DER_LOG, NULL, "Calculated pagetable "
+	    "level (%d), agaw = %d", nlevels, agaw);
+
+	immu->immu_dvma_nlevels = nlevels;
+	immu->immu_dvma_agaw = agaw;
+
+	return (DDI_SUCCESS);
+}
+
+static int
+setup_regs(immu_t *immu)
+{
+	int error;
+
+	ASSERT(immu);
+	ASSERT(immu->immu_name);
+
+	/*
+	 * This lock may be acquired by the IOMMU interrupt handler
+	 */
+	mutex_init(&(immu->immu_regs_lock), NULL, MUTEX_DRIVER,
+	    (void *)ipltospl(IMMU_INTR_IPL));
+
+	/*
+	 * map the register address space
+	 */
+	error = ddi_regs_map_setup(immu->immu_dip, 0,
+	    (caddr_t *)&(immu->immu_regs_addr), (offset_t)0,
+	    (offset_t)IMMU_REGSZ, &immu_regs_attr,
+	    &(immu->immu_regs_handle));
+
+	if (error == DDI_FAILURE) {
+		ddi_err(DER_WARN, NULL, "%s: Intel IOMMU register map failed",
+		    immu->immu_name);
+		mutex_destroy(&(immu->immu_regs_lock));
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * get the register value
+	 */
+	immu->immu_regs_cap = get_reg64(immu, IMMU_REG_CAP);
+	immu->immu_regs_excap = get_reg64(immu, IMMU_REG_EXCAP);
+
+	/*
+	 * if the hardware access is non-coherent, we need clflush
+	 */
+	if (IMMU_ECAP_GET_C(immu->immu_regs_excap)) {
+		immu->immu_dvma_coherent = B_TRUE;
+	} else {
+		immu->immu_dvma_coherent = B_FALSE;
+		if (!(x86_feature & X86_CLFSH)) {
+			ddi_err(DER_WARN, NULL,
+			    "immu unit %s can't be enabled due to "
+			    "missing clflush functionality", immu->immu_name);
+			ddi_regs_map_free(&(immu->immu_regs_handle));
+			mutex_destroy(&(immu->immu_regs_lock));
+			return (DDI_FAILURE);
+		}
+	}
+
+	/*
+	 * Check for Mobile 4 series chipset
+	 */
+	if (immu_quirk_mobile4 == B_TRUE &&
+	    !IMMU_CAP_GET_RWBF(immu->immu_regs_cap)) {
+		ddi_err(DER_LOG, NULL,
+		    "IMMU: Mobile 4 chipset quirk detected. "
+		    "Force-setting RWBF");
+		IMMU_CAP_SET_RWBF(immu->immu_regs_cap);
+		ASSERT(IMMU_CAP_GET_RWBF(immu->immu_regs_cap));
+	}
+
+	/*
+	 * retrieve the maximum number of domains
+	 */
+	immu->immu_max_domains = IMMU_CAP_ND(immu->immu_regs_cap);
+
+	/*
+	 * calculate the agaw
+	 */
+	if (set_agaw(immu) != DDI_SUCCESS) {
+		ddi_regs_map_free(&(immu->immu_regs_handle));
+		mutex_destroy(&(immu->immu_regs_lock));
+		return (DDI_FAILURE);
+	}
+	immu->immu_regs_cmdval = 0;
+
+	return (DDI_SUCCESS);
+}
+
+/* ############### Functions exported ################## */
+
+/*
+ * immu_regs_setup()
+ *       Setup mappings to a IMMU unit's registers
+ *       so that they can be read/written
+ */
+void
+immu_regs_setup(list_t *listp)
+{
+	int i;
+	immu_t *immu;
+
+	for (i = 0; i < IMMU_MAXSEG; i++) {
+		immu = list_head(listp);
+		for (; immu; immu = list_next(listp, immu)) {
+			/* do your best, continue on error */
+			if (setup_regs(immu) != DDI_SUCCESS) {
+				immu->immu_regs_setup = B_FALSE;
+			} else {
+				immu->immu_regs_setup = B_TRUE;
+			}
+		}
+	}
+}
+
+/*
+ * immu_regs_map()
+ */
+int
+immu_regs_resume(immu_t *immu)
+{
+	int error;
+
+	/*
+	 * remap the register address space
+	 */
+	error = ddi_regs_map_setup(immu->immu_dip, 0,
+	    (caddr_t *)&(immu->immu_regs_addr), (offset_t)0,
+	    (offset_t)IMMU_REGSZ, &immu_regs_attr,
+	    &(immu->immu_regs_handle));
+	if (error != DDI_SUCCESS) {
+		return (DDI_FAILURE);
+	}
+
+	immu_regs_set_root_table(immu);
+
+	immu_regs_intr_enable(immu, immu->immu_regs_intr_msi_addr,
+	    immu->immu_regs_intr_msi_data, immu->immu_regs_intr_uaddr);
+
+	(void) immu_intr_handler(immu);
+
+	immu_regs_intrmap_enable(immu, immu->immu_intrmap_irta_reg);
+
+	immu_regs_qinv_enable(immu, immu->immu_qinv_reg_value);
+
+
+	return (error);
+}
+
+/*
+ * immu_regs_suspend()
+ */
+void
+immu_regs_suspend(immu_t *immu)
+{
+
+	immu->immu_intrmap_running = B_FALSE;
+
+	/* Finally, unmap the regs */
+	ddi_regs_map_free(&(immu->immu_regs_handle));
+}
+
+/*
+ * immu_regs_startup()
+ *	set a IMMU unit's registers to startup the unit
+ */
+void
+immu_regs_startup(immu_t *immu)
+{
+	uint32_t status;
+
+	if (immu->immu_regs_setup == B_FALSE) {
+		return;
+	}
+
+	ASSERT(immu->immu_regs_running == B_FALSE);
+
+	ASSERT(MUTEX_HELD(&(immu->immu_lock)));
+
+	mutex_enter(&(immu->immu_regs_lock));
+	put_reg32(immu, IMMU_REG_GLOBAL_CMD,
+	    immu->immu_regs_cmdval | IMMU_GCMD_TE);
+	wait_completion(immu, IMMU_REG_GLOBAL_STS,
+	    get_reg32, (status & IMMU_GSTS_TES), status);
+	immu->immu_regs_cmdval |= IMMU_GCMD_TE;
+	immu->immu_regs_running = B_TRUE;
+	mutex_exit(&(immu->immu_regs_lock));
+
+	ddi_err(DER_NOTE, NULL, "IMMU %s running", immu->immu_name);
+}
+
+/*
+ * immu_regs_shutdown()
+ *	shutdown a unit
+ */
+void
+immu_regs_shutdown(immu_t *immu)
+{
+	uint32_t status;
+
+	if (immu->immu_regs_running == B_FALSE) {
+		return;
+	}
+
+	ASSERT(immu->immu_regs_setup == B_TRUE);
+
+	ASSERT(MUTEX_HELD(&(immu->immu_lock)));
+
+	mutex_enter(&(immu->immu_regs_lock));
+	immu->immu_regs_cmdval &= ~IMMU_GCMD_TE;
+	put_reg32(immu, IMMU_REG_GLOBAL_CMD,
+	    immu->immu_regs_cmdval);
+	wait_completion(immu, IMMU_REG_GLOBAL_STS,
+	    get_reg32, !(status & IMMU_GSTS_TES), status);
+	immu->immu_regs_running = B_FALSE;
+	mutex_exit(&(immu->immu_regs_lock));
+
+	ddi_err(DER_NOTE, NULL, "IOMMU %s stopped", immu->immu_name);
+}
+
+/*
+ * immu_regs_intr()
+ *        Set a IMMU unit regs to setup a IMMU unit's
+ *        interrupt handler
+ */
+void
+immu_regs_intr_enable(immu_t *immu, uint32_t msi_addr, uint32_t msi_data,
+    uint32_t uaddr)
+{
+	mutex_enter(&(immu->immu_regs_lock));
+	immu->immu_regs_intr_msi_addr = msi_addr;
+	immu->immu_regs_intr_uaddr = uaddr;
+	immu->immu_regs_intr_msi_data = msi_data;
+	put_reg32(immu, IMMU_REG_FEVNT_ADDR, msi_addr);
+	put_reg32(immu, IMMU_REG_FEVNT_UADDR, uaddr);
+	put_reg32(immu, IMMU_REG_FEVNT_DATA, msi_data);
+	put_reg32(immu, IMMU_REG_FEVNT_CON, 0);
+	mutex_exit(&(immu->immu_regs_lock));
+}
+
+/*
+ * immu_regs_passthru_supported()
+ *       Returns B_TRUE ifi passthru is supported
+ */
+boolean_t
+immu_regs_passthru_supported(immu_t *immu)
+{
+	if (IMMU_ECAP_GET_PT(immu->immu_regs_excap)) {
+		return (B_TRUE);
+	}
+
+	ddi_err(DER_WARN, NULL, "Passthru not supported");
+	return (B_FALSE);
+}
+
+/*
+ * immu_regs_is_TM_reserved()
+ *       Returns B_TRUE if TM field is reserved
+ */
+boolean_t
+immu_regs_is_TM_reserved(immu_t *immu)
+{
+	if (IMMU_ECAP_GET_DI(immu->immu_regs_excap) ||
+	    IMMU_ECAP_GET_CH(immu->immu_regs_excap)) {
+		return (B_FALSE);
+	}
+	return (B_TRUE);
+}
+
+/*
+ * immu_regs_is_SNP_reserved()
+ *       Returns B_TRUE if SNP field is reserved
+ */
+boolean_t
+immu_regs_is_SNP_reserved(immu_t *immu)
+{
+
+	return (IMMU_ECAP_GET_SC(immu->immu_regs_excap) ? B_FALSE : B_TRUE);
+}
+
+/*
+ * immu_regs_wbf_flush()
+ *     If required and supported, write to IMMU
+ *     unit's regs to flush DMA write buffer(s)
+ */
+void
+immu_regs_wbf_flush(immu_t *immu)
+{
+	uint32_t status;
+
+	if (!IMMU_CAP_GET_RWBF(immu->immu_regs_cap)) {
+		return;
+	}
+
+	mutex_enter(&(immu->immu_regs_lock));
+	put_reg32(immu, IMMU_REG_GLOBAL_CMD,
+	    immu->immu_regs_cmdval | IMMU_GCMD_WBF);
+	wait_completion(immu, IMMU_REG_GLOBAL_STS,
+	    get_reg32, (!(status & IMMU_GSTS_WBFS)), status);
+	mutex_exit(&(immu->immu_regs_lock));
+}
+
+/*
+ * immu_regs_cpu_flush()
+ * 	flush the cpu cache line after CPU memory writes, so
+ *      IOMMU can see the writes
+ */
+void
+immu_regs_cpu_flush(immu_t *immu, caddr_t addr, uint_t size)
+{
+	uint_t i;
+
+	ASSERT(immu);
+
+	if (immu->immu_dvma_coherent == B_TRUE)
+		return;
+
+	for (i = 0; i < size; i += x86_clflush_size) {
+		clflush_insn(addr+i);
+	}
+
+	mfence_insn();
+}
+
+void
+immu_regs_iotlb_flush(immu_t *immu, uint_t domainid, uint64_t dvma,
+    uint64_t count, uint_t hint, immu_iotlb_inv_t type)
+{
+	ASSERT(immu);
+
+	switch (type) {
+	case IOTLB_PSI:
+		ASSERT(domainid > 0);
+		ASSERT(dvma > 0);
+		ASSERT(count > 0);
+		iotlb_psi(immu, domainid, dvma, count, hint);
+		break;
+	case IOTLB_DSI:
+		ASSERT(domainid > 0);
+		ASSERT(dvma == 0);
+		ASSERT(count == 0);
+		ASSERT(hint == 0);
+		iotlb_dsi(immu, domainid);
+		break;
+	case IOTLB_GLOBAL:
+		ASSERT(domainid == 0);
+		ASSERT(dvma == 0);
+		ASSERT(count == 0);
+		ASSERT(hint == 0);
+		iotlb_global(immu);
+		break;
+	default:
+		ddi_err(DER_PANIC, NULL, "invalid IOTLB invalidation type: %d",
+		    type);
+		/*NOTREACHED*/
+	}
+}
+
+/*
+ * immu_regs_context_flush()
+ *   flush the context cache
+ */
+void
+immu_regs_context_flush(immu_t *immu, uint8_t function_mask,
+    uint16_t sid, uint_t did, immu_context_inv_t type)
+{
+	uint64_t command = 0;
+	uint64_t status;
+
+	ASSERT(immu);
+	ASSERT(rw_write_held(&(immu->immu_ctx_rwlock)));
+
+	/*
+	 * define the command
+	 */
+	switch (type) {
+	case CONTEXT_FSI:
+		command |= CCMD_INV_ICC | CCMD_INV_DEVICE
+		    | CCMD_INV_DID(did)
+		    | CCMD_INV_SID(sid) | CCMD_INV_FM(function_mask);
+		break;
+	case CONTEXT_DSI:
+		ASSERT(function_mask == 0);
+		ASSERT(sid == 0);
+		command |= CCMD_INV_ICC | CCMD_INV_DOMAIN
+		    | CCMD_INV_DID(did);
+		break;
+	case CONTEXT_GLOBAL:
+		ASSERT(function_mask == 0);
+		ASSERT(sid == 0);
+		ASSERT(did == 0);
+		command |= CCMD_INV_ICC | CCMD_INV_GLOBAL;
+		break;
+	default:
+		ddi_err(DER_PANIC, NULL,
+		    "%s: incorrect context cache flush type",
+		    immu->immu_name);
+		/*NOTREACHED*/
+	}
+
+	mutex_enter(&(immu->immu_regs_lock));
+	/* verify there is no pending command */
+	wait_completion(immu, IMMU_REG_CONTEXT_CMD, get_reg64,
+	    (!(status & CCMD_INV_ICC)), status);
+	put_reg64(immu, IMMU_REG_CONTEXT_CMD, command);
+	wait_completion(immu, IMMU_REG_CONTEXT_CMD, get_reg64,
+	    (!(status & CCMD_INV_ICC)), status);
+	mutex_exit(&(immu->immu_regs_lock));
+}
+
+void
+immu_regs_set_root_table(immu_t *immu)
+{
+	uint32_t status;
+
+	mutex_enter(&(immu->immu_regs_lock));
+	put_reg64(immu, IMMU_REG_ROOTENTRY,
+	    immu->immu_ctx_root->hwpg_paddr);
+	put_reg32(immu, IMMU_REG_GLOBAL_CMD,
+	    immu->immu_regs_cmdval | IMMU_GCMD_SRTP);
+	wait_completion(immu, IMMU_REG_GLOBAL_STS,
+	    get_reg32, (status & IMMU_GSTS_RTPS), status);
+	mutex_exit(&(immu->immu_regs_lock));
+}
+
+
+/* enable queued invalidation interface */
+void
+immu_regs_qinv_enable(immu_t *immu, uint64_t qinv_reg_value)
+{
+	uint32_t status;
+
+	if (immu_qinv_enable == B_FALSE)
+		return;
+
+	mutex_enter(&immu->immu_regs_lock);
+	immu->immu_qinv_reg_value = qinv_reg_value;
+	/* Initialize the Invalidation Queue Tail register to zero */
+	put_reg64(immu, IMMU_REG_INVAL_QT, 0);
+
+	/* set invalidation queue base address register */
+	put_reg64(immu, IMMU_REG_INVAL_QAR, qinv_reg_value);
+
+	/* enable queued invalidation interface */
+	put_reg32(immu, IMMU_REG_GLOBAL_CMD,
+	    immu->immu_regs_cmdval | IMMU_GCMD_QIE);
+	wait_completion(immu, IMMU_REG_GLOBAL_STS,
+	    get_reg32, (status & IMMU_GSTS_QIES), status);
+	mutex_exit(&immu->immu_regs_lock);
+
+	immu->immu_regs_cmdval |= IMMU_GCMD_QIE;
+	immu->immu_qinv_running = B_TRUE;
+
+}
+
+/* enable interrupt remapping hardware unit */
+void
+immu_regs_intrmap_enable(immu_t *immu, uint64_t irta_reg)
+{
+	uint32_t status;
+
+	if (immu_intrmap_enable == B_FALSE)
+		return;
+
+	/* set interrupt remap table pointer */
+	mutex_enter(&(immu->immu_regs_lock));
+	immu->immu_intrmap_irta_reg = irta_reg;
+	put_reg64(immu, IMMU_REG_IRTAR, irta_reg);
+	put_reg32(immu, IMMU_REG_GLOBAL_CMD,
+	    immu->immu_regs_cmdval | IMMU_GCMD_SIRTP);
+	wait_completion(immu, IMMU_REG_GLOBAL_STS,
+	    get_reg32, (status & IMMU_GSTS_IRTPS), status);
+	mutex_exit(&(immu->immu_regs_lock));
+
+	/* global flush intr entry cache */
+	if (immu_qinv_enable == B_TRUE)
+		immu_qinv_intr_global(immu);
+
+	/* enable interrupt remapping */
+	mutex_enter(&(immu->immu_regs_lock));
+	put_reg32(immu, IMMU_REG_GLOBAL_CMD,
+	    immu->immu_regs_cmdval | IMMU_GCMD_IRE);
+	wait_completion(immu, IMMU_REG_GLOBAL_STS,
+	    get_reg32, (status & IMMU_GSTS_IRES),
+	    status);
+	immu->immu_regs_cmdval |= IMMU_GCMD_IRE;
+
+	/* set compatible mode */
+	put_reg32(immu, IMMU_REG_GLOBAL_CMD,
+	    immu->immu_regs_cmdval | IMMU_GCMD_CFI);
+	wait_completion(immu, IMMU_REG_GLOBAL_STS,
+	    get_reg32, (status & IMMU_GSTS_CFIS),
+	    status);
+	immu->immu_regs_cmdval |= IMMU_GCMD_CFI;
+	mutex_exit(&(immu->immu_regs_lock));
+
+	immu->immu_intrmap_running = B_TRUE;
+}
+
+uint64_t
+immu_regs_get64(immu_t *immu, uint_t reg)
+{
+	return (get_reg64(immu, reg));
+}
+
+uint32_t
+immu_regs_get32(immu_t *immu, uint_t reg)
+{
+	return (get_reg32(immu, reg));
+}
+
+void
+immu_regs_put64(immu_t *immu, uint_t reg, uint64_t val)
+{
+	put_reg64(immu, reg, val);
+}
+
+void
+immu_regs_put32(immu_t *immu, uint_t reg, uint32_t val)
+{
+	put_reg32(immu, reg, val);
+}
--- a/usr/src/uts/i86pc/io/intel_iommu.c	Sat Jan 30 15:04:39 2010 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4939 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Portions Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/*
- * Copyright (c) 2009, Intel Corporation.
- * All rights reserved.
- */
-
-/*
- * Intel IOMMU implementation
- */
-#include <sys/conf.h>
-#include <sys/modctl.h>
-#include <sys/pci.h>
-#include <sys/pci_impl.h>
-#include <sys/sysmacros.h>
-#include <sys/ddi.h>
-#include <sys/ddidmareq.h>
-#include <sys/ddi_impldefs.h>
-#include <sys/ddifm.h>
-#include <sys/sunndi.h>
-#include <sys/debug.h>
-#include <sys/fm/protocol.h>
-#include <sys/note.h>
-#include <sys/apic.h>
-#include <vm/hat_i86.h>
-#include <sys/smp_impldefs.h>
-#include <sys/spl.h>
-#include <sys/archsystm.h>
-#include <sys/x86_archext.h>
-#include <sys/rootnex.h>
-#include <sys/avl.h>
-#include <sys/bootconf.h>
-#include <sys/bootinfo.h>
-#include <sys/intel_iommu.h>
-#include <sys/atomic.h>
-#include <sys/iommulib.h>
-#include <sys/memlist.h>
-#include <sys/pcie.h>
-#include <sys/pci_cfgspace.h>
-
-/*
- * Macros based on PCI spec
- */
-#define	GET_DEV(devfn)	(devfn >> 3)		/* get device from devicefunc */
-#define	GET_FUNC(devfn)	(devfn & 7)		/* get func from devicefunc */
-#define	GET_DEVFUNC(d, f)	(((d) << 3) | (f)) /* create devicefunc */
-#define	REV2CLASS(r)	((r) >> 8)		/* Get classcode from revid */
-#define	CLASS2BASE(c)	((c) >> 16)		/* baseclass from classcode */
-#define	CLASS2SUB(c)	(((c) >> 8) & 0xff);	/* subclass from classcode */
-
-static boolean_t drhd_only_for_gfx(intel_iommu_state_t *iommu);
-static void iommu_bringup_unit(intel_iommu_state_t *iommu);
-
-/*
- * Are we on a Mobile 4 Series Chipset
- */
-static int mobile4_cs = 0;
-
-/*
- * Activate usb workaround for some Mobile 4 Series Chipset based platforms
- * On Toshiba laptops, its observed that usb devices appear to
- * read physical page 0. If we enable RW access via iommu, system doesnt
- * hang, otherwise the system hangs when the last include-all engine is
- * enabled for translation.
- * This happens only when enabling legacy emulation mode.
- */
-static int usb_page0_quirk = 1;
-static int usb_fullpa_quirk = 0;
-static int usb_rmrr_quirk = 1;
-
-/*
- * internal variables
- *   iommu_states	- the list of iommu
- *   domain_states	- the list of domain
- *   rmrr_states	- the list of rmrr
- *   page_num		- the count of pages for iommu page tables
- */
-static list_t iommu_states;
-static list_t domain_states;
-static list_t rmrr_states;
-static uint_t page_num;
-
-/*
- * record some frequently used dips
- */
-static dev_info_t *root_devinfo = NULL;
-static dev_info_t *lpc_devinfo = NULL;
-
-/*
- * A single element in the BDF based cache of private structs
- */
-typedef struct bdf_private_entry {
-	int			bpe_seg;
-	int			bpe_bus;
-	int			bpe_devfcn;
-	iommu_private_t		*bpe_private;
-	struct bdf_private_entry	*bpe_next;
-} bdf_private_entry_t;
-
-/*
- * Head of the BDF based cache of private structs
- */
-typedef struct bdf_private_cache {
-	kmutex_t		bpc_lock;
-	bdf_private_entry_t	*bpc_cache;
-} bdf_private_cache_t;
-
-static bdf_private_cache_t bdf_private_cache;
-
-/*
- * dvma cache related variables
- */
-static uint_t dvma_cache_high = 64;
-static dvma_cookie_head_t cookie_cache[MAX_COOKIE_CACHE_SIZE];
-
-/* ioapic info for interrupt remapping */
-static ioapic_iommu_info_t *ioapic_iommu_infos[MAX_IO_APIC];
-
-/*
- * switch to turn on/off the gfx dma remapping unit,
- * this is used when there is a dedicated drhd for the
- * gfx
- */
-int gfx_drhd_disable = 0;
-static dev_info_t *gfx_devinfo = NULL;
-
-/*
- * switch to disable dmar remapping unit, even the initiation work has
- * been finished
- */
-int dmar_drhd_disable = 0;
-
-/*
- * switch to disable queued invalidation interface/interrupt remapping
- */
-int qinv_disable = 0;
-int intrr_disable = 0;
-
-static char *dmar_fault_reason[] = {
-	"Reserved",
-	"The present field in root-entry is Clear",
-	"The present field in context-entry is Clear",
-	"Hardware detected invalid programming of a context-entry",
-	"The DMA request attempted to access an address beyond max support",
-	"The Write field in a page-table entry is Clear when DMA write",
-	"The Read field in a page-table entry is Clear when DMA read",
-	"Access the next level page table resulted in error",
-	"Access the root-entry table resulted in error",
-	"Access the context-entry table resulted in error",
-	"Reserved field not initialized to zero in a present root-entry",
-	"Reserved field not initialized to zero in a present context-entry",
-	"Reserved field not initialized to zero in a present page-table entry",
-	"DMA blocked due to the Translation Type field in context-entry",
-	"Incorrect fault event reason number"
-};
-
-#define	DMAR_MAX_REASON_NUMBER	(14)
-
-#define	IOMMU_IOVPTE_TABLE_SIZE	(IOMMU_LEVEL_SIZE * sizeof (struct iovpte))
-
-/*
- * Check if the device has mobile 4 chipset quirk
- */
-static int
-check_hwquirk_walk(dev_info_t *dip, void *arg)
-{
-	_NOTE(ARGUNUSED(arg))
-	int vendor_id, device_id;
-
-	vendor_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
-	    "vendor-id", -1);
-	device_id = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
-	    "device-id", -1);
-
-	if (vendor_id == 0x8086 && device_id == 0x2a40) {
-		mobile4_cs = 1;
-		return (DDI_WALK_TERMINATE);
-	} else {
-		return (DDI_WALK_CONTINUE);
-	}
-}
-
-static void
-check_hwquirk(void)
-{
-	int count;
-
-	/*
-	 * walk through the entire device tree
-	 */
-	ndi_devi_enter(root_devinfo, &count);
-	ddi_walk_devs(ddi_get_child(root_devinfo), check_hwquirk_walk, NULL);
-	ndi_devi_exit(root_devinfo, count);
-}
-
-#define	IOMMU_ALLOC_RESOURCE_DELAY	drv_usectohz(5000)
-
-/*
- * QS field of Invalidation Queue Address Register
- * the size of invalidation queue is 1 << (qinv_iqa_qs + 8)
- */
-static uint_t qinv_iqa_qs = 6;
-
-/*
- * the invalidate desctiptor type of queued invalidation interface
- */
-static char *qinv_dsc_type[] = {
-	"Reserved",
-	"Context Cache Invalidate Descriptor",
-	"IOTLB Invalidate Descriptor",
-	"Device-IOTLB Invalidate Descriptor",
-	"Interrupt Entry Cache Invalidate Descriptor",
-	"Invalidation Wait Descriptor",
-	"Incorrect queue invalidation type"
-};
-
-#define	QINV_MAX_DSC_TYPE	(6)
-
-/*
- * S field of the Interrupt Remapping Table Address Register
- * the size of the interrupt remapping table is 1 << (intrr_irta_s + 1)
- */
-static uint_t intrr_irta_s = INTRR_MAX_IRTA_SIZE;
-
-/*
- * If true, arrange to suppress broadcast EOI by setting edge-triggered mode
- * even for level-triggered interrupts in the interrupt-remapping engine.
- * If false, broadcast EOI can still be suppressed if the CPU supports the
- * APIC_SVR_SUPPRESS_BROADCAST_EOI bit.  In both cases, the IOAPIC is still
- * programmed with the correct trigger mode, and pcplusmp must send an EOI
- * to the IOAPIC by writing to the IOAPIC's EOI register to make up for the
- * missing broadcast EOI.
- */
-static int intrr_suppress_brdcst_eoi = 0;
-
-/*
- * whether verify the source id of interrupt request
- */
-static int intrr_enable_sid_verify = 0;
-
-/* the fault reason for interrupt remapping */
-static char *intrr_fault_reason[] = {
-	"reserved field set in IRTE",
-	"interrupt_index exceed the intr-remap table size",
-	"present field in IRTE is clear",
-	"hardware access intr-remap table address resulted in error",
-	"reserved field set in IRTE, inlcude various conditional",
-	"hardware blocked an interrupt request in Compatibility format",
-	"remappable interrupt request blocked due to verification failure"
-};
-
-#define	INTRR_MAX_REASON_NUMBER	(6)
-
-/*
- * the queued invalidation interface functions
- */
-static int iommu_qinv_init(intel_iommu_state_t *iommu);
-static void iommu_qinv_fini(intel_iommu_state_t *iommu);
-static void iommu_qinv_enable(intel_iommu_state_t *iommu);
-static void qinv_submit_inv_dsc(intel_iommu_state_t *iommu, inv_dsc_t *dsc);
-static void qinv_cc_common(intel_iommu_state_t *iommu, uint8_t function_mask,
-    uint16_t source_id, uint_t domain_id, ctt_inv_g_t type);
-static void qinv_iotlb_common(intel_iommu_state_t *iommu, uint_t domain_id,
-    uint64_t addr, uint_t am, uint_t hint, tlb_inv_g_t type);
-static void qinv_iec_common(intel_iommu_state_t *iommu, uint_t iidx,
-    uint_t im, uint_t g);
-static void qinv_iec_global(intel_iommu_state_t *iommu);
-static void qinv_iec_single(intel_iommu_state_t *iommu, uint_t iidx);
-static void qinv_iec(intel_iommu_state_t *iommu, uint_t iidx, uint_t cnt);
-static uint_t qinv_alloc_sync_mem_entry(intel_iommu_state_t *iommu);
-static void qinv_wait_async_unfence(intel_iommu_state_t *iommu,
-    iotlb_pend_node_t *node);
-static void qinv_wait_sync(intel_iommu_state_t *iommu);
-static int qinv_wait_async_finish(intel_iommu_state_t *iommu, int *count);
-static void qinv_cc_fsi(intel_iommu_state_t *iommu, uint8_t function_mask,
-    uint16_t source_id, uint_t domain_id);
-static void qinv_cc_dsi(intel_iommu_state_t *iommu, uint_t domain_id);
-static void qinv_cc_gbl(intel_iommu_state_t *iommu);
-static void qinv_iotlb_psi(intel_iommu_state_t *iommu, uint_t domain_id,
-    uint64_t dvma, uint_t count, uint_t hint);
-static void qinv_iotlb_dsi(intel_iommu_state_t *iommu, uint_t domain_id);
-static void qinv_iotlb_gbl(intel_iommu_state_t *iommu);
-static void qinv_plant_wait(intel_iommu_state_t *iommu,
-    iommu_dvma_cookie_t *dcookies, uint_t count, uint_t array_size);
-static void qinv_reap_wait(intel_iommu_state_t *iommu);
-
-/*LINTED*/
-static void qinv_wait_async_fence(intel_iommu_state_t *iommu);
-/*LINTED*/
-static void qinv_dev_iotlb_common(intel_iommu_state_t *iommu, uint16_t sid,
-    uint64_t addr, uint_t size, uint_t max_invs_pd);
-
-/* interrupt remapping related functions */
-static int intr_remap_init_unit(intel_iommu_state_t *iommu);
-static void intr_remap_fini_unit(intel_iommu_state_t *iommu);
-static void intr_remap_enable_unit(intel_iommu_state_t *iommu);
-static uint_t bitset_find_free(bitset_t *, uint_t);
-static uint_t bitset_find_multi_free(bitset_t *, uint_t, uint_t);
-static int intrr_tbl_alloc_entry(intr_remap_tbl_state_t *);
-static int intrr_tbl_alloc_multi_entries(intr_remap_tbl_state_t *, uint_t);
-static void get_ioapic_iommu_info(void);
-static void intr_remap_get_iommu(apic_irq_t *);
-static void intr_remap_get_sid(apic_irq_t *);
-
-static int intr_remap_init(int);
-static void intr_remap_enable(int);
-static void intr_remap_alloc_entry(apic_irq_t *);
-static void intr_remap_map_entry(apic_irq_t *, void *);
-static void intr_remap_free_entry(apic_irq_t *);
-static void intr_remap_record_rdt(apic_irq_t *, ioapic_rdt_t *);
-static void intr_remap_record_msi(apic_irq_t *, msi_regs_t *);
-
-static struct apic_intrr_ops intr_remap_ops = {
-	intr_remap_init,
-	intr_remap_enable,
-	intr_remap_alloc_entry,
-	intr_remap_map_entry,
-	intr_remap_free_entry,
-	intr_remap_record_rdt,
-	intr_remap_record_msi,
-};
-
-/* apic mode, APIC/X2APIC */
-static int intrr_apic_mode = LOCAL_APIC;
-
-/*
- * cpu_clflush()
- *   flush the cpu cache line
- */
-static void
-cpu_clflush(caddr_t addr, uint_t size)
-{
-	uint_t i;
-
-	for (i = 0; i < size; i += x86_clflush_size) {
-		clflush_insn(addr+i);
-	}
-
-	mfence_insn();
-}
-
-/*
- * iommu_page_init()
- *   do some init work for the iommu page allocator
- */
-static void
-iommu_page_init(void)
-{
-	page_num = 0;
-}
-
-/*
- * iommu_get_page()
- *   get a 4k iommu page, and zero out it
- */
-static paddr_t
-iommu_get_page(intel_iommu_state_t *iommu, int kmflag)
-{
-	iommu_pghdl_t *pghdl;
-	caddr_t vaddr;
-
-	pghdl = iommu_page_alloc(iommu, kmflag);
-	vaddr = pghdl->vaddr;
-	bzero(vaddr, IOMMU_PAGE_SIZE);
-	iommu->iu_dmar_ops->do_clflush(vaddr, IOMMU_PAGE_SIZE);
-
-	page_num++;
-
-	return (pghdl->paddr);
-}
-
-/*
- * iommu_free_page()
- *   free the iommu page allocated with iommu_get_page
- */
-static void
-iommu_free_page(intel_iommu_state_t *iommu, paddr_t paddr)
-{
-	iommu_page_free(iommu, paddr);
-	page_num--;
-}
-
-#define	iommu_get_reg32(iommu, offset)	ddi_get32((iommu)->iu_reg_handle, \
-		(uint32_t *)(iommu->iu_reg_address + (offset)))
-#define	iommu_get_reg64(iommu, offset)	ddi_get64((iommu)->iu_reg_handle, \
-		(uint64_t *)(iommu->iu_reg_address + (offset)))
-#define	iommu_put_reg32(iommu, offset, val)	ddi_put32\
-		((iommu)->iu_reg_handle, \
-		(uint32_t *)(iommu->iu_reg_address + (offset)), val)
-#define	iommu_put_reg64(iommu, offset, val)	ddi_put64\
-		((iommu)->iu_reg_handle, \
-		(uint64_t *)(iommu->iu_reg_address + (offset)), val)
-
-/*
- * calculate_agaw()
- *   calculate agaw from gaw
- */
-static int
-calculate_agaw(int gaw)
-{
-	int r, agaw;
-
-	r = (gaw - 12) % 9;
-
-	if (r == 0)
-		agaw = gaw;
-	else
-		agaw = gaw + 9 - r;
-
-	if (agaw > 64)
-		agaw = 64;
-
-	return (agaw);
-}
-
-/*
- * destroy_iommu_state()
- *   destory an iommu state
- */
-static void
-destroy_iommu_state(intel_iommu_state_t *iommu)
-{
-	iommu_free_page(iommu, iommu->iu_root_entry_paddr);
-	iommu_rscs_fini(&(iommu->iu_domain_id_hdl));
-	mutex_destroy(&(iommu->iu_reg_lock));
-	mutex_destroy(&(iommu->iu_root_context_lock));
-	ddi_regs_map_free(&(iommu->iu_reg_handle));
-	kmem_free(iommu->iu_dmar_ops, sizeof (struct dmar_ops));
-
-	if (iommu->iu_inv_queue) {
-		iommu_qinv_fini(iommu);
-	}
-
-	if (iommu->iu_intr_remap_tbl) {
-		intr_remap_fini_unit(iommu);
-	}
-
-	kmem_free(iommu, sizeof (intel_iommu_state_t));
-}
-
-/*
- * iommu_update_stats - update iommu private kstat counters
- *
- * This routine will dump and reset the iommu's internal
- * statistics counters. The current stats dump values will
- * be sent to the kernel status area.
- */
-static int
-iommu_update_stats(kstat_t *ksp, int rw)
-{
-	intel_iommu_state_t *iommu;
-	iommu_kstat_t *iommu_ksp;
-	const char *state;
-
-	if (rw == KSTAT_WRITE)
-		return (EACCES);
-
-	iommu = (intel_iommu_state_t *)ksp->ks_private;
-	ASSERT(iommu != NULL);
-	iommu_ksp = (iommu_kstat_t *)ksp->ks_data;
-	ASSERT(iommu_ksp != NULL);
-
-	state = (iommu->iu_enabled & DMAR_ENABLE) ? "enabled" : "disabled";
-	(void) strcpy(iommu_ksp->is_dmar_enabled.value.c, state);
-	state = (iommu->iu_enabled & QINV_ENABLE) ? "enabled" : "disabled";
-	(void) strcpy(iommu_ksp->is_qinv_enabled.value.c, state);
-	state = (iommu->iu_enabled & INTRR_ENABLE) ?
-	    "enabled" : "disabled";
-	(void) strcpy(iommu_ksp->is_intrr_enabled.value.c, state);
-	iommu_ksp->is_iotlb_psi.value.ui64 =
-	    iommu->iu_statistics.st_iotlb_psi;
-	iommu_ksp->is_iotlb_domain.value.ui64 =
-	    iommu->iu_statistics.st_iotlb_domain;
-	iommu_ksp->is_iotlb_global.value.ui64 =
-	    iommu->iu_statistics.st_iotlb_global;
-	iommu_ksp->is_write_buffer.value.ui64 =
-	    iommu->iu_statistics.st_write_buffer;
-	iommu_ksp->is_context_cache.value.ui64 =
-	    iommu->iu_statistics.st_context_cache;
-	iommu_ksp->is_wait_complete_us.value.ui64 =
-	    drv_hztousec(iommu->iu_statistics.st_wait_complete_us);
-	iommu_ksp->is_domain_alloc.value.ui64 =
-	    iommu->iu_statistics.st_domain_alloc;
-	iommu_ksp->is_page_used.value.ui64 = page_num;
-
-	return (0);
-}
-
-/*
- * iommu_init_stats - initialize kstat data structures
- *
- * This routine will create and initialize the iommu private
- * statistics counters.
- */
-int
-iommu_init_stats(intel_iommu_state_t *iommu)
-{
-	kstat_t *ksp;
-	iommu_kstat_t *iommu_ksp;
-
-	/*
-	 * Create and init kstat
-	 */
-	ksp = kstat_create("rootnex", 0,
-	    ddi_node_name(iommu->iu_drhd->di_dip),
-	    "misc", KSTAT_TYPE_NAMED,
-	    sizeof (iommu_kstat_t) / sizeof (kstat_named_t), 0);
-
-	if (ksp == NULL) {
-		cmn_err(CE_WARN,
-		    "Could not create kernel statistics for %s",
-		    ddi_node_name(iommu->iu_drhd->di_dip));
-		return (DDI_FAILURE);
-	}
-
-	iommu->iu_kstat = ksp;
-	iommu_ksp = (iommu_kstat_t *)ksp->ks_data;
-
-	/*
-	 * Initialize all the statistics
-	 */
-	kstat_named_init(&(iommu_ksp->is_dmar_enabled), "dmar_enable",
-	    KSTAT_DATA_CHAR);
-	kstat_named_init(&(iommu_ksp->is_qinv_enabled), "qinv_enable",
-	    KSTAT_DATA_CHAR);
-	kstat_named_init(&(iommu_ksp->is_intrr_enabled), "intrr_enable",
-	    KSTAT_DATA_CHAR);
-	kstat_named_init(&(iommu_ksp->is_iotlb_psi), "iotlb_psi",
-	    KSTAT_DATA_UINT64);
-	kstat_named_init(&(iommu_ksp->is_iotlb_domain), "iotlb_domain",
-	    KSTAT_DATA_UINT64);
-	kstat_named_init(&(iommu_ksp->is_iotlb_global), "iotlb_global",
-	    KSTAT_DATA_UINT64);
-	kstat_named_init(&(iommu_ksp->is_write_buffer), "write_buffer",
-	    KSTAT_DATA_UINT64);
-	kstat_named_init(&(iommu_ksp->is_context_cache), "context_cache",
-	    KSTAT_DATA_UINT64);
-	kstat_named_init(&(iommu_ksp->is_wait_complete_us), "wait_complete_us",
-	    KSTAT_DATA_UINT64);
-	kstat_named_init(&(iommu_ksp->is_page_used), "physical_page_used",
-	    KSTAT_DATA_UINT64);
-	kstat_named_init(&(iommu_ksp->is_domain_alloc), "domain_allocated",
-	    KSTAT_DATA_UINT64);
-
-	/*
-	 * Function to provide kernel stat update on demand
-	 */
-	ksp->ks_update = iommu_update_stats;
-
-	/*
-	 * Pointer into provider's raw statistics
-	 */
-	ksp->ks_private = (void *)iommu;
-
-	/*
-	 * Add kstat to systems kstat chain
-	 */
-	kstat_install(ksp);
-
-	return (DDI_SUCCESS);
-}
-
-/*
- * iommu_intr_handler()
- *   the fault event handler for a single drhd
- */
-static int
-iommu_intr_handler(intel_iommu_state_t *iommu)
-{
-	uint32_t status;
-	int index, fault_reg_offset;
-	int max_fault_index;
-	int any_fault = 0;
-
-	mutex_enter(&(iommu->iu_reg_lock));
-
-	/* read the fault status */
-	status = iommu_get_reg32(iommu, IOMMU_REG_FAULT_STS);
-
-	/* check if we have a pending fault for this IOMMU */
-	if (!(status & IOMMU_FAULT_STS_PPF)) {
-		goto no_primary_faults;
-	}
-
-	/*
-	 * handle all primary pending faults
-	 */
-	any_fault = 1;
-	index = IOMMU_FAULT_GET_INDEX(status);
-	max_fault_index =  IOMMU_CAP_GET_NFR(iommu->iu_capability) - 1;
-	fault_reg_offset = IOMMU_CAP_GET_FRO(iommu->iu_capability);
-
-	_NOTE(CONSTCOND)
-	while (1) {
-		uint64_t val;
-		uint8_t fault_reason;
-		uint8_t fault_type;
-		uint16_t sid;
-		uint64_t pg_addr;
-		uint64_t iidx;
-
-		/* read the higher 64bits */
-		val = iommu_get_reg64(iommu,
-		    fault_reg_offset + index * 16 + 8);
-
-		/* check if pending fault */
-		if (!IOMMU_FRR_GET_F(val))
-			break;
-
-		/* get the fault reason, fault type and sid */
-		fault_reason = IOMMU_FRR_GET_FR(val);
-		fault_type = IOMMU_FRR_GET_FT(val);
-		sid = IOMMU_FRR_GET_SID(val);
-
-		/* read the first 64bits */
-		val = iommu_get_reg64(iommu,
-		    fault_reg_offset + index * 16);
-		pg_addr = val & IOMMU_PAGE_MASK;
-		iidx = val >> 48;
-
-		/* clear the fault */
-		iommu_put_reg32(iommu, fault_reg_offset + index * 16 + 12,
-		    (((uint32_t)1) << 31));
-
-		/* report the fault info */
-		if (fault_reason < 0x20) {
-			/* dmar-remapping fault */
-			cmn_err(CE_WARN,
-			    "%s generated a fault event when translating "
-			    "DMA %s\n"
-			    "\t on address 0x%" PRIx64 " for PCI(%d, %d, %d), "
-			    "the reason is:\n\t %s",
-			    ddi_node_name(iommu->iu_drhd->di_dip),
-			    fault_type ? "read" : "write", pg_addr,
-			    (sid >> 8) & 0xff, (sid >> 3) & 0x1f, sid & 0x7,
-			    dmar_fault_reason[MIN(fault_reason,
-			    DMAR_MAX_REASON_NUMBER)]);
-		} else if (fault_reason < 0x27) {
-			/* intr-remapping fault */
-			cmn_err(CE_WARN,
-			    "%s generated a fault event when translating "
-			    "interrupt request\n"
-			    "\t on index 0x%" PRIx64 " for PCI(%d, %d, %d), "
-			    "the reason is:\n\t %s",
-			    ddi_node_name(iommu->iu_drhd->di_dip),
-			    iidx,
-			    (sid >> 8) & 0xff, (sid >> 3) & 0x1f, sid & 0x7,
-			    intrr_fault_reason[MIN((fault_reason - 0x20),
-			    INTRR_MAX_REASON_NUMBER)]);
-		}
-
-		index++;
-		if (index > max_fault_index)
-			index = 0;
-	}
-
-no_primary_faults:
-
-	/*
-	 * handle queued invalidation interface errors
-	 */
-	if (status & IOMMU_FAULT_STS_IQE) {
-		uint64_t	head;
-		inv_dsc_t	*dsc;
-
-		head = QINV_IQA_HEAD(
-		    iommu_get_reg64(iommu, IOMMU_REG_INVAL_QH));
-		dsc = (inv_dsc_t *)(iommu->iu_inv_queue->iq_table.vaddr
-		    + (head * QINV_ENTRY_SIZE));
-
-		/* report the error */
-		cmn_err(CE_WARN,
-		    "%s generated a fault when fetching a descriptor from the\n"
-		    "\tinvalidation queue, or detects that the fetched\n"
-		    "\tdescriptor is invalid. The head register is "
-		    "0x%" PRIx64 ",\n"
-		    "\tthe type is %s\n",
-		    ddi_node_name(iommu->iu_drhd->di_dip), head,
-		    qinv_dsc_type[MIN(INV_DSC_TYPE(dsc),
-		    QINV_MAX_DSC_TYPE)]);
-	}
-
-	/*
-	 * Hardware received an unexpected or invalid Device-IOTLB
-	 * invalidation completion
-	 */
-	if (status & IOMMU_FAULT_STS_ICE) {
-		cmn_err(CE_WARN,
-		    "Hardware received an unexpected or invalid "
-		    "Device-IOTLB invalidation completion.\n");
-	}
-
-	/*
-	 * Hardware detected a Device-IOTLB invalidation
-	 * completion time-out
-	 */
-	if (status & IOMMU_FAULT_STS_ITE) {
-		cmn_err(CE_WARN,
-		    "Hardware detected a Device-IOTLB invalidation "
-		    "completion time-out.\n");
-	}
-
-	/* clear the fault */
-	iommu_put_reg32(iommu, IOMMU_REG_FAULT_STS, 1);
-
-	mutex_exit(&(iommu->iu_reg_lock));
-
-	return (any_fault ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
-}
-
-/*
- * Function to identify a display device from the PCI class code
- */
-static int
-device_is_display(uint_t classcode)
-{
-	static uint_t disp_classes[] = {
-		0x000100,
-		0x030000,
-		0x030001
-	};
-	int i, nclasses = sizeof (disp_classes) / sizeof (uint_t);
-
-	for (i = 0; i < nclasses; i++) {
-		if (classcode == disp_classes[i])
-			return (1);
-	}
-	return (0);
-}
-
-/*
- * Function that determines if device is PCIEX and/or PCIEX bridge
- */
-static int
-device_is_pciex(uchar_t bus, uchar_t dev, uchar_t func, int *is_pci_bridge)
-{
-	ushort_t cap;
-	ushort_t capsp;
-	ushort_t cap_count = PCI_CAP_MAX_PTR;
-	ushort_t status;
-	int is_pciex = 0;
-
-	*is_pci_bridge = 0;
-
-	status = pci_getw_func(bus, dev, func, PCI_CONF_STAT);
-	if (!(status & PCI_STAT_CAP))
-		return (0);
-
-	capsp = pci_getb_func(bus, dev, func, PCI_CONF_CAP_PTR);
-	while (cap_count-- && capsp >= PCI_CAP_PTR_OFF) {
-		capsp &= PCI_CAP_PTR_MASK;
-		cap = pci_getb_func(bus, dev, func, capsp);
-
-		if (cap == PCI_CAP_ID_PCI_E) {
-			status = pci_getw_func(bus, dev, func, capsp + 2);
-			/*
-			 * See section 7.8.2 of PCI-Express Base Spec v1.0a
-			 * for Device/Port Type.
-			 * PCIE_PCIECAP_DEV_TYPE_PCIE2PCI implies that the
-			 * device is a PCIe2PCI bridge
-			 */
-			*is_pci_bridge =
-			    ((status & PCIE_PCIECAP_DEV_TYPE_MASK) ==
-			    PCIE_PCIECAP_DEV_TYPE_PCIE2PCI) ? 1 : 0;
-
-			is_pciex = 1;
-		}
-
-		capsp = (*pci_getb_func)(bus, dev, func,
-		    capsp + PCI_CAP_NEXT_PTR);
-	}
-
-	return (is_pciex);
-}
-
-/*
- * Allocate a private structure and initialize it
- */
-static iommu_private_t *
-iommu_create_private(int bus, int dev, int func)
-{
-	uchar_t basecl, subcl;
-	uint_t classcode, revclass;
-	iommu_private_t *private;
-	int pciex = 0;
-	int is_pci_bridge = 0;
-
-	/* No cached private struct. Create one */
-	private = kmem_zalloc(sizeof (iommu_private_t), KM_SLEEP);
-	private->idp_seg = 0; /* Currently seg can only be 0 */
-	private->idp_bus = bus;
-	private->idp_devfn = GET_DEVFUNC(dev, func);
-	private->idp_sec = 0;
-	private->idp_sub = 0;
-	private->idp_bbp_type = IOMMU_PPB_NONE;
-
-	/* record the bridge */
-	revclass = pci_getl_func(bus, dev, func, PCI_CONF_REVID);
-
-	classcode = REV2CLASS(revclass);
-	basecl = CLASS2BASE(classcode);
-	subcl = CLASS2SUB(classcode);
-
-	private->idp_is_bridge = ((basecl == PCI_CLASS_BRIDGE) &&
-	    (subcl == PCI_BRIDGE_PCI));
-
-	if (private->idp_is_bridge) {
-		private->idp_sec = pci_getb_func(bus, dev, func,
-		    PCI_BCNF_SECBUS);
-		private->idp_sub = pci_getb_func(bus, dev, func,
-		    PCI_BCNF_SUBBUS);
-
-		pciex = device_is_pciex(bus, dev, func, &is_pci_bridge);
-		if (pciex && is_pci_bridge)
-			private->idp_bbp_type = IOMMU_PPB_PCIE_PCI;
-		else if (pciex)
-			private->idp_bbp_type = IOMMU_PPB_PCIE_PCIE;
-		else
-			private->idp_bbp_type = IOMMU_PPB_PCI_PCI;
-	}
-
-	/* record the special devices */
-	private->idp_is_display =
-	    (device_is_display(classcode) ? B_TRUE : B_FALSE);
-
-	private->idp_is_lpc = ((basecl == PCI_CLASS_BRIDGE) &&
-	    (subcl == PCI_BRIDGE_ISA));
-	private->idp_intel_domain = NULL;
-
-	return (private);
-}
-
-/*
- * Set the private struct in the private field of a devinfo node
- */
-static int
-iommu_set_private(dev_info_t *dip)
-{
-	bdf_private_entry_t *bpe, *new;
-	int bus, device, func, seg;
-	iommu_private_t *pvt;
-	dmar_domain_state_t *domain;
-
-	seg = 0; /* NOTE: Currently seg always = 0 */
-	bus = device = func = -1;
-
-	if (acpica_get_bdf(dip, &bus, &device, &func) != DDI_SUCCESS) {
-		/* probably not PCI device */
-		return (DDI_FAILURE);
-	}
-
-	/*
-	 * We always need a private structure, whether it was cached
-	 * or not previously, since a hotplug may change the type of
-	 * device - for example we may have had a bridge here before,
-	 * and now we could have a leaf device
-	 */
-	pvt = iommu_create_private(bus, device, func);
-	ASSERT(pvt);
-
-	/* assume new cache entry needed */
-	new = kmem_zalloc(sizeof (*new), KM_SLEEP);
-
-	mutex_enter(&bdf_private_cache.bpc_lock);
-
-	for (bpe = bdf_private_cache.bpc_cache; bpe; bpe = bpe->bpe_next) {
-		if (bpe->bpe_seg == seg &&
-		    bpe->bpe_bus == bus &&
-		    bpe->bpe_devfcn == GET_DEVFUNC(device, func)) {
-			break;
-		}
-	}
-
-	if (bpe) {
-		/* extry exists, new not needed */
-		kmem_free(new, sizeof (*new));
-		ASSERT(bpe->bpe_private);
-		domain = bpe->bpe_private->idp_intel_domain;
-		/* domain may be NULL */
-		kmem_free(bpe->bpe_private, sizeof (iommu_private_t));
-		bpe->bpe_private = pvt;
-		pvt->idp_intel_domain = domain;
-	} else {
-		new->bpe_seg = pvt->idp_seg;
-		new->bpe_bus = pvt->idp_bus;
-		new->bpe_devfcn = pvt->idp_devfn;
-		new->bpe_private = pvt;
-		new->bpe_next =  bdf_private_cache.bpc_cache;
-		bdf_private_cache.bpc_cache = new;
-	}
-	DEVI(dip)->devi_iommu_private = pvt;
-
-	mutex_exit(&bdf_private_cache.bpc_lock);
-	return (DDI_SUCCESS);
-}
-
-
-/*
- * intel_iommu_init()
- *   the interface to setup interrupt handlers and init the DMAR units
- */
-static void
-intel_iommu_init(void)
-{
-	int ipl, irq, vect;
-	intel_iommu_state_t *iommu;
-	char intr_name[64];
-	uint32_t msi_addr, msi_data;
-	uint32_t iommu_instance = 0;
-	ipl = IOMMU_INTR_IPL;
-
-	msi_addr = (MSI_ADDR_HDR |
-	    ((apic_cpus[0].aci_local_id & 0xFF) << MSI_ADDR_DEST_SHIFT) |
-	    (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
-	    (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT));
-
-	for_each_in_list(&iommu_states, iommu) {
-		irq = psm_get_ipivect(ipl, -1);
-		vect = apic_irq_table[irq]->airq_vector;
-		msi_data =
-		    ((MSI_DATA_DELIVERY_FIXED << MSI_DATA_DELIVERY_SHIFT) |
-		    vect);
-		(void) snprintf(intr_name, sizeof (intr_name),
-		    "iommu intr%d", iommu_instance++);
-		(void) add_avintr((void *)NULL, ipl,
-		    (avfunc)(iommu_intr_handler),
-		    intr_name, irq, (caddr_t)iommu,
-		    NULL, NULL, NULL);
-		(void) iommu_intr_handler(iommu);
-		mutex_enter(&(iommu->iu_reg_lock));
-		iommu_put_reg32(iommu, IOMMU_REG_FEVNT_ADDR, msi_addr);
-		if (intrr_apic_mode == LOCAL_X2APIC) {
-			iommu_put_reg32(iommu, IOMMU_REG_FEVNT_UADDR,
-			    apic_cpus[0].aci_local_id & 0xFFFFFF00);
-		} else {
-			iommu_put_reg32(iommu, IOMMU_REG_FEVNT_UADDR, 0);
-		}
-		iommu_put_reg32(iommu, IOMMU_REG_FEVNT_DATA, msi_data);
-		iommu_put_reg32(iommu, IOMMU_REG_FEVNT_CON, 0);
-		mutex_exit(&(iommu->iu_reg_lock));
-	}
-
-	/*
-	 * enable dma remapping
-	 */
-	cmn_err(CE_CONT, "?Start to enable the dmar units\n");
-	if (!dmar_drhd_disable) {
-		for_each_in_list(&iommu_states, iommu) {
-			if (gfx_drhd_disable &&
-			    drhd_only_for_gfx(iommu))
-				continue;
-			iommu_bringup_unit(iommu);
-		}
-	}
-}
-
-/*
- * wait max 60s for the hardware completion
- */
-#define	IOMMU_WAIT_TIME		60000000
-#define	iommu_wait_completion(iommu, offset, getf, completion, status) \
-{ \
-	clock_t stick = ddi_get_lbolt(); \
-	clock_t ntick; \
-	_NOTE(CONSTCOND) \
-	while (1) { \
-		status = getf(iommu, offset); \
-		ntick = ddi_get_lbolt(); \
-		if (completion) {\
-			atomic_add_64\
-			    (&(iommu->iu_statistics.st_wait_complete_us),\
-			    ntick - stick);\
-			break; \
-		} \
-		if (ntick - stick >= drv_usectohz(IOMMU_WAIT_TIME)) { \
-			cmn_err(CE_PANIC, \
-			    "iommu wait completion time out\n"); \
-		} else { \
-			iommu_cpu_nop();\
-		}\
-	}\
-}
-
-/*
- * dmar_flush_write_buffer()
- *   flush the write buffer
- */
-static void
-dmar_flush_write_buffer(intel_iommu_state_t *iommu)
-{
-	uint32_t status;
-
-	mutex_enter(&(iommu->iu_reg_lock));
-	iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD,
-	    iommu->iu_global_cmd_reg | IOMMU_GCMD_WBF);
-	iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS,
-	    iommu_get_reg32, !(status & IOMMU_GSTS_WBFS), status);
-	mutex_exit(&(iommu->iu_reg_lock));
-
-	/* record the statistics */
-	atomic_inc_64(&(iommu->iu_statistics.st_write_buffer));
-}
-
-/*
- * dmar_flush_iotlb_common()
- *   flush the iotlb cache
- */
-static void
-dmar_flush_iotlb_common(intel_iommu_state_t *iommu, uint_t domain_id,
-    uint64_t addr, uint_t am, uint_t hint, tlb_inv_g_t type)
-{
-	uint64_t command = 0, iva = 0, status;
-	uint_t iva_offset, iotlb_offset;
-
-	iva_offset = IOMMU_ECAP_GET_IRO(iommu->iu_excapability);
-	iotlb_offset = iva_offset + 8;
-
-	/*
-	 * prepare drain read/write command
-	 */
-	if (IOMMU_CAP_GET_DWD(iommu->iu_capability)) {
-		command |= TLB_INV_DRAIN_WRITE;
-	}
-
-	if (IOMMU_CAP_GET_DRD(iommu->iu_capability)) {
-		command |= TLB_INV_DRAIN_READ;
-	}
-
-	/*
-	 * if the hardward doesn't support page selective invalidation, we
-	 * will use domain type. Otherwise, use global type
-	 */
-	switch (type) {
-	case TLB_INV_G_PAGE:
-		if (!IOMMU_CAP_GET_PSI(iommu->iu_capability) ||
-		    am > IOMMU_CAP_GET_MAMV(iommu->iu_capability) ||
-		    addr & IOMMU_PAGE_OFFSET) {
-			goto ignore_psi;
-		}
-		command |= TLB_INV_PAGE | TLB_INV_IVT |
-		    TLB_INV_DID(domain_id);
-		iva = addr | am | TLB_IVA_HINT(hint);
-		break;
-ignore_psi:
-	case TLB_INV_G_DOMAIN:
-		command |= TLB_INV_DOMAIN | TLB_INV_IVT |
-		    TLB_INV_DID(domain_id);
-		break;
-	case TLB_INV_G_GLOBAL:
-		command |= TLB_INV_GLOBAL | TLB_INV_IVT;
-		break;
-	default:
-		cmn_err(CE_WARN, "incorrect iotlb flush type");
-		return;
-	}
-
-	/*
-	 * do the actual flush
-	 */
-	mutex_enter(&(iommu->iu_reg_lock));
-	/* verify there is no pending command */
-	iommu_wait_completion(iommu, iotlb_offset, iommu_get_reg64,
-	    !(status & TLB_INV_IVT), status);
-	if (iva)
-		iommu_put_reg64(iommu, iva_offset, iva);
-	iommu_put_reg64(iommu, iotlb_offset, command);
-	iommu_wait_completion(iommu, iotlb_offset, iommu_get_reg64,
-	    !(status & TLB_INV_IVT), status);
-	mutex_exit(&(iommu->iu_reg_lock));
-
-	/*
-	 * check the result and record the statistics
-	 */
-	switch (TLB_INV_GET_IAIG(status)) {
-	/* global */
-	case 1:
-		atomic_inc_64(&(iommu->iu_statistics.st_iotlb_global));
-		break;
-	/* domain */
-	case 2:
-		atomic_inc_64(&(iommu->iu_statistics.st_iotlb_domain));
-		break;
-	/* psi */
-	case 3:
-		atomic_inc_64(&(iommu->iu_statistics.st_iotlb_psi));
-		break;
-	default:
-		break;
-	}
-}
-
-/*
- * dmar_flush_iotlb_psi()
- *   register based iotlb psi invalidation
- */
-static void
-dmar_flush_iotlb_psi(intel_iommu_state_t *iommu, uint_t domain_id,
-    uint64_t dvma, uint_t count, uint_t hint)
-{
-	uint_t am = 0;
-	uint_t max_am = 0;
-	uint64_t align = 0;
-	uint64_t dvma_pg = 0;
-	uint_t used_count = 0;
-
-	/* choose page specified invalidation */
-	if (IOMMU_CAP_GET_PSI(iommu->iu_capability)) {
-		/* MAMV is valid only if PSI is set */
-		max_am = IOMMU_CAP_GET_MAMV(iommu->iu_capability);
-		while (count != 0) {
-			/* First calculate alignment of DVMA */
-			dvma_pg = IOMMU_BTOP(dvma);
-			ASSERT(dvma_pg != NULL);
-			ASSERT(count >= 1);
-			for (align = 1; (dvma_pg & align) == 0; align <<= 1)
-				;
-			/* truncate count to the nearest power of 2 */
-			for (used_count = 1, am = 0; count >> used_count != 0;
-			    used_count <<= 1, am++)
-				;
-			if (am > max_am) {
-				am = max_am;
-				used_count = 1 << am;
-			}
-			if (align >= used_count) {
-				dmar_flush_iotlb_common(iommu, domain_id,
-				    dvma, am, hint, TLB_INV_G_PAGE);
-			} else {
-				/* align < used_count */
-				used_count = align;
-				for (am = 0; (1 << am) != used_count; am++)
-					;
-				dmar_flush_iotlb_common(iommu, domain_id,
-				    dvma, am, hint, TLB_INV_G_PAGE);
-			}
-			count -= used_count;
-			dvma = (dvma_pg + used_count) << IOMMU_PAGE_SHIFT;
-		}
-	/* choose domain invalidation */
-	} else {
-		dmar_flush_iotlb_common(iommu, domain_id, dvma,
-		    0, 0, TLB_INV_G_DOMAIN);
-	}
-}
-
-/*
- * dmar_flush_iotlb_dsi()
- *   flush dsi iotlb
- */
-static void
-dmar_flush_iotlb_dsi(intel_iommu_state_t *iommu, uint_t domain_id)
-{
-	dmar_flush_iotlb_common(iommu, domain_id, 0, 0, 0, TLB_INV_G_DOMAIN);
-}
-
-/*
- * dmar_flush_iotlb_glb()
- *   flush global iotbl
- */
-static void
-dmar_flush_iotlb_glb(intel_iommu_state_t *iommu)
-{
-	dmar_flush_iotlb_common(iommu, 0, 0, 0, 0, TLB_INV_G_GLOBAL);
-}
-
-
-/*
- * dmar_flush_context_cache()
- *   flush the context cache
- */
-static void
-dmar_flush_context_cache(intel_iommu_state_t *iommu, uint8_t function_mask,
-    uint16_t source_id, uint_t domain_id, ctt_inv_g_t type)
-{
-	uint64_t command = 0, status;
-
-	/*
-	 * define the command
-	 */
-	switch (type) {
-	case CTT_INV_G_DEVICE:
-		command |= CCMD_INV_ICC | CCMD_INV_DEVICE
-		    | CCMD_INV_DID(domain_id)
-		    | CCMD_INV_SID(source_id) | CCMD_INV_FM(function_mask);
-		break;
-	case CTT_INV_G_DOMAIN:
-		command |= CCMD_INV_ICC | CCMD_INV_DOMAIN
-		    | CCMD_INV_DID(domain_id);
-		break;
-	case CTT_INV_G_GLOBAL:
-		command |= CCMD_INV_ICC | CCMD_INV_GLOBAL;
-		break;
-	default:
-		cmn_err(CE_WARN, "incorrect context cache flush type");
-		return;
-	}
-
-	mutex_enter(&(iommu->iu_reg_lock));
-	/* verify there is no pending command */
-	iommu_wait_completion(iommu, IOMMU_REG_CONTEXT_CMD, iommu_get_reg64,
-	    !(status & CCMD_INV_ICC), status);
-	iommu_put_reg64(iommu, IOMMU_REG_CONTEXT_CMD, command);
-	iommu_wait_completion(iommu, IOMMU_REG_CONTEXT_CMD, iommu_get_reg64,
-	    !(status & CCMD_INV_ICC), status);
-	mutex_exit(&(iommu->iu_reg_lock));
-
-	/* record the context cache statistics */
-	atomic_inc_64(&(iommu->iu_statistics.st_context_cache));
-}
-
-/*
- * dmar_flush_context_fsi()
- *   function based context cache flush
- */
-static void
-dmar_flush_context_fsi(intel_iommu_state_t *iommu, uint8_t function_mask,
-    uint16_t source_id, uint_t domain_id)
-{
-	dmar_flush_context_cache(iommu, function_mask, source_id,
-	    domain_id, CTT_INV_G_DEVICE);
-}
-
-/*
- * dmar_flush_context_dsi()
- *   domain based context cache flush
- */
-static void
-dmar_flush_context_dsi(intel_iommu_state_t *iommu, uint_t domain_id)
-{
-	dmar_flush_context_cache(iommu, 0, 0, domain_id, CTT_INV_G_DOMAIN);
-}
-
-/*
- * dmar_flush_context_gbl()
- *   flush global context cache
- */
-static void
-dmar_flush_context_gbl(intel_iommu_state_t *iommu)
-{
-	dmar_flush_context_cache(iommu, 0, 0, 0, CTT_INV_G_GLOBAL);
-}
-
-/*
- * dmar_set_root_entry_table()
- *   set root entry table
- */
-static void
-dmar_set_root_table(intel_iommu_state_t *iommu)
-{
-	uint32_t status;
-
-	mutex_enter(&(iommu->iu_reg_lock));
-	iommu_put_reg64(iommu, IOMMU_REG_ROOTENTRY,
-	    iommu->iu_root_entry_paddr);
-	iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD,
-	    iommu->iu_global_cmd_reg | IOMMU_GCMD_SRTP);
-	iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS,
-	    iommu_get_reg32, (status & IOMMU_GSTS_RTPS), status);
-	mutex_exit(&(iommu->iu_reg_lock));
-}
-
-/*
- * dmar_enable_unit()
- *   enable the dmar unit
- */
-static void
-dmar_enable_unit(intel_iommu_state_t *iommu)
-{
-	uint32_t status;
-
-	mutex_enter(&(iommu->iu_reg_lock));
-	iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD,
-	    IOMMU_GCMD_TE);
-	iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS,
-	    iommu_get_reg32, (status & IOMMU_GSTS_TES), status);
-	mutex_exit(&(iommu->iu_reg_lock));
-	iommu->iu_global_cmd_reg |= IOMMU_GCMD_TE;
-	iommu->iu_enabled |= DMAR_ENABLE;
-	cmn_err(CE_CONT, "?\t%s enabled\n",
-	    ddi_node_name(iommu->iu_drhd->di_dip));
-}
-
-/*
- * iommu_bringup_unit()
- *   the processes to bring up a dmar unit
- */
-static void
-iommu_bringup_unit(intel_iommu_state_t *iommu)
-{
-	/*
-	 * flush the iommu write buffer
-	 */
-	iommu->iu_dmar_ops->do_flwb(iommu);
-
-	/*
-	 * set root entry table
-	 */
-	iommu->iu_dmar_ops->do_set_root_table(iommu);
-
-	/*
-	 * flush the context cache
-	 */
-	iommu->iu_dmar_ops->do_context_gbl(iommu);
-
-	/*
-	 * flush the iotlb cache
-	 */
-	iommu->iu_dmar_ops->do_iotlb_gbl(iommu);
-
-	/*
-	 * at last enable the unit
-	 */
-	iommu->iu_dmar_ops->do_enable(iommu);
-
-	/* enable queued invalidation */
-	if (iommu->iu_inv_queue)
-		iommu_qinv_enable(iommu);
-}
-
-/*
- * iommu_dvma_cache_get()
- *   get a dvma from the cache
- */
-static uint64_t
-iommu_dvma_cache_get(dmar_domain_state_t *domain,
-    size_t size, size_t align, size_t nocross)
-{
-	dvma_cache_node_t *cache_node = NULL;
-	dvma_cache_head_t *cache_head;
-	uint_t index = IOMMU_BTOP(size) - 1;
-	uint64_t ioaddr;
-
-	if (index >= DVMA_CACHE_HEAD_CNT)
-		return (0);
-
-	cache_head = &(domain->dm_dvma_cache[index]);
-	mutex_enter(&(cache_head->dch_free_lock));
-	for_each_in_list(&(cache_head->dch_free_list), cache_node) {
-		if ((cache_node->dcn_align >= align) &&
-		    ((nocross == 0) ||
-		    ((cache_node->dcn_dvma ^ (cache_node->dcn_dvma + size - 1))
-		    < (nocross - 1)))) {
-			list_remove(&(cache_head->dch_free_list),
-			    cache_node);
-			cache_head->dch_free_count--;
-			break;
-		}
-	}
-	mutex_exit(&(cache_head->dch_free_lock));
-
-	if (cache_node) {
-		ioaddr = cache_node->dcn_dvma;
-		mutex_enter(&(cache_head->dch_mem_lock));
-		list_insert_head(&(cache_head->dch_mem_list), cache_node);
-		mutex_exit(&(cache_head->dch_mem_lock));
-		return (ioaddr);
-	}
-
-	return (0);
-}
-
-/*
- * iommu_dvma_cache_put()
- *   put a dvma to the cache after use
- */
-static void
-iommu_dvma_cache_put(dmar_domain_state_t *domain, uint64_t dvma,
-    size_t size, size_t align)
-{
-	dvma_cache_node_t *cache_node = NULL;
-	dvma_cache_head_t *cache_head;
-	uint_t index = IOMMU_BTOP(size) - 1;
-	boolean_t shrink = B_FALSE;
-
-	/* out of cache range */
-	if (index >= DVMA_CACHE_HEAD_CNT) {
-		vmem_xfree(domain->dm_dvma_map,
-		    (void *)(intptr_t)dvma, size);
-		return;
-	}
-
-	cache_head = &(domain->dm_dvma_cache[index]);
-
-	/* get a node block */
-	mutex_enter(&(cache_head->dch_mem_lock));
-	cache_node = list_head(&(cache_head->dch_mem_list));
-	if (cache_node) {
-		list_remove(&(cache_head->dch_mem_list), cache_node);
-	}
-	mutex_exit(&(cache_head->dch_mem_lock));
-
-	/* no cache, alloc one */
-	if (cache_node == NULL) {
-		cache_node = kmem_alloc(sizeof (dvma_cache_node_t), KM_SLEEP);
-	}
-
-	/* initialize this node */
-	cache_node->dcn_align = align;
-	cache_node->dcn_dvma = dvma;
-
-	/* insert into the free list */
-	mutex_enter(&(cache_head->dch_free_lock));
-	list_insert_head(&(cache_head->dch_free_list), cache_node);
-
-	/* shrink the cache list */
-	if (cache_head->dch_free_count++ > dvma_cache_high) {
-		cache_node = list_tail(&(cache_head->dch_free_list));
-		list_remove(&(cache_head->dch_free_list), cache_node);
-		shrink = B_TRUE;
-		cache_head->dch_free_count--;
-	}
-	mutex_exit(&(cache_head->dch_free_lock));
-
-	if (shrink) {
-		ASSERT(cache_node);
-		vmem_xfree(domain->dm_dvma_map,
-		    (void *)(intptr_t)(cache_node->dcn_dvma), size);
-		kmem_free(cache_node, sizeof (dvma_cache_node_t));
-	}
-}
-
-/*
- * iommu_dvma_cache_flush()
- *   flush the dvma caches when vmem_xalloc() failed
- */
-static void
-iommu_dvma_cache_flush(dmar_domain_state_t *domain, dev_info_t *dip)
-{
-	dvma_cache_node_t *cache_node;
-	dvma_cache_head_t *cache_head;
-	uint_t index;
-
-	cmn_err(CE_NOTE, "domain dvma cache for %s flushed",
-	    ddi_node_name(dip));
-
-	for (index = 0; index < DVMA_CACHE_HEAD_CNT; index++) {
-		cache_head = &(domain->dm_dvma_cache[index]);
-		mutex_enter(&(cache_head->dch_free_lock));
-		cache_node = list_head(&(cache_head->dch_free_list));
-		while (cache_node) {
-			list_remove(&(cache_head->dch_free_list), cache_node);
-			vmem_xfree(domain->dm_dvma_map,
-			    (void *)(intptr_t)(cache_node->dcn_dvma),
-			    IOMMU_PTOB(index + 1));
-			kmem_free(cache_node, sizeof (dvma_cache_node_t));
-			cache_head->dch_free_count--;
-			cache_node = list_head(&(cache_head->dch_free_list));
-		}
-		ASSERT(cache_head->dch_free_count == 0);
-		mutex_exit(&(cache_head->dch_free_lock));
-	}
-}
-
-/*
- * get_dvma_cookie_array()
- *   get a dvma cookie array from the cache or allocate
- */
-static iommu_dvma_cookie_t *
-get_dvma_cookie_array(uint_t array_size)
-{
-	dvma_cookie_head_t *cache_head;
-	iommu_dvma_cookie_t *cookie = NULL;
-
-	if (array_size > MAX_COOKIE_CACHE_SIZE) {
-		return (kmem_alloc(sizeof (iommu_dvma_cookie_t) * array_size,
-		    KM_SLEEP));
-	}
-
-	cache_head = &(cookie_cache[array_size - 1]);
-	mutex_enter(&(cache_head->dch_lock));
-	/* LINTED E_EQUALITY_NOT_ASSIGNMENT */
-	if (cookie = cache_head->dch_next) {
-		cache_head->dch_next = cookie->dc_next;
-		cache_head->dch_count--;
-	}
-	mutex_exit(&(cache_head->dch_lock));
-
-	if (cookie) {
-		return (cookie);
-	} else {
-		return (kmem_alloc(sizeof (iommu_dvma_cookie_t) * array_size,
-		    KM_SLEEP));
-	}
-}
-
-/*
- * put_dvma_cookie_array()
- *   put a dvma cookie array to the cache or free
- */
-static void
-put_dvma_cookie_array(iommu_dvma_cookie_t *dcookies, uint_t array_size)
-{
-	dvma_cookie_head_t *cache_head;
-
-	if (array_size > MAX_COOKIE_CACHE_SIZE) {
-		kmem_free(dcookies, sizeof (iommu_dvma_cookie_t) * array_size);
-		return;
-	}
-
-	cache_head = &(cookie_cache[array_size - 1]);
-	mutex_enter(&(cache_head->dch_lock));
-	dcookies->dc_next = cache_head->dch_next;
-	cache_head->dch_next = dcookies;
-	cache_head->dch_count++;
-	mutex_exit(&(cache_head->dch_lock));
-}
-
-/*
- * dmar_reg_plant_wait()
- *   the plant wait operation for register based cache invalidation
- */
-static void
-dmar_reg_plant_wait(intel_iommu_state_t *iommu, iommu_dvma_cookie_t *dcookies,
-    uint_t count, uint_t array_size)
-{
-	iotlb_pend_node_t *node = NULL;
-	iotlb_pend_head_t *head;
-
-	head = &(iommu->iu_pend_head);
-
-	/* get a node */
-	mutex_enter(&(head->ich_mem_lock));
-	node = list_head(&(head->ich_mem_list));
-	if (node) {
-		list_remove(&(head->ich_mem_list), node);
-	}
-	mutex_exit(&(head->ich_mem_lock));
-
-	/* no cache, alloc one */
-	if (node == NULL) {
-		node = kmem_alloc(sizeof (iotlb_pend_node_t), KM_SLEEP);
-	}
-
-	/* initialize this node */
-	node->icn_dcookies = dcookies;
-	node->icn_count = count;
-	node->icn_array_size = array_size;
-
-	/* insert into the pend list */
-	mutex_enter(&(head->ich_pend_lock));
-	list_insert_tail(&(head->ich_pend_list), node);
-	head->ich_pend_count++;
-	mutex_exit(&(head->ich_pend_lock));
-}
-
-/*
- * dmar_release_dvma_cookie()
- *   release the dvma cookie
- */
-static void
-dmar_release_dvma_cookie(iommu_dvma_cookie_t *dcookies,
-    uint_t count, uint_t array_size)
-{
-	uint_t i;
-
-	/* free dvma */
-	for (i = 0; i < count; i++) {
-		iommu_dvma_cache_put(dcookies[i].dc_domain,
-		    dcookies[i].dc_addr, dcookies[i].dc_size,
-		    dcookies[i].dc_align);
-	}
-
-	/* free the cookie array */
-	put_dvma_cookie_array(dcookies, array_size);
-}
-
-/*
- * dmar_reg_reap_wait()
- *   the reap wait operation for register based cache invalidation
- */
-static void
-dmar_reg_reap_wait(intel_iommu_state_t *iommu)
-{
-	iotlb_pend_node_t *node;
-	iotlb_pend_head_t *head;
-
-	head = &(iommu->iu_pend_head);
-	mutex_enter(&(head->ich_pend_lock));
-	node = list_head(&(head->ich_pend_list));
-	if (node) {
-		list_remove(&(head->ich_pend_list), node);
-		head->ich_pend_count--;
-	}
-	mutex_exit(&(head->ich_pend_lock));
-
-	if (node) {
-		dmar_release_dvma_cookie(node->icn_dcookies,
-		    node->icn_count, node->icn_array_size);
-		/* put the node into the node cache */
-		mutex_enter(&(head->ich_mem_lock));
-		list_insert_head(&(head->ich_mem_list), node);
-		mutex_exit(&(head->ich_mem_lock));
-	}
-}
-
-/*
- * dmar_init_ops()
- *   init dmar ops
- */
-static void
-dmar_init_ops(intel_iommu_state_t *iommu)
-{
-	struct dmar_ops *ops;
-
-	ASSERT(iommu);
-	ops = kmem_alloc(sizeof (struct dmar_ops), KM_SLEEP);
-
-	/* initialize the dmar operations */
-	ops->do_enable = dmar_enable_unit;
-	ops->do_fault = iommu_intr_handler;
-
-	/* cpu clflush */
-	if (iommu->iu_coherency) {
-		ops->do_clflush = (void (*)(caddr_t, uint_t))return_instr;
-	} else {
-		ASSERT(x86_feature & X86_CLFSH);
-		ops->do_clflush = cpu_clflush;
-	}
-
-	/* Check for Mobile 4 Series Chipset */
-	if (mobile4_cs && !IOMMU_CAP_GET_RWBF(iommu->iu_capability)) {
-		cmn_err(CE_WARN,
-		    "Mobile 4 Series chipset present, activating quirks\n");
-		iommu->iu_capability |= (1 << 4);
-		if (IOMMU_CAP_GET_RWBF(iommu->iu_capability))
-			cmn_err(CE_WARN, "Setting RWBF forcefully\n");
-	}
-
-	/* write buffer */
-	if (IOMMU_CAP_GET_RWBF(iommu->iu_capability)) {
-		ops->do_flwb = dmar_flush_write_buffer;
-	} else {
-		ops->do_flwb = (void (*)(intel_iommu_state_t *))return_instr;
-	}
-
-	/* cache related functions */
-	ops->do_iotlb_psi = dmar_flush_iotlb_psi;
-	ops->do_iotlb_dsi = dmar_flush_iotlb_dsi;
-	ops->do_iotlb_gbl = dmar_flush_iotlb_glb;
-	ops->do_context_fsi = dmar_flush_context_fsi;
-	ops->do_context_dsi = dmar_flush_context_dsi;
-	ops->do_context_gbl = dmar_flush_context_gbl;
-	ops->do_plant_wait = dmar_reg_plant_wait;
-	ops->do_reap_wait = dmar_reg_reap_wait;
-
-	ops->do_set_root_table = dmar_set_root_table;
-
-
-	iommu->iu_dmar_ops = ops;
-}
-
-/*
- * create_iommu_state()
- *   alloc and setup the iommu state
- */
-static int
-create_iommu_state(drhd_info_t *drhd)
-{
-	intel_iommu_state_t *iommu;
-	int mgaw, sagaw, agaw;
-	int bitnum;
-	int ret;
-
-	static ddi_device_acc_attr_t ioattr = {
-		DDI_DEVICE_ATTR_V0,
-		DDI_NEVERSWAP_ACC,
-		DDI_STRICTORDER_ACC,
-	};
-
-	iommu = kmem_zalloc(sizeof (intel_iommu_state_t), KM_SLEEP);
-	drhd->di_iommu = (void *)iommu;
-	iommu->iu_drhd = drhd;
-
-	/*
-	 * map the register address space
-	 */
-	ret = ddi_regs_map_setup(iommu->iu_drhd->di_dip, 0,
-	    (caddr_t *)&(iommu->iu_reg_address), (offset_t)0,
-	    (offset_t)IOMMU_REG_SIZE, &ioattr,
-	    &(iommu->iu_reg_handle));
-
-	if (ret != DDI_SUCCESS) {
-		cmn_err(CE_WARN, "iommu register map failed: %d", ret);
-		kmem_free(iommu, sizeof (intel_iommu_state_t));
-		return (DDI_FAILURE);
-	}
-
-	mutex_init(&(iommu->iu_reg_lock), NULL, MUTEX_DRIVER,
-	    (void *)ipltospl(IOMMU_INTR_IPL));
-	mutex_init(&(iommu->iu_root_context_lock), NULL, MUTEX_DRIVER, NULL);
-
-	/*
-	 * get the register value
-	 */
-	iommu->iu_capability = iommu_get_reg64(iommu, IOMMU_REG_CAP);
-	iommu->iu_excapability = iommu_get_reg64(iommu, IOMMU_REG_EXCAP);
-
-	/*
-	 * if the hardware access is non-coherent, we need clflush
-	 */
-	if (IOMMU_ECAP_GET_C(iommu->iu_excapability)) {
-		iommu->iu_coherency = B_TRUE;
-	} else {
-		iommu->iu_coherency = B_FALSE;
-		if (!(x86_feature & X86_CLFSH)) {
-			cmn_err(CE_WARN, "drhd can't be enabled due to "
-			    "missing clflush functionality");
-			ddi_regs_map_free(&(iommu->iu_reg_handle));
-			kmem_free(iommu, sizeof (intel_iommu_state_t));
-			return (DDI_FAILURE);
-		}
-	}
-
-	/*
-	 * retrieve the maximum number of domains
-	 */
-	iommu->iu_max_domain = IOMMU_CAP_ND(iommu->iu_capability);
-
-	/*
-	 * setup the domain id allocator
-	 *  domain id 0 is reserved by the architecture
-	 */
-	iommu_rscs_init(1, iommu->iu_max_domain, &(iommu->iu_domain_id_hdl));
-
-	/*
-	 * calculate the agaw
-	 */
-	mgaw = IOMMU_CAP_MGAW(iommu->iu_capability);
-	sagaw = IOMMU_CAP_SAGAW(iommu->iu_capability);
-	iommu->iu_gaw = mgaw;
-	agaw = calculate_agaw(iommu->iu_gaw);
-	bitnum = (agaw - 30) / 9;
-
-	while (bitnum < 5) {
-		if (sagaw & (1 << bitnum))
-			break;
-		else
-			bitnum++;
-	}
-
-	if (bitnum >= 5) {
-		cmn_err(CE_PANIC, "can't determine agaw");
-		/*NOTREACHED*/
-		return (DDI_FAILURE);
-	} else {
-		iommu->iu_agaw = 30 + bitnum * 9;
-		if (iommu->iu_agaw > 64)
-			iommu->iu_agaw = 64;
-		iommu->iu_level = bitnum + 2;
-	}
-
-	/*
-	 * the iommu is orginally disabled
-	 */
-	iommu->iu_enabled = 0;
-	iommu->iu_global_cmd_reg = 0;
-
-	/*
-	 * init kstat
-	 */
-	(void) iommu_init_stats(iommu);
-	bzero(&(iommu->iu_statistics), sizeof (iommu_stat_t));
-
-	/*
-	 * init dmar ops
-	 */
-	dmar_init_ops(iommu);
-
-	/*
-	 * alloc root entry table, this should put after init ops
-	 */
-	iommu->iu_root_entry_paddr = iommu_get_page(iommu, KM_SLEEP);
-
-	/*
-	 * init queued invalidation interface
-	 */
-	iommu->iu_inv_queue = NULL;
-	if (IOMMU_ECAP_GET_QI(iommu->iu_excapability) && !qinv_disable) {
-		if (iommu_qinv_init(iommu) != DDI_SUCCESS) {
-			cmn_err(CE_WARN,
-			    "%s init queued invalidation interface failed\n",
-			    ddi_node_name(iommu->iu_drhd->di_dip));
-		}
-	}
-
-	/*
-	 * init intr remapping table state pointer
-	 */
-	iommu->iu_intr_remap_tbl = NULL;
-
-	/*
-	 * initialize the iotlb pending list and cache
-	 */
-	mutex_init(&(iommu->iu_pend_head.ich_pend_lock), NULL,
-	    MUTEX_DRIVER, NULL);
-	list_create(&(iommu->iu_pend_head.ich_pend_list),
-	    sizeof (iotlb_pend_node_t),
-	    offsetof(iotlb_pend_node_t, node));
-	iommu->iu_pend_head.ich_pend_count = 0;
-
-	mutex_init(&(iommu->iu_pend_head.ich_mem_lock), NULL,
-	    MUTEX_DRIVER, NULL);
-	list_create(&(iommu->iu_pend_head.ich_mem_list),
-	    sizeof (iotlb_pend_node_t),
-	    offsetof(iotlb_pend_node_t, node));
-
-	/*
-	 * insert this iommu into the list
-	 */
-	list_insert_tail(&iommu_states, iommu);
-
-	/*
-	 * report this unit
-	 */
-	cmn_err(CE_CONT, "?\t%s state structure created\n",
-	    ddi_node_name(iommu->iu_drhd->di_dip));
-
-	return (DDI_SUCCESS);
-}
-
-/*
- * match_dip_sbdf()
- *   walk function for get_dip_from_info()
- */
-static int
-match_dip_sbdf(dev_info_t *dip, void *arg)
-{
-	iommu_private_t *private;
-	pci_dev_info_t *info = arg;
-
-	if (DEVI(dip)->devi_iommu_private == NULL &&
-	    iommu_set_private(dip) != DDI_SUCCESS) {
-		return (DDI_WALK_CONTINUE);
-	}
-
-	private = DEVI(dip)->devi_iommu_private;
-
-	ASSERT(private);
-
-	if ((info->pdi_seg == private->idp_seg) &&
-	    (info->pdi_bus == private->idp_bus) &&
-	    (info->pdi_devfn == private->idp_devfn)) {
-		info->pdi_dip = dip;
-		return (DDI_WALK_TERMINATE);
-	}
-	return (DDI_WALK_CONTINUE);
-}
-
-/*
- * get_dip_from_info()
- *   get the dev_info structure by pass a bus/dev/func
- */
-static int
-get_dip_from_info(pci_dev_info_t *info)
-{
-	int count;
-	info->pdi_dip = NULL;
-
-	ndi_devi_enter(root_devinfo, &count);
-	ddi_walk_devs(ddi_get_child(root_devinfo),
-	    match_dip_sbdf, info);
-	ndi_devi_exit(root_devinfo, count);
-
-	if (info->pdi_dip)
-		return (DDI_SUCCESS);
-	else
-		return (DDI_FAILURE);
-}
-
-/*
- * iommu_get_pci_top_bridge()
- *   get the top level bridge for a pci device
- */
-static dev_info_t *
-iommu_get_pci_top_bridge(dev_info_t *dip)
-{
-	iommu_private_t *private;
-	dev_info_t *tmp, *pdip;
-
-	tmp = NULL;
-	pdip = ddi_get_parent(dip);
-	for (; pdip && pdip != root_devinfo; pdip = ddi_get_parent(pdip)) {
-		if (DEVI(pdip)->devi_iommu_private == NULL &&
-		    iommu_set_private(pdip) != DDI_SUCCESS)
-			continue;
-
-		private = DEVI(pdip)->devi_iommu_private;
-		ASSERT(private);
-
-		if ((private->idp_bbp_type == IOMMU_PPB_PCIE_PCI) ||
-		    (private->idp_bbp_type == IOMMU_PPB_PCI_PCI))
-			tmp = pdip;
-	}
-
-	return (tmp);
-}
-
-/*
- * domain_vmem_init()
- *   initiate the domain vmem
- */
-static void
-domain_vmem_init(dmar_domain_state_t *domain)
-{
-	char vmem_name[64];
-	static uint_t vmem_instance = 0;
-	struct memlist *mp;
-	uint64_t start, end;
-	void *vmem_ret;
-
-	(void) snprintf(vmem_name, sizeof (vmem_name),
-	    "domain_vmem_%d", vmem_instance++);
-
-	memlist_read_lock();
-	mp = phys_install;
-	end = (mp->ml_address + mp->ml_size);
-
-	/*
-	 * Skip page 0: vmem_create wont like it for obvious
-	 * reasons.
-	 */
-	if (mp->ml_address == 0) {
-		start = IOMMU_PAGE_SIZE;
-	} else {
-		start = mp->ml_address;
-	}
-
-	cmn_err(CE_CONT, "?Adding iova [0x%" PRIx64
-	    " - 0x%" PRIx64 "] to %s\n", start, end,
-	    vmem_name);
-
-	domain->dm_dvma_map = vmem_create(vmem_name,
-	    (void *)(uintptr_t)start,	/* base */
-	    end - start,		/* size */
-	    IOMMU_PAGE_SIZE,		/* quantum */
-	    NULL,			/* afunc */
-	    NULL,			/* ffunc */
-	    NULL,			/* source */
-	    0,				/* qcache_max */
-	    VM_SLEEP);
-
-	if (domain->dm_dvma_map == NULL) {
-		cmn_err(CE_PANIC, "Unable to inialize vmem map\n");
-	}
-
-	mp = mp->ml_next;
-	while (mp) {
-		vmem_ret = vmem_add(domain->dm_dvma_map,
-		    (void *)((uintptr_t)mp->ml_address),
-		    mp->ml_size, VM_NOSLEEP);
-		cmn_err(CE_CONT, "?Adding iova [0x%" PRIx64
-		    " - 0x%" PRIx64 "] to %s\n", mp->ml_address,
-		    mp->ml_address + mp->ml_size, vmem_name);
-		if (!vmem_ret)
-			cmn_err(CE_PANIC, "Unable to inialize vmem map\n");
-		mp = mp->ml_next;
-	}
-
-	memlist_read_unlock();
-}
-
-/*
- * iommu_domain_init()
- *   initiate a domain
- */
-static int
-iommu_domain_init(dmar_domain_state_t *domain)
-{
-	uint_t i;
-
-	/*
-	 * allocate the domain id
-	 */
-	if (iommu_rscs_alloc(domain->dm_iommu->iu_domain_id_hdl,
-	    &(domain->dm_domain_id)) != DDI_SUCCESS) {
-		cmn_err(CE_WARN, "domain id exhausted %p, assign 1",
-		    (void *)domain->dm_iommu);
-		domain->dm_domain_id = 1;
-	}
-
-	/*
-	 * record the domain statistics
-	 */
-	atomic_inc_64(&(domain->dm_iommu->iu_statistics.st_domain_alloc));
-
-	/*
-	 * create vmem map
-	 */
-	domain_vmem_init(domain);
-
-	/*
-	 * create the first level page table
-	 */
-	domain->dm_page_table_paddr = iommu_get_page(domain->dm_iommu,
-	    KM_SLEEP);
-
-	mutex_init(&(domain->dm_pgtable_lock), NULL, MUTEX_DRIVER, NULL);
-
-	/*
-	 * init the CPU available page tables
-	 */
-	domain->dm_pt_tree.vp = kmem_zalloc(IOMMU_IOVPTE_TABLE_SIZE, KM_SLEEP);
-	domain->dm_pt_tree.pp = iommu_get_vaddr(domain->dm_iommu,
-	    domain->dm_page_table_paddr);
-	domain->dm_identity = B_FALSE;
-
-	/*
-	 * init the dvma cache
-	 */
-	for (i = 0; i < DVMA_CACHE_HEAD_CNT; i++) {
-		/* init the free list */
-		mutex_init(&(domain->dm_dvma_cache[i].dch_free_lock),
-		    NULL, MUTEX_DRIVER, NULL);
-		list_create(&(domain->dm_dvma_cache[i].dch_free_list),
-		    sizeof (dvma_cache_node_t),
-		    offsetof(dvma_cache_node_t, node));
-		domain->dm_dvma_cache[i].dch_free_count = 0;
-
-		/* init the memory cache list */
-		mutex_init(&(domain->dm_dvma_cache[i].dch_mem_lock),
-		    NULL, MUTEX_DRIVER, NULL);
-		list_create(&(domain->dm_dvma_cache[i].dch_mem_list),
-		    sizeof (dvma_cache_node_t),
-		    offsetof(dvma_cache_node_t, node));
-	}
-
-	list_insert_tail(&domain_states, domain);
-
-	return (DDI_SUCCESS);
-}
-
-/*
- *  Get first ancestor with a non-NULL private struct
- */
-static dev_info_t *
-iommu_get_ancestor_private(dev_info_t *dip)
-{
-	dev_info_t *pdip;
-
-	pdip = ddi_get_parent(dip);
-	for (; pdip && pdip != root_devinfo; pdip = ddi_get_parent(pdip)) {
-		if (DEVI(pdip)->devi_iommu_private == NULL &&
-		    iommu_set_private(pdip) != DDI_SUCCESS)
-			continue;
-		ASSERT(DEVI(pdip)->devi_iommu_private);
-		return (pdip);
-	}
-
-	return (NULL);
-}
-
-/*
- * dmar_check_sub()
- *   check to see if the device is under scope of a p2p bridge
- */
-static boolean_t
-dmar_check_sub(dev_info_t *dip, int seg, pci_dev_scope_t *devs)
-{
-	dev_info_t *pdip;
-	iommu_private_t *private;
-	int bus = devs->pds_bus;
-	int devfn = ((devs->pds_dev << 3) | devs->pds_func);
-
-	ASSERT(dip != root_devinfo);
-
-	pdip = ddi_get_parent(dip);
-	for (; pdip && pdip != root_devinfo; pdip = ddi_get_parent(pdip)) {
-		if (DEVI(pdip)->devi_iommu_private == NULL &&
-		    iommu_set_private(pdip) != DDI_SUCCESS)
-			continue;
-		private = DEVI(pdip)->devi_iommu_private;
-		ASSERT(private);
-		if ((private->idp_seg == seg) &&
-		    (private->idp_bus == bus) &&
-		    (private->idp_devfn == devfn))
-			return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-/*
- * iommu_get_dmar()
- *   get the iommu structure for a device
- */
-static intel_iommu_state_t *
-iommu_get_dmar(dev_info_t *dip)
-{
-	iommu_private_t *private = NULL;
-	int seg, bus, dev, func;
-	pci_dev_scope_t *devs;
-	drhd_info_t *drhd;
-
-	bus = dev = func = -1;
-
-	seg = 0;
-	if (DEVI(dip)->devi_iommu_private ||
-	    iommu_set_private(dip) == DDI_SUCCESS) {
-		private = DEVI(dip)->devi_iommu_private;
-		ASSERT(private);
-		seg = private->idp_seg;
-		bus = private->idp_bus;
-		dev = GET_DEV(private->idp_devfn);
-		func = GET_FUNC(private->idp_devfn);
-	}
-
-	/*
-	 * walk the drhd list for a match
-	 */
-	for_each_in_list(&(dmar_info->dmari_drhd[seg]), drhd) {
-
-		/*
-		 * match the include all
-		 */
-		if (drhd->di_include_all)
-			return ((intel_iommu_state_t *)
-			    drhd->di_iommu);
-
-		/*
-		 * try to match the device scope
-		 */
-		for_each_in_list(&(drhd->di_dev_list), devs) {
-
-			/*
-			 * get a perfect match
-			 */
-			if (private &&
-			    devs->pds_bus == bus &&
-			    devs->pds_dev == dev &&
-			    devs->pds_func == func) {
-				return ((intel_iommu_state_t *)
-				    (drhd->di_iommu));
-			}
-
-			/*
-			 * maybe under a scope of a p2p
-			 */
-			if (devs->pds_type == 0x2 &&
-			    dmar_check_sub(dip, seg, devs))
-				return ((intel_iommu_state_t *)
-				    (drhd->di_iommu));
-		}
-	}
-
-	/*
-	 * This may happen with buggy versions of BIOSes. Just warn instead
-	 * of panic as we don't want whole system to go down because of one
-	 * device.
-	 */
-	cmn_err(CE_WARN, "can't match iommu for %s\n",
-	    ddi_node_name(dip));
-
-	return (NULL);
-}
-
-/*
- * domain_set_root_context
- *   set root context for a single device
- */
-static void
-domain_set_root_context(dmar_domain_state_t *domain,
-    pci_dev_info_t *info, uint_t agaw)
-{
-	caddr_t root, context;
-	paddr_t paddr;
-	iorce_t rce;
-	uint_t bus, devfn;
-	intel_iommu_state_t *iommu;
-	uint_t aw_code;
-
-	ASSERT(domain);
-	iommu = domain->dm_iommu;
-	ASSERT(iommu);
-	bus = info->pdi_bus;
-	devfn = info->pdi_devfn;
-	aw_code = (agaw - 30) / 9;
-
-	/*
-	 * set root entry
-	 */
-	root = iommu_get_vaddr(iommu, iommu->iu_root_entry_paddr);
-	rce = (iorce_t)root + bus;
-	mutex_enter(&(iommu->iu_root_context_lock));
-	if (!ROOT_ENTRY_GET_P(rce)) {
-		paddr = iommu_get_page(iommu, KM_SLEEP);
-		ROOT_ENTRY_SET_P(rce);
-		ROOT_ENTRY_SET_CTP(rce, paddr);
-		iommu->iu_dmar_ops->do_clflush((caddr_t)rce, sizeof (*rce));
-		context = iommu_get_vaddr(iommu, paddr);
-	} else {
-		paddr = ROOT_ENTRY_GET_CTP(rce);
-		context = iommu_get_vaddr(iommu, paddr);
-	}
-
-	/* set context entry */
-	rce = (iorce_t)context + devfn;
-	if (!CONT_ENTRY_GET_P(rce)) {
-		paddr = domain->dm_page_table_paddr;
-		CONT_ENTRY_SET_P(rce);
-		CONT_ENTRY_SET_ASR(rce, paddr);
-		CONT_ENTRY_SET_AW(rce, aw_code);
-		CONT_ENTRY_SET_DID(rce, domain->dm_domain_id);
-		iommu->iu_dmar_ops->do_clflush((caddr_t)rce, sizeof (*rce));
-	} else if (CONT_ENTRY_GET_ASR(rce) !=
-	    domain->dm_page_table_paddr) {
-		cmn_err(CE_PANIC, "root context entries for"
-		    " %d, %d, %d has been set", bus,
-		    devfn >>3, devfn & 0x7);
-		/*NOTREACHED*/
-	}
-
-	mutex_exit(&(iommu->iu_root_context_lock));
-
-	/* cache mode set, flush context cache */
-	if (IOMMU_CAP_GET_CM(iommu->iu_capability)) {
-		iommu->iu_dmar_ops->do_context_fsi(iommu, 0,
-		    (bus << 8) | devfn, domain->dm_domain_id);
-		iommu->iu_dmar_ops->do_iotlb_dsi(iommu, domain->dm_domain_id);
-	/* cache mode not set, flush write buffer */
-	} else {
-		iommu->iu_dmar_ops->do_flwb(iommu);
-	}
-}
-
-/*
- * setup_single_context()
- *   setup the root context entry
- */
-static void
-setup_single_context(dmar_domain_state_t *domain,
-    int seg, int bus, int devfn)
-{
-	pci_dev_info_t info;
-
-	info.pdi_seg = seg;
-	info.pdi_bus = bus;
-	info.pdi_devfn = devfn;
-
-	domain_set_root_context(domain, &info,
-	    domain->dm_iommu->iu_agaw);
-}
-
-/*
- * setup_context_walk()
- *   the walk function to set up the possible context entries
- */
-static int
-setup_context_walk(dev_info_t *dip, void *arg)
-{
-	dmar_domain_state_t *domain = arg;
-	iommu_private_t *private;
-
-	private = DEVI(dip)->devi_iommu_private;
-	if (private == NULL && iommu_set_private(dip) != DDI_SUCCESS) {
-		cmn_err(CE_PANIC, "setup_context_walk: cannot find private");
-		/*NOTREACHED*/
-	}
-	private = DEVI(dip)->devi_iommu_private;
-	ASSERT(private);
-
-	setup_single_context(domain, private->idp_seg,
-	    private->idp_bus, private->idp_devfn);
-
-	return (DDI_WALK_PRUNECHILD);
-}
-
-/*
- * setup_possible_contexts()
- *   set up all the possible context entries for a device under ppb
- */
-static void
-setup_possible_contexts(dmar_domain_state_t *domain, dev_info_t *dip)
-{
-	int count;
-	iommu_private_t *private;
-	private = DEVI(dip)->devi_iommu_private;
-
-	ASSERT(private);
-
-	/* for pci-pci bridge */
-	if (private->idp_bbp_type == IOMMU_PPB_PCI_PCI) {
-		setup_single_context(domain, private->idp_seg,
-		    private->idp_bus, private->idp_devfn);
-		return;
-	}
-
-	/* for pcie-pci bridge */
-	setup_single_context(domain, private->idp_seg,
-	    private->idp_bus, private->idp_devfn);
-	setup_single_context(domain, private->idp_seg,
-	    private->idp_sec, 0);
-
-	/* for functions under pcie-pci bridge */
-	ndi_devi_enter(dip, &count);
-	ddi_walk_devs(ddi_get_child(dip), setup_context_walk, domain);
-	ndi_devi_exit(dip, count);
-}
-
-/*
- * iommu_alloc_domain()
- *   allocate a domain for device, the result is returned in domain parameter
- */
-static int
-iommu_alloc_domain(dev_info_t *dip, dmar_domain_state_t **domain)
-{
-	iommu_private_t *private, *b_private;
-	dev_info_t *bdip = NULL, *ldip = NULL;
-	dmar_domain_state_t *new;
-	pci_dev_info_t info;
-	uint_t need_to_set_parent = 0;
-	int count;
-
-	private = DEVI(dip)->devi_iommu_private;
-	if (private == NULL) {
-		cmn_err(CE_PANIC, "iommu private is NULL (%s)\n",
-		    ddi_node_name(dip));
-	}
-
-	/*
-	 * check if the domain has already allocated without lock held.
-	 */
-	if (private->idp_intel_domain) {
-		*domain = INTEL_IOMMU_PRIVATE(private->idp_intel_domain);
-		return (DDI_SUCCESS);
-	}
-
-	/*
-	 * lock strategy for dip->devi_iommu_private->idp_intel_domain field:
-	 * 1) read access is allowed without lock held.
-	 * 2) write access is protected by ndi_devi_enter(dip, &count). Lock
-	 *    on dip will protect itself and all descendants.
-	 * 3) lock will be released if in-kernel and iommu hardware data
-	 *    strutures have been synchronized.
-	 */
-	ndi_hold_devi(dip);
-	bdip = iommu_get_pci_top_bridge(dip);
-	ASSERT(bdip == NULL || DEVI(bdip)->devi_iommu_private);
-	ldip = (bdip != NULL) ? bdip : dip;
-	ndi_devi_enter(ldip, &count);
-
-	/*
-	 * double check if the domain has already created by other thread.
-	 */
-	if (private->idp_intel_domain) {
-		ndi_devi_exit(ldip, count);
-		ndi_rele_devi(dip);
-		*domain = INTEL_IOMMU_PRIVATE(private->idp_intel_domain);
-		return (DDI_SUCCESS);
-	}
-
-	/*
-	 * check to see if it is under a pci bridge
-	 */
-	if (bdip != NULL) {
-		b_private = DEVI(bdip)->devi_iommu_private;
-		ASSERT(b_private);
-		if (b_private->idp_intel_domain) {
-			new = INTEL_IOMMU_PRIVATE(b_private->idp_intel_domain);
-			goto get_domain_finish;
-		} else {
-			need_to_set_parent = 1;
-		}
-	}
-
-	/*
-	 * OK, we have to allocate a new domain
-	 */
-	new = kmem_alloc(sizeof (dmar_domain_state_t), KM_SLEEP);
-	new->dm_iommu = iommu_get_dmar(dip);
-	if (new->dm_iommu == NULL || iommu_domain_init(new) != DDI_SUCCESS) {
-		ndi_devi_exit(ldip, count);
-		ndi_rele_devi(dip);
-		kmem_free(new, sizeof (dmar_domain_state_t));
-		*domain = NULL;
-		return (DDI_FAILURE);
-	}
-
-get_domain_finish:
-	/*
-	 * setup root context entries
-	 */
-	if (bdip == NULL) {
-		info.pdi_seg = private->idp_seg;
-		info.pdi_bus = private->idp_bus;
-		info.pdi_devfn = private->idp_devfn;
-		domain_set_root_context(new, &info, new->dm_iommu->iu_agaw);
-	} else if (need_to_set_parent) {
-		setup_possible_contexts(new, bdip);
-		membar_producer();
-		b_private->idp_intel_domain = (void *)new;
-	}
-	membar_producer();
-	private->idp_intel_domain = (void *)new;
-
-	ndi_devi_exit(ldip, count);
-	ndi_rele_devi(dip);
-	*domain = new;
-
-	return (DDI_SUCCESS);
-}
-
-/*
- * iommu_get_domain()
- *   get a iommu domain for dip, and the result is returned in domain
- */
-static int
-iommu_get_domain(dev_info_t *dip, dmar_domain_state_t **domain)
-{
-	iommu_private_t *private = DEVI(dip)->devi_iommu_private;
-	dev_info_t *pdip;
-
-	ASSERT(domain);
-
-	/*
-	 * for isa devices attached under lpc
-	 */
-	pdip = ddi_get_parent(dip);
-	if (strcmp(ddi_node_name(pdip), "isa") == 0) {
-		if (lpc_devinfo) {
-			return (iommu_alloc_domain(lpc_devinfo, domain));
-		} else {
-			*domain = NULL;
-			return (DDI_FAILURE);
-		}
-	}
-
-	/*
-	 * for gart, use the real graphic devinfo
-	 */
-	if (strcmp(ddi_node_name(dip), "agpgart") == 0) {
-		if (gfx_devinfo) {
-			return (iommu_alloc_domain(gfx_devinfo, domain));
-		} else {
-			*domain = NULL;
-			return (DDI_FAILURE);
-		}
-	}
-
-	/*
-	 * if iommu private is NULL:
-	 *	1. try to find a cached private
-	 *	2. if that fails try to create a new one
-	 *	3. if this fails as well, device is probably not
-	 *	   PCI and shares domain with an ancestor.
-	 */
-	if (private == NULL && iommu_set_private(dip) != DDI_SUCCESS) {
-		if (pdip = iommu_get_ancestor_private(dip)) {
-			return (iommu_alloc_domain(pdip, domain));
-		}
-		cmn_err(CE_WARN, "Cannot find ancestor private for "
-		    "devinfo %s%d", ddi_node_name(dip),
-		    ddi_get_instance(dip));
-		*domain = NULL;
-		return (DDI_FAILURE);
-	}
-
-	/*
-	 * check if the domain has already allocated
-	 */
-	private = DEVI(dip)->devi_iommu_private;
-	ASSERT(private);
-	if (private->idp_intel_domain) {
-		*domain = INTEL_IOMMU_PRIVATE(private->idp_intel_domain);
-		return (DDI_SUCCESS);
-	}
-
-	/*
-	 * allocate a domain for this device
-	 */
-	return (iommu_alloc_domain(dip, domain));
-}
-
-/*
- * helper functions to manipulate iommu pte
- */
-static void
-set_pte(iopte_t pte, uint_t rw, paddr_t addr)
-{
-	*pte |= (rw & 0x3);
-	*pte |= (addr & IOMMU_PAGE_MASK);
-}
-
-static paddr_t
-pte_get_paddr(iopte_t pte)
-{
-	return (*pte & IOMMU_PAGE_MASK);
-}
-
-/*
- * dvma_level_offset()
- *   get the page table offset by specifying a dvma and level
- */
-static uint_t
-dvma_level_offset(uint64_t dvma_pn, uint_t level)
-{
-	uint_t start_bit, offset;
-
-	start_bit = (level - 1) * IOMMU_LEVEL_STRIDE;
-	offset = (dvma_pn >> start_bit) & IOMMU_LEVEL_OFFSET;
-
-	return (offset);
-}
-
-/*
- * iommu_setup_level_table()
- *   setup the page table for a level
- */
-static iovpte_t
-iommu_setup_level_table(dmar_domain_state_t *domain,
-    iovpte_t pvpte, uint_t offset)
-{
-	iopte_t pte;
-	iovpte_t vpte;
-	paddr_t child;
-	caddr_t vp;
-
-	vpte = (iovpte_t)(pvpte->vp) + offset;
-	pte = (iopte_t)(pvpte->pp) + offset;
-
-	/*
-	 * check whether pde already exists withoud lock held.
-	 */
-	if (vpte->pp != NULL) {
-		return (vpte);
-	}
-
-	/* Speculatively allocate resources needed. */
-	child = iommu_get_page(domain->dm_iommu, KM_SLEEP);
-	vp = kmem_zalloc(IOMMU_IOVPTE_TABLE_SIZE, KM_SLEEP);
-	mutex_enter(&(domain->dm_pgtable_lock));
-
-	/*
-	 * double check whether pde already exists with lock held.
-	 */
-	if (vpte->pp != NULL) {
-		mutex_exit(&(domain->dm_pgtable_lock));
-		kmem_free(vp, IOMMU_IOVPTE_TABLE_SIZE);
-		iommu_free_page(domain->dm_iommu, child);
-		return (vpte);
-	}
-	set_pte(pte, IOMMU_PAGE_PROP_RW, child);
-	domain->dm_iommu->iu_dmar_ops->do_clflush((caddr_t)pte, sizeof (*pte));
-	vpte->vp = vp;
-
-	/* make previous changes visible to other threads. */
-	membar_producer();
-	vpte->pp = iommu_get_vaddr(domain->dm_iommu, child);
-	mutex_exit(&(domain->dm_pgtable_lock));
-
-	return (vpte);
-}
-
-/*
- * iommu_setup_page_table()
- *   setup the page table for a dvma
- */
-static caddr_t
-iommu_setup_page_table(dmar_domain_state_t *domain, uint64_t dvma)
-{
-	iovpte_t vpte;
-	uint_t level;
-	uint_t offset;
-	int i;
-
-	level = domain->dm_iommu->iu_level;
-	vpte = &(domain->dm_pt_tree);
-
-	for (i = level; i > 1; i--) {
-		offset = dvma_level_offset(IOMMU_BTOP(dvma), i);
-		vpte = iommu_setup_level_table(domain, vpte, offset);
-	}
-
-	return (vpte->pp);
-}
-
-/*
- * iommu_map_page_range()
- *   map a range of pages for iommu translation
- *
- * domain: the device domain
- * dvma: the start dvma for mapping
- * start: the start physcial address
- * end: the end physical address
- * flags: misc flag
- */
-static int
-iommu_map_page_range(dmar_domain_state_t *domain, uint64_t dvma,
-    uint64_t start, uint64_t end, int flags)
-{
-	uint_t offset;
-	iopte_t pte;
-	caddr_t vaddr, dirt;
-	uint64_t paddr = start & IOMMU_PAGE_MASK;
-	uint64_t epaddr = end & IOMMU_PAGE_MASK;
-	uint64_t ioaddr = dvma & IOMMU_PAGE_MASK;
-	uint_t count;
-
-	while (paddr <= epaddr) {
-		vaddr = iommu_setup_page_table(domain, ioaddr);
-		offset = dvma_level_offset(IOMMU_BTOP(ioaddr), 1);
-
-		count = 0;
-		dirt = (caddr_t)((iopte_t)vaddr + offset);
-		while ((paddr <= epaddr) && (offset < IOMMU_PTE_MAX)) {
-			pte = (iopte_t)vaddr + offset;
-			if (*pte != NULL) {
-				if (pte_get_paddr(pte) != paddr) {
-					cmn_err(CE_WARN, "try to set "
-					    "non-NULL pte");
-				}
-			} else {
-				set_pte(pte, IOMMU_PAGE_PROP_RW, paddr);
-			}
-			paddr += IOMMU_PAGE_SIZE;
-			offset++;
-			count++;
-		}
-
-		/* flush cpu and iotlb cache */
-		domain->dm_iommu->iu_dmar_ops->do_clflush(dirt,
-		    count * sizeof (uint64_t));
-
-		if (!(flags & IOMMU_PAGE_PROP_NOSYNC)) {
-			/* cache mode set, flush iotlb */
-			if (IOMMU_CAP_GET_CM(domain->dm_iommu->iu_capability)) {
-				domain->dm_iommu->iu_dmar_ops->
-				    do_iotlb_psi(domain->dm_iommu,
-				    0, ioaddr, count, TLB_IVA_WHOLE);
-			/* cache mode not set, flush write buffer */
-			} else {
-				domain->dm_iommu->iu_dmar_ops->
-				    do_flwb(domain->dm_iommu);
-			}
-		}
-
-		ioaddr += IOMMU_PTOB(count);
-	}
-
-	return (DDI_SUCCESS);
-}
-
-/*
- * iommu_vmem_walker()
- */
-static void
-iommu_vmem_walker(void *arg, void *base, size_t size)
-{
-	vmem_walk_arg_t *warg = (vmem_walk_arg_t *)arg;
-	rmrr_info_t *rmrr = warg->vwa_rmrr;
-	dmar_domain_state_t *domain = warg->vwa_domain;
-	dev_info_t *dip = warg->vwa_dip;
-	uint64_t start, end;
-
-	start = MAX(rmrr->ri_baseaddr, (uint64_t)(intptr_t)base);
-	end = MIN(rmrr->ri_limiaddr + 1, (uint64_t)(intptr_t)base + size);
-	if (start < end) {
-		cmn_err(CE_WARN, "rmrr overlap with physmem [0x%"
-		    PRIx64 " - 0x%" PRIx64 "] for %s", start, end,
-		    ddi_node_name(dip));
-
-		(void) vmem_xalloc(domain->dm_dvma_map,
-		    end - start,	/* size */
-		    IOMMU_PAGE_SIZE,	/* align/quantum */
-		    0,			/* phase */
-		    0,			/* nocross */
-		    (void *)(uintptr_t)start,	/* minaddr */
-		    (void *)(uintptr_t)end,	/* maxaddr */
-		    VM_NOSLEEP);
-	}
-}
-
-/*
- * build_single_rmrr_identity_map()
- *   build identity map for a single rmrr unit
- */
-static void
-build_single_rmrr_identity_map(rmrr_info_t *rmrr)
-{
-	pci_dev_scope_t *devs;
-	pci_dev_info_t info;
-	uint64_t start, end, size;
-	dmar_domain_state_t *domain;
-	vmem_walk_arg_t warg;
-
-	info.pdi_seg = rmrr->ri_segment;
-	for_each_in_list(&(rmrr->ri_dev_list), devs) {
-		info.pdi_bus = devs->pds_bus;
-		info.pdi_devfn = (devs->pds_dev << 3) |
-		    devs->pds_func;
-
-		if (get_dip_from_info(&info) != DDI_SUCCESS) {
-			cmn_err(CE_NOTE, "RMRR: device [%x,%x,%x] listed in "
-			    "ACPI DMAR table does not exist, ignoring",
-			    info.pdi_bus, GET_DEV(info.pdi_devfn),
-			    GET_FUNC(info.pdi_devfn));
-			continue;
-		}
-
-		if (iommu_get_domain(info.pdi_dip, &domain) != DDI_SUCCESS) {
-			cmn_err(CE_WARN, "rmrr: get domain for %s failed",
-			    ddi_node_name(info.pdi_dip));
-			continue;
-		}
-
-		start = rmrr->ri_baseaddr;
-		end = rmrr->ri_limiaddr;
-		size = end - start + 1;
-
-		if (!address_in_memlist(bios_rsvd, start, size)) {
-			cmn_err(CE_WARN, "bios issue: "
-			    "rmrr [0x%" PRIx64 " - 0x%" PRIx64 "]\n"
-			    "is not in reserved memory range\n",
-			    start, end);
-		}
-
-		(void) iommu_map_page_range(domain,
-		    start, start, end,
-		    DDI_DMA_READ | DDI_DMA_WRITE |
-		    IOMMU_PAGE_PROP_NOSYNC);
-
-		/*
-		 * rmrr should never overlap phy_mem
-		 */
-		warg.vwa_rmrr = rmrr;
-		warg.vwa_domain = domain;
-		warg.vwa_dip = info.pdi_dip;
-		vmem_walk(domain->dm_dvma_map, VMEM_SPAN | VMEM_REENTRANT,
-		    iommu_vmem_walker, &warg);
-	}
-}
-
-/*
- * build_rmrr_identity_map()
- *   build identity mapping for devices under rmrr scopes
- */
-static void
-build_rmrr_identity_map(void)
-{
-	rmrr_info_t *rmrr;
-	int i;
-
-	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
-		if (list_is_empty(&(dmar_info->dmari_rmrr[i])))
-			break;
-		for_each_in_list(&(dmar_info->dmari_rmrr[i]), rmrr) {
-			list_insert_tail(&rmrr_states, rmrr);
-			build_single_rmrr_identity_map(rmrr);
-		}
-	}
-}
-
-/*
- * drhd_only_for_gfx()
- *   return TRUE, if the drhd is only for gfx
- */
-static boolean_t
-drhd_only_for_gfx(intel_iommu_state_t *iommu)
-{
-	drhd_info_t *drhd = iommu->iu_drhd;
-	pci_dev_scope_t *devs;
-	pci_dev_info_t info;
-	int dev_num;
-
-	if (drhd->di_include_all)
-		return (B_FALSE);
-
-	/* get the device number attached to this drhd */
-	dev_num = 0;
-	for_each_in_list(&(drhd->di_dev_list), devs) {
-		dev_num++;
-	}
-
-	if (dev_num == 1) {
-		iommu_private_t *private;
-		devs = list_head(&(drhd->di_dev_list));
-		info.pdi_seg = drhd->di_segment;
-		info.pdi_bus = devs->pds_bus;
-		info.pdi_devfn = (devs->pds_dev << 3) +
-		    (devs->pds_func & 0x7);
-
-		if (get_dip_from_info(&info) != DDI_SUCCESS) {
-			return (B_FALSE);
-		}
-
-		private = DEVI(info.pdi_dip)->devi_iommu_private;
-		ASSERT(private);
-		if (private->idp_is_display)
-			return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-/*
- * build_dev_identity_map()
- *   build identity map for a device
- */
-static void
-build_dev_identity_map(dev_info_t *dip)
-{
-	struct memlist *mp;
-	dmar_domain_state_t *domain;
-
-	if (iommu_get_domain(dip, &domain) != DDI_SUCCESS) {
-		cmn_err(CE_WARN, "build identity map for %s failed,"
-		    "this device may not be functional",
-		    ddi_node_name(dip));
-		return;
-	}
-
-	memlist_read_lock();
-	mp = phys_install;
-	while (mp != NULL) {
-		(void) iommu_map_page_range(domain,
-		    mp->ml_address & IOMMU_PAGE_MASK,
-		    mp->ml_address & IOMMU_PAGE_MASK,
-		    (mp->ml_address + mp->ml_size - 1) & IOMMU_PAGE_MASK,
-		    DDI_DMA_READ | DDI_DMA_WRITE |
-		    IOMMU_PAGE_PROP_NOSYNC);
-		mp = mp->ml_next;
-	}
-
-	memlist_read_unlock();
-
-	/*
-	 * record the identity map for domain, any device
-	 * which uses this domain will needn't any further
-	 * map
-	 */
-	domain->dm_identity = B_TRUE;
-}
-
-/*
- * build dma map for bios reserved memspace
- */
-static void
-map_bios_rsvd_mem_pool(dev_info_t *dip)
-{
-	struct memlist *mp;
-	dmar_domain_state_t *domain;
-
-	if (iommu_get_domain(dip, &domain) != DDI_SUCCESS) {
-		cmn_err(CE_WARN, "get domain for %s failed",
-		    ddi_node_name(dip));
-		return;
-	}
-
-	mp = bios_rsvd;
-	while (mp != 0) {
-		(void) iommu_map_page_range(domain,
-		    mp->ml_address & IOMMU_PAGE_MASK,
-		    mp->ml_address & IOMMU_PAGE_MASK,
-		    (mp->ml_address + mp->ml_size - 1) & IOMMU_PAGE_MASK,
-		    DDI_DMA_READ | DDI_DMA_WRITE |
-		    IOMMU_PAGE_PROP_NOSYNC);
-		cmn_err(CE_CONT, "?Mapping Reservd [0x%" PRIx64
-		    " - 0x%" PRIx64 "]\n", mp->ml_address,
-		    (mp->ml_address + mp->ml_size));
-		mp = mp->ml_next;
-	}
-}
-
-/*
- * build_isa_gfx_identity_walk()
- *   the walk function for build_isa_gfx_identity_map()
- */
-static int
-build_isa_gfx_identity_walk(dev_info_t *dip, void *arg)
-{
-	dmar_domain_state_t *domain;
-	_NOTE(ARGUNUSED(arg))
-
-	iommu_private_t *private;
-
-	if (DEVI(dip)->devi_iommu_private == NULL &&
-	    iommu_set_private(dip) != DDI_SUCCESS) {
-		/* ignore devices which cannot have private struct */
-		return (DDI_WALK_CONTINUE);
-	}
-
-	private = DEVI(dip)->devi_iommu_private;
-
-	ASSERT(private);
-
-	/* fix the gfx and fd */
-	if (private->idp_is_display) {
-		gfx_devinfo = dip;
-		build_dev_identity_map(dip);
-		return (DDI_WALK_CONTINUE);
-	} else if (private->idp_is_lpc) {
-		lpc_devinfo = dip;
-		return (DDI_WALK_CONTINUE);
-	}
-
-	if (!(usb_rmrr_quirk || usb_page0_quirk || usb_fullpa_quirk)) {
-		return (DDI_WALK_CONTINUE);
-	}
-
-	if (!((strcmp(ddi_driver_name(dip), "uhci") == 0) ||
-	    (strcmp(ddi_driver_name(dip), "ehci") == 0) ||
-	    (strcmp(ddi_driver_name(dip), "ohci") == 0))) {
-		return (DDI_WALK_CONTINUE);
-	}
-
-	/* workaround for usb leagcy emulation mode */
-	if (usb_rmrr_quirk) {
-		map_bios_rsvd_mem_pool(dip);
-		cmn_err(CE_CONT,
-		    "?Workaround for %s USB rmrr\n",
-		    ddi_node_name(dip));
-	}
-
-	/*
-	 * Identify usb ehci and uhci controllers
-	 */
-	if (usb_fullpa_quirk) {
-		build_dev_identity_map(dip);
-		cmn_err(CE_CONT,
-		    "?Workaround for %s USB phys install mem\n",
-		    ddi_node_name(dip));
-		return (DDI_WALK_CONTINUE);
-	}
-
-	if (usb_page0_quirk) {
-		if (iommu_get_domain(dip, &domain) != DDI_SUCCESS) {
-			cmn_err(CE_WARN,
-			    "Unable to setup usb-quirk for %s failed,"
-			    "this device may not be functional",
-			    ddi_node_name(dip));
-			return (DDI_WALK_CONTINUE);
-		}
-		(void) iommu_map_page_range(domain,
-		    0, 0, 0, DDI_DMA_READ | DDI_DMA_WRITE |
-		    IOMMU_PAGE_PROP_NOSYNC);
-		cmn_err(CE_CONT, "?Workaround for %s USB [0-4k]\n",
-		    ddi_node_name(dip));
-	}
-
-	return (DDI_WALK_CONTINUE);
-}
-
-/*
- * build_isa_gfx_identity_map()
- *   build identity map for isa and gfx devices
- */
-static void
-build_isa_gfx_identity_map(void)
-{
-	int count;
-
-	/*
-	 * walk through the entire device tree
-	 */
-	ndi_devi_enter(root_devinfo, &count);
-	ddi_walk_devs(ddi_get_child(root_devinfo),
-	    build_isa_gfx_identity_walk, NULL);
-	ndi_devi_exit(root_devinfo, count);
-}
-
-/*
- * dmar_check_boot_option()
- *   check the intel iommu boot option
- */
-static void
-dmar_check_boot_option(char *opt, int *var)
-{
-	int len;
-	char *boot_option;
-
-	if ((len = do_bsys_getproplen(NULL, opt)) > 0) {
-		boot_option = kmem_alloc(len, KM_SLEEP);
-		(void) do_bsys_getprop(NULL, opt, boot_option);
-		if (strcmp(boot_option, "yes") == 0 ||
-		    strcmp(boot_option, "true") == 0) {
-			cmn_err(CE_CONT, "\"%s=true\" was set\n",
-			    opt);
-			*var = 1;
-		} else if (strcmp(boot_option, "no") == 0 ||
-		    strcmp(boot_option, "false") == 0) {
-			cmn_err(CE_CONT, "\"%s=false\" was set\n",
-			    opt);
-			*var = 0;
-		}
-		kmem_free(boot_option, len);
-	}
-}
-
-extern void (*rootnex_iommu_init)(void);
-
-/*
- * intel_iommu_attach_dmar_nodes()
- *   attach intel iommu nodes
- */
-int
-intel_iommu_attach_dmar_nodes(void)
-{
-	drhd_info_t *drhd;
-	intel_iommu_state_t *iommu;
-	int i;
-
-	/*
-	 * retrieve the dmar boot options
-	 */
-	cmn_err(CE_CONT, "?Start to check dmar related boot options\n");
-	dmar_check_boot_option("dmar-gfx-disable", &gfx_drhd_disable);
-	dmar_check_boot_option("dmar-drhd-disable", &dmar_drhd_disable);
-	dmar_check_boot_option("usb-page0-quirk", &usb_page0_quirk);
-	dmar_check_boot_option("usb-fullpa-quirk", &usb_fullpa_quirk);
-	dmar_check_boot_option("usb-rmrr-quirk", &usb_rmrr_quirk);
-	dmar_check_boot_option("qinv-disable", &qinv_disable);
-	dmar_check_boot_option("intrr-disable", &intrr_disable);
-
-	/*
-	 * init the lists
-	 */
-	list_create(&iommu_states, sizeof (intel_iommu_state_t),
-	    offsetof(intel_iommu_state_t, node));
-	list_create(&domain_states, sizeof (dmar_domain_state_t),
-	    offsetof(dmar_domain_state_t, node));
-	list_create(&rmrr_states, sizeof (rmrr_info_t),
-	    offsetof(rmrr_info_t, node4states));
-
-	root_devinfo = ddi_root_node();
-	ASSERT(root_devinfo);
-
-	check_hwquirk();
-
-	iommu_page_init();
-
-	/*
-	 * initiate each iommu unit
-	 */
-	cmn_err(CE_CONT, "?Start to create iommu state structures\n");
-	for (i = 0; i < DMAR_MAX_SEGMENT; i++) {
-		for_each_in_list(&(dmar_info->dmari_drhd[i]), drhd) {
-			if (create_iommu_state(drhd) != DDI_SUCCESS)
-				goto iommu_init_fail;
-		}
-	}
-
-	/*
-	 * register interrupt remap ops
-	 */
-	if ((dmar_info->dmari_intr_remap == B_TRUE) && !intrr_disable) {
-		psm_vt_ops = &intr_remap_ops;
-	}
-
-	/*
-	 * build identity map for devices in the rmrr scope
-	 */
-	cmn_err(CE_CONT, "?Start to prepare identity map for rmrr\n");
-	build_rmrr_identity_map();
-
-	/*
-	 * build identity map for isa and gfx devices
-	 */
-	cmn_err(CE_CONT, "?Start to prepare identity map for gfx\n");
-	build_isa_gfx_identity_map();
-
-	/*
-	 * initialize the dvma cookie cache
-	 */
-	for (i = 0; i < MAX_COOKIE_CACHE_SIZE; i++) {
-		mutex_init(&(cookie_cache[i].dch_lock), NULL,
-		    MUTEX_DRIVER, NULL);
-		cookie_cache[i].dch_count = 0;
-		cookie_cache[i].dch_next = NULL;
-	}
-
-	/*
-	 * regist the intr add function
-	 */
-	rootnex_iommu_init = intel_iommu_init;
-
-	return (DDI_SUCCESS);
-
-iommu_init_fail:
-
-	/*
-	 * free iommu state structure
-	 */
-	while (iommu = list_head(&iommu_states)) {
-		list_remove(&iommu_states, iommu);
-		destroy_iommu_state(iommu);
-	}
-	list_destroy(&iommu_states);
-
-	return (DDI_FAILURE);
-}
-
-/*
- * get_level_table()
- *   get level n page table, NULL is returned if
- *   failure encountered
- */
-static caddr_t
-get_level_table(dmar_domain_state_t *domain,
-    uint64_t dvma_pn, uint_t n)
-{
-	iovpte_t vpte;
-	uint_t level;
-	uint_t i, offset;
-
-	level = domain->dm_iommu->iu_level;
-	ASSERT(level >= n);
-	vpte = &(domain->dm_pt_tree);
-
-	/* walk to the level n page table */
-	for (i = level; i > n; i--) {
-		offset = dvma_level_offset(dvma_pn, i);
-		vpte = (iovpte_t)(vpte->vp) + offset;
-	}
-
-	return (vpte->pp);
-}
-
-/*
- * iommu_alloc_cookie_array()
- *   allocate the cookie array which is needed by map sgl
- */
-static int
-iommu_alloc_cookie_array(rootnex_dma_t *dma,
-    struct ddi_dma_req *dmareq, uint_t prealloc)
-{
-	int kmflag;
-	rootnex_sglinfo_t *sinfo = &(dma->dp_sglinfo);
-
-	/* figure out the rough estimate of array size */
-	sinfo->si_max_pages =
-	    (dmareq->dmar_object.dmao_size + IOMMU_PAGE_OFFSET) /
-	    sinfo->si_max_cookie_size + 1;
-
-	/* the preallocated buffer fit this size */
-	if (sinfo->si_max_pages <= prealloc) {
-		dma->dp_cookies = (ddi_dma_cookie_t *)dma->dp_prealloc_buffer;
-		dma->dp_need_to_free_cookie = B_FALSE;
-	/* we need to allocate new array */
-	} else {
-		/* convert the sleep flags */
-		if (dmareq->dmar_fp == DDI_DMA_SLEEP) {
-			kmflag =  KM_SLEEP;
-		} else {
-			kmflag =  KM_NOSLEEP;
-		}
-
-		dma->dp_cookie_size = sinfo->si_max_pages *
-		    sizeof (ddi_dma_cookie_t);
-		dma->dp_cookies = kmem_alloc(dma->dp_cookie_size, kmflag);
-		if (dma->dp_cookies == NULL) {
-			return (IOMMU_SGL_NORESOURCES);
-		}
-		dma->dp_need_to_free_cookie = B_TRUE;
-	}
-
-	/* allocate the dvma cookie array */
-	dma->dp_dvma_cookies = get_dvma_cookie_array(sinfo->si_max_pages);
-
-	return (IOMMU_SGL_SUCCESS);
-}
-
-/*
- * iommu_alloc_dvma()
- *   alloc a dvma range for the caller
- */
-static int
-iommu_alloc_dvma(dmar_domain_state_t *domain, uint_t size,
-    ddi_dma_impl_t *hp, uint64_t *dvma, uint_t cnt)
-{
-	rootnex_dma_t *dma;
-	ddi_dma_attr_t *dma_attr;
-	iommu_dvma_cookie_t *dcookie;
-	uint64_t ioaddr;
-	size_t xsize, align, nocross;
-	uint64_t minaddr, maxaddr;
-
-	/* shotcuts */
-	dma = (rootnex_dma_t *)hp->dmai_private;
-	dma_attr = &(hp->dmai_attr);
-	dcookie = dma->dp_dvma_cookies;
-
-	/* parameters */
-	xsize = (size + IOMMU_PAGE_OFFSET) & IOMMU_PAGE_MASK;
-	align = MAX((size_t)(dma_attr->dma_attr_align), IOMMU_PAGE_SIZE);
-	nocross = (size_t)(dma_attr->dma_attr_seg + 1);
-	minaddr = dma_attr->dma_attr_addr_lo;
-	maxaddr = dma_attr->dma_attr_addr_hi + 1;
-
-	/* handle the rollover cases */
-	if (maxaddr < dma_attr->dma_attr_addr_hi) {
-		maxaddr = dma_attr->dma_attr_addr_hi;
-	}
-
-	/* get from cache first */
-	ioaddr = iommu_dvma_cache_get(domain, xsize, align, nocross);
-
-	if (ioaddr == NULL) {
-		/* allocate from vmem arena */
-		ioaddr = (uint64_t)(uintptr_t)vmem_xalloc(domain->dm_dvma_map,
-		    xsize, align, 0, nocross,
-		    (void *)(uintptr_t)minaddr,
-		    (void *)(uintptr_t)maxaddr,
-		    VM_NOSLEEP);
-
-		/* if xalloc failed, we have to flush the cache and retry */
-		if (ioaddr == NULL) {
-			iommu_dvma_cache_flush(domain, dma->dp_dip);
-			ioaddr = (uint64_t)(uintptr_t)vmem_xalloc(
-			    domain->dm_dvma_map,
-			    xsize, align, 0, nocross,
-			    (void *)(uintptr_t)minaddr,
-			    (void *)(uintptr_t)maxaddr,
-			    VM_NOSLEEP);
-			ASSERT(ioaddr);
-		}
-	}
-
-	ASSERT(ioaddr >= minaddr);
-	ASSERT(ioaddr + size - 1 < maxaddr);
-
-	*dvma = ioaddr;
-
-	/*
-	 * save the dvma range in the device dvma cookie
-	 */
-	dcookie[cnt].dc_addr = ioaddr;
-	dcookie[cnt].dc_size = xsize;
-	dcookie[cnt].dc_domain = domain;
-	dcookie[cnt].dc_align = align;
-
-	return (DDI_SUCCESS);
-}
-
-/*
- * iommu_map_dvma()
- *   map dvma to the physical addresses, the actual
- *   mapped dvma page number is returned
- */
-static int
-iommu_map_dvma(dmar_domain_state_t *domain, uint64_t dvma,
-    uint64_t paddr, uint_t psize, struct ddi_dma_req *dmareq)
-{
-	uint64_t start, end;
-	int flags;
-
-	start = paddr & IOMMU_PAGE_MASK;
-	end = (paddr + psize - 1) & IOMMU_PAGE_MASK;
-	flags = dmareq->dmar_flags & DDI_DMA_RDWR;
-
-	/* map each physical address */
-	(void) iommu_map_page_range(domain, dvma, start, end, flags);
-	return (IOMMU_BTOP(end - start) + 1);
-}
-
-/*
- * intel_iommu_map_sgl()
- *   called from rootnex_dma_bindhdl(), to build dma
- *   cookies when iommu is enabled
- */
-int
-intel_iommu_map_sgl(ddi_dma_handle_t handle,
-    struct ddi_dma_req *dmareq, uint_t prealloc)
-{
-	ddi_dma_atyp_t buftype;
-	uint64_t offset;
-	page_t **pparray;
-	uint64_t paddr;
-	uint64_t dvma;
-	uint_t psize;
-	uint_t size;
-	uint64_t maxseg;
-	caddr_t vaddr;
-	uint_t pcnt, cnt;
-	page_t *page;
-	ddi_dma_cookie_t *sgl;
-	rootnex_sglinfo_t *sglinfo;
-	ddi_dma_obj_t *dmar_object;
-	ddi_dma_impl_t *hp;
-	rootnex_dma_t *dma;
-	dmar_domain_state_t *domain;
-	int e;
-
-	hp = (ddi_dma_impl_t *)handle;
-	dma = (rootnex_dma_t *)hp->dmai_private;
-	sglinfo = &(dma->dp_sglinfo);
-	dmar_object = &(dmareq->dmar_object);
-	maxseg = sglinfo->si_max_cookie_size;
-	pparray = dmar_object->dmao_obj.virt_obj.v_priv;
-	vaddr = dmar_object->dmao_obj.virt_obj.v_addr;
-	buftype = dmar_object->dmao_type;
-	size = dmar_object->dmao_size;
-
-	/* get domain for the dma request */
-	if (iommu_get_domain(dma->dp_dip, &domain) != DDI_SUCCESS) {
-		cmn_err(CE_WARN, "get domain for %s failed",
-		    ddi_node_name(dma->dp_dip));
-		return (IOMMU_SGL_NORESOURCES);
-	}
-
-	/* direct return if drhd is disabled */
-	if (!(domain->dm_iommu->iu_enabled & DMAR_ENABLE) ||
-	    domain->dm_identity)
-		return (IOMMU_SGL_DISABLE);
-
-	/*
-	 * allocate the cookies arrays, if the pre-allocated
-	 * space is not enough, we should reallocate it
-	 */
-	if (iommu_alloc_cookie_array(dma, dmareq, prealloc)
-	    != IOMMU_SGL_SUCCESS)
-		return (IOMMU_SGL_NORESOURCES);
-	hp->dmai_cookie = dma->dp_cookies;
-	sgl = dma->dp_cookies;
-
-	pcnt = 0;
-	cnt = 0;
-
-	/* retrieve paddr, psize, offset from dmareq */
-	if (buftype == DMA_OTYP_PAGES) {
-		page = dmar_object->dmao_obj.pp_obj.pp_pp;
-		ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
-		offset =  dmar_object->dmao_obj.pp_obj.pp_offset &
-		    MMU_PAGEOFFSET;
-		paddr = pfn_to_pa(page->p_pagenum) + offset;
-		psize = MIN((MMU_PAGESIZE - offset), size);
-		sglinfo->si_asp = NULL;
-		page = page->p_next;
-	} else {
-		ASSERT((buftype == DMA_OTYP_VADDR) ||
-		    (buftype == DMA_OTYP_BUFVADDR));
-		sglinfo->si_asp = dmar_object->dmao_obj.virt_obj.v_as;
-		if (sglinfo->si_asp == NULL) {
-			sglinfo->si_asp = &kas;
-		}
-		offset = (uintptr_t)vaddr & MMU_PAGEOFFSET;
-
-		if (pparray != NULL) {
-			ASSERT(!PP_ISFREE(pparray[pcnt]));
-			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset;
-			psize = MIN((MMU_PAGESIZE - offset), size);
-			pcnt++;
-		} else {
-			paddr = pfn_to_pa(hat_getpfnum(sglinfo->si_asp->a_hat,
-			    vaddr)) + offset;
-			psize = MIN(size, (MMU_PAGESIZE - offset));
-			vaddr += psize;
-		}
-	}
-
-	/* save the iommu page offset */
-	sglinfo->si_buf_offset = offset & IOMMU_PAGE_OFFSET;
-
-	/*
-	 * allocate the dvma and map [paddr, paddr+psize)
-	 */
-	e = iommu_alloc_dvma(domain, MIN(size + sglinfo->si_buf_offset,
-	    maxseg), hp, &dvma, cnt);
-	if (e != DDI_SUCCESS)
-		return (IOMMU_SGL_NORESOURCES);
-	e  = iommu_map_dvma(domain, dvma, paddr, psize, dmareq);
-
-	/*
-	 * setup the first cookie with the dvma of the page
-	 * and the its size, we don't take account in the
-	 * offset into the first page now
-	 */
-	sgl[cnt].dmac_laddress = dvma;
-	sgl[cnt].dmac_size = psize + sglinfo->si_buf_offset;
-	sgl[cnt].dmac_type = 0;
-	dvma += IOMMU_PTOB(e);
-
-	size -= psize;
-	while (size > 0) {
-		/* get the size for this page (i.e. partial or full page) */
-		psize = MIN(size, MMU_PAGESIZE);
-		if (buftype == DMA_OTYP_PAGES) {
-			/* get the paddr from the page_t */
-			ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page));
-			paddr = pfn_to_pa(page->p_pagenum);
-			page = page->p_next;
-		} else if (pparray != NULL) {
-			/* index into the array of page_t's to get the paddr */
-			ASSERT(!PP_ISFREE(pparray[pcnt]));
-			paddr = pfn_to_pa(pparray[pcnt]->p_pagenum);
-			pcnt++;
-		} else {
-			/* call into the VM to get the paddr */
-			paddr = pfn_to_pa(hat_getpfnum
-			    (sglinfo->si_asp->a_hat, vaddr));
-			vaddr += psize;
-		}
-
-		/*
-		 * check to see if this page would put us
-		 * over the max cookie size
-		 */
-		if ((sgl[cnt].dmac_size + psize) > maxseg) {
-			/* use the next cookie */
-			cnt++;
-
-			/* allocate the dvma and map [paddr, paddr+psize) */
-			e = iommu_alloc_dvma(domain, MIN(size, maxseg),
-			    hp, &dvma, cnt);
-			if (e != DDI_SUCCESS)
-				return (IOMMU_SGL_NORESOURCES);
-			e  = iommu_map_dvma(domain, dvma, paddr, psize, dmareq);
-
-			/* save the cookie information */
-			sgl[cnt].dmac_laddress = dvma;
-			sgl[cnt].dmac_size = psize;
-			sgl[cnt].dmac_type = 0;
-			dvma += IOMMU_PTOB(e);
-
-		/*
-		 * we can add this page in the current cookie
-		 */
-		} else {
-			e  = iommu_map_dvma(domain, dvma, paddr, psize, dmareq);
-			sgl[cnt].dmac_size += psize;
-			dvma += IOMMU_PTOB(e);
-		}
-
-		size -= psize;
-	}
-
-	/* take account in the offset into the first page */
-	sgl[0].dmac_laddress += sglinfo->si_buf_offset;
-	sgl[0].dmac_size -= sglinfo->si_buf_offset;
-
-	/* save away how many cookies we have */
-	sglinfo->si_sgl_size = cnt + 1;
-
-	return (IOMMU_SGL_SUCCESS);
-}
-
-/*
- * iommu_clear_leaf_pte()
- *   clear a single leaf pte
- */
-static void
-iommu_clear_leaf_pte(dmar_domain_state_t *domain, uint64_t dvma, uint64_t size)
-{
-	iopte_t pte;
-	uint_t offset;
-	caddr_t leaf_table, dirt;
-	uint64_t csize = 0;
-	uint64_t cdvma = dvma & IOMMU_PAGE_MASK;
-	int count;
-
-	while (csize < size) {
-
-		/* retrieve the leaf page table */
-		leaf_table = get_level_table(domain, IOMMU_BTOP(cdvma), 1);
-		if (!leaf_table) {
-			cmn_err(CE_WARN, "get level 1 table for 0x%"
-			    PRIx64 "failed", cdvma);
-			return;
-		}
-
-		/* map the leaf page and walk to the pte */
-		offset = dvma_level_offset(IOMMU_BTOP(cdvma), 1);
-
-		/* clear the ptes */
-		count = 0;
-		dirt = (caddr_t)((iopte_t)leaf_table + offset);
-		while ((csize < size) &&
-		    (offset < IOMMU_PTE_MAX)) {
-			pte = (iopte_t)leaf_table + offset;
-			if (!*pte) {
-				cmn_err(CE_WARN, "try to clear NULL pte");
-			} else {
-				*pte = 0;
-			}
-			csize += IOMMU_PAGE_SIZE;
-			offset++;
-			count++;
-		}
-
-		/* flush cpu and iotlb cache */
-		domain->dm_iommu->iu_dmar_ops->do_clflush(dirt,
-		    count * sizeof (uint64_t));
-		domain->dm_iommu->iu_dmar_ops->do_iotlb_psi(domain->dm_iommu,
-		    domain->dm_domain_id, cdvma, count, TLB_IVA_WHOLE);
-
-		/* unmap the leaf page */
-		cdvma += IOMMU_PTOB(count);
-	}
-}
-
-/*
- * intel_iommu_unmap_sgl()
- *   called from rootnex_dma_unbindhdl(), to unbind dma
- *   cookies when iommu is enabled
- */
-void
-intel_iommu_unmap_sgl(ddi_dma_handle_t handle)
-{
-	ddi_dma_impl_t *hp;
-	rootnex_dma_t *dma;
-	dmar_domain_state_t *domain;
-	iommu_dvma_cookie_t *dcookies;
-	rootnex_sglinfo_t *sinfo;
-	uint64_t i;
-
-	hp = (ddi_dma_impl_t *)handle;
-	dma = (rootnex_dma_t *)hp->dmai_private;
-	dcookies = dma->dp_dvma_cookies;
-	sinfo = &(dma->dp_sglinfo);
-
-	/* get the device domain, no return check needed here */
-	(void) iommu_get_domain(dma->dp_dip, &domain);
-
-	/* if the drhd is disabled, nothing will be done */
-	if (!(domain->dm_iommu->iu_enabled & DMAR_ENABLE) ||
-	    domain->dm_identity)
-		return;
-
-	/* the drhd is enabled */
-	for (i = 0; i < sinfo->si_sgl_size; i++) {
-		/* clear leaf ptes */
-		iommu_clear_leaf_pte(domain, dcookies[i].dc_addr,
-		    dcookies[i].dc_size);
-	}
-
-	domain->dm_iommu->iu_dmar_ops->do_reap_wait(domain->dm_iommu);
-	domain->dm_iommu->iu_dmar_ops->do_plant_wait(domain->dm_iommu,
-	    dcookies, sinfo->si_sgl_size, sinfo->si_max_pages);
-}
-
-/*
- * initialize invalidation request queue structure.
- * call ddi_dma_mem_alloc to allocate physical contigous
- * pages for invalidation queue table
- */
-static int
-iommu_qinv_init(intel_iommu_state_t *iommu)
-{
-	inv_queue_state_t *inv_queue;
-	size_t size;
-
-	ddi_dma_attr_t inv_queue_dma_attr = {
-		DMA_ATTR_V0,
-		0U,
-		0xffffffffU,
-		0xffffffffU,
-		MMU_PAGESIZE, /* page aligned */
-		0x1,
-		0x1,
-		0xffffffffU,
-		0xffffffffU,
-		1,
-		4,
-		0
-	};
-
-	ddi_device_acc_attr_t inv_queue_acc_attr = {
-		DDI_DEVICE_ATTR_V0,
-		DDI_NEVERSWAP_ACC,
-		DDI_STRICTORDER_ACC
-	};
-
-	if (qinv_iqa_qs > QINV_MAX_QUEUE_SIZE)
-		qinv_iqa_qs = QINV_MAX_QUEUE_SIZE;
-
-	inv_queue = (inv_queue_state_t *)
-	    kmem_zalloc(sizeof (inv_queue_state_t), KM_SLEEP);
-
-	/* set devi_ops in dev info structure for ddi_dma_mem_alloc */
-	DEVI(iommu->iu_drhd->di_dip)->devi_ops =
-	    DEVI(ddi_root_node())->devi_ops;
-
-	/*
-	 * set devi_bus_dma_allochdl in dev info structure for
-	 * ddi_dma_free_handle
-	 */
-	DEVI(iommu->iu_drhd->di_dip)->devi_bus_dma_allochdl =
-	    DEVI(ddi_root_node());
-
-	if (ddi_dma_alloc_handle(iommu->iu_drhd->di_dip,
-	    &inv_queue_dma_attr,
-	    DDI_DMA_SLEEP,
-	    NULL,
-	    &(inv_queue->iq_table.dma_hdl)) != DDI_SUCCESS) {
-		cmn_err(CE_WARN,
-		    "alloc invalidation queue table handler failed\n");
-		goto queue_table_handle_failed;
-	}
-
-	if (ddi_dma_alloc_handle(iommu->iu_drhd->di_dip,
-	    &inv_queue_dma_attr,
-	    DDI_DMA_SLEEP,
-	    NULL,
-	    &(inv_queue->iq_sync.dma_hdl)) != DDI_SUCCESS) {
-		cmn_err(CE_WARN,
-		    "alloc invalidation queue sync mem handler failed\n");
-		goto sync_table_handle_failed;
-	}
-
-	inv_queue->iq_table.size = (1 << (qinv_iqa_qs + 8));
-	size = inv_queue->iq_table.size * QINV_ENTRY_SIZE;
-
-	/* alloc physical contiguous pages for invalidation queue */
-	if (ddi_dma_mem_alloc(inv_queue->iq_table.dma_hdl,
-	    size,
-	    &inv_queue_acc_attr,
-	    DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
-	    DDI_DMA_SLEEP,
-	    NULL,
-	    &(inv_queue->iq_table.vaddr),
-	    &size,
-	    &(inv_queue->iq_table.acc_hdl)) != DDI_SUCCESS) {
-		cmn_err(CE_WARN,
-		    "alloc invalidation queue table failed\n");
-		goto queue_table_mem_failed;
-	}
-
-	ASSERT(!((uintptr_t)inv_queue->iq_table.vaddr & MMU_PAGEOFFSET));
-	bzero(inv_queue->iq_table.vaddr, size);
-
-	/* get the base physical address of invalidation request queue */
-	inv_queue->iq_table.paddr = pfn_to_pa(
-	    hat_getpfnum(kas.a_hat, inv_queue->iq_table.vaddr));
-
-	inv_queue->iq_table.head = inv_queue->iq_table.tail = 0;
-
-	inv_queue->iq_sync.size = inv_queue->iq_table.size;
-	size = inv_queue->iq_sync.size * QINV_SYNC_DATA_SIZE;
-
-	/* alloc status memory for invalidation wait descriptor */
-	if (ddi_dma_mem_alloc(inv_queue->iq_sync.dma_hdl,
-	    size,
-	    &inv_queue_acc_attr,
-	    DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
-	    DDI_DMA_SLEEP,
-	    NULL,
-	    &(inv_queue->iq_sync.vaddr),
-	    &size,
-	    &(inv_queue->iq_sync.acc_hdl)) != DDI_SUCCESS) {
-		cmn_err(CE_WARN,
-		    "alloc invalidation queue sync mem failed\n");
-		goto sync_table_mem_failed;
-	}
-
-	ASSERT(!((uintptr_t)inv_queue->iq_sync.vaddr & MMU_PAGEOFFSET));
-	bzero(inv_queue->iq_sync.vaddr, size);
-	inv_queue->iq_sync.paddr = pfn_to_pa(
-	    hat_getpfnum(kas.a_hat, inv_queue->iq_sync.vaddr));
-
-	inv_queue->iq_sync.head = inv_queue->iq_sync.tail = 0;
-
-	mutex_init(&(inv_queue->iq_table.lock), NULL, MUTEX_DRIVER, NULL);
-	mutex_init(&(inv_queue->iq_sync.lock), NULL, MUTEX_DRIVER, NULL);
-
-	/*
-	 * init iotlb pend node for submitting invalidation iotlb
-	 * queue request
-	 */
-	inv_queue->iotlb_pend_node = (iotlb_pend_node_t **)
-	    kmem_zalloc(inv_queue->iq_sync.size
-	    * sizeof (iotlb_pend_node_t *), KM_SLEEP);
-
-	/* set invalidation queue structure */
-	iommu->iu_inv_queue = inv_queue;
-
-	return (DDI_SUCCESS);
-
-sync_table_mem_failed:
-	ddi_dma_mem_free(&(inv_queue->iq_table.acc_hdl));
-
-queue_table_mem_failed:
-	ddi_dma_free_handle(&(inv_queue->iq_sync.dma_hdl));
-
-sync_table_handle_failed:
-	ddi_dma_free_handle(&(inv_queue->iq_table.dma_hdl));
-
-queue_table_handle_failed:
-	kmem_free(inv_queue, sizeof (inv_queue_state_t));
-
-	return (ENOMEM);
-}
-
-/* destroy invalidation queue structure */
-static void
-iommu_qinv_fini(intel_iommu_state_t *iommu)
-{
-	inv_queue_state_t *inv_queue;
-
-	inv_queue = iommu->iu_inv_queue;
-	kmem_free(inv_queue->iotlb_pend_node,
-	    inv_queue->iq_sync.size
-	    * sizeof (iotlb_pend_node_t *));
-	ddi_dma_mem_free(&(inv_queue->iq_sync.acc_hdl));
-	ddi_dma_mem_free(&(inv_queue->iq_table.acc_hdl));
-	ddi_dma_free_handle(&(inv_queue->iq_sync.dma_hdl));
-	ddi_dma_free_handle(&(inv_queue->iq_table.dma_hdl));
-	mutex_destroy(&(inv_queue->iq_table.lock));
-	mutex_destroy(&(inv_queue->iq_sync.lock));
-	kmem_free(inv_queue, sizeof (inv_queue_state_t));
-}
-
-/* enable queued invalidation interface */
-static void
-iommu_qinv_enable(intel_iommu_state_t *iommu)
-{
-	inv_queue_state_t *inv_queue;
-	uint64_t iqa_reg_value;
-	uint32_t status;
-
-	struct dmar_ops *dmar_ops;
-
-	inv_queue = iommu->iu_inv_queue;
-
-	iqa_reg_value = inv_queue->iq_table.paddr | qinv_iqa_qs;
-
-	mutex_enter(&iommu->iu_reg_lock);
-	/* Initialize the Invalidation Queue Tail register to zero */
-	iommu_put_reg64(iommu, IOMMU_REG_INVAL_QT, 0);
-
-	/* set invalidation queue base address register */
-	iommu_put_reg64(iommu, IOMMU_REG_INVAL_QAR, iqa_reg_value);
-
-	/* enable queued invalidation interface */
-	iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD,
-	    iommu->iu_global_cmd_reg | IOMMU_GCMD_QIE);
-	iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS,
-	    iommu_get_reg32, (status & IOMMU_GSTS_QIES), status);
-	mutex_exit(&iommu->iu_reg_lock);
-
-	iommu->iu_global_cmd_reg |= IOMMU_GCMD_QIE;
-	iommu->iu_enabled |= QINV_ENABLE;
-
-	/* set new queued invalidation interface */
-	dmar_ops = iommu->iu_dmar_ops;
-
-	dmar_ops->do_context_fsi = qinv_cc_fsi;
-	dmar_ops->do_context_dsi = qinv_cc_dsi;
-	dmar_ops->do_context_gbl = qinv_cc_gbl;
-	dmar_ops->do_iotlb_psi = qinv_iotlb_psi;
-	dmar_ops->do_iotlb_dsi = qinv_iotlb_dsi;
-	dmar_ops->do_iotlb_gbl = qinv_iotlb_gbl;
-	dmar_ops->do_plant_wait = qinv_plant_wait;
-	dmar_ops->do_reap_wait = qinv_reap_wait;
-}
-
-/* submit invalidation request descriptor to invalidation queue */
-static void
-qinv_submit_inv_dsc(intel_iommu_state_t *iommu, inv_dsc_t *dsc)
-{
-	inv_queue_state_t *inv_queue;
-	inv_queue_mem_t *iq_table;
-	uint_t tail;
-
-	inv_queue = iommu->iu_inv_queue;
-	iq_table = &(inv_queue->iq_table);
-
-	mutex_enter(&iq_table->lock);
-	tail = iq_table->tail;
-	iq_table->tail++;
-
-	if (iq_table->tail == iq_table->size)
-		iq_table->tail = 0;
-
-	while (iq_table->head == iq_table->tail) {
-		/*
-		 * inv queue table exhausted, wait hardware to fetch
-		 * next descriptor
-		 */
-		iq_table->head = QINV_IQA_HEAD(
-		    iommu_get_reg64(iommu, IOMMU_REG_INVAL_QH));
-	}
-
-	bcopy(dsc, iq_table->vaddr + tail * QINV_ENTRY_SIZE,
-	    QINV_ENTRY_SIZE);
-
-	iommu_put_reg64(iommu, IOMMU_REG_INVAL_QT,
-	    iq_table->tail << QINV_IQA_TAIL_SHIFT);
-
-	mutex_exit(&iq_table->lock);
-}
-
-/* queued invalidation interface -- invalidate context cache */
-static void
-qinv_cc_common(intel_iommu_state_t *iommu, uint8_t function_mask,
-    uint16_t source_id, uint_t domain_id, ctt_inv_g_t type)
-{
-	inv_dsc_t dsc;
-
-	dsc.lo = CC_INV_DSC_LOW(function_mask, source_id, domain_id, type);
-	dsc.hi = CC_INV_DSC_HIGH;
-
-	qinv_submit_inv_dsc(iommu, &dsc);
-
-	/* record the context cache statistics */
-	atomic_inc_64(&(iommu->iu_statistics.st_context_cache));
-}
-
-/* queued invalidation interface -- invalidate iotlb */
-static void
-qinv_iotlb_common(intel_iommu_state_t *iommu, uint_t domain_id,
-    uint64_t addr, uint_t am, uint_t hint, tlb_inv_g_t type)
-{
-	inv_dsc_t dsc;
-	uint8_t dr = 0;
-	uint8_t dw = 0;
-
-	if (IOMMU_CAP_GET_DRD(iommu->iu_capability))
-		dr = 1;
-	if (IOMMU_CAP_GET_DWD(iommu->iu_capability))
-		dw = 1;
-
-	switch (type) {
-	case TLB_INV_G_PAGE:
-		if (!IOMMU_CAP_GET_PSI(iommu->iu_capability) ||
-		    am > IOMMU_CAP_GET_MAMV(iommu->iu_capability) ||
-		    addr & IOMMU_PAGE_OFFSET) {
-			type = TLB_INV_G_DOMAIN;
-			goto qinv_ignore_psi;
-		}
-		dsc.lo = IOTLB_INV_DSC_LOW(domain_id, dr, dw, type);
-		dsc.hi = IOTLB_INV_DSC_HIGH(addr, hint, am);
-		break;
-
-	qinv_ignore_psi:
-	case TLB_INV_G_DOMAIN:
-		dsc.lo = IOTLB_INV_DSC_LOW(domain_id, dr, dw, type);
-		dsc.hi = 0;
-		break;
-
-	case TLB_INV_G_GLOBAL:
-		dsc.lo = IOTLB_INV_DSC_LOW(0, dr, dw, type);
-		dsc.hi = 0;
-		break;
-	default:
-		cmn_err(CE_WARN, "incorrect iotlb flush type");
-		return;
-	}
-
-	qinv_submit_inv_dsc(iommu, &dsc);
-
-	/*
-	 * check the result and record the statistics
-	 */
-	switch (type) {
-	/* global */
-	case TLB_INV_G_GLOBAL:
-		atomic_inc_64(&(iommu->iu_statistics.st_iotlb_global));
-		break;
-	/* domain */
-	case TLB_INV_G_DOMAIN:
-		atomic_inc_64(&(iommu->iu_statistics.st_iotlb_domain));
-		break;
-	/* psi */
-	case TLB_INV_G_PAGE:
-		atomic_inc_64(&(iommu->iu_statistics.st_iotlb_psi));
-		break;
-	default:
-		break;
-	}
-}
-
-/* queued invalidation interface -- invalidate dev_iotlb */
-static void
-qinv_dev_iotlb_common(intel_iommu_state_t *iommu, uint16_t sid,
-    uint64_t addr, uint_t size, uint_t max_invs_pd)
-{
-	inv_dsc_t dsc;
-
-	dsc.lo = DEV_IOTLB_INV_DSC_LOW(sid, max_invs_pd);
-	dsc.hi = DEV_IOTLB_INV_DSC_HIGH(addr, size);
-
-	qinv_submit_inv_dsc(iommu, &dsc);
-}
-
-/* queued invalidation interface -- invalidate interrupt entry cache */
-static void
-qinv_iec_common(intel_iommu_state_t *iommu, uint_t iidx, uint_t im, uint_t g)
-{
-	inv_dsc_t dsc;
-
-	dsc.lo = IEC_INV_DSC_LOW(iidx, im, g);
-	dsc.hi = IEC_INV_DSC_HIGH;
-
-	qinv_submit_inv_dsc(iommu, &dsc);
-}
-
-/* queued invalidation interface -- global invalidate interrupt entry cache */
-static void
-qinv_iec_global(intel_iommu_state_t *iommu)
-{
-	qinv_iec_common(iommu, 0, 0, IEC_INV_GLOBAL);
-	qinv_wait_sync(iommu);
-}
-
-/* queued invalidation interface -- invalidate single interrupt entry cache */
-static void
-qinv_iec_single(intel_iommu_state_t *iommu, uint_t iidx)
-{
-	qinv_iec_common(iommu, iidx, 0, IEC_INV_INDEX);
-	qinv_wait_sync(iommu);
-}
-
-/* queued invalidation interface -- invalidate interrupt entry caches */
-static void
-qinv_iec(intel_iommu_state_t *iommu, uint_t iidx, uint_t cnt)
-{
-	uint_t	i, mask = 0;
-
-	ASSERT(cnt != 0);
-
-	/* requested interrupt count is not a power of 2 */
-	if (!ISP2(cnt)) {
-		for (i = 0; i < cnt; i++) {
-			qinv_iec_common(iommu, iidx + cnt, 0, IEC_INV_INDEX);
-		}
-		qinv_wait_sync(iommu);
-		return;
-	}
-
-	while ((2 << mask) < cnt) {
-		mask++;
-	}
-
-	if (mask > IOMMU_ECAP_GET_MHMV(iommu->iu_excapability)) {
-		for (i = 0; i < cnt; i++) {
-			qinv_iec_common(iommu, iidx + cnt, 0, IEC_INV_INDEX);
-		}
-		qinv_wait_sync(iommu);
-		return;
-	}
-
-	qinv_iec_common(iommu, iidx, mask, IEC_INV_INDEX);
-
-	qinv_wait_sync(iommu);
-}
-
-/*
- * alloc free entry from sync status table
- */
-static uint_t
-qinv_alloc_sync_mem_entry(intel_iommu_state_t *iommu)
-{
-	inv_queue_mem_t *sync_mem;
-	uint_t tail;
-
-	sync_mem = &iommu->iu_inv_queue->iq_sync;
-
-sync_mem_exhausted:
-	mutex_enter(&sync_mem->lock);
-	tail = sync_mem->tail;
-	sync_mem->tail++;
-	if (sync_mem->tail == sync_mem->size)
-		sync_mem->tail = 0;
-
-	if (sync_mem->head == sync_mem->tail) {
-		/* should never happen */
-		cmn_err(CE_WARN, "sync mem exhausted\n");
-		sync_mem->tail = tail;
-		mutex_exit(&sync_mem->lock);
-		delay(IOMMU_ALLOC_RESOURCE_DELAY);
-		goto sync_mem_exhausted;
-	}
-	mutex_exit(&sync_mem->lock);
-
-	return (tail);
-}
-
-/*
- * queued invalidation interface -- invalidation wait descriptor
- *   fence flag not set, need status data to indicate the invalidation
- *   wait descriptor completion
- */
-static void
-qinv_wait_async_unfence(intel_iommu_state_t *iommu, iotlb_pend_node_t *node)
-{
-	inv_dsc_t dsc;
-	inv_queue_mem_t *sync_mem;
-	uint64_t saddr;
-	uint_t tail;
-
-	sync_mem = &iommu->iu_inv_queue->iq_sync;
-	tail = qinv_alloc_sync_mem_entry(iommu);
-
-	/* plant an iotlb pending node */
-	iommu->iu_inv_queue->iotlb_pend_node[tail] = node;
-
-	saddr = sync_mem->paddr + tail * QINV_SYNC_DATA_SIZE;
-
-	/*
-	 * sdata = QINV_SYNC_DATA_UNFENCE, fence = 0, sw = 1, if = 0
-	 * indicate the invalidation wait descriptor completion by
-	 * performing a coherent DWORD write to the status address,
-	 * not by generating an invalidation completion event
-	 */
-	dsc.lo = INV_WAIT_DSC_LOW(QINV_SYNC_DATA_UNFENCE, 0, 1, 0);
-	dsc.hi = INV_WAIT_DSC_HIGH(saddr);
-
-	qinv_submit_inv_dsc(iommu, &dsc);
-}
-
-/*
- * queued invalidation interface -- invalidation wait descriptor
- *   fence flag set, indicate descriptors following the invalidation
- *   wait descriptor must be processed by hardware only after the
- *   invalidation wait descriptor completes.
- */
-static void
-qinv_wait_async_fence(intel_iommu_state_t *iommu)
-{
-	inv_dsc_t dsc;
-
-	/* sw = 0, fence = 1, iflag = 0 */
-	dsc.lo = INV_WAIT_DSC_LOW(0, 1, 0, 0);
-	dsc.hi = 0;
-	qinv_submit_inv_dsc(iommu, &dsc);
-}
-
-/*
- * queued invalidation interface -- invalidation wait descriptor
- *   wait until the invalidation request finished
- */
-static void
-qinv_wait_sync(intel_iommu_state_t *iommu)
-{
-	inv_dsc_t dsc;
-	inv_queue_mem_t *sync_mem;
-	uint64_t saddr;
-	uint_t tail;
-	volatile uint32_t *status;
-
-	sync_mem = &iommu->iu_inv_queue->iq_sync;
-	tail = qinv_alloc_sync_mem_entry(iommu);
-	saddr = sync_mem->paddr + tail * QINV_SYNC_DATA_SIZE;
-	status = (uint32_t *)(sync_mem->vaddr + tail * QINV_SYNC_DATA_SIZE);
-
-	/*
-	 * sdata = QINV_SYNC_DATA_FENCE, fence = 1, sw = 1, if = 0
-	 * indicate the invalidation wait descriptor completion by
-	 * performing a coherent DWORD write to the status address,
-	 * not by generating an invalidation completion event
-	 */
-	dsc.lo = INV_WAIT_DSC_LOW(QINV_SYNC_DATA_FENCE, 1, 1, 0);
-	dsc.hi = INV_WAIT_DSC_HIGH(saddr);
-
-	qinv_submit_inv_dsc(iommu, &dsc);
-
-	while ((*status) != QINV_SYNC_DATA_FENCE)
-		iommu_cpu_nop();
-	*status = QINV_SYNC_DATA_UNFENCE;
-}
-
-/* get already completed invalidation wait requests */
-static int
-qinv_wait_async_finish(intel_iommu_state_t *iommu, int *cnt)
-{
-	inv_queue_mem_t *sync_mem;
-	int index;
-	volatile uint32_t *value;
-
-	ASSERT((*cnt) == 0);
-
-	sync_mem = &iommu->iu_inv_queue->iq_sync;
-
-	mutex_enter(&sync_mem->lock);
-	index = sync_mem->head;
-	value = (uint32_t *)(sync_mem->vaddr + index
-	    * QINV_SYNC_DATA_SIZE);
-	while (*value == QINV_SYNC_DATA_UNFENCE) {
-		*value = 0;
-		(*cnt)++;
-		sync_mem->head++;
-		if (sync_mem->head == sync_mem->size) {
-			sync_mem->head = 0;
-			value = (uint32_t *)(sync_mem->vaddr);
-		} else
-			value = (uint32_t *)((char *)value +
-			    QINV_SYNC_DATA_SIZE);
-	}
-
-	mutex_exit(&sync_mem->lock);
-	if ((*cnt) > 0)
-		return (index);
-	else
-		return (-1);
-}
-
-/*
- * queued invalidation interface
- *   function based context cache invalidation
- */
-static void
-qinv_cc_fsi(intel_iommu_state_t *iommu, uint8_t function_mask,
-    uint16_t source_id, uint_t domain_id)
-{
-	qinv_cc_common(iommu, function_mask, source_id,
-	    domain_id, CTT_INV_G_DEVICE);
-	qinv_wait_sync(iommu);
-}
-
-/*
- * queued invalidation interface
- *   domain based context cache invalidation
- */
-static void
-qinv_cc_dsi(intel_iommu_state_t *iommu, uint_t domain_id)
-{
-	qinv_cc_common(iommu, 0, 0, domain_id, CTT_INV_G_DOMAIN);
-	qinv_wait_sync(iommu);
-}
-
-/*
- * queued invalidation interface
- *   invalidation global context cache
- */
-static void
-qinv_cc_gbl(intel_iommu_state_t *iommu)
-{
-	qinv_cc_common(iommu, 0, 0, 0, CTT_INV_G_GLOBAL);
-	qinv_wait_sync(iommu);
-}
-
-/*
- * queued invalidation interface
- *   paged based iotlb invalidation
- */
-static void
-qinv_iotlb_psi(intel_iommu_state_t *iommu, uint_t domain_id,
-	uint64_t dvma, uint_t count, uint_t hint)
-{
-	uint_t am = 0;
-	uint_t max_am;
-
-	max_am = IOMMU_CAP_GET_MAMV(iommu->iu_capability);
-
-	/* choose page specified invalidation */
-	if (IOMMU_CAP_GET_PSI(iommu->iu_capability)) {
-		while (am <= max_am) {
-			if ((ADDR_AM_OFFSET(IOMMU_BTOP(dvma), am) + count)
-			    <= ADDR_AM_MAX(am)) {
-				qinv_iotlb_common(iommu, domain_id,
-				    dvma, am, hint, TLB_INV_G_PAGE);
-				break;
-			}
-			am++;
-		}
-		if (am > max_am) {
-			qinv_iotlb_common(iommu, domain_id,
-			    dvma, 0, hint, TLB_INV_G_DOMAIN);
-		}
-
-	/* choose domain invalidation */
-	} else {
-		qinv_iotlb_common(iommu, domain_id, dvma,
-		    0, hint, TLB_INV_G_DOMAIN);
-	}
-}
-
-/*
- * queued invalidation interface
- *   domain based iotlb invalidation
- */
-static void
-qinv_iotlb_dsi(intel_iommu_state_t *iommu, uint_t domain_id)
-{
-	qinv_iotlb_common(iommu, domain_id, 0, 0, 0, TLB_INV_G_DOMAIN);
-	qinv_wait_sync(iommu);
-}
-
-/*
- * queued invalidation interface
- *    global iotlb invalidation
- */
-static void
-qinv_iotlb_gbl(intel_iommu_state_t *iommu)
-{
-	qinv_iotlb_common(iommu, 0, 0, 0, 0, TLB_INV_G_GLOBAL);
-	qinv_wait_sync(iommu);
-}
-
-/*
- * the plant wait operation for queued invalidation interface
- */
-static void
-qinv_plant_wait(intel_iommu_state_t *iommu, iommu_dvma_cookie_t *dcookies,
-		uint_t count, uint_t array_size)
-{
-	iotlb_pend_node_t *node = NULL;
-	iotlb_pend_head_t *head;
-
-	head = &(iommu->iu_pend_head);
-	mutex_enter(&(head->ich_mem_lock));
-	node = list_head(&(head->ich_mem_list));
-	if (node) {
-		list_remove(&(head->ich_mem_list), node);
-	}
-	mutex_exit(&(head->ich_mem_lock));
-
-	/* no cache, alloc one */
-	if (node == NULL) {
-		node = kmem_zalloc(sizeof (iotlb_pend_node_t), KM_SLEEP);
-	}
-	node->icn_dcookies = dcookies;
-	node->icn_count = count;
-	node->icn_array_size = array_size;
-
-	/* plant an invalidation wait descriptor, not wait its completion */
-	qinv_wait_async_unfence(iommu, node);
-}
-
-/*
- * the reap wait operation for queued invalidation interface
- */
-static void
-qinv_reap_wait(intel_iommu_state_t *iommu)
-{
-	int index, cnt = 0;
-	iotlb_pend_node_t *node;
-	iotlb_pend_head_t *head;
-
-	head = &(iommu->iu_pend_head);
-
-	index = qinv_wait_async_finish(iommu, &cnt);
-
-	while (cnt--) {
-		node = iommu->iu_inv_queue->iotlb_pend_node[index];
-		if (node == NULL)
-			continue;
-		dmar_release_dvma_cookie(node->icn_dcookies,
-		    node->icn_count, node->icn_array_size);
-
-		mutex_enter(&(head->ich_mem_lock));
-		list_insert_head(&(head->ich_mem_list), node);
-		mutex_exit(&(head->ich_mem_lock));
-		iommu->iu_inv_queue->iotlb_pend_node[index] = NULL;
-		index++;
-		if (index == iommu->iu_inv_queue->iq_sync.size)
-			index = 0;
-	}
-}
-
-/* init interrupt remapping table */
-static int
-intr_remap_init_unit(intel_iommu_state_t *iommu)
-{
-	intr_remap_tbl_state_t *intr_remap_tbl;
-	size_t size;
-
-	ddi_dma_attr_t intrr_dma_attr = {
-		DMA_ATTR_V0,
-		0U,
-		0xffffffffU,
-		0xffffffffU,
-		MMU_PAGESIZE,	/* page aligned */
-		0x1,
-		0x1,
-		0xffffffffU,
-		0xffffffffU,
-		1,
-		4,
-		0
-	};
-
-	ddi_device_acc_attr_t intrr_acc_attr = {
-		DDI_DEVICE_ATTR_V0,
-		DDI_NEVERSWAP_ACC,
-		DDI_STRICTORDER_ACC
-	};
-
-	if (intrr_apic_mode == LOCAL_X2APIC) {
-		if (!IOMMU_ECAP_GET_EIM(iommu->iu_excapability)) {
-			return (DDI_FAILURE);
-		}
-	}
-
-	if (intrr_irta_s > INTRR_MAX_IRTA_SIZE) {
-		intrr_irta_s = INTRR_MAX_IRTA_SIZE;
-	}
-
-	intr_remap_tbl = (intr_remap_tbl_state_t *)
-	    kmem_zalloc(sizeof (intr_remap_tbl_state_t), KM_SLEEP);
-
-	if (ddi_dma_alloc_handle(iommu->iu_drhd->di_dip,
-	    &intrr_dma_attr,
-	    DDI_DMA_SLEEP,
-	    NULL,
-	    &(intr_remap_tbl->dma_hdl)) != DDI_SUCCESS) {
-		goto intrr_tbl_handle_failed;
-	}
-
-	intr_remap_tbl->size = 1 << (intrr_irta_s + 1);
-	size = intr_remap_tbl->size * INTRR_RTE_SIZE;
-	if (ddi_dma_mem_alloc(intr_remap_tbl->dma_hdl,
-	    size,
-	    &intrr_acc_attr,
-	    DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
-	    DDI_DMA_SLEEP,
-	    NULL,
-	    &(intr_remap_tbl->vaddr),
-	    &size,
-	    &(intr_remap_tbl->acc_hdl)) != DDI_SUCCESS) {
-		goto intrr_tbl_mem_failed;
-
-	}
-
-	ASSERT(!((uintptr_t)intr_remap_tbl->vaddr & MMU_PAGEOFFSET));
-	bzero(intr_remap_tbl->vaddr, size);
-	intr_remap_tbl->paddr = pfn_to_pa(
-	    hat_getpfnum(kas.a_hat, intr_remap_tbl->vaddr));
-
-	mutex_init(&(intr_remap_tbl->lock), NULL, MUTEX_DRIVER, NULL);
-	bitset_init(&intr_remap_tbl->map);
-	bitset_resize(&intr_remap_tbl->map, intr_remap_tbl->size);
-	intr_remap_tbl->free = 0;
-
-	iommu->iu_intr_remap_tbl = intr_remap_tbl;
-
-	return (DDI_SUCCESS);
-
-intrr_tbl_mem_failed:
-	ddi_dma_free_handle(&(intr_remap_tbl->dma_hdl));
-
-intrr_tbl_handle_failed:
-	kmem_free(intr_remap_tbl, sizeof (intr_remap_tbl_state_t));
-
-	return (ENOMEM);
-}
-
-/* destroy interrupt remapping table */
-static void
-intr_remap_fini_unit(intel_iommu_state_t *iommu)
-{
-	intr_remap_tbl_state_t *intr_remap_tbl;
-
-	intr_remap_tbl = iommu->iu_intr_remap_tbl;
-	bitset_fini(&intr_remap_tbl->map);
-	ddi_dma_mem_free(&(intr_remap_tbl->acc_hdl));
-	ddi_dma_free_handle(&(intr_remap_tbl->dma_hdl));
-	kmem_free(intr_remap_tbl, sizeof (intr_remap_tbl_state_t));
-}
-
-/* enable interrupt remapping hardware unit */
-static void
-intr_remap_enable_unit(intel_iommu_state_t *iommu)
-{
-	uint32_t status;
-	uint64_t irta_reg;
-	intr_remap_tbl_state_t *intr_remap_tbl;
-
-	intr_remap_tbl = iommu->iu_intr_remap_tbl;
-
-	irta_reg = intr_remap_tbl->paddr | intrr_irta_s;
-
-	if (intrr_apic_mode == LOCAL_X2APIC)
-		irta_reg |= (0x1 << 11);
-
-	/* set interrupt remap table pointer */
-	mutex_enter(&(iommu->iu_reg_lock));
-	iommu_put_reg64(iommu, IOMMU_REG_IRTAR,	irta_reg);
-	iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD,
-	    iommu->iu_global_cmd_reg | IOMMU_GCMD_SIRTP);
-	iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS,
-	    iommu_get_reg32, (status & IOMMU_GSTS_IRTPS), status);
-	mutex_exit(&(iommu->iu_reg_lock));
-
-	/* global flush intr entry cache */
-	qinv_iec_global(iommu);
-
-	/* enable interrupt remapping */
-	mutex_enter(&(iommu->iu_reg_lock));
-	iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD,
-	    iommu->iu_global_cmd_reg | IOMMU_GCMD_IRE);
-	iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS,
-	    iommu_get_reg32, (status & IOMMU_GSTS_IRES),
-	    status);
-	iommu->iu_global_cmd_reg |= IOMMU_GCMD_IRE;
-
-	/* set compatible mode */
-	iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD,
-	    iommu->iu_global_cmd_reg | IOMMU_GCMD_CFI);
-	iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS,
-	    iommu_get_reg32, (status & IOMMU_GSTS_CFIS),
-	    status);
-	iommu->iu_global_cmd_reg |= IOMMU_GCMD_CFI;
-	mutex_exit(&(iommu->iu_reg_lock));
-
-	iommu->iu_enabled |= INTRR_ENABLE;
-}
-
-/*
- * helper function to find the free interrupt remapping
- * table entry
- */
-static uint_t
-bitset_find_free(bitset_t *b, uint_t post)
-{
-	uint_t	i;
-	uint_t	cap = bitset_capacity(b);
-
-	if (post == cap)
-		post = 0;
-
-	ASSERT(post < cap);
-
-	for (i = post; i < cap; i++) {
-		if (!bitset_in_set(b, i))
-			return (i);
-	}
-
-	for (i = 0; i < post; i++) {
-		if (!bitset_in_set(b, i))
-			return (i);
-	}
-
-	return (INTRR_IIDX_FULL);	/* no free index */
-}
-
-/*
- * helper function to find 'count' contigous free
- * interrupt remapping table entries
- */
-static uint_t
-bitset_find_multi_free(bitset_t *b, uint_t post, uint_t count)
-{
-	uint_t  i, j;
-	uint_t	cap = bitset_capacity(b);
-
-	if (post == INTRR_IIDX_FULL) {
-		return (INTRR_IIDX_FULL);
-	}
-
-	if (count > cap)
-		return (INTRR_IIDX_FULL);
-
-	ASSERT(post < cap);
-
-	for (i = post; (i + count) <= cap; i++) {
-		for (j = 0; j < count; j++) {
-			if (bitset_in_set(b, (i + j))) {
-				i = i + j;
-				break;
-			}
-			if (j == count - 1)
-				return (i);
-		}
-	}
-
-	for (i = 0; (i < post) && ((i + count) <= cap); i++) {
-		for (j = 0; j < count; j++) {
-			if (bitset_in_set(b, (i + j))) {
-				i = i + j;
-				break;
-			}
-			if (j == count - 1)
-				return (i);
-		}
-	}
-
-	return (INTRR_IIDX_FULL);  		/* no free index */
-}
-
-/* alloc one interrupt remapping table entry */
-static int
-intrr_tbl_alloc_entry(intr_remap_tbl_state_t *intr_remap_tbl)
-{
-	uint32_t iidx;
-
-retry_alloc_iidx:
-	mutex_enter(&intr_remap_tbl->lock);
-	iidx = intr_remap_tbl->free;
-	if (iidx == INTRR_IIDX_FULL) {
-		/* no free intr entry, use compatible format intr */
-		mutex_exit(&intr_remap_tbl->lock);
-		if (intrr_apic_mode == LOCAL_X2APIC) {
-			/*
-			 * x2apic mode not allowed compatible
-			 * interrupt
-			 */
-			delay(IOMMU_ALLOC_RESOURCE_DELAY);
-			goto retry_alloc_iidx;
-		}
-	} else {
-		bitset_add(&intr_remap_tbl->map, iidx);
-		intr_remap_tbl->free = bitset_find_free(&intr_remap_tbl->map,
-		    iidx + 1);
-		mutex_exit(&intr_remap_tbl->lock);
-	}
-
-	return (iidx);
-}
-
-/* alloc 'cnt' contigous interrupt remapping table entries */
-static int
-intrr_tbl_alloc_multi_entries(intr_remap_tbl_state_t *intr_remap_tbl,
-    uint_t cnt)
-{
-	uint_t iidx, pos, i;
-
-retry_alloc_iidxs:
-	mutex_enter(&intr_remap_tbl->lock);
-	pos = intr_remap_tbl->free;
-	iidx = bitset_find_multi_free(&intr_remap_tbl->map, pos, cnt);
-	if (iidx != INTRR_IIDX_FULL) {
-		if (iidx <= pos && pos < (iidx + cnt)) {
-			intr_remap_tbl->free = bitset_find_free(
-			    &intr_remap_tbl->map, iidx + cnt);
-		}
-		for (i = 0; i < cnt; i++) {
-			bitset_add(&intr_remap_tbl->map, iidx + i);
-		}
-		mutex_exit(&intr_remap_tbl->lock);
-	} else {
-		mutex_exit(&intr_remap_tbl->lock);
-		if (intrr_apic_mode == LOCAL_X2APIC) {
-			/* x2apic mode not allowed comapitible interrupt */
-			delay(IOMMU_ALLOC_RESOURCE_DELAY);
-			goto retry_alloc_iidxs;
-		}
-	}
-
-	return (iidx);
-}
-
-/* get ioapic source id and iommu structure for ioapics */
-static void
-get_ioapic_iommu_info(void)
-{
-	ioapic_drhd_info_t *ioapic_dinfo;
-	uint_t i;
-
-	for_each_in_list(&ioapic_drhd_infos, ioapic_dinfo) {
-		for (i = 0; i < MAX_IO_APIC; i++) {
-			if (ioapic_dinfo->ioapic_id == apic_io_id[i]) {
-				ioapic_iommu_infos[i] = kmem_zalloc(
-				    sizeof (ioapic_iommu_info_t), KM_SLEEP);
-				ioapic_iommu_infos[i]->sid = ioapic_dinfo->sid;
-				ioapic_iommu_infos[i]->iommu =
-				    (intel_iommu_state_t *)
-				    ioapic_dinfo->drhd->di_iommu;
-				break;
-			}
-		}
-	}
-}
-
-/* initialize interrupt remapping */
-static int
-intr_remap_init(int apic_mode)
-{
-	intel_iommu_state_t *iommu;
-	int intrr_all_disable = 1;
-
-	intrr_apic_mode = apic_mode;
-
-	for_each_in_list(&iommu_states, iommu) {
-		if ((iommu->iu_enabled & QINV_ENABLE) &&
-		    IOMMU_ECAP_GET_IR(iommu->iu_excapability)) {
-			if (intr_remap_init_unit(iommu) == DDI_SUCCESS) {
-				intrr_all_disable = 0;
-			}
-		}
-	}
-
-	if (intrr_all_disable) {
-		/*
-		 * if all drhd unit disabled intr remapping,
-		 * return FAILURE
-		 */
-		return (DDI_FAILURE);
-	} else {
-		return (DDI_SUCCESS);
-	}
-}
-
-/* enable interrupt remapping */
-static void
-intr_remap_enable(int suppress_brdcst_eoi)
-{
-	intel_iommu_state_t *iommu;
-
-	intrr_suppress_brdcst_eoi = suppress_brdcst_eoi;
-
-	for_each_in_list(&iommu_states, iommu) {
-		if (iommu->iu_intr_remap_tbl)
-			intr_remap_enable_unit(iommu);
-	}
-
-	/* get iommu structure and interrupt source id for ioapic */
-	get_ioapic_iommu_info();
-}
-
-/* alloc remapping entry for the interrupt */
-static void
-intr_remap_alloc_entry(apic_irq_t *irq_ptr)
-{
-	intel_iommu_state_t	*iommu;
-	intr_remap_tbl_state_t *intr_remap_tbl;
-	uint32_t		iidx, cnt, i;
-	uint_t			vector, irqno;
-	uint32_t		sid_svt_sq;
-
-	if (AIRQ_PRIVATE(irq_ptr) == INTRR_DISABLE ||
-	    AIRQ_PRIVATE(irq_ptr) != NULL) {
-		return;
-	}
-
-	AIRQ_PRIVATE(irq_ptr) =
-	    kmem_zalloc(sizeof (intr_remap_private_t), KM_SLEEP);
-
-	intr_remap_get_iommu(irq_ptr);
-
-	iommu = INTRR_PRIVATE(irq_ptr)->ir_iommu;
-	if (iommu == NULL) {
-		goto intr_remap_disable;
-	}
-
-	intr_remap_tbl = iommu->iu_intr_remap_tbl;
-
-	if (irq_ptr->airq_mps_intr_index == MSI_INDEX) {
-		cnt = irq_ptr->airq_intin_no;
-	} else {
-		cnt = 1;
-	}
-
-	if (cnt == 1) {
-		iidx = intrr_tbl_alloc_entry(intr_remap_tbl);
-	} else {
-		iidx = intrr_tbl_alloc_multi_entries(intr_remap_tbl, cnt);
-	}
-
-	if (iidx == INTRR_IIDX_FULL) {
-		goto intr_remap_disable;
-	}
-
-	INTRR_PRIVATE(irq_ptr)->ir_iidx = iidx;
-
-	intr_remap_get_sid(irq_ptr);
-
-	if (cnt == 1) {
-		if (IOMMU_CAP_GET_CM(iommu->iu_capability)) {
-			qinv_iec_single(iommu, iidx);
-		} else {
-			iommu->iu_dmar_ops->do_flwb(iommu);
-		}
-		return;
-	}
-
-	sid_svt_sq = INTRR_PRIVATE(irq_ptr)->ir_sid_svt_sq;
-
-	vector = irq_ptr->airq_vector;
-
-	for (i = 1; i < cnt; i++) {
-		irqno = apic_vector_to_irq[vector + i];
-		irq_ptr = apic_irq_table[irqno];
-
-		ASSERT(irq_ptr);
-
-		AIRQ_PRIVATE(irq_ptr) =
-		    kmem_zalloc(sizeof (intr_remap_private_t), KM_SLEEP);
-
-		INTRR_PRIVATE(irq_ptr)->ir_iommu = iommu;
-		INTRR_PRIVATE(irq_ptr)->ir_sid_svt_sq = sid_svt_sq;
-		INTRR_PRIVATE(irq_ptr)->ir_iidx = iidx + i;
-	}
-
-	if (IOMMU_CAP_GET_CM(iommu->iu_capability)) {
-		qinv_iec(iommu, iidx, cnt);
-	} else {
-		iommu->iu_dmar_ops->do_flwb(iommu);
-	}
-
-	return;
-
-intr_remap_disable:
-	kmem_free(AIRQ_PRIVATE(irq_ptr), sizeof (intr_remap_private_t));
-	AIRQ_PRIVATE(irq_ptr) = INTRR_DISABLE;
-}
-
-/* helper function to get iommu structure */
-static void intr_remap_get_iommu(apic_irq_t *irq_ptr)
-{
-	intel_iommu_state_t	*iommu = NULL;
-
-	ASSERT(INTRR_PRIVATE(irq_ptr)->ir_iommu == NULL);
-
-	if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
-		/* for fixed interrupt */
-		uint_t ioapic_index = irq_ptr->airq_ioapicindex;
-		if (ioapic_iommu_infos[ioapic_index])
-			iommu = ioapic_iommu_infos[ioapic_index]->iommu;
-	} else {
-		if (irq_ptr->airq_dip != NULL) {
-			iommu = iommu_get_dmar(irq_ptr->airq_dip);
-		}
-	}
-
-	if ((iommu != NULL) && (iommu->iu_enabled & INTRR_ENABLE)) {
-		INTRR_PRIVATE(irq_ptr)->ir_iommu = iommu;
-	}
-}
-
-/* helper function to get interrupt request source id */
-static void
-intr_remap_get_sid(apic_irq_t *irq_ptr)
-{
-	dev_info_t	*dip, *pdip;
-	iommu_private_t	*private;
-	uint16_t	sid;
-	uchar_t		svt, sq;
-
-	if (!intrr_enable_sid_verify) {
-		return;
-	}
-
-	if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
-		/* for interrupt through I/O APIC */
-		uint_t ioapic_index = irq_ptr->airq_ioapicindex;
-
-		sid = ioapic_iommu_infos[ioapic_index]->sid;
-
-		svt = SVT_ALL_VERIFY;
-		sq = SQ_VERIFY_ALL;
-	} else {
-		/* MSI/MSI-X interrupt */
-		dip = irq_ptr->airq_dip;
-		ASSERT(dip);
-		pdip = iommu_get_pci_top_bridge(dip);
-		if (pdip == NULL) {
-			/* pcie device */
-			private = DEVI(dip)->devi_iommu_private;
-			ASSERT(private);
-			sid = (private->idp_bus << 8) | private->idp_devfn;
-			svt = SVT_ALL_VERIFY;
-			sq = SQ_VERIFY_ALL;
-		} else {
-			private = DEVI(pdip)->devi_iommu_private;
-			ASSERT(private);
-
-			if (private->idp_bbp_type == IOMMU_PPB_PCIE_PCI) {
-				/* device behind pcie to pci bridge */
-				sid = (private->idp_bus << 8) | \
-				    private->idp_sec;
-				svt = SVT_BUS_VERIFY;
-				sq = SQ_VERIFY_ALL;
-			} else {
-				/* device behind pci to pci bridge */
-				sid = (private->idp_bus << 8) | \
-				    private->idp_devfn;
-				svt = SVT_ALL_VERIFY;
-				sq = SQ_VERIFY_ALL;
-			}
-		}
-	}
-
-	INTRR_PRIVATE(irq_ptr)->ir_sid_svt_sq = sid | (svt << 18) | (sq << 16);
-}
-
-/* remapping the interrupt */
-static void
-intr_remap_map_entry(apic_irq_t *irq_ptr, void *intr_data)
-{
-	intel_iommu_state_t	*iommu;
-	intr_remap_tbl_state_t	*intr_remap_tbl;
-	ioapic_rdt_t	*irdt = (ioapic_rdt_t *)intr_data;
-	msi_regs_t	*mregs = (msi_regs_t *)intr_data;
-	intr_rte_t	irte;
-	uint_t		iidx, i, cnt;
-	uint32_t	dst, sid_svt_sq;
-	uchar_t		vector, dlm, tm, rh, dm;
-
-	if (AIRQ_PRIVATE(irq_ptr) == INTRR_DISABLE) {
-		return;
-	}
-
-	if (irq_ptr->airq_mps_intr_index == MSI_INDEX) {
-		cnt = irq_ptr->airq_intin_no;
-	} else {
-		cnt = 1;
-	}
-
-	iidx = INTRR_PRIVATE(irq_ptr)->ir_iidx;
-	iommu = INTRR_PRIVATE(irq_ptr)->ir_iommu;
-	intr_remap_tbl = iommu->iu_intr_remap_tbl;
-	sid_svt_sq = INTRR_PRIVATE(irq_ptr)->ir_sid_svt_sq;
-	vector = irq_ptr->airq_vector;
-
-	if (!APIC_IS_MSI_OR_MSIX_INDEX(irq_ptr->airq_mps_intr_index)) {
-		dm = RDT_DM(irdt->ir_lo);
-		rh = 0;
-		tm = RDT_TM(irdt->ir_lo);
-		dlm = RDT_DLM(irdt->ir_lo);
-		dst = irdt->ir_hi;
-
-		/*
-		 * Mark the IRTE's TM as Edge to suppress broadcast EOI.
-		 */
-		if (intrr_suppress_brdcst_eoi) {
-			tm = TRIGGER_MODE_EDGE;
-		}
-	} else {
-		dm = MSI_ADDR_DM_PHYSICAL;
-		rh = MSI_ADDR_RH_FIXED;
-		tm = TRIGGER_MODE_EDGE;
-		dlm = 0;
-		dst = mregs->mr_addr;
-	}
-
-	if (intrr_apic_mode == LOCAL_APIC)
-		dst = (dst & 0xFF) << 8;
-
-	if (cnt == 1) {
-		irte.lo = IRTE_LOW(dst, vector, dlm, tm, rh, dm, 0, 1);
-		irte.hi = IRTE_HIGH(sid_svt_sq);
-
-		/* set interrupt remapping table entry */
-		bcopy(&irte, intr_remap_tbl->vaddr +
-		    iidx * INTRR_RTE_SIZE,
-		    INTRR_RTE_SIZE);
-
-		qinv_iec_single(iommu, iidx);
-
-	} else {
-		vector = irq_ptr->airq_vector;
-		for (i = 0; i < cnt; i++) {
-			irte.lo = IRTE_LOW(dst, vector, dlm, tm, rh, dm, 0, 1);
-			irte.hi = IRTE_HIGH(sid_svt_sq);
-
-			/* set interrupt remapping table entry */
-			bcopy(&irte, intr_remap_tbl->vaddr +
-			    iidx * INTRR_RTE_SIZE,
-			    INTRR_RTE_SIZE);
-			vector++;
-			iidx++;
-		}
-
-		qinv_iec(iommu, iidx, cnt);
-	}
-}
-
-/* free the remapping entry */
-static void
-intr_remap_free_entry(apic_irq_t *irq_ptr)
-{
-	intel_iommu_state_t *iommu;
-	intr_remap_tbl_state_t *intr_remap_tbl;
-	uint32_t iidx;
-
-	if (AIRQ_PRIVATE(irq_ptr) == INTRR_DISABLE) {
-		AIRQ_PRIVATE(irq_ptr) = NULL;
-		return;
-	}
-
-	iommu = INTRR_PRIVATE(irq_ptr)->ir_iommu;
-	intr_remap_tbl = iommu->iu_intr_remap_tbl;
-	iidx = INTRR_PRIVATE(irq_ptr)->ir_iidx;
-
-	bzero(intr_remap_tbl->vaddr + iidx * INTRR_RTE_SIZE,
-	    INTRR_RTE_SIZE);
-
-	qinv_iec_single(iommu, iidx);
-
-	mutex_enter(&intr_remap_tbl->lock);
-	bitset_del(&intr_remap_tbl->map, iidx);
-	if (intr_remap_tbl->free == INTRR_IIDX_FULL) {
-		intr_remap_tbl->free = iidx;
-	}
-	mutex_exit(&intr_remap_tbl->lock);
-
-	kmem_free(AIRQ_PRIVATE(irq_ptr), sizeof (intr_remap_private_t));
-	AIRQ_PRIVATE(irq_ptr) = NULL;
-}
-
-/* record the ioapic rdt entry */
-static void
-intr_remap_record_rdt(apic_irq_t *irq_ptr, ioapic_rdt_t *irdt)
-{
-	uint32_t rdt_entry, tm, pol, iidx, vector;
-
-	rdt_entry = irdt->ir_lo;
-
-	if (INTRR_PRIVATE(irq_ptr) != NULL) {
-		iidx = INTRR_PRIVATE(irq_ptr)->ir_iidx;
-		tm = RDT_TM(rdt_entry);
-		pol = RDT_POL(rdt_entry);
-		vector = irq_ptr->airq_vector;
-		irdt->ir_lo = (tm << INTRR_IOAPIC_TM_SHIFT) |
-		    (pol << INTRR_IOAPIC_POL_SHIFT) |
-		    ((iidx >> 15) << INTRR_IOAPIC_IIDX15_SHIFT) |
-		    vector;
-		irdt->ir_hi = (iidx << INTRR_IOAPIC_IIDX_SHIFT) |
-		    (1 << INTRR_IOAPIC_FORMAT_SHIFT);
-	} else {
-		irdt->ir_hi <<= APIC_ID_BIT_OFFSET;
-	}
-}
-
-/* record the msi interrupt structure */
-/*ARGSUSED*/
-static void
-intr_remap_record_msi(apic_irq_t *irq_ptr, msi_regs_t *mregs)
-{
-	uint_t	iidx;
-
-	if (INTRR_PRIVATE(irq_ptr) != NULL) {
-		iidx = INTRR_PRIVATE(irq_ptr)->ir_iidx;
-
-		mregs->mr_data = 0;
-		mregs->mr_addr = MSI_ADDR_HDR |
-		    ((iidx & 0x7fff) << INTRR_MSI_IIDX_SHIFT) |
-		    (1 << INTRR_MSI_FORMAT_SHIFT) | (1 << INTRR_MSI_SHV_SHIFT) |
-		    ((iidx >> 15) << INTRR_MSI_IIDX15_SHIFT);
-	} else {
-		mregs->mr_addr = MSI_ADDR_HDR |
-		    (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) |
-		    (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) |
-		    (mregs->mr_addr << MSI_ADDR_DEST_SHIFT);
-		mregs->mr_data = (MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) |
-		    mregs->mr_data;
-	}
-}
--- a/usr/src/uts/i86pc/io/iommu_rscs.c	Sat Jan 30 15:04:39 2010 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,392 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-
-#include <sys/conf.h>
-#include <sys/autoconf.h>
-#include <sys/sysmacros.h>
-#include <sys/debug.h>
-#include <sys/psw.h>
-#include <sys/ddidmareq.h>
-#include <sys/kmem.h>
-#include <sys/cmn_err.h>
-#include <vm/seg.h>
-#include <vm/seg_kmem.h>
-#include <vm/seg_kpm.h>
-#include <vm/seg_dev.h>
-#include <sys/vmem.h>
-#include <vm/hat.h>
-#include <vm/as.h>
-#include <vm/page.h>
-#include <sys/avintr.h>
-#include <sys/errno.h>
-#include <sys/modctl.h>
-#include <sys/ddi_impldefs.h>
-#include <sys/sunddi.h>
-#include <sys/sunndi.h>
-#include <sys/mach_intr.h>
-#include <vm/hat_i86.h>
-#include <sys/machsystm.h>
-#include <sys/iommu_rscs.h>
-#include <sys/intel_iommu.h>
-
-ddi_dma_attr_t page_dma_attr = {
-	DMA_ATTR_V0,
-	0U,
-	0xffffffffU,
-	0xffffffffU,
-	MMU_PAGESIZE, /* page aligned */
-	0x1,
-	0x1,
-	0xffffffffU,
-	0xffffffffU,
-	1,
-	4,
-	0
-};
-
-ddi_device_acc_attr_t page_acc_attr = {
-	DDI_DEVICE_ATTR_V0,
-	DDI_NEVERSWAP_ACC,
-	DDI_STRICTORDER_ACC
-};
-
-typedef struct iommu_rscs_s {
-	/*
-	 * Bounds of resource allocation. We will start allocating at rs_min
-	 * and rollover at rs_max+1 (rs_max is included). e.g. for rs_min=0
-	 * and rs_max=7, we will have 8 total resources which can be alloced.
-	 */
-	uint_t rs_min;
-	uint_t rs_max;
-
-	/*
-	 * rs_free points to an array of 64-bit values used to track resource
-	 * allocation. rs_free_size is the free buffer size in bytes.
-	 */
-	uint64_t *rs_free;
-	uint_t rs_free_size;
-
-	/*
-	 * last tracks the last alloc'd resource. This allows us to do a round
-	 * robin allocation.
-	 */
-	uint_t rs_last;
-
-	kmutex_t rs_mutex;
-} iommu_rscs_state_t;
-
-static uint_t
-iommu_pghdl_hash_func(paddr_t paddr)
-{
-	return (paddr % IOMMU_PGHDL_HASH_SIZE);
-}
-
-/*
- * iommu_page_alloc()
- *
- */
-iommu_pghdl_t *
-iommu_page_alloc(intel_iommu_state_t *iommu, int kmflag)
-{
-	size_t actual_size = 0;
-	iommu_pghdl_t *pghdl;
-	caddr_t vaddr;
-	uint_t idx;
-
-	ASSERT(kmflag == KM_SLEEP || kmflag == KM_NOSLEEP);
-
-	pghdl = kmem_zalloc(sizeof (*pghdl), kmflag);
-	if (pghdl == NULL) {
-		return (0);
-	}
-
-	if (ddi_dma_alloc_handle(ddi_root_node(), &page_dma_attr, DDI_DMA_SLEEP,
-	    NULL, &pghdl->dma_hdl) != DDI_SUCCESS) {
-		kmem_free(pghdl, sizeof (*pghdl));
-		return (0);
-	}
-
-	if (ddi_dma_mem_alloc(pghdl->dma_hdl, PAGESIZE, &page_acc_attr,
-	    DDI_DMA_CONSISTENT | IOMEM_DATA_UNCACHED,
-	    (kmflag == KM_SLEEP) ? DDI_DMA_SLEEP : DDI_DMA_DONTWAIT,
-	    NULL, &vaddr, &actual_size, &pghdl->mem_hdl) != DDI_SUCCESS) {
-		ddi_dma_free_handle(&pghdl->dma_hdl);
-		kmem_free(pghdl, sizeof (*pghdl));
-		return (0);
-	}
-
-	ASSERT(actual_size == PAGESIZE);
-
-	if (actual_size != PAGESIZE) {
-		ddi_dma_mem_free(&pghdl->mem_hdl);
-		ddi_dma_free_handle(&pghdl->dma_hdl);
-		kmem_free(pghdl, sizeof (*pghdl));
-		return (0);
-
-	}
-
-	pghdl->paddr = pfn_to_pa(hat_getpfnum(kas.a_hat, vaddr));
-	pghdl->vaddr = vaddr;
-
-	idx = iommu_pghdl_hash_func(pghdl->paddr);
-	pghdl->next = iommu->iu_pghdl_hash[idx];
-	if (pghdl->next)
-		pghdl->next->prev = pghdl;
-	iommu->iu_pghdl_hash[idx] = pghdl;
-
-	return (pghdl);
-}
-
-/*
- * iommu_page_free()
- */
-void
-iommu_page_free(intel_iommu_state_t *iommu, paddr_t paddr)
-{
-	uint_t idx;
-	iommu_pghdl_t *pghdl;
-
-	idx = iommu_pghdl_hash_func(paddr);
-	pghdl = iommu->iu_pghdl_hash[idx];
-	while (pghdl && pghdl->paddr != paddr)
-		pghdl = pghdl->next;
-	if (pghdl == NULL) {
-		cmn_err(CE_PANIC,
-		    "Freeing a free IOMMU page: paddr=0x%" PRIx64,
-		    paddr);
-		/*NOTREACHED*/
-	}
-	if (pghdl->prev == NULL)
-		iommu->iu_pghdl_hash[idx] = pghdl->next;
-	else
-		pghdl->prev->next = pghdl->next;
-	if (pghdl->next)
-		pghdl->next->prev = pghdl->prev;
-
-	ddi_dma_mem_free(&pghdl->mem_hdl);
-	ddi_dma_free_handle(&pghdl->dma_hdl);
-	kmem_free(pghdl, sizeof (*pghdl));
-}
-
-/*
- * iommu_get_vaddr()
- */
-caddr_t
-iommu_get_vaddr(intel_iommu_state_t *iommu, paddr_t paddr)
-{
-	uint_t idx;
-	iommu_pghdl_t *pghdl;
-
-	idx = iommu_pghdl_hash_func(paddr);
-	pghdl = iommu->iu_pghdl_hash[idx];
-	while (pghdl && pghdl->paddr != paddr)
-		pghdl = pghdl->next;
-	if (pghdl == NULL) {
-		return (0);
-	}
-	return (pghdl->vaddr);
-}
-
-
-/*
- * iommu_rscs_init()
- *    Initialize the resource structure. init() returns a handle to be
- *    used for the rest of the resource functions. This code is written assuming
- *    that min_val will be close to 0. Therefore, we will allocate the free
- *    buffer only taking max_val into account.
- */
-void
-iommu_rscs_init(uint_t min_val, uint_t max_val, iommu_rscs_t *handle)
-{
-	iommu_rscs_state_t *rstruct;
-	uint_t array_size;
-	uint_t index;
-
-
-	ASSERT(handle != NULL);
-	ASSERT(min_val < max_val);
-
-	/* alloc space for resource structure */
-	rstruct = kmem_alloc(sizeof (iommu_rscs_state_t), KM_SLEEP);
-
-	/*
-	 * Test to see if the max value is 64-bit aligned. If so, we don't need
-	 * to allocate an extra 64-bit word. alloc space for free buffer
-	 * (8 bytes per uint64_t).
-	 */
-	if ((max_val & 0x3F) == 0) {
-		rstruct->rs_free_size = (max_val >> 6) * 8;
-	} else {
-		rstruct->rs_free_size = ((max_val >> 6) + 1) * 8;
-	}
-	rstruct->rs_free = kmem_alloc(rstruct->rs_free_size, KM_SLEEP);
-
-	/* Initialize resource structure */
-	rstruct->rs_min = min_val;
-	rstruct->rs_last = min_val;
-	rstruct->rs_max = max_val;
-	mutex_init(&rstruct->rs_mutex, NULL, MUTEX_DRIVER, NULL);
-
-	/* Mark all resources as free */
-	array_size = rstruct->rs_free_size >> 3;
-	for (index = 0; index < array_size; index++) {
-		rstruct->rs_free[index] = (uint64_t)0xFFFFFFFFFFFFFFFF;
-	}
-
-	/* setup handle which is returned from this function */
-	*handle = rstruct;
-}
-
-
-/*
- * iommu_rscs_fini()
- *    Frees up the space allocated in init().  Notice that a pointer to the
- *    handle is used for the parameter.  fini() will set the handle to NULL
- *    before returning.
- */
-void
-iommu_rscs_fini(iommu_rscs_t *handle)
-{
-	iommu_rscs_state_t *rstruct;
-
-
-	ASSERT(handle != NULL);
-
-	rstruct = (iommu_rscs_state_t *)*handle;
-
-	mutex_destroy(&rstruct->rs_mutex);
-	kmem_free(rstruct->rs_free, rstruct->rs_free_size);
-	kmem_free(rstruct, sizeof (iommu_rscs_state_t));
-
-	/* set handle to null.  This helps catch bugs. */
-	*handle = NULL;
-}
-
-
-/*
- * iommu_rscs_alloc()
- *    alloc a resource. If alloc fails, we are out of resources.
- */
-int
-iommu_rscs_alloc(iommu_rscs_t handle, uint_t *resource)
-{
-	iommu_rscs_state_t *rstruct;
-	uint_t array_idx;
-	uint64_t free;
-	uint_t index;
-	uint_t last;
-	uint_t min;
-	uint_t max;
-
-
-	ASSERT(handle != NULL);
-	ASSERT(resource != NULL);
-
-	rstruct = (iommu_rscs_state_t *)handle;
-
-	mutex_enter(&rstruct->rs_mutex);
-	min = rstruct->rs_min;
-	max = rstruct->rs_max;
-
-	/*
-	 * Find a free resource. This will return out of the loop once it finds
-	 * a free resource. There are a total of 'max'-'min'+1 resources.
-	 * Performs a round robin allocation.
-	 */
-	for (index = min; index <= max; index++) {
-
-		array_idx = rstruct->rs_last >> 6;
-		free = rstruct->rs_free[array_idx];
-		last = rstruct->rs_last & 0x3F;
-
-		/* if the next resource to check is free */
-		if ((free & ((uint64_t)1 << last)) != 0) {
-			/* we are using this resource */
-			*resource = rstruct->rs_last;
-
-			/* take it out of the free list */
-			rstruct->rs_free[array_idx] &= ~((uint64_t)1 << last);
-
-			/*
-			 * increment the last count so we start checking the
-			 * next resource on the next alloc().  Note the rollover
-			 * at 'max'+1.
-			 */
-			rstruct->rs_last++;
-			if (rstruct->rs_last > max) {
-				rstruct->rs_last = rstruct->rs_min;
-			}
-
-			/* unlock the resource structure */
-			mutex_exit(&rstruct->rs_mutex);
-
-			return (DDI_SUCCESS);
-		}
-
-		/*
-		 * This resource is not free, lets go to the next one. Note the
-		 * rollover at 'max'.
-		 */
-		rstruct->rs_last++;
-		if (rstruct->rs_last > max) {
-			rstruct->rs_last = rstruct->rs_min;
-		}
-	}
-
-	mutex_exit(&rstruct->rs_mutex);
-
-	return (DDI_FAILURE);
-}
-
-
-/*
- * iommu_rscs_free()
- *    Free the previously alloc'd resource.  Once a resource has been free'd,
- *    it can be used again when alloc is called.
- */
-void
-iommu_rscs_free(iommu_rscs_t handle, uint_t resource)
-{
-	iommu_rscs_state_t *rstruct;
-	uint_t array_idx;
-	uint_t offset;
-
-
-	ASSERT(handle != NULL);
-
-	rstruct = (iommu_rscs_state_t *)handle;
-	ASSERT(resource >= rstruct->rs_min);
-	ASSERT(resource <= rstruct->rs_max);
-
-	mutex_enter(&rstruct->rs_mutex);
-
-	/* Put the resource back in the free list */
-	array_idx = resource >> 6;
-	offset = resource & 0x3F;
-	rstruct->rs_free[array_idx] |= ((uint64_t)1 << offset);
-
-	mutex_exit(&rstruct->rs_mutex);
-}
--- a/usr/src/uts/i86pc/io/mp_platform_common.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/io/mp_platform_common.c	Sat Jan 30 18:23:16 2010 -0800
@@ -1759,7 +1759,7 @@
 	}
 
 #if !defined(__xpv)
-	apic_vt_ops->apic_intrr_free_entry(irqptr);
+	apic_vt_ops->apic_intrmap_free_entry(irqptr);
 #endif
 
 	/*
@@ -2966,10 +2966,10 @@
 #if !defined(__xpv)
 			irdt.ir_hi = AV_TOALL >> APIC_ID_BIT_OFFSET;
 
-			apic_vt_ops->apic_intrr_alloc_entry(irq_ptr);
-			apic_vt_ops->apic_intrr_map_entry(
+			apic_vt_ops->apic_intrmap_alloc_entry(irq_ptr);
+			apic_vt_ops->apic_intrmap_map_entry(
 			    irq_ptr, (void *)&irdt);
-			apic_vt_ops->apic_intrr_record_rdt(irq_ptr, &irdt);
+			apic_vt_ops->apic_intrmap_record_rdt(irq_ptr, &irdt);
 
 			/* Write the RDT entry -- no specific CPU binding */
 			WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
@@ -3010,9 +3010,9 @@
 		irdt.ir_hi = cpu_infop->aci_local_id;
 
 #if !defined(__xpv)
-		apic_vt_ops->apic_intrr_alloc_entry(irq_ptr);
-		apic_vt_ops->apic_intrr_map_entry(irq_ptr, (void *)&irdt);
-		apic_vt_ops->apic_intrr_record_rdt(irq_ptr, &irdt);
+		apic_vt_ops->apic_intrmap_alloc_entry(irq_ptr);
+		apic_vt_ops->apic_intrmap_map_entry(irq_ptr, (void *)&irdt);
+		apic_vt_ops->apic_intrmap_record_rdt(irq_ptr, &irdt);
 
 		/* Write the RDT entry -- bind to a specific CPU: */
 		WRITE_IOAPIC_RDT_ENTRY_HIGH_DWORD(ioapicindex, intin_no,
--- a/usr/src/uts/i86pc/io/pcplusmp/apic.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/io/pcplusmp/apic.c	Sat Jan 30 18:23:16 2010 -0800
@@ -108,7 +108,7 @@
 static void	apic_timer_enable(void);
 static void	apic_timer_disable(void);
 static void	apic_post_cyclic_setup(void *arg);
-static void	apic_intrr_init(int apic_mode);
+static void	apic_intrmap_init(int apic_mode);
 static void	apic_record_ioapic_rdt(apic_irq_t *irq_ptr, ioapic_rdt_t *irdt);
 static void	apic_record_msi(apic_irq_t *irq_ptr, msi_regs_t *mregs);
 
@@ -366,7 +366,7 @@
 uint32_t	apic_divide_reg_init = 0;	/* 0 - divide by 2 */
 
 /* default apic ops without interrupt remapping */
-static apic_intrr_ops_t apic_nointrr_ops = {
+static apic_intrmap_ops_t apic_nointrmap_ops = {
 	(int (*)(int))return_instr,
 	(void (*)(int))return_instr,
 	(void (*)(apic_irq_t *))return_instr,
@@ -376,7 +376,7 @@
 	apic_record_msi,
 };
 
-apic_intrr_ops_t *apic_vt_ops = &apic_nointrr_ops;
+apic_intrmap_ops_t *apic_vt_ops = &apic_nointrmap_ops;
 
 /*
  *	This is the loadable module wrapper
@@ -759,7 +759,7 @@
 	 * Initialize and enable interrupt remapping before apic
 	 * hardware initialization
 	 */
-	apic_intrr_init(apic_mode);
+	apic_intrmap_init(apic_mode);
 
 	/*
 	 * On UniSys Model 6520, the BIOS leaves vector 0x20 isr
@@ -2591,7 +2591,7 @@
 }
 
 static void
-apic_intrr_init(int apic_mode)
+apic_intrmap_init(int apic_mode)
 {
 	int suppress_brdcst_eoi = 0;
 
@@ -2602,8 +2602,9 @@
 		 * documentation (yet)), initialize interrupt remapping
 		 * support before initializing the X2APIC unit.
 		 */
-		if (((apic_intrr_ops_t *)psm_vt_ops)->apic_intrr_init(apic_mode)
-		    == DDI_SUCCESS) {
+		if (((apic_intrmap_ops_t *)psm_vt_ops)->
+		    apic_intrmap_init(apic_mode) == DDI_SUCCESS) {
+
 			apic_vt_ops = psm_vt_ops;
 
 			/*
@@ -2615,7 +2616,7 @@
 				suppress_brdcst_eoi = 1;
 			}
 
-			apic_vt_ops->apic_intrr_enable(suppress_brdcst_eoi);
+			apic_vt_ops->apic_intrmap_enable(suppress_brdcst_eoi);
 
 			if (apic_detect_x2apic()) {
 				apic_enable_x2apic();
--- a/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/io/pcplusmp/apic_introp.c	Sat Jan 30 18:23:16 2010 -0800
@@ -94,9 +94,9 @@
 	msi_regs.mr_data = vector;
 	msi_regs.mr_addr = target_apic_id;
 
-	apic_vt_ops->apic_intrr_alloc_entry(irq_ptr);
-	apic_vt_ops->apic_intrr_map_entry(irq_ptr, (void *)&msi_regs);
-	apic_vt_ops->apic_intrr_record_msi(irq_ptr, &msi_regs);
+	apic_vt_ops->apic_intrmap_alloc_entry(irq_ptr);
+	apic_vt_ops->apic_intrmap_map_entry(irq_ptr, (void *)&msi_regs);
+	apic_vt_ops->apic_intrmap_record_msi(irq_ptr, &msi_regs);
 
 	/* MSI Address */
 	msi_addr = msi_regs.mr_addr;
--- a/usr/src/uts/i86pc/io/rootnex.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/io/rootnex.c	Sat Jan 30 18:23:16 2010 -0800
@@ -67,8 +67,10 @@
 #include <sys/hypervisor.h>
 #include <sys/bootconf.h>
 #include <vm/kboot_mmu.h>
-#else
-#include <sys/intel_iommu.h>
+#endif
+
+#if defined(__amd64) && !defined(__xpv)
+#include <sys/immu.h>
 #endif
 
 
@@ -90,6 +92,8 @@
 int rootnex_sync_check_parms = 0;
 #endif
 
+boolean_t rootnex_dmar_not_setup;
+
 /* Master Abort and Target Abort panic flag */
 int rootnex_fm_ma_ta_panic_flag = 0;
 
@@ -220,7 +224,7 @@
     ddi_dma_cookie_t *cookiep, uint_t *ccountp);
 static int rootnex_coredma_unbindhdl(dev_info_t *dip, dev_info_t *rdip,
     ddi_dma_handle_t handle);
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 static void rootnex_coredma_reset_cookies(dev_info_t *dip,
     ddi_dma_handle_t handle);
 static int rootnex_coredma_get_cookies(dev_info_t *dip, ddi_dma_handle_t handle,
@@ -271,6 +275,7 @@
 
 static int rootnex_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
 static int rootnex_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+static int rootnex_quiesce(dev_info_t *dip);
 
 static struct dev_ops rootnex_ops = {
 	DEVO_REV,
@@ -284,7 +289,7 @@
 	&rootnex_cb_ops,
 	&rootnex_bus_ops,
 	NULL,
-	ddi_quiesce_not_needed,		/* quiesce */
+	rootnex_quiesce,		/* quiesce */
 };
 
 static struct modldrv rootnex_modldrv = {
@@ -299,7 +304,7 @@
 	NULL
 };
 
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 static iommulib_nexops_t iommulib_nexops = {
 	IOMMU_NEXOPS_VERSION,
 	"Rootnex IOMMU ops Vers 1.1",
@@ -437,7 +442,11 @@
 	case DDI_ATTACH:
 		break;
 	case DDI_RESUME:
+#if defined(__amd64) && !defined(__xpv)
+		return (immu_unquiesce());
+#else
 		return (DDI_SUCCESS);
+#endif
 	default:
 		return (DDI_FAILURE);
 	}
@@ -453,7 +462,6 @@
 	rootnex_state->r_err_ibc = (ddi_iblock_cookie_t)ipltospl(15);
 	rootnex_state->r_reserved_msg_printed = B_FALSE;
 	rootnex_cnt = &rootnex_state->r_counters[0];
-	rootnex_state->r_intel_iommu_enabled = B_FALSE;
 
 	/*
 	 * Set minimum fm capability level for i86pc platforms and then
@@ -481,21 +489,7 @@
 	/* Initialize rootnex event handle */
 	i_ddi_rootnex_init_events(dip);
 
-#if !defined(__xpv)
-#if defined(__amd64)
-	/* probe intel iommu */
-	intel_iommu_probe_and_parse();
-
-	/* attach the iommu nodes */
-	if (intel_iommu_support) {
-		if (intel_iommu_attach_dmar_nodes() == DDI_SUCCESS) {
-			rootnex_state->r_intel_iommu_enabled = B_TRUE;
-		} else {
-			intel_iommu_release_dmar_info();
-		}
-	}
-#endif
-
+#if defined(__amd64) && !defined(__xpv)
 	e = iommulib_nexus_register(dip, &iommulib_nexops,
 	    &rootnex_state->r_iommulib_handle);
 
@@ -516,12 +510,16 @@
 {
 	switch (cmd) {
 	case DDI_SUSPEND:
-		break;
+#if defined(__amd64) && !defined(__xpv)
+		return (immu_quiesce());
+#else
+		return (DDI_SUCCESS);
+#endif
 	default:
 		return (DDI_FAILURE);
 	}
-
-	return (DDI_SUCCESS);
+	/*NOTREACHED*/
+
 }
 
 
@@ -1746,7 +1744,7 @@
 rootnex_dma_allochdl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_attr_t *attr,
     int (*waitfp)(caddr_t), caddr_t arg, ddi_dma_handle_t *handlep)
 {
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 	uint_t error = ENOTSUP;
 	int retval;
 
@@ -1806,7 +1804,7 @@
 static int
 rootnex_dma_freehdl(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle)
 {
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 	if (IOMMU_USED(rdip)) {
 		return (iommulib_nexdma_freehdl(dip, rdip, handle));
 	}
@@ -1814,7 +1812,6 @@
 	return (rootnex_coredma_freehdl(dip, rdip, handle));
 }
 
-
 /*ARGSUSED*/
 static int
 rootnex_coredma_bindhdl(dev_info_t *dip, dev_info_t *rdip,
@@ -1828,7 +1825,6 @@
 	int kmflag;
 	int e;
 
-
 	hp = (ddi_dma_impl_t *)handle;
 	dma = (rootnex_dma_t *)hp->dmai_private;
 	sinfo = &dma->dp_sglinfo;
@@ -1879,36 +1875,25 @@
 	/* save away the original bind info */
 	dma->dp_dma = dmareq->dmar_object;
 
-#if !defined(__xpv)
-	if (rootnex_state->r_intel_iommu_enabled) {
-		e = intel_iommu_map_sgl(handle, dmareq,
-		    rootnex_state->r_prealloc_cookies);
-
-		switch (e) {
-		case IOMMU_SGL_SUCCESS:
-			goto rootnex_sgl_end;
-
-		case IOMMU_SGL_DISABLE:
-			goto rootnex_sgl_start;
-
-		case IOMMU_SGL_NORESOURCES:
-			cmn_err(CE_WARN, "iommu map sgl failed for %s",
-			    ddi_node_name(dma->dp_dip));
-			rootnex_clean_dmahdl(hp);
-			return (DDI_DMA_NORESOURCES);
-
-		default:
-			cmn_err(CE_WARN,
-			    "undefined value returned from"
-			    " intel_iommu_map_sgl: %d",
-			    e);
-			rootnex_clean_dmahdl(hp);
-			return (DDI_DMA_NORESOURCES);
-		}
+#if defined(__amd64) && !defined(__xpv)
+	e = immu_map_sgl(hp, dmareq, rootnex_prealloc_cookies, rdip);
+	switch (e) {
+	case DDI_DMA_MAPPED:
+		goto out;
+	case DDI_DMA_USE_PHYSICAL:
+		break;
+	case DDI_DMA_PARTIAL:
+		ddi_err(DER_PANIC, rdip, "Partial DVMA map");
+		e = DDI_DMA_NORESOURCES;
+		/*FALLTHROUGH*/
+	default:
+		ddi_err(DER_MODE, rdip, "DVMA map failed");
+		ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_BIND_FAIL]);
+		rootnex_clean_dmahdl(hp);
+		return (e);
 	}
 #endif
 
-rootnex_sgl_start:
 	/*
 	 * Figure out a rough estimate of what maximum number of pages this
 	 * buffer could use (a high estimate of course).
@@ -1963,15 +1948,15 @@
 
 	/*
 	 * Get the real sgl. rootnex_get_sgl will fill in cookie array while
-	 * looking at the contraints in the dma structure. It will then put some
-	 * additional state about the sgl in the dma struct (i.e. is the sgl
-	 * clean, or do we need to do some munging; how many pages need to be
-	 * copied, etc.)
+	 * looking at the constraints in the dma structure. It will then put
+	 * some additional state about the sgl in the dma struct (i.e. is
+	 * the sgl clean, or do we need to do some munging; how many pages
+	 * need to be copied, etc.)
 	 */
 	rootnex_get_sgl(&dmareq->dmar_object, dma->dp_cookies,
 	    &dma->dp_sglinfo);
 
-rootnex_sgl_end:
+out:
 	ASSERT(sinfo->si_sgl_size <= sinfo->si_max_pages);
 	/* if we don't need a copy buffer, we don't need to sync */
 	if (sinfo->si_copybuf_req == 0) {
@@ -2008,11 +1993,12 @@
 		*ccountp = sinfo->si_sgl_size;
 		hp->dmai_cookie++;
 		hp->dmai_rflags &= ~DDI_DMA_PARTIAL;
-		hp->dmai_nwin = 1;
-		ROOTNEX_DPROF_INC(&rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS]);
-		ROOTNEX_DPROBE3(rootnex__bind__fast, dev_info_t *, rdip,
-		    uint64_t, rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS], uint_t,
-		    dma->dp_dma.dmao_size);
+		ROOTNEX_PROF_INC(&rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS]);
+		DTRACE_PROBE3(rootnex__bind__fast, dev_info_t *, rdip,
+		    uint64_t, rootnex_cnt[ROOTNEX_CNT_ACTIVE_BINDS],
+		    uint_t, dma->dp_dma.dmao_size);
+
+
 		return (DDI_DMA_MAPPED);
 	}
 
@@ -2055,6 +2041,7 @@
 	if (e == DDI_DMA_MAPPED) {
 		hp->dmai_rflags &= ~DDI_DMA_PARTIAL;
 		*ccountp = sinfo->si_sgl_size;
+		hp->dmai_nwin = 1;
 	} else {
 		hp->dmai_rflags |= DDI_DMA_PARTIAL;
 		*ccountp = dma->dp_window[dma->dp_current_win].wd_cookie_cnt;
@@ -2070,7 +2057,6 @@
 	return (e);
 }
 
-
 /*
  * rootnex_dma_bindhdl()
  *    called from ddi_dma_addr_bind_handle() and ddi_dma_buf_bind_handle().
@@ -2080,7 +2066,7 @@
     ddi_dma_handle_t handle, struct ddi_dma_req *dmareq,
     ddi_dma_cookie_t *cookiep, uint_t *ccountp)
 {
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 	if (IOMMU_USED(rdip)) {
 		return (iommulib_nexdma_bindhdl(dip, rdip, handle, dmareq,
 		    cookiep, ccountp));
@@ -2090,6 +2076,8 @@
 	    cookiep, ccountp));
 }
 
+
+
 /*ARGSUSED*/
 static int
 rootnex_coredma_unbindhdl(dev_info_t *dip, dev_info_t *rdip,
@@ -2136,12 +2124,13 @@
 	rootnex_teardown_copybuf(dma);
 	rootnex_teardown_windows(dma);
 
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 	/*
-	 * If intel iommu enabled, clean up the page tables and free the dvma
+	 * Clean up the page tables and free the dvma
 	 */
-	if (rootnex_state->r_intel_iommu_enabled) {
-		intel_iommu_unmap_sgl(handle);
+	e = immu_unmap_sgl(hp, rdip);
+	if (e != DDI_DMA_USE_PHYSICAL && e != DDI_SUCCESS) {
+		return (e);
 	}
 #endif
 
@@ -2178,7 +2167,7 @@
 rootnex_dma_unbindhdl(dev_info_t *dip, dev_info_t *rdip,
     ddi_dma_handle_t handle)
 {
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 	if (IOMMU_USED(rdip)) {
 		return (iommulib_nexdma_unbindhdl(dip, rdip, handle));
 	}
@@ -2186,7 +2175,7 @@
 	return (rootnex_coredma_unbindhdl(dip, rdip, handle));
 }
 
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 
 static int
 rootnex_coredma_get_sleep_flags(ddi_dma_handle_t handle)
@@ -2491,7 +2480,6 @@
 	return (DDI_SUCCESS);
 }
 
-
 /*
  * rootnex_valid_bind_parms()
  *    Called in ddi_dma_*_bind_handle path to validate its parameters.
@@ -2794,7 +2782,6 @@
 	}
 }
 
-
 /*
  * rootnex_bind_slowpath()
  *    Call in the bind path if the calling driver can't use the sgl without
@@ -4229,7 +4216,7 @@
 rootnex_dma_sync(dev_info_t *dip, dev_info_t *rdip, ddi_dma_handle_t handle,
     off_t off, size_t len, uint_t cache_flags)
 {
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 	if (IOMMU_USED(rdip)) {
 		return (iommulib_nexdma_sync(dip, rdip, handle, off, len,
 		    cache_flags));
@@ -4516,7 +4503,7 @@
     uint_t win, off_t *offp, size_t *lenp, ddi_dma_cookie_t *cookiep,
     uint_t *ccountp)
 {
-#if !defined(__xpv)
+#if defined(__amd64) && !defined(__xpv)
 	if (IOMMU_USED(rdip)) {
 		return (iommulib_nexdma_win(dip, rdip, handle, win, offp, lenp,
 		    cookiep, ccountp));
@@ -4916,8 +4903,8 @@
 			end_addr = start_addr + csize;
 
 			/*
-			 * if the faulted address is within the physical address
-			 * range of the cookie, return DDI_FM_NONFATAL.
+			 * if the faulted address is within the physical
+			 * address of the cookie, return DDI_FM_NONFATAL.
 			 */
 			if ((fault_addr >= start_addr) &&
 			    (fault_addr <= end_addr)) {
@@ -4929,3 +4916,34 @@
 	/* fault_addr not within this DMA handle */
 	return (DDI_FM_UNKNOWN);
 }
+
+/*ARGSUSED*/
+static int
+rootnex_quiesce(dev_info_t *dip)
+{
+#if defined(__amd64) && !defined(__xpv)
+	return (immu_quiesce());
+#else
+	return (DDI_SUCCESS);
+#endif
+}
+
+#if defined(__xpv)
+void
+immu_init(void)
+{
+	;
+}
+
+void
+immu_startup(void)
+{
+	;
+}
+/*ARGSUSED*/
+void
+immu_physmem_update(uint64_t addr, uint64_t size)
+{
+	;
+}
+#endif
--- a/usr/src/uts/i86pc/os/ddi_impl.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/os/ddi_impl.c	Sat Jan 30 18:23:16 2010 -0800
@@ -101,6 +101,10 @@
 
 static int kmem_override_cache_attrs(caddr_t, size_t, uint_t);
 
+#if defined(__amd64) && !defined(__xpv)
+extern void immu_init(void);
+#endif
+
 #define	CTGENTRIES	15
 
 static struct ctgas {
@@ -202,6 +206,18 @@
 	/* reprogram devices not set up by firmware (BIOS) */
 	impl_bus_reprobe();
 
+#if defined(__amd64) && !defined(__xpv)
+	/*
+	 * Setup but don't startup the IOMMU
+	 * Startup happens later via a direct call
+	 * to IOMMU code by boot code.
+	 * At this point, all PCI bus renumbering
+	 * is done, so safe to init the IMMU
+	 * AKA Intel IOMMU.
+	 */
+	immu_init();
+#endif
+
 	/*
 	 * attach the isa nexus to get ACPI resource usage
 	 * isa is "kind of" a pseudo node
--- a/usr/src/uts/i86pc/os/fakebop.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/os/fakebop.c	Sat Jan 30 18:23:16 2010 -0800
@@ -58,7 +58,6 @@
 #endif
 #include <vm/kboot_mmu.h>
 #include <vm/hat_pte.h>
-#include <sys/dmar_acpi.h>
 #include <sys/kobj.h>
 #include <sys/kobj_lex.h>
 #include <sys/pci_cfgspace_impl.h>
@@ -2225,13 +2224,6 @@
 	    tp->number * tp->number);
 }
 
-static void
-process_dmar(struct dmar *tp)
-{
-	bsetprop(DMAR_TABLE_PROPNAME, strlen(DMAR_TABLE_PROPNAME),
-	    tp, tp->hdr.len);
-}
-
 #else /* __xpv */
 static void
 enumerate_xen_cpus()
@@ -2274,8 +2266,6 @@
 	if (slit_ptr = (struct slit *)find_fw_table("SLIT"))
 		process_slit(slit_ptr);
 
-	if (tp = find_fw_table("DMAR"))
-		process_dmar((struct dmar *)tp);
 	tp = find_fw_table("MCFG");
 #else /* __xpv */
 	enumerate_xen_cpus();
--- a/usr/src/uts/i86pc/os/startup.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/os/startup.c	Sat Jan 30 18:23:16 2010 -0800
@@ -151,6 +151,10 @@
 
 void *gfx_devinfo_list;
 
+#if defined(__amd64) && !defined(__xpv)
+extern void immu_startup(void);
+#endif
+
 /*
  * XXX make declaration below "static" when drivers no longer use this
  * interface.
@@ -171,10 +175,6 @@
 static void startup_end(void);
 static void layout_kernel_va(void);
 
-#if !defined(__xpv)
-void (*rootnex_iommu_init)(void) = NULL;
-#endif
-
 /*
  * Declare these as initialized data so we can patch them.
  */
@@ -2137,11 +2137,14 @@
 	xs_domu_init();
 #endif
 
-#if !defined(__xpv)
-	if (rootnex_iommu_init != NULL) {
-		rootnex_iommu_init();
-	}
+#if defined(__amd64) && !defined(__xpv)
+	/*
+	 * Intel IOMMU has been setup/initialized in ddi_impl.c
+	 * Start it up now.
+	 */
+	immu_startup();
 #endif
+
 	PRM_POINT("Enabling interrupts");
 	(*picinitf)();
 	sti();
--- a/usr/src/uts/i86pc/rootnex/Makefile	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/rootnex/Makefile	Sat Jan 30 18:23:16 2010 -0800
@@ -20,7 +20,7 @@
 #
 #
 # uts/i86pc/rootnex/Makefile
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #	This makefile drives the production of the rootnex driver
@@ -85,7 +85,7 @@
 
 modlintlib:	$(MODLINTLIB_DEPS)
 
-clean.lint:	$(CLEAN_LINT_DEPS)
+clean.lint: 	$(CLEAN_DEPS)
 
 install:	$(INSTALL_DEPS) $(CONF_INSTALL_DEPS)
 
--- a/usr/src/uts/i86pc/sys/apic.h	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/sys/apic.h	Sat Jan 30 18:23:16 2010 -0800
@@ -486,7 +486,7 @@
 	uint_t	airq_busy;		/* How frequently did clock find */
 					/* us in this */
 	struct apic_irq *airq_next;	/* chain of intpts sharing a vector */
-	void		*airq_intrr_private; /* intr remap private data */
+	void		*airq_intrmap_private; /* intr remap private data */
 } apic_irq_t;
 
 #define	IRQ_USER_BOUND	0x80000000 /* user requested bind if set in airq_cpu */
@@ -556,15 +556,15 @@
 /*
  * APIC ops to support intel interrupt remapping
  */
-typedef struct apic_intrr_ops {
-	int	(*apic_intrr_init)(int);
-	void	(*apic_intrr_enable)(int);
-	void	(*apic_intrr_alloc_entry)(apic_irq_t *);
-	void	(*apic_intrr_map_entry)(apic_irq_t *, void *);
-	void	(*apic_intrr_free_entry)(apic_irq_t *);
-	void	(*apic_intrr_record_rdt)(apic_irq_t *, ioapic_rdt_t *);
-	void	(*apic_intrr_record_msi)(apic_irq_t *, msi_regs_t *);
-} apic_intrr_ops_t;
+typedef struct apic_intrmap_ops {
+	int	(*apic_intrmap_init)(int);
+	void	(*apic_intrmap_enable)(int);
+	void	(*apic_intrmap_alloc_entry)(apic_irq_t *);
+	void	(*apic_intrmap_map_entry)(apic_irq_t *, void *);
+	void	(*apic_intrmap_free_entry)(apic_irq_t *);
+	void	(*apic_intrmap_record_rdt)(apic_irq_t *, ioapic_rdt_t *);
+	void	(*apic_intrmap_record_msi)(apic_irq_t *, msi_regs_t *);
+} apic_intrmap_ops_t;
 
 /*
  * Various poweroff methods and ports & bits for them
@@ -862,7 +862,7 @@
 extern void apic_set_directed_EOI_handler();
 extern int apic_directed_EOI_supported();
 
-extern apic_intrr_ops_t *apic_vt_ops;
+extern apic_intrmap_ops_t *apic_vt_ops;
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/i86pc/sys/dmar_acpi.h	Sat Jan 30 15:04:39 2010 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,236 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Portions Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2008, Intel Corporation.
- * All rights reserved.
- */
-
-#ifndef _SYS_DMAR_ACPI_H
-#define	_SYS_DMAR_ACPI_H
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	DMAR_TABLE_PROPNAME	"dmar-table"
-
-#define	DMAR_UNIT_TYPE_DRHD	0
-#define	DMAR_UNIT_TYPE_RMRR	1
-#define	DMAR_UNIT_TYPE_ATSR	2
-
-#define	DEV_SCOPE_ENDPOINT	1
-#define	DEV_SCOPE_P2P		2
-#define	DEV_SCOPE_IOAPIC	3
-#define	DEV_SCOPE_HPET		4
-
-#define	INCLUDE_PCI_ALL		0x01
-#define	DMAR_MAX_SEGMENT	1
-
-#define	IOMMU_PAGE_SIZE_4K	(1UL << 12)
-#define	IOMMU_REG_SIZE		(1UL << 12)
-#define	PARSE_DMAR_SUCCESS	1
-#define	PARSE_DMAR_FAIL		0
-
-#define	for_each_in_list(list, node) \
-	for (node = list_head(list); node != NULL; \
-	    node = list_next(list, node))
-
-/*
- * The following structure describes the formate of
- * DMAR ACPI table format. They are used to parse
- * DMAR ACPI table.
- *
- * Read the spec for the meaning of each member.
- */
-
-/* DMAR ACPI table header */
-typedef struct dmar_acpi_head {
-	char		dh_sig[4];
-	uint32_t	dh_len;
-	uint8_t		dh_rev;
-	uint8_t		dh_checksum;
-	char		dh_oemid[6];
-	char		dh_oemtblid[8];
-	uint32_t	dh_oemrev;
-	char		dh_asl[4];
-	uint32_t	dh_aslrev;
-	uint8_t		dh_haw;
-	uint8_t		dh_flags;
-	uint8_t		dh_reserved[10];
-} dmar_acpi_head_t;
-
-/* Remapping structure header */
-typedef struct dmar_acpi_unit_head {
-	uint16_t	uh_type;
-	uint16_t	uh_length;
-} dmar_acpi_unit_head_t;
-
-/* DRHD unit structure */
-typedef struct dmar_acpi_drhd {
-	dmar_acpi_unit_head_t	dr_header;
-	uint8_t			dr_flags;
-	uint8_t			dr_reserved;
-	uint16_t		dr_segment;
-	uint64_t		dr_baseaddr;
-} dmar_acpi_drhd_t;
-
-/* Device scope structure */
-typedef struct dmar_acpi_dev_scope {
-	uint8_t		ds_type;
-	uint8_t		ds_length;
-	uint8_t		ds_reserved[2];
-	uint8_t		ds_enumid;
-	uint8_t		ds_sbusnum;
-} dmar_acpi_dev_scope_t;
-
-/* RMRR unit structure */
-typedef struct dmar_acpi_rmrr {
-	dmar_acpi_unit_head_t	rm_header;
-	uint8_t			rm_reserved[2];
-	uint16_t		rm_segment;
-	uint64_t		rm_baseaddr;
-	uint64_t		rm_limiaddr;
-} dmar_acpi_rmrr_t;
-
-/*
- * The following structures describes kernel recorded
- * information about the DRHD and RMRR.
- */
-
-/*
- * DRHD information structure
- *
- * node           - the drhd info structure is inserted in the
- *                  list embedded in the intel_dmar_info
- * di_segment     - the pci segment associated with this drhd
- * di_reg_base    - base address of the register set, the size
- *                  of this set is 4K
- * di_include_all - is it an include_all unit
- * di_dev_list    - the dev_info list get from the device scope,
- *                  the node of this list is pci_dev_info_t,
- *                  which present a single pci device
- * di_dip         - pointer to the dev_info for this drhd in the
- *                  device tree
- * di_iommu	  - link to the iommu state structure
- */
-typedef struct drhd_info {
-	list_node_t 	node;
-	uint16_t 	di_segment;
-	uint64_t 	di_reg_base;
-	boolean_t	di_include_all;
-	list_t 		di_dev_list;
-	dev_info_t	*di_dip;
-	void		*di_iommu;
-} drhd_info_t;
-
-/*
- * RMRR information structure
- *
- * node        - the rmrr info structure is inserted in the
- *               list embedded in the intel_dmar_info
- * ri_segment  - the pci segment associated with this rmrr
- * ri_baseaddr - the low address of the reserved range
- * ri_limiaddr - the high address of the reserved range
- * ri_dev_list - the dev_info list get from the device scope,
- *               the node of this list is pci_dev_info_t, w-
- *               hich present a single pci device
- */
-typedef struct rmrr_info {
-	list_node_t	node;
-	list_node_t	node4states;
-	uint16_t	ri_segment;
-	uint64_t	ri_baseaddr;
-	uint64_t	ri_limiaddr;
-	list_t		ri_dev_list;
-} rmrr_info_t;
-
-/*
- * Intel IOMMU information structure
- *
- * dmari_haw        - haw (host address width) indicates the max-
- *                    imum DMA physical addressability by this
- *                    platform.
- * dmari_intr_remap - does this platform support intr remapping
- * dmari_drhd       - the list array of drhd units with the
- *                    segment number as the index into this array
- * dmari_rmrr       - list array for the rmrr
- */
-typedef struct intel_dmar_info {
-	uint8_t		dmari_haw;
-	boolean_t	dmari_intr_remap;
-	list_t		dmari_drhd[DMAR_MAX_SEGMENT];
-	list_t		dmari_rmrr[DMAR_MAX_SEGMENT];
-} intel_dmar_info_t;
-
-/*
- * The pci device node in the dev_list of drhd_info and
- * rmrr_info
- *
- * node		  - list node
- * bus, dev, func - bus, device and function number of
- *		  - this pci device
- * pdi_type	  - type of this device, includes
- *		    0x01 : pci endpoint
- *		    0x02 : pci p2p bridge
- *		    0x03 : ioapci
- *		    0x04 : msi capable hpet
- * pdi_sec_bus	  - record the bus number of the PCI bus
- *		    segment to which the secondary interface
- *		    of the bridge is connected
- * pdi_sub_bus	  - record the bus number of the highest
- *		    numbered PCI bus segment which is behind
- *		    (or subordinate to) the bridge
- */
-typedef struct pci_dev_scope {
-	list_node_t node;
-	uint8_t pds_bus;
-	uint8_t pds_dev;
-	uint8_t pds_func;
-	uint8_t pds_type;
-} pci_dev_scope_t;
-
-extern boolean_t intel_iommu_support;
-extern intel_dmar_info_t *dmar_info;
-extern void intel_iommu_release_dmar_info(void);
-extern void intel_iommu_probe_and_parse(void);
-
-/*
- * interrupt source id and drhd info for ioapic
- */
-typedef struct ioapic_drhd_info {
-	list_node_t	node;
-	uchar_t		ioapic_id;	/* ioapic id */
-	uint16_t	sid;		/* ioapic source id */
-	drhd_info_t	*drhd;
-} ioapic_drhd_info_t;
-
-extern list_t ioapic_drhd_infos;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_DMAR_ACPI_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/i86pc/sys/immu.h	Sat Jan 30 18:23:16 2010 -0800
@@ -0,0 +1,835 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Portions Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2008, Intel Corporation.
+ * All rights reserved.
+ */
+
+#ifndef	_SYS_INTEL_IOMMU_H
+#define	_SYS_INTEL_IOMMU_H
+
+/*
+ * Intel IOMMU implementation specific state
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/bitset.h>
+#include <sys/kstat.h>
+#include <sys/vmem.h>
+#include <sys/rootnex.h>
+
+/*
+ * Some ON drivers have bugs. Keep this define until all such drivers
+ * have been fixed
+ */
+#define	BUGGY_DRIVERS 1
+
+/* PD(T)E entries */
+typedef uint64_t hw_pdte_t;
+
+#define	IMMU_MAXNAMELEN (64)
+#define	IMMU_MAXSEG	(1)
+#define	IMMU_REGSZ	(1UL << 12)
+#define	IMMU_PAGESIZE   (4096)
+#define	IMMU_PAGESHIFT	(12)
+#define	IMMU_PAGEOFFSET	(IMMU_PAGESIZE - 1)
+#define	IMMU_PAGEMASK	(~IMMU_PAGEOFFSET)
+#define	IMMU_BTOP(b)	(((uint64_t)b) >> IMMU_PAGESHIFT)
+#define	IMMU_PTOB(p)	(((uint64_t)p) << IMMU_PAGESHIFT)
+#define	IMMU_PGTABLE_MAX_LEVELS	(6)
+#define	IMMU_ROUNDUP(size) (((size) + IMMU_PAGEOFFSET) & ~IMMU_PAGEOFFSET)
+#define	IMMU_ROUNDOWN(addr) ((addr) & ~IMMU_PAGEOFFSET)
+#define	IMMU_PGTABLE_LEVEL_STRIDE	(9)
+#define	IMMU_PGTABLE_LEVEL_MASK	((1<<IMMU_PGTABLE_LEVEL_STRIDE) - 1)
+#define	IMMU_PGTABLE_OFFSHIFT  (IMMU_PAGESHIFT - IMMU_PGTABLE_LEVEL_STRIDE)
+#define	IMMU_PGTABLE_MAXIDX  ((IMMU_PAGESIZE / sizeof (hw_pdte_t)) - 1)
+
+#define	IMMU_ROUNDUP(size) (((size) + IMMU_PAGEOFFSET) & ~IMMU_PAGEOFFSET)
+#define	IMMU_ROUNDOWN(addr) ((addr) & ~IMMU_PAGEOFFSET)
+
+/*
+ * DMAR global defines
+ */
+#define	DMAR_TABLE	"dmar-table"
+#define	DMAR_INTRMAP_SUPPORT	(0x01)
+
+/* DMAR unit types */
+#define	DMAR_DRHD	0
+#define	DMAR_RMRR	1
+#define	DMAR_ATSR	2
+#define	DMAR_RHSA	3
+
+/* DRHD flag values */
+#define	DMAR_INCLUDE_ALL	(0x01)
+
+/* Device scope types */
+#define	DMAR_ENDPOINT	1
+#define	DMAR_SUBTREE	2
+#define	DMAR_IOAPIC	3
+#define	DMAR_HPET	4
+
+
+/* Forward declarations for IOMMU state structure and DVMA domain struct */
+struct immu;
+struct domain;
+
+/*
+ * The following structure describes the formate of DMAR ACPI table format.
+ * They are used to parse DMAR ACPI table. Read the spec for the meaning
+ * of each member.
+ */
+
+/* lengths of various strings */
+#define	DMAR_SIG_LEN    (4)	/* table signature */
+#define	DMAR_OEMID_LEN  (6)	/* OEM ID */
+#define	DMAR_TBLID_LEN  (8)	/* OEM table ID */
+#define	DMAR_ASL_LEN    (4)	/* ASL len */
+
+typedef struct dmar_table {
+	kmutex_t	tbl_lock;
+	uint8_t		tbl_haw;
+	boolean_t	tbl_intrmap;
+	list_t		tbl_drhd_list[IMMU_MAXSEG];
+	list_t		tbl_rmrr_list[IMMU_MAXSEG];
+	char		*tbl_oem_id;
+	char		*tbl_oem_tblid;
+	uint32_t	tbl_oem_rev;
+	caddr_t		tbl_raw;
+	int		tbl_rawlen;
+} dmar_table_t;
+
+typedef struct drhd {
+	kmutex_t	dr_lock;   /* protects the dmar field */
+	struct immu	*dr_immu;
+	dev_info_t	*dr_dip;
+	uint16_t 	dr_seg;
+	uint64_t 	dr_regs;
+	boolean_t	dr_include_all;
+	list_t 		dr_scope_list;
+	list_node_t 	dr_node;
+} drhd_t;
+
+typedef struct rmrr {
+	kmutex_t	rm_lock;
+	uint16_t	rm_seg;
+	uint64_t	rm_base;
+	uint64_t	rm_limit;
+	list_t		rm_scope_list;
+	list_node_t	rm_node;
+} rmrr_t;
+
+/*
+ * Macros based on PCI spec
+ */
+#define	IMMU_PCI_DEV(devfunc)    ((uint64_t)devfunc >> 3) /* from devfunc  */
+#define	IMMU_PCI_FUNC(devfunc)   (devfunc & 7)  /* get func from devfunc */
+#define	IMMU_PCI_DEVFUNC(d, f)   (((d) << 3) | (f))  /* create devfunc */
+
+typedef struct scope {
+	uint8_t scp_type;
+	uint8_t scp_enumid;
+	uint8_t scp_bus;
+	uint8_t scp_dev;
+	uint8_t scp_func;
+	list_node_t scp_node;
+} scope_t;
+
+/*
+ * interrupt source id and drhd info for ioapic
+ */
+typedef struct ioapic_drhd {
+	uchar_t		ioapic_ioapicid;
+	uint16_t	ioapic_sid;	/* ioapic source id */
+	drhd_t		*ioapic_drhd;
+	list_node_t	ioapic_node;
+} ioapic_drhd_t;
+
+typedef struct memrng {
+	uint64_t mrng_start;
+	uint64_t mrng_npages;
+} memrng_t;
+
+typedef enum immu_flags {
+	IMMU_FLAGS_NONE = 0x1,
+	IMMU_FLAGS_SLEEP = 0x1,
+	IMMU_FLAGS_NOSLEEP = 0x2,
+	IMMU_FLAGS_READ = 0x4,
+	IMMU_FLAGS_WRITE = 0x8,
+	IMMU_FLAGS_DONTPASS = 0x10,
+	IMMU_FLAGS_ALLOC = 0x20,
+	IMMU_FLAGS_MUST_MATCH = 0x40,
+	IMMU_FLAGS_PAGE1 = 0x80,
+	IMMU_FLAGS_UNITY = 0x100,
+	IMMU_FLAGS_DMAHDL = 0x200,
+	IMMU_FLAGS_MEMRNG = 0x400
+} immu_flags_t;
+
+typedef enum cont_avail {
+	IMMU_CONT_BAD = 0x0,
+	IMMU_CONT_UNINITED = 0x1,
+	IMMU_CONT_INITED = 0x2
+} cont_avail_t;
+
+/* Size of root and context tables and their entries */
+#define	IMMU_ROOT_TBLSZ		(4096)
+#define	IMMU_CONT_TBLSZ		(4096)
+#define	IMMU_ROOT_NUM		(256)
+#define	IMMU_CONT_NUM		(256)
+
+/* register offset */
+#define	IMMU_REG_VERSION	(0x00)  /* Version Rigister, 32 bit */
+#define	IMMU_REG_CAP		(0x08)  /* Capability Register, 64 bit */
+#define	IMMU_REG_EXCAP		(0x10)  /* Extended Capability Reg, 64 bit */
+#define	IMMU_REG_GLOBAL_CMD	(0x18)  /* Global Command Register, 32 bit */
+#define	IMMU_REG_GLOBAL_STS	(0x1C)  /* Global Status Register, 32 bit */
+#define	IMMU_REG_ROOTENTRY	(0x20)  /* Root-Entry Table Addr Reg, 64 bit */
+#define	IMMU_REG_CONTEXT_CMD	(0x28)  /* Context Comand Register, 64 bit */
+#define	IMMU_REG_FAULT_STS	(0x34)  /* Fault Status Register, 32 bit */
+#define	IMMU_REG_FEVNT_CON	(0x38)  /* Fault Event Control Reg, 32 bit */
+#define	IMMU_REG_FEVNT_DATA	(0x3C)  /* Fault Event Data Register, 32 bit */
+#define	IMMU_REG_FEVNT_ADDR	(0x40)  /* Fault Event Address Reg, 32 bit */
+#define	IMMU_REG_FEVNT_UADDR	(0x44)  /* Fault Event Upper Addr Reg, 32 bit */
+#define	IMMU_REG_AFAULT_LOG	(0x58)  /* Advanced Fault Log Reg, 64 bit */
+#define	IMMU_REG_PMER		(0x64)  /* Protected Memory Enble Reg, 32 bit */
+#define	IMMU_REG_PLMBR		(0x68)  /* Protected Low Mem Base Reg, 32 bit */
+#define	IMMU_REG_PLMLR		(0x6C)  /* Protected Low Mem Lim Reg, 32 bit */
+#define	IMMU_REG_PHMBR		(0X70)  /* Protectd High Mem Base Reg, 64 bit */
+#define	IMMU_REG_PHMLR		(0x78)  /* Protected High Mem Lim Reg, 64 bit */
+#define	IMMU_REG_INVAL_QH	(0x80)  /* Invalidation Queue Head, 64 bit */
+#define	IMMU_REG_INVAL_QT	(0x88)  /* Invalidation Queue Tail, 64 bit */
+#define	IMMU_REG_INVAL_QAR	(0x90)  /* Invalidtion Queue Addr Reg, 64 bit */
+#define	IMMU_REG_INVAL_CSR	(0x9C)  /* Inval Compl Status Reg, 32 bit */
+#define	IMMU_REG_INVAL_CECR	(0xA0)  /* Inval Compl Evnt Ctrl Reg, 32 bit */
+#define	IMMU_REG_INVAL_CEDR	(0xA4)  /* Inval Compl Evnt Data Reg, 32 bit */
+#define	IMMU_REG_INVAL_CEAR	(0xA8)  /* Inval Compl Event Addr Reg, 32 bit */
+#define	IMMU_REG_INVAL_CEUAR	(0xAC)  /* Inval Comp Evnt Up Addr reg, 32bit */
+#define	IMMU_REG_IRTAR		(0xB8)  /* INTR Remap Tbl Addr Reg, 64 bit */
+
+/* ioapic memory region */
+#define	IOAPIC_REGION_START	(0xfee00000)
+#define	IOAPIC_REGION_END	(0xfeefffff)
+
+/* fault register */
+#define	IMMU_FAULT_STS_PPF		(2)
+#define	IMMU_FAULT_STS_PFO		(1)
+#define	IMMU_FAULT_STS_ITE		(1 << 6)
+#define	IMMU_FAULT_STS_ICE		(1 << 5)
+#define	IMMU_FAULT_STS_IQE		(1 << 4)
+#define	IMMU_FAULT_GET_INDEX(x)		((((uint64_t)x) >> 8) & 0xff)
+#define	IMMU_FRR_GET_F(x)		(((uint64_t)x) >> 63)
+#define	IMMU_FRR_GET_FR(x)		((((uint64_t)x) >> 32) & 0xff)
+#define	IMMU_FRR_GET_FT(x)		((((uint64_t)x) >> 62) & 0x1)
+#define	IMMU_FRR_GET_SID(x)		((x) & 0xffff)
+
+/* (ex)capability register */
+#define	IMMU_CAP_GET_NFR(x)		(((((uint64_t)x) >> 40) & 0xff) + 1)
+#define	IMMU_CAP_GET_DWD(x)		((((uint64_t)x) >> 54) & 1)
+#define	IMMU_CAP_GET_DRD(x)		((((uint64_t)x) >> 55) & 1)
+#define	IMMU_CAP_GET_PSI(x)		((((uint64_t)x) >> 39) & 1)
+#define	IMMU_CAP_GET_SPS(x)		((((uint64_t)x) >> 34) & 0xf)
+#define	IMMU_CAP_GET_ISOCH(x)		((((uint64_t)x) >> 23) & 1)
+#define	IMMU_CAP_GET_ZLR(x)		((((uint64_t)x) >> 22) & 1)
+#define	IMMU_CAP_GET_MAMV(x)		((((uint64_t)x) >> 48) & 0x3f)
+#define	IMMU_CAP_GET_CM(x)		((((uint64_t)x) >> 7) & 1)
+#define	IMMU_CAP_GET_PHMR(x)		((((uint64_t)x) >> 6) & 1)
+#define	IMMU_CAP_GET_PLMR(x)		((((uint64_t)x) >> 5) & 1)
+#define	IMMU_CAP_GET_RWBF(x)		((((uint64_t)x) >> 4) & 1)
+#define	IMMU_CAP_GET_AFL(x)		((((uint64_t)x) >> 3) & 1)
+#define	IMMU_CAP_GET_FRO(x)		(((((uint64_t)x) >> 24) & 0x3ff) * 16)
+#define	IMMU_CAP_MGAW(x)		(((((uint64_t)x) >> 16) & 0x3f) + 1)
+#define	IMMU_CAP_SAGAW(x)		((((uint64_t)x) >> 8) & 0x1f)
+#define	IMMU_CAP_ND(x)			(1 << (((x) & 0x7) *2 + 4)) -1
+#define	IMMU_ECAP_GET_IRO(x)		(((((uint64_t)x) >> 8) & 0x3ff) << 4)
+#define	IMMU_ECAP_GET_MHMV(x)		(((uint64_t)x >> 20) & 0xf)
+#define	IMMU_ECAP_GET_SC(x)		((x) & 0x80)
+#define	IMMU_ECAP_GET_PT(x)		((x) & 0x40)
+#define	IMMU_ECAP_GET_CH(x)		((x) & 0x20)
+#define	IMMU_ECAP_GET_EIM(x)		((x) & 0x10)
+#define	IMMU_ECAP_GET_IR(x)		((x) & 0x8)
+#define	IMMU_ECAP_GET_DI(x)		((x) & 0x4)
+#define	IMMU_ECAP_GET_QI(x)		((x) & 0x2)
+#define	IMMU_ECAP_GET_C(x)		((x) & 0x1)
+
+#define	IMMU_CAP_SET_RWBF(x)		((x) |= (1 << 4))
+
+
+/* iotlb invalidation */
+#define	TLB_INV_GLOBAL		(((uint64_t)1) << 60)
+#define	TLB_INV_DOMAIN		(((uint64_t)2) << 60)
+#define	TLB_INV_PAGE		(((uint64_t)3) << 60)
+#define	TLB_INV_GET_IAIG(x)	((((uint64_t)x) >> 57) & 7)
+#define	TLB_INV_DRAIN_READ	(((uint64_t)1) << 49)
+#define	TLB_INV_DRAIN_WRITE	(((uint64_t)1) << 48)
+#define	TLB_INV_DID(x)		(((uint64_t)((x) & 0xffff)) << 32)
+#define	TLB_INV_IVT		(((uint64_t)1) << 63)
+#define	TLB_IVA_HINT(x)		(((x) & 0x1) << 6)
+#define	TLB_IVA_LEAF		1
+#define	TLB_IVA_WHOLE		0
+
+/* dont use value 0 for  enums - to catch unit 8 */
+typedef enum iotlb_inv {
+	IOTLB_PSI = 1,
+	IOTLB_DSI,
+	IOTLB_GLOBAL
+} immu_iotlb_inv_t;
+
+typedef enum context_inv {
+	CONTEXT_FSI = 1,
+	CONTEXT_DSI,
+	CONTEXT_GLOBAL
+} immu_context_inv_t;
+
+/* context invalidation */
+#define	CCMD_INV_ICC		(((uint64_t)1) << 63)
+#define	CCMD_INV_GLOBAL		(((uint64_t)1) << 61)
+#define	CCMD_INV_DOMAIN		(((uint64_t)2) << 61)
+#define	CCMD_INV_DEVICE		(((uint64_t)3) << 61)
+#define	CCMD_INV_DID(x)		((uint64_t)((x) & 0xffff))
+#define	CCMD_INV_SID(x)		(((uint64_t)((x) & 0xffff)) << 16)
+#define	CCMD_INV_FM(x)		(((uint64_t)((x) & 0x3)) << 32)
+
+/* global command register */
+#define	IMMU_GCMD_TE		(((uint32_t)1) << 31)
+#define	IMMU_GCMD_SRTP		(((uint32_t)1) << 30)
+#define	IMMU_GCMD_SFL		(((uint32_t)1) << 29)
+#define	IMMU_GCMD_EAFL		(((uint32_t)1) << 28)
+#define	IMMU_GCMD_WBF		(((uint32_t)1) << 27)
+#define	IMMU_GCMD_QIE		(((uint32_t)1) << 26)
+#define	IMMU_GCMD_IRE		(((uint32_t)1) << 25)
+#define	IMMU_GCMD_SIRTP	(((uint32_t)1) << 24)
+#define	IMMU_GCMD_CFI		(((uint32_t)1) << 23)
+
+/* global status register */
+#define	IMMU_GSTS_TES		(((uint32_t)1) << 31)
+#define	IMMU_GSTS_RTPS		(((uint32_t)1) << 30)
+#define	IMMU_GSTS_FLS		(((uint32_t)1) << 29)
+#define	IMMU_GSTS_AFLS		(((uint32_t)1) << 28)
+#define	IMMU_GSTS_WBFS		(((uint32_t)1) << 27)
+#define	IMMU_GSTS_QIES		(((uint32_t)1) << 26)
+#define	IMMU_GSTS_IRES		(((uint32_t)1) << 25)
+#define	IMMU_GSTS_IRTPS	(((uint32_t)1) << 24)
+#define	IMMU_GSTS_CFIS		(((uint32_t)1) << 23)
+
+/* psi address mask */
+#define	ADDR_AM_MAX(m)		(((uint_t)1) << (m))
+#define	ADDR_AM_OFFSET(n, m)	((n) & (ADDR_AM_MAX(m) - 1))
+
+/* dmar fault event */
+#define	IMMU_INTR_IPL			(8)
+#define	IMMU_REG_FEVNT_CON_IM_SHIFT	(31)
+
+#define	IMMU_ALLOC_RESOURCE_DELAY    (drv_usectohz(5000))
+
+/* max value of Size field of Interrupt Remapping Table Address Register */
+#define	INTRMAP_MAX_IRTA_SIZE	0xf
+
+/* interrupt remapping table entry size */
+#define	INTRMAP_RTE_SIZE		0x10
+
+/* ioapic redirection table entry related shift of remappable interrupt */
+#define	INTRMAP_IOAPIC_IDX_SHIFT		17
+#define	INTRMAP_IOAPIC_FORMAT_SHIFT	16
+#define	INTRMAP_IOAPIC_TM_SHIFT		15
+#define	INTRMAP_IOAPIC_POL_SHIFT		13
+#define	INTRMAP_IOAPIC_IDX15_SHIFT	11
+
+/* msi intr entry related shift of remappable interrupt */
+#define	INTRMAP_MSI_IDX_SHIFT	5
+#define	INTRMAP_MSI_FORMAT_SHIFT	4
+#define	INTRMAP_MSI_SHV_SHIFT	3
+#define	INTRMAP_MSI_IDX15_SHIFT	2
+
+#define	INTRMAP_IDX_FULL		(uint_t)-1
+
+#define	RDT_DLM(rdt)	BITX((rdt), 10, 8)
+#define	RDT_DM(rdt)	BT_TEST(&(rdt), 11)
+#define	RDT_POL(rdt)	BT_TEST(&(rdt), 13)
+#define	RDT_TM(rdt)	BT_TEST(&(rdt), 15)
+
+#define	INTRMAP_DISABLE	(void *)-1
+
+/*
+ * invalidation granularity
+ */
+typedef enum {
+	TLB_INV_G_GLOBAL = 1,
+	TLB_INV_G_DOMAIN,
+	TLB_INV_G_PAGE
+} tlb_inv_g_t;
+
+typedef enum {
+	CTT_INV_G_GLOBAL = 1,
+	CTT_INV_G_DOMAIN,
+	CTT_INV_G_DEVICE
+} ctt_inv_g_t;
+
+typedef enum {
+	IEC_INV_GLOBAL = 0,
+	IEC_INV_INDEX
+} iec_inv_g_t;
+
+
+struct inv_queue_state;
+struct intrmap_tbl_state;
+
+/* A software page table structure */
+typedef struct pgtable {
+	krwlock_t swpg_rwlock;
+	caddr_t hwpg_vaddr;   /* HW pgtable VA */
+	paddr_t hwpg_paddr;   /* HW pgtable PA */
+	ddi_dma_handle_t hwpg_dmahdl;
+	ddi_acc_handle_t hwpg_memhdl;
+	struct pgtable **swpg_next_array;
+	list_node_t swpg_domain_node;  /* domain list of pgtables */
+} pgtable_t;
+
+/* interrupt remapping table state info */
+typedef struct intrmap {
+	kmutex_t		intrmap_lock;
+	ddi_dma_handle_t	intrmap_dma_hdl;
+	ddi_acc_handle_t	intrmap_acc_hdl;
+	caddr_t			intrmap_vaddr;
+	paddr_t			intrmap_paddr;
+	uint_t			intrmap_size;
+	bitset_t		intrmap_map;
+	uint_t			intrmap_free;
+} intrmap_t;
+
+typedef struct hw_rce {
+	uint64_t lo;
+	uint64_t hi;
+} hw_rce_t;
+
+
+#define	ROOT_GET_P(hrent) ((hrent)->lo & 0x1)
+#define	ROOT_SET_P(hrent) ((hrent)->lo |= 0x1)
+
+#define	ROOT_GET_CONT(hrent) ((hrent)->lo & ~(0xFFF))
+#define	ROOT_SET_CONT(hrent, paddr) ((hrent)->lo |= (paddr & (~0xFFF)))
+
+#define	TTYPE_XLATE_ONLY  (0x0)
+#define	TTYPE_XLATE_IOTLB (0x1)
+#define	TTYPE_PASSTHRU    (0x2)
+#define	TTYPE_RESERVED    (0x3)
+
+#define	CONT_GET_DID(hcent) ((((uint64_t)(hcent)->hi) >> 8) & 0xFFFF)
+#define	CONT_SET_DID(hcent, did) ((hcent)->hi |= ((0xFFFF & (did)) << 8))
+
+#define	CONT_GET_AVAIL(hcent) ((((uint64_t)((hcent)->hi)) >> 0x3) & 0xF)
+#define	CONT_SET_AVAIL(hcent, av) ((hcent)->hi |= ((0xF & (av)) << 0x3))
+
+#define	CONT_GET_LO_AW(hcent) (30 + 9 *((hcent)->hi & 0x7))
+#define	CONT_GET_AW(hcent) \
+	((CONT_GET_LO_AW(hcent) == 66) ? 64 : CONT_GET_LO_AW(hcent))
+#define	CONT_SET_AW(hcent, aw) \
+	((hcent)->hi |= (((((aw) + 2) - 30) / 9) & 0x7))
+
+#define	CONT_GET_ASR(hcent) ((hcent)->lo & ~(0xFFF))
+#define	CONT_SET_ASR(hcent, paddr) ((hcent)->lo |= (paddr & (~0xFFF)))
+
+#define	CONT_GET_TTYPE(hcent) ((((uint64_t)(hcent)->lo) >> 0x2) & 0x3)
+#define	CONT_SET_TTYPE(hcent, ttype) ((hcent)->lo |= (((ttype) & 0x3) << 0x2))
+
+#define	CONT_GET_P(hcent) ((hcent)->lo & 0x1)
+#define	CONT_SET_P(hcent) ((hcent)->lo |= 0x1)
+
+
+/* we use the bit 63 (available for system SW) as a present bit */
+#define	PDTE_SW4(hw_pdte) ((hw_pdte) & ((uint64_t)1<<63))
+#define	PDTE_CLEAR_SW4(hw_pdte) ((hw_pdte) &= ~((uint64_t)1<<63))
+
+#define	PDTE_P(hw_pdte) ((hw_pdte) & ((uint64_t)1<<63))
+#define	PDTE_CLEAR_P(hw_pdte) ((hw_pdte) &= ~((uint64_t)1<<63))
+#define	PDTE_SET_P(hw_pdte) ((hw_pdte) |= ((uint64_t)1<<63))
+
+#define	PDTE_TM(hw_pdte) ((hw_pdte) & ((uint64_t)1<<62))
+#define	PDTE_CLEAR_TM(hw_pdte) ((hw_pdte) &= ~((uint64_t)1<<62))
+
+#define	PDTE_SW3(hw_pdte) \
+	(((hw_pdte) & ~(((uint64_t)0x3<<62)|(((uint64_t)1<<52)-1))) >> 52)
+#define	PDTE_SW3_OVERFLOW(hw_pdte) \
+	(PDTE_SW3(hw_pdte) == 0x3FF)
+#define	PDTE_CLEAR_SW3(hw_pdte) \
+	((hw_pdte) &= (((uint64_t)0x3<<62)|(((uint64_t)1<<52)-1)))
+#define	PDTE_SET_SW3(hw_pdte, ref) \
+	((hw_pdte) |= ((((uint64_t)(ref)) & 0x3FF) << 52))
+
+#define	PDTE_PADDR(hw_pdte) ((hw_pdte) & ~(((uint64_t)0xFFF<<52)|((1<<12)-1)))
+#define	PDTE_CLEAR_PADDR(hw_pdte) \
+		((hw_pdte) &= (((uint64_t)0xFFF<<52)|((1<<12)-1)))
+#define	PDTE_SET_PADDR(hw_pdte, paddr) ((hw_pdte) |= PDTE_PADDR(paddr))
+
+#define	PDTE_SNP(hw_pdte) ((hw_pdte) & (1<<11))
+#define	PDTE_CLEAR_SNP(hw_pdte) ((hw_pdte) &= ~(1<<11))
+#define	PDTE_SET_SNP(hw_pdte) ((hw_pdte) |= (1<<11))
+
+#define	PDTE_SW2(hw_pdte) ((hw_pdte) & (0x700))
+#define	PDTE_CLEAR_SW2(hw_pdte) ((hw_pdte) &= ~(0x700))
+
+#define	PDTE_SP(hw_pdte) ((hw_pdte) & (0x80))
+#define	PDTE_CLEAR_SP(hw_pdte) ((hw_pdte) &= ~(0x80))
+
+#define	PDTE_SW1(hw_pdte) ((hw_pdte) & (0x7C))
+#define	PDTE_CLEAR_SW1(hw_pdte) ((hw_pdte) &= ~(0x7C))
+
+#define	PDTE_WRITE(hw_pdte) ((hw_pdte) & (0x2))
+#define	PDTE_CLEAR_WRITE(hw_pdte) ((hw_pdte) &= ~(0x2))
+#define	PDTE_SET_WRITE(hw_pdte) ((hw_pdte) |= (0x2))
+
+#define	PDTE_READ(hw_pdte) ((hw_pdte) & (0x1))
+#define	PDTE_CLEAR_READ(hw_pdte) ((hw_pdte) &= ~(0x1))
+#define	PDTE_SET_READ(hw_pdte) ((hw_pdte) |= (0x1))
+
+typedef struct immu {
+	kmutex_t		immu_lock;
+	char			*immu_name;
+
+	/* lock grabbed by interrupt handler */
+	kmutex_t		immu_intr_lock;
+
+	/* ACPI/DMAR table related */
+	void			*immu_dmar_unit;
+	dev_info_t		*immu_dip;
+	struct domain		*immu_unity_domain;
+
+	/* IOMMU register related */
+	kmutex_t		immu_regs_lock;
+	boolean_t		immu_regs_setup;
+	boolean_t		immu_regs_running;
+	boolean_t		immu_regs_quiesced;
+	ddi_acc_handle_t	immu_regs_handle;
+	caddr_t			immu_regs_addr;
+	uint64_t		immu_regs_cap;
+	uint64_t		immu_regs_excap;
+	uint32_t		immu_regs_cmdval;
+	uint32_t		immu_regs_intr_msi_addr;
+	uint32_t		immu_regs_intr_msi_data;
+	uint32_t		immu_regs_intr_uaddr;
+
+	/* DVMA related */
+	kmutex_t		immu_dvma_lock;
+	boolean_t		immu_dvma_setup;
+	boolean_t		immu_dvma_running;
+	int			immu_dvma_gaw;
+	int			immu_dvma_agaw;
+	int			immu_dvma_nlevels;
+	boolean_t		immu_dvma_coherent;
+
+	/* DVMA context related */
+	krwlock_t		immu_ctx_rwlock;
+	pgtable_t		*immu_ctx_root;
+
+	/* DVMA domain related */
+	int			immu_max_domains;
+	vmem_t			*immu_did_arena;
+	char			immu_did_arena_name[IMMU_MAXNAMELEN];
+	list_t			immu_domain_list;
+
+	/* DVMA special devices */
+	boolean_t		immu_dvma_gfx_only;
+	list_t			immu_dvma_lpc_list;
+	list_t			immu_dvma_gfx_list;
+
+	/* interrupt remapping related */
+	kmutex_t		immu_intrmap_lock;
+	boolean_t		immu_intrmap_setup;
+	boolean_t		immu_intrmap_running;
+	intrmap_t		*immu_intrmap;
+	uint64_t		immu_intrmap_irta_reg;
+
+	/* queued invalidation related */
+	kmutex_t		immu_qinv_lock;
+	boolean_t		immu_qinv_setup;
+	boolean_t		immu_qinv_running;
+	boolean_t		immu_qinv_enabled;
+	void			*immu_qinv;
+	uint64_t		immu_qinv_reg_value;
+
+	/* list_node for system-wide list of DMAR units */
+	list_node_t		immu_node;
+} immu_t;
+
+/* properties that control DVMA */
+#define	DDI_DVMA_MAPTYPE_PROP	"ddi-dvma-mapping"
+
+/* property values */
+#define	DDI_DVMA_MAPTYPE_UNITY	"unity"
+
+typedef enum immu_maptype {
+	IMMU_MAPTYPE_BAD = 0,    /* 0 is always bad */
+	IMMU_MAPTYPE_UNITY = 1,
+	IMMU_MAPTYPE_XLATE
+} immu_maptype_t;
+
+/*
+ * domain_t
+ *
+ */
+typedef struct domain {
+	/* the basics */
+	uint_t			dom_did;
+	immu_t			*dom_immu;
+
+	/* mapping related */
+	immu_maptype_t		dom_maptype;
+	vmem_t			*dom_dvma_arena;
+	char			dom_dvma_arena_name[IMMU_MAXNAMELEN];
+
+	/* pgtables */
+	pgtable_t		*dom_pgtable_root;
+	krwlock_t		dom_pgtable_rwlock;
+
+	/* list of pgtables for this domain */
+	list_t			dom_pglist;
+
+	/* list node for list of domains (unity or xlate) */
+	list_node_t		dom_maptype_node;
+	/* list node for list of domains off immu */
+	list_node_t		dom_immu_node;
+} domain_t;
+
+typedef enum immu_pcib {
+	IMMU_PCIB_BAD = 0,
+	IMMU_PCIB_NOBDF,
+	IMMU_PCIB_PCIE_PCIE,
+	IMMU_PCIB_PCIE_PCI,
+	IMMU_PCIB_PCI_PCI,
+	IMMU_PCIB_ENDPOINT
+} immu_pcib_t;
+
+/*
+ *  immu_devi_t
+ *      Intel IOMMU in devinfo node
+ */
+typedef struct immu_devi {
+	/* pci seg, bus, dev, func */
+	int		imd_seg;
+	int		imd_bus;
+	int		imd_devfunc;
+
+	/* ppb information */
+	immu_pcib_t	imd_pcib_type;
+	int		imd_sec;
+	int		imd_sub;
+
+	/* identifier for special devices */
+	boolean_t	imd_display;
+	boolean_t	imd_lpc;
+
+	/* dmar unit to which this dip belongs */
+	immu_t		*imd_immu;
+
+	/* domain ptr */
+	domain_t	*imd_domain;
+	dev_info_t	*imd_ddip;
+
+	/* my devinfo */
+	dev_info_t	*imd_dip;
+
+	/*
+	 * if we are a "special" devinfo
+	 * the node for the special linked list
+	 * off the DMAR unit structure
+	 */
+	list_node_t	imd_spc_node;
+} immu_devi_t;
+
+#define	IMMU_DEVI(dip)		((immu_devi_t *)(DEVI(dip)->devi_iommu))
+#define	IMMU_DEVI_SET(dip, imd)	(DEVI(dip)->devi_iommu = (void *)imd)
+
+/*
+ * struct dmar_arg
+ */
+typedef struct immu_arg {
+	int		ima_seg;
+	int		ima_bus;
+	int		ima_devfunc;
+	dev_info_t	*ima_rdip;
+	dev_info_t	*ima_ddip;
+} immu_arg_t;
+
+/*
+ * Globals used by IOMMU code
+ */
+/* shared between IOMMU files */
+extern dev_info_t *root_devinfo;
+extern kmutex_t immu_lock;
+extern list_t immu_list;
+extern boolean_t immu_setup;
+extern boolean_t immu_running;
+extern kmutex_t ioapic_drhd_lock;
+extern list_t ioapic_drhd_list;
+
+/* switches */
+
+/* Various features */
+extern boolean_t immu_enable;
+extern boolean_t immu_dvma_enable;
+extern boolean_t immu_gfxdvma_enable;
+extern boolean_t immu_intrmap_enable;
+extern boolean_t immu_qinv_enable;
+extern boolean_t immu_mmio_safe;
+
+/* various quirks that need working around */
+extern boolean_t immu_quirk_usbpage0;
+extern boolean_t immu_quirk_usbfullpa;
+extern boolean_t immu_quirk_usbrmrr;
+extern boolean_t immu_quirk_mobile4;
+
+/* debug messages */
+extern boolean_t immu_dmar_print;
+
+/* ################### Interfaces exported outside IOMMU code ############## */
+void immu_init(void);
+void immu_startup(void);
+void immu_shutdown(void);
+void immu_destroy(void);
+int immu_map_sgl(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq,
+    int prealloc_count, dev_info_t *rdip);
+int immu_unmap_sgl(ddi_dma_impl_t *hp, dev_info_t *rdip);
+void immu_device_tree_changed(void);
+void immu_physmem_update(uint64_t addr, uint64_t size);
+int immu_quiesce(void);
+int immu_unquiesce(void);
+/* ######################################################################### */
+
+/* ################# Interfaces used within IOMMU code #################### */
+
+/* functions in rootnex.c */
+int rootnex_dvcookies_alloc(ddi_dma_impl_t *hp,
+    struct ddi_dma_req *dmareq, dev_info_t *rdip, void *arg);
+void rootnex_dvcookies_free(dvcookie_t *dvcookies, void *arg);
+
+/* immu_dmar.c interfaces */
+int immu_dmar_setup(void);
+int immu_dmar_parse(void);
+void immu_dmar_startup(void);
+void immu_dmar_shutdown(void);
+void immu_dmar_destroy(void);
+boolean_t immu_dmar_blacklisted(char **strings_array, uint_t nstrings);
+immu_t *immu_dmar_get_immu(dev_info_t *rdip);
+char *immu_dmar_unit_name(void *dmar_unit);
+dev_info_t *immu_dmar_unit_dip(void *dmar_unit);
+void immu_dmar_set_immu(void *dmar_unit, immu_t *immu);
+void *immu_dmar_walk_units(int seg, void *dmar_unit);
+boolean_t immu_dmar_intrmap_supported(void);
+uint16_t immu_dmar_ioapic_sid(int ioapicid);
+immu_t *immu_dmar_ioapic_immu(int ioapicid);
+void immu_dmar_rmrr_map(void);
+
+/* immu.c interfaces */
+int immu_walk_ancestor(dev_info_t *rdip, dev_info_t *ddip,
+    int (*func)(dev_info_t *, void *arg), void *arg,
+    int *level, immu_flags_t immu_flags);
+
+/* immu_regs.c interfaces */
+void immu_regs_setup(list_t *immu_list);
+void immu_regs_startup(immu_t *immu);
+int immu_regs_resume(immu_t *immu);
+void immu_regs_suspend(immu_t *immu);
+void immu_regs_shutdown(immu_t *immu);
+void immu_regs_destroy(list_t *immu_list);
+
+void immu_regs_intr(immu_t *immu, uint32_t msi_addr, uint32_t msi_data,
+    uint32_t uaddr);
+
+boolean_t immu_regs_passthru_supported(immu_t *immu);
+boolean_t immu_regs_is_TM_reserved(immu_t *immu);
+boolean_t immu_regs_is_SNP_reserved(immu_t *immu);
+
+void immu_regs_wbf_flush(immu_t *immu);
+void immu_regs_cpu_flush(immu_t *immu, caddr_t addr, uint_t size);
+void immu_regs_iotlb_flush(immu_t *immu, uint_t domainid, uint64_t dvma,
+    uint64_t count, uint_t hint, immu_iotlb_inv_t type);
+void immu_regs_context_flush(immu_t *immu, uint8_t function_mask,
+    uint16_t source_id, uint_t did, immu_context_inv_t type);
+void immu_regs_set_root_table(immu_t *immu);
+void immu_regs_qinv_enable(immu_t *immu, uint64_t qinv_reg_value);
+void immu_regs_intr_enable(immu_t *immu, uint32_t msi_addr, uint32_t msi_data,
+    uint32_t uaddr);
+void immu_regs_intrmap_enable(immu_t *immu, uint64_t irta_reg);
+uint64_t immu_regs_get64(immu_t *immu, uint_t reg);
+void immu_regs_put64(immu_t *immu, uint_t reg, uint64_t val);
+uint32_t immu_regs_get32(immu_t *immu, uint_t reg);
+void immu_regs_put32(immu_t *immu, uint_t reg, uint32_t val);
+
+/* immu_dvma.c interfaces */
+void immu_dvma_setup(list_t *immu_list);
+void immu_dvma_startup(immu_t *immu);
+void immu_dvma_shutdown(immu_t *immu);
+void immu_dvma_destroy(list_t *immu_list);
+
+void immu_dvma_physmem_update(uint64_t addr, uint64_t size);
+int immu_dvma_map(ddi_dma_impl_t *hp, struct ddi_dma_req *dmareq, memrng_t *,
+    uint_t prealloc_count, dev_info_t *rdip, immu_flags_t immu_flags);
+int immu_dvma_unmap(ddi_dma_impl_t *hp, dev_info_t *rdip);
+int immu_dvma_alloc(dvcookie_t *first_dvcookie, void *arg);
+void immu_dvma_free(dvcookie_t *first_dvcookie, void *arg);
+int immu_devi_set(dev_info_t *dip, immu_flags_t immu_flags);
+immu_devi_t *immu_devi_get(dev_info_t *dip);
+immu_t *immu_dvma_get_immu(dev_info_t *dip, immu_flags_t immu_flags);
+
+
+/* immu_intrmap.c interfaces */
+void immu_intrmap_setup(list_t *immu_list);
+void immu_intrmap_startup(immu_t *immu);
+void immu_intrmap_shutdown(immu_t *immu);
+void immu_intrmap_destroy(list_t *immu_list);
+
+/* registers interrupt handler for IOMMU unit */
+void immu_intr_register(immu_t *immu);
+int immu_intr_handler(immu_t *immu);
+
+
+/* immu_qinv.c interfaces */
+void immu_qinv_setup(list_t *immu_list);
+void immu_qinv_startup(immu_t *immu);
+void immu_qinv_shutdown(immu_t *immu);
+void immu_qinv_destroy(list_t *immu_list);
+
+void immu_qinv_context_fsi(immu_t *immu, uint8_t function_mask,
+    uint16_t source_id, uint_t domain_id);
+void immu_qinv_context_dsi(immu_t *immu, uint_t domain_id);
+void immu_qinv_context_gbl(immu_t *immu);
+void immu_qinv_iotlb_psi(immu_t *immu, uint_t domain_id,
+    uint64_t dvma, uint_t count, uint_t hint);
+void immu_qinv_iotlb_dsi(immu_t *immu, uint_t domain_id);
+void immu_qinv_iotlb_gbl(immu_t *immu);
+void immu_qinv_intr_global(immu_t *immu);
+void immu_qinv_intr_one_cache(immu_t *immu, uint_t idx);
+void immu_qinv_intr_caches(immu_t *immu, uint_t idx, uint_t cnt);
+void immu_qinv_report_fault(immu_t *immu);
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_INTEL_IOMMU_H */
--- a/usr/src/uts/i86pc/sys/intel_iommu.h	Sat Jan 30 15:04:39 2010 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,733 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Portions Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-/*
- * Copyright (c) 2008, Intel Corporation.
- * All rights reserved.
- */
-
-#ifndef	_SYS_INTEL_IOMMU_H
-#define	_SYS_INTEL_IOMMU_H
-
-/*
- * Intel IOMMU implementation specific state
- */
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#include <sys/types.h>
-#include <sys/bitset.h>
-#include <sys/dmar_acpi.h>
-#include <sys/iommu_rscs.h>
-#include <sys/cpu.h>
-#include <sys/kstat.h>
-
-/* extern functions */
-extern int intel_iommu_attach_dmar_nodes(void);
-extern int intel_iommu_map_sgl(ddi_dma_handle_t handle,
-    struct ddi_dma_req *dmareq, uint_t prealloc);
-extern void intel_iommu_unmap_sgl(ddi_dma_handle_t handle);
-extern void return_instr(void);
-
-/* define the return value for iommu_map_sgl */
-#define	IOMMU_SGL_SUCCESS	0
-#define	IOMMU_SGL_DISABLE	1
-#define	IOMMU_SGL_NORESOURCES	2
-
-/* register offset */
-#define	IOMMU_REG_VERSION	(0x00)  /* Version Rigister, 32 bit */
-#define	IOMMU_REG_CAP		(0x08)  /* Capability Register, 64 bit */
-#define	IOMMU_REG_EXCAP		(0x10)  /* Extended Capability Reg, 64 bit */
-#define	IOMMU_REG_GLOBAL_CMD	(0x18)  /* Global Command Register, 32 bit */
-#define	IOMMU_REG_GLOBAL_STS	(0x1C)  /* Global Status Register, 32 bit */
-#define	IOMMU_REG_ROOTENTRY	(0x20)  /* Root-Entry Table Addr Reg, 64 bit */
-#define	IOMMU_REG_CONTEXT_CMD	(0x28)  /* Context Comand Register, 64 bit */
-#define	IOMMU_REG_FAULT_STS	(0x34)  /* Fault Status Register, 32 bit */
-#define	IOMMU_REG_FEVNT_CON	(0x38)  /* Fault Event Control Reg, 32 bit */
-#define	IOMMU_REG_FEVNT_DATA	(0x3C)  /* Fault Event Data Register, 32 bit */
-#define	IOMMU_REG_FEVNT_ADDR	(0x40)  /* Fault Event Address Reg, 32 bit */
-#define	IOMMU_REG_FEVNT_UADDR	(0x44)  /* Fault Event Upper Addr Reg, 32 bit */
-#define	IOMMU_REG_AFAULT_LOG	(0x58)  /* Advanced Fault Log Reg, 64 bit */
-#define	IOMMU_REG_PMER		(0x64)  /* Protected Memory Enble Reg, 32 bit */
-#define	IOMMU_REG_PLMBR		(0x68)  /* Protected Low Mem Base Reg, 32 bit */
-#define	IOMMU_REG_PLMLR		(0x6C)  /* Protected Low Mem Lim Reg, 32 bit */
-#define	IOMMU_REG_PHMBR		(0X70)  /* Protectd High Mem Base Reg, 64 bit */
-#define	IOMMU_REG_PHMLR		(0x78)  /* Protected High Mem Lim Reg, 64 bit */
-#define	IOMMU_REG_INVAL_QH	(0x80)  /* Invalidation Queue Head, 64 bit */
-#define	IOMMU_REG_INVAL_QT	(0x88)  /* Invalidation Queue Tail, 64 bit */
-#define	IOMMU_REG_INVAL_QAR	(0x90)  /* Invalidtion Queue Addr Reg, 64 bit */
-#define	IOMMU_REG_INVAL_CSR	(0x9C)  /* Inval Compl Status Reg, 32 bit */
-#define	IOMMU_REG_INVAL_CECR	(0xA0)  /* Inval Compl Evnt Ctrl Reg, 32 bit */
-#define	IOMMU_REG_INVAL_CEDR	(0xA4)  /* Inval Compl Evnt Data Reg, 32 bit */
-#define	IOMMU_REG_INVAL_CEAR	(0xA8)  /* Inval Compl Event Addr Reg, 32 bit */
-#define	IOMMU_REG_INVAL_CEUAR	(0xAC)  /* Inval Comp Evnt Up Addr reg, 32bit */
-#define	IOMMU_REG_IRTAR		(0xB8)  /* INTR Remap Tbl Addr Reg, 64 bit */
-
-/* ioapic memory region */
-#define	IOAPIC_REGION_START	(0xfee00000)
-#define	IOAPIC_REGION_END	(0xfeefffff)
-
-/* iommu page */
-#define	IOMMU_LEVEL_STRIDE	(9)
-#define	IOMMU_LEVEL_SIZE	((uint64_t)1 << IOMMU_LEVEL_STRIDE)
-#define	IOMMU_LEVEL_OFFSET	(IOMMU_LEVEL_SIZE - 1)
-#define	IOMMU_PAGE_SHIFT	(12)
-#define	IOMMU_PAGE_SIZE		(uint64_t)((uint64_t)1 << IOMMU_PAGE_SHIFT)
-#define	IOMMU_PAGE_MASK		~(IOMMU_PAGE_SIZE - 1)
-#define	IOMMU_PAGE_OFFSET	(IOMMU_PAGE_SIZE - 1)
-#define	IOMMU_PAGE_ROUND(x)	(((x) + IOMMU_PAGE_OFFSET) & IOMMU_PAGE_MASK)
-#define	IOMMU_PTOB(x)		(((uint64_t)(x)) << IOMMU_PAGE_SHIFT)
-#define	IOMMU_BTOP(x)		((x) >> IOMMU_PAGE_SHIFT)
-#define	IOMMU_BTOPR(x)		IOMMU_BTOP((x) + IOMMU_PAGE_OFFSET)
-#define	IOMMU_LEVEL_TO_AGAW(x)	((x) * 9 + 12)
-#define	IOMMU_IOVA_MAX_4G	(((uint64_t)1 << 32) - 1)
-#define	IOMMU_SIZE_4G		((uint64_t)1 << 32)
-#define	IOMMU_SIZE_2M		((uint64_t)1 << 21)
-#define	IOMMU_2M_MASK		~(IOMMU_SIZE_2M - 1)
-#define	IOMMU_PTE_MAX		(IOMMU_PAGE_SIZE >> 3)
-
-/* iommu page entry property */
-#define	IOMMU_PAGE_PROP_READ	(1)
-#define	IOMMU_PAGE_PROP_WRITE	(2)
-#define	IOMMU_PAGE_PROP_RW	(IOMMU_PAGE_PROP_READ | IOMMU_PAGE_PROP_WRITE)
-#define	IOMMU_PAGE_PROP_NOSYNC	(4)
-
-/* root context entry */
-#define	ROOT_ENTRY_GET_P(x)		(((x)->lo) & 0x1)
-#define	ROOT_ENTRY_SET_P(x)		((x)->lo) |= 0x1
-#define	ROOT_ENTRY_GET_CTP(x)		(((x)->lo) & IOMMU_PAGE_MASK)
-#define	ROOT_ENTRY_SET_CTP(x, p)	((x)->lo) |= ((p) & IOMMU_PAGE_MASK)
-#define	CONT_ENTRY_GET_P(x)		(((x)->lo) & 0x1)
-#define	CONT_ENTRY_SET_P(x)		((x)->lo) |= 0x1
-#define	CONT_ENTRY_SET_ASR(x, p)	((x)->lo) |= ((p) & IOMMU_PAGE_MASK)
-#define	CONT_ENTRY_GET_ASR(x)		(((x)->lo) & IOMMU_PAGE_MASK)
-#define	CONT_ENTRY_SET_AW(x, v)		((x)->hi) |= ((v) & 7)
-#define	CONT_ENTRY_SET_DID(x, v) ((x)->hi) |= (((v) & ((1 << 16) - 1)) << 8)
-
-/* fault register */
-#define	IOMMU_FAULT_STS_PPF		(2)
-#define	IOMMU_FAULT_STS_PFO		(1)
-#define	IOMMU_FAULT_STS_ITE		(1 << 6)
-#define	IOMMU_FAULT_STS_ICE		(1 << 5)
-#define	IOMMU_FAULT_STS_IQE		(1 << 4)
-#define	IOMMU_FAULT_GET_INDEX(x)	(((x) >> 8) & 0xff)
-#define	IOMMU_FRR_GET_F(x)		((x) >> 63)
-#define	IOMMU_FRR_GET_FR(x)		(((x) >> 32) & 0xff)
-#define	IOMMU_FRR_GET_FT(x)		(((x) >> 62) & 0x1)
-#define	IOMMU_FRR_GET_SID(x)		((x) & 0xffff)
-
-/* (ex)capability register */
-#define	IOMMU_CAP_GET_NFR(x)		((((x) >> 40) & 0xff) + 1)
-#define	IOMMU_CAP_GET_DWD(x)		(((x) >> 54) & 1)
-#define	IOMMU_CAP_GET_DRD(x)		(((x) >> 55) & 1)
-#define	IOMMU_CAP_GET_PSI(x)		(((x) >> 39) & 1)
-#define	IOMMU_CAP_GET_SPS(x)		(((x) >> 34) & 0xf)
-#define	IOMMU_CAP_GET_ISOCH(x)		(((x) >> 23) & 1)
-#define	IOMMU_CAP_GET_ZLR(x)		(((x) >> 22) & 1)
-#define	IOMMU_CAP_GET_MAMV(x)		(((x) >> 48) & 0x3f)
-#define	IOMMU_CAP_GET_CM(x)		(((x) >> 7) & 1)
-#define	IOMMU_CAP_GET_PHMR(x)		(((x) >> 6) & 1)
-#define	IOMMU_CAP_GET_PLMR(x)		(((x) >> 5) & 1)
-#define	IOMMU_CAP_GET_RWBF(x)		(((x) >> 4) & 1)
-#define	IOMMU_CAP_GET_AFL(x)		(((x) >> 3) & 1)
-#define	IOMMU_CAP_GET_FRO(x)		((((x) >> 24) & 0x3ff) * 16)
-#define	IOMMU_CAP_MGAW(x)		(((((uint64_t)x) >> 16) & 0x3f) + 1)
-#define	IOMMU_CAP_SAGAW(x)		(((x) >> 8) & 0x1f)
-#define	IOMMU_CAP_ND(x)			(1 << (((x) & 0x7) *2 + 4)) -1
-#define	IOMMU_ECAP_GET_IRO(x)		((((x) >> 8) & 0x3ff) << 4)
-#define	IOMMU_ECAP_GET_MHMV(x)		((x >> 20) & 0xf)
-#define	IOMMU_ECAP_GET_SC(x)		((x) & 0x80)
-#define	IOMMU_ECAP_GET_PT(x)		((x) & 0x40)
-#define	IOMMU_ECAP_GET_CH(x)		((x) & 0x20)
-#define	IOMMU_ECAP_GET_EIM(x)		((x) & 0x10)
-#define	IOMMU_ECAP_GET_IR(x)		((x) & 0x8)
-#define	IOMMU_ECAP_GET_DI(x)		((x) & 0x4)
-#define	IOMMU_ECAP_GET_QI(x)		((x) & 0x2)
-#define	IOMMU_ECAP_GET_C(x)		((x) & 0x1)
-
-
-/* iotlb invalidation */
-#define	TLB_INV_GLOBAL		(((uint64_t)1) << 60)
-#define	TLB_INV_DOMAIN		(((uint64_t)2) << 60)
-#define	TLB_INV_PAGE		(((uint64_t)3) << 60)
-#define	TLB_INV_GET_IAIG(x)	(((x) >> 57) & 7)
-#define	TLB_INV_DRAIN_READ	(((uint64_t)1) << 49)
-#define	TLB_INV_DRAIN_WRITE	(((uint64_t)1) << 48)
-#define	TLB_INV_DID(x)		(((uint64_t)((x) & 0xffff)) << 32)
-#define	TLB_INV_IVT		(((uint64_t)1) << 63)
-#define	TLB_IVA_HINT(x)		(((x) & 0x1) << 6)
-#define	TLB_IVA_LEAF		1
-#define	TLB_IVA_WHOLE		0
-
-/* context invalidation */
-#define	CCMD_INV_ICC		(((uint64_t)1) << 63)
-#define	CCMD_INV_GLOBAL		(((uint64_t)1) << 61)
-#define	CCMD_INV_DOMAIN		(((uint64_t)2) << 61)
-#define	CCMD_INV_DEVICE		(((uint64_t)3) << 61)
-#define	CCMD_INV_DID(x)		((uint64_t)((x) & 0xffff))
-#define	CCMD_INV_SID(x)		(((uint64_t)((x) & 0xffff)) << 16)
-#define	CCMD_INV_FM(x)		(((uint64_t)((x) & 0x3)) << 32)
-
-/* global command register */
-#define	IOMMU_GCMD_TE		(((uint32_t)1) << 31)
-#define	IOMMU_GCMD_SRTP		(((uint32_t)1) << 30)
-#define	IOMMU_GCMD_SFL		(((uint32_t)1) << 29)
-#define	IOMMU_GCMD_EAFL		(((uint32_t)1) << 28)
-#define	IOMMU_GCMD_WBF		(((uint32_t)1) << 27)
-#define	IOMMU_GCMD_QIE		(((uint32_t)1) << 26)
-#define	IOMMU_GCMD_IRE		(((uint32_t)1) << 25)
-#define	IOMMU_GCMD_SIRTP	(((uint32_t)1) << 24)
-#define	IOMMU_GCMD_CFI		(((uint32_t)1) << 23)
-
-/* global status register */
-#define	IOMMU_GSTS_TES		(((uint32_t)1) << 31)
-#define	IOMMU_GSTS_RTPS		(((uint32_t)1) << 30)
-#define	IOMMU_GSTS_FLS		(((uint32_t)1) << 29)
-#define	IOMMU_GSTS_AFLS		(((uint32_t)1) << 28)
-#define	IOMMU_GSTS_WBFS		(((uint32_t)1) << 27)
-#define	IOMMU_GSTS_QIES		(((uint32_t)1) << 26)
-#define	IOMMU_GSTS_IRES		(((uint32_t)1) << 25)
-#define	IOMMU_GSTS_IRTPS	(((uint32_t)1) << 24)
-#define	IOMMU_GSTS_CFIS		(((uint32_t)1) << 23)
-
-/* psi address mask */
-#define	ADDR_AM_MAX(m)		(((uint_t)1) << (m))
-#define	ADDR_AM_OFFSET(n, m)	((n) & (ADDR_AM_MAX(m) - 1))
-
-/* dmar fault event */
-#define	IOMMU_INTR_IPL			(8)
-#define	IOMMU_REG_FEVNT_CON_IM_SHIFT	(31)
-
-/* iommu enable state */
-#define	DMAR_ENABLE		0x1
-#define	QINV_ENABLE		0x2
-#define	INTRR_ENABLE		0x4
-
-/* invalidation queue table entry size */
-#define	QINV_ENTRY_SIZE		0x10
-
-/* max value of Queue Size field of Invalidation Queue Address Register */
-#define	QINV_MAX_QUEUE_SIZE	0x7
-
-/* status data size of invalidation wait descriptor */
-#define	QINV_SYNC_DATA_SIZE	0x4
-
-/* status data value of invalidation wait descriptor */
-#define	QINV_SYNC_DATA_FENCE	1
-#define	QINV_SYNC_DATA_UNFENCE	2
-
-/* invalidation queue head and tail */
-#define	QINV_IQA_HEAD(QH)	BITX((QH), 18, 4)
-#define	QINV_IQA_TAIL_SHIFT	4
-
-/* max value of Size field of Interrupt Remapping Table Address Register */
-#define	INTRR_MAX_IRTA_SIZE	0xf
-
-/* interrupt remapping table entry size */
-#define	INTRR_RTE_SIZE		0x10
-
-/* ioapic redirection table entry related shift of remappable interrupt */
-#define	INTRR_IOAPIC_IIDX_SHIFT		17
-#define	INTRR_IOAPIC_FORMAT_SHIFT	16
-#define	INTRR_IOAPIC_TM_SHIFT		15
-#define	INTRR_IOAPIC_POL_SHIFT		13
-#define	INTRR_IOAPIC_IIDX15_SHIFT	11
-
-/* msi intr entry related shift of remappable interrupt */
-#define	INTRR_MSI_IIDX_SHIFT	5
-#define	INTRR_MSI_FORMAT_SHIFT	4
-#define	INTRR_MSI_SHV_SHIFT	3
-#define	INTRR_MSI_IIDX15_SHIFT	2
-
-#define	INTRR_IIDX_FULL		(uint_t)-1
-
-#define	RDT_DLM(rdt)	BITX((rdt), 10, 8)
-#define	RDT_DM(rdt)	BT_TEST(&(rdt), 11)
-#define	RDT_POL(rdt)	BT_TEST(&(rdt), 13)
-#define	RDT_TM(rdt)	BT_TEST(&(rdt), 15)
-
-#define	INTRR_DISABLE	(void *)-1
-
-/* page entry structure */
-typedef uint64_t *iopte_t;
-
-/* root/context entry structure */
-typedef struct iorce {
-	uint64_t lo;
-	uint64_t hi;
-} *iorce_t;
-
-/* kernel maintained page table entry */
-typedef struct iovpte {
-	/*
-	 * pointer to the cpu accessable
-	 * iommu page table
-	 */
-	caddr_t vp;
-	/*
-	 * pointer to the real iommu
-	 * page table
-	 */
-	caddr_t pp;
-} *iovpte_t;
-
-/*
- * struct iommu_kstat
- *   kstat tructure for iommu
- */
-typedef struct iommu_kstat {
-
-	/* hardware dependent */
-	kstat_named_t is_dmar_enabled;
-	kstat_named_t is_qinv_enabled;
-	kstat_named_t is_intrr_enabled;
-	kstat_named_t is_iotlb_psi;
-	kstat_named_t is_iotlb_domain;
-	kstat_named_t is_iotlb_global;
-	kstat_named_t is_write_buffer;
-	kstat_named_t is_context_cache;
-	kstat_named_t is_wait_complete_us;
-	kstat_named_t is_domain_alloc;
-
-	/* hardware independent */
-	kstat_named_t is_page_used;
-} iommu_kstat_t;
-
-/*
- * struct iommu_stat
- *   statistics for iommu
- */
-typedef struct iommu_stat {
-	uint64_t st_iotlb_psi;
-	uint64_t st_iotlb_domain;
-	uint64_t st_iotlb_global;
-	uint64_t st_write_buffer;
-	uint64_t st_context_cache;
-	uint64_t st_wait_complete_us;
-	uint64_t st_domain_alloc;
-} iommu_stat_t;
-
-struct intel_iommu_state;
-struct iommu_dvma_cookie;
-struct dmar_domain_state;
-
-/*
- * invalidation granularity
- */
-typedef enum {
-	TLB_INV_G_GLOBAL = 1,
-	TLB_INV_G_DOMAIN,
-	TLB_INV_G_PAGE
-} tlb_inv_g_t;
-
-typedef enum {
-	CTT_INV_G_GLOBAL = 1,
-	CTT_INV_G_DOMAIN,
-	CTT_INV_G_DEVICE
-} ctt_inv_g_t;
-
-typedef enum {
-	IEC_INV_GLOBAL = 0,
-	IEC_INV_INDEX
-} iec_inv_g_t;
-
-/*
- * struct dmar_ops
- *   dmar hardware operation functions
- */
-struct dmar_ops {
-	/* enable */
-	void (*do_enable)(struct intel_iommu_state *iommu);
-
-	/* page fault */
-	int (*do_fault)(struct intel_iommu_state *iommu);
-
-	/* cache related */
-	void (*do_flwb)(struct intel_iommu_state *iommu);
-	void (*do_iotlb_psi)(struct intel_iommu_state *iommu, uint_t domain_id,
-	    uint64_t dvma, uint_t count, uint_t hint);
-	void (*do_iotlb_dsi)(struct intel_iommu_state *iommu, uint_t domain_id);
-	void (*do_iotlb_gbl)(struct intel_iommu_state *iommu);
-	void (*do_context_fsi)(struct intel_iommu_state *iommu,
-	    uint8_t function_mask,
-	    uint16_t source_id, uint_t domain_id);
-	void (*do_context_dsi)(struct intel_iommu_state *iommu,
-	    uint_t domain_id);
-	void (*do_context_gbl)(struct intel_iommu_state *iommu);
-	void (*do_plant_wait)(struct intel_iommu_state *iommu,
-	    struct iommu_dvma_cookie *dcookies, uint_t count,
-	    uint_t array_size);
-	void (*do_reap_wait)(struct intel_iommu_state *iommu);
-
-	/* root entry */
-	void (*do_set_root_table)(struct intel_iommu_state *iommu);
-
-	/* cpu cache line flush */
-	void (*do_clflush)(caddr_t addr, uint_t size);
-};
-
-/*
- * struct iotlb_cache_node
- *   the pending data for iotlb flush
- */
-typedef struct iotlb_pend_node {
-	/* node to hook into the list */
-	list_node_t			node;
-	/* ptr to dvma cookie array */
-	struct iommu_dvma_cookie	*icn_dcookies;
-	/* valid cookie count */
-	uint_t				icn_count;
-	/* array size */
-	uint_t				icn_array_size;
-} iotlb_pend_node_t;
-
-/*
- * struct iotlb_cache_head
- *   the pending head for the iotlb flush
- */
-typedef struct iotlb_pend_head {
-	/* the pending iotlb list */
-	kmutex_t	ich_pend_lock;
-	list_t		ich_pend_list;
-	uint_t		ich_pend_count;
-
-	/* the pending node cache list */
-	kmutex_t	ich_mem_lock;
-	list_t		ich_mem_list;
-} iotlb_pend_head_t;
-
-struct inv_queue_state;
-struct intr_remap_tbl_state;
-struct iommu_pghdl;
-
-#define	IOMMU_PGHDL_HASH_SIZE	(256)
-
-/*
- * struct intel_iommu_state
- *   This structure describes the state information
- *   of each iommu unit in the platform. It is cre-
- *   ated in the dmarnex driver's attach(), and will
- *   be used in every DMA DDI and the iommu transla-
- *   tion functions
- *
- * node			- the list node to hook it in iommu_states
- * iu_drhd		- the related drhd
- * iu_reg_handle	- register access handler
- * iu_reg_lock		- lock to protect register operation
- * iu_reg_address	- virtual address of the register base address
- * iu_capability	- copy of the capability register
- * iu_excapability	- copy of the extention register
- * iu_root_entry_paddr	- root entry page table
- * iu_root_context_lock	- root context entry lock
- * iu_gaw		- guest address width
- * iu_agaw		- adjusted guest address width
- * iu_level		- the page table level
- * iu_global_cmd_reg	- global command register save place
- * iu_max_domain	- the maximum domain numbers
- * iu_domain_id_hdl	- domain id allocator handler
- * iu_enabled		- the soft state of the iommu
- * iu_coherency		- hardware access is coherent
- * iu_kstat		- kstat pointer
- * iu_statistics	- iommu statistics
- * iu_dmar_ops		- iommu operation functions
- * iu_pend_head		- pending iotlb list
- * iu_inv_queue		- invalidation queue state
- * iu_intr_remap_tbl	- interrupt remapping table state
- * iu_pghdl_hash	- hash of pages allocated for IOMMU internal work.
- */
-typedef struct intel_iommu_state {
-	list_node_t		node;
-	drhd_info_t		*iu_drhd;
-	ddi_acc_handle_t	iu_reg_handle;
-	kmutex_t		iu_reg_lock;
-	caddr_t			iu_reg_address;
-	uint64_t		iu_capability;
-	uint64_t		iu_excapability;
-	paddr_t			iu_root_entry_paddr;
-	kmutex_t		iu_root_context_lock;
-	int			iu_gaw;
-	int			iu_agaw;
-	int			iu_level;
-	uint32_t		iu_global_cmd_reg;
-	int			iu_max_domain;
-	iommu_rscs_t		iu_domain_id_hdl;
-	uchar_t			iu_enabled;
-	boolean_t		iu_coherency;
-	kstat_t			*iu_kstat;
-	iommu_stat_t		iu_statistics;
-	struct dmar_ops		*iu_dmar_ops;
-	iotlb_pend_head_t	iu_pend_head;
-	struct inv_queue_state	*iu_inv_queue;
-	struct intr_remap_tbl_state	*iu_intr_remap_tbl;
-	struct iommu_pghdl	*iu_pghdl_hash[IOMMU_PGHDL_HASH_SIZE];
-} intel_iommu_state_t;
-
-/*
- * struct dvma_cache_node
- *   dvma cache node
- */
-typedef struct dvma_cache_node {
-	list_node_t		node;
-
-	/* parameters */
-	size_t			dcn_align;
-	uint64_t		dcn_dvma;
-} dvma_cache_node_t;
-
-/*
- * struct dvma_cache_head
- *   dvma cache head
- */
-typedef struct dvma_cache_head {
-	/* the list of the free dvma */
-	kmutex_t	dch_free_lock;
-	list_t		dch_free_list;
-	uint_t		dch_free_count;
-
-	/* the cache for the node memory */
-	kmutex_t	dch_mem_lock;
-	list_t		dch_mem_list;
-} dvma_cache_head_t;
-
-#define	DVMA_CACHE_HEAD_CNT	64
-
-/*
- * struct dmar_domain_state
- *   This structure describes the state information
- *   of an iommu domain. It is created and initiated
- *   when the driver call ddi_dma_bind_handle(). And
- *   will be used in each iommu translation fucntions
- *
- * dm_domain_id		- the domain id
- * dm_iommu		- iommu pointer this domain belongs to
- * dm_dvma_map		- dvma map
- * dm_dvma_cache	- dvma cahce lists
- * dm_page_table_paddr	- page table address for this domain
- * dm_pgtable_lock	- lock to protect changes to page table.
- * dm_pt_tree		- the kernel maintained page tables
- * dm_identity		- does this domain identity mapped
- */
-typedef struct dmar_domain_state {
-	list_node_t		node;
-	uint_t			dm_domain_id;
-	intel_iommu_state_t	*dm_iommu;
-	vmem_t			*dm_dvma_map;
-	dvma_cache_head_t	dm_dvma_cache[DVMA_CACHE_HEAD_CNT];
-	paddr_t			dm_page_table_paddr;
-	kmutex_t		dm_pgtable_lock;
-	struct iovpte		dm_pt_tree;
-	boolean_t		dm_identity;
-} dmar_domain_state_t;
-
-/*
- * struct pci_dev_info
- *   pci device info structure
- */
-typedef struct pci_dev_info {
-	list_node_t	node;
-	int		pdi_seg;
-	int		pdi_bus;
-	int		pdi_devfn;
-	dev_info_t	*pdi_dip;
-} pci_dev_info_t;
-
-#define		IOMMU_PPB_NONE		0
-#define		IOMMU_PPB_PCIE_PCIE	1
-#define		IOMMU_PPB_PCIE_PCI	2
-#define		IOMMU_PPB_PCI_PCI	3
-
-#define		MAX_COOKIE_CACHE_SIZE	20
-/*
- * struct iommu_dvma_cookie
- *   this cookie record the dvma allocated for
- *   an individual device
- */
-typedef struct iommu_dvma_cookie {
-	uint64_t	dc_addr;
-	uint64_t	dc_size;
-	struct dmar_domain_state	*dc_domain;
-	size_t		dc_align;
-	struct iommu_dvma_cookie	*dc_next;
-} iommu_dvma_cookie_t;
-
-/*
- * struct dvma_cookie_head
- *   the cookie cache head
- */
-typedef struct dvma_cookie_head {
-	kmutex_t		dch_lock;
-	iommu_dvma_cookie_t	*dch_next;
-	uint_t			dch_count;
-} dvma_cookie_head_t;
-
-/* physical contigous pages for invalidation queue */
-typedef struct inv_queue_mem {
-	kmutex_t		lock;
-	ddi_dma_handle_t	dma_hdl;
-	ddi_acc_handle_t	acc_hdl;
-	caddr_t			vaddr;
-	paddr_t			paddr;
-	uint_t			size;
-	uint16_t		head;
-	uint16_t		tail;
-} inv_queue_mem_t;
-
-/*
- * invalidation queue state
- *   This structure describes the state information of the
- *   invalidation queue table and related status memeory for
- *   invalidation wait descriptor
- *
- * iq_table		- invalidation queue table
- * iq_sync		- sync status memory for invalidation wait descriptor
- * iotlb_pend_node	- pending tlb node
- */
-typedef struct inv_queue_state {
-	inv_queue_mem_t		iq_table;
-	inv_queue_mem_t		iq_sync;
-	iotlb_pend_node_t	**iotlb_pend_node;
-} inv_queue_state_t;
-
-/* invalidation queue entry structure */
-typedef struct inv_dsc {
-	uint64_t	lo;
-	uint64_t	hi;
-} inv_dsc_t;
-
-/* helper macro for making queue invalidation descriptor */
-#define	INV_DSC_TYPE(dsc)	((dsc)->lo & 0xF)
-#define	CC_INV_DSC_HIGH		(0)
-#define	CC_INV_DSC_LOW(fm, sid, did, g)	(((uint64_t)(fm) << 48) | \
-	((uint64_t)(sid) << 32) | \
-	((uint64_t)(did) << 16) | \
-	((uint64_t)(g) << 4) | \
-	1)
-
-#define	IOTLB_INV_DSC_HIGH(addr, ih, am) (((uint64_t)(addr)) | \
-	((uint64_t)(ih) << 6) |	\
-	((uint64_t)(am)))
-
-#define	IOTLB_INV_DSC_LOW(did, dr, dw, g) (((uint64_t)(did) << 16) | \
-	((uint64_t)(dr) << 7) | \
-	((uint64_t)(dw) << 6) | \
-	((uint64_t)(g) << 4) | \
-	2)
-
-#define	DEV_IOTLB_INV_DSC_HIGH(addr, s) (((uint64_t)(addr)) | (s))
-
-#define	DEV_IOTLB_INV_DSC_LOW(sid, max_invs_pd) ( \
-	((uint64_t)(sid) << 32) | \
-	((uint64_t)(max_invs_pd) << 16) | \
-	3)
-
-#define	IEC_INV_DSC_HIGH (0)
-#define	IEC_INV_DSC_LOW(iidx, im, g) (((uint64_t)(iidx) << 32) | \
-	((uint64_t)(im) << 27) | \
-	((uint64_t)(g) << 4) | \
-	4)
-
-#define	INV_WAIT_DSC_HIGH(saddr) ((uint64_t)(saddr))
-
-#define	INV_WAIT_DSC_LOW(sdata, fn, sw, iflag) (((uint64_t)(sdata) << 32) | \
-	((uint64_t)(fn) << 6) | \
-	((uint64_t)(sw) << 5) | \
-	((uint64_t)(iflag) << 4) | \
-	5)
-
-/* save source id and iommu structure for ioapic */
-typedef struct ioapic_iommu_info {
-	uint16_t		sid;
-	intel_iommu_state_t	*iommu;
-} ioapic_iommu_info_t;
-
-typedef struct intr_remap_private {
-	intel_iommu_state_t	*ir_iommu;
-	uint16_t		ir_iidx;
-	uint32_t		ir_sid_svt_sq;
-} intr_remap_private_t;
-
-#define	INTRR_PRIVATE(airq) ((intr_remap_private_t *)airq->airq_intrr_private)
-#define	AIRQ_PRIVATE(airq) (airq->airq_intrr_private)
-
-/* interrupt remapping table state info */
-typedef struct intr_remap_tbl_state {
-	kmutex_t		lock;
-	ddi_dma_handle_t	dma_hdl;
-	ddi_acc_handle_t	acc_hdl;
-	caddr_t			vaddr;
-	paddr_t			paddr;
-	uint_t			size;
-	bitset_t		map;
-	uint_t			free;
-} intr_remap_tbl_state_t;
-
-/* interrupt remapping table entry */
-typedef struct intr_rte {
-	uint64_t	lo;
-	uint64_t	hi;
-} intr_rte_t;
-
-#define	IRTE_HIGH(sid_svt_sq) (sid_svt_sq)
-#define	IRTE_LOW(dst, vector, dlm, tm, rh, dm, fpd, p)	\
-	    (((uint64_t)(dst) << 32) |  \
-	    ((uint64_t)(vector) << 16) | \
-	    ((uint64_t)(dlm) << 5) | \
-	    ((uint64_t)(tm) << 4) | \
-	    ((uint64_t)(rh) << 3) | \
-	    ((uint64_t)(dm) << 2) | \
-	    ((uint64_t)(fpd) << 1) | \
-	    (p))
-
-typedef enum {
-	SVT_NO_VERIFY = 0, 	/* no verification */
-	SVT_ALL_VERIFY,		/* using sid and sq to verify */
-	SVT_BUS_VERIFY,		/* verify #startbus and #endbus */
-	SVT_RSVD
-} intrr_svt_t;
-
-typedef enum {
-	SQ_VERIFY_ALL = 0,	/* verify all 16 bits */
-	SQ_VERIFY_IGR_1,	/* ignore bit 3 */
-	SQ_VERIFY_IGR_2,	/* ignore bit 2-3 */
-	SQ_VERIFY_IGR_3		/* ignore bit 1-3 */
-} intrr_sq_t;
-
-/*
- * struct vmem_walk_arg
- *   the arg of vmem vmem walker
- */
-typedef struct vmem_walk_arg {
-	rmrr_info_t		*vwa_rmrr;
-	dmar_domain_state_t	*vwa_domain;
-	dev_info_t		*vwa_dip;
-} vmem_walk_arg_t;
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_INTEL_IOMMU_H */
--- a/usr/src/uts/i86pc/sys/iommu_rscs.h	Sat Jan 30 15:04:39 2010 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,81 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-
-#ifndef	_SYS_IOMMU_H
-#define	_SYS_IOMMU_H
-
-/*
- * XXX
- */
-
-#include <sys/types.h>
-#include <sys/conf.h>
-#include <sys/modctl.h>
-#include <sys/sunddi.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-
-/*
- * iommu_page_alloc()
- *   allocate a 4K page and map it into KVA
- * iommu_page_free()
- *   unmap and free page from iommu_page_alloc()
- * iommu_page_map()
- *   map page into kva
- * iommu_page_unmap()
- *   unmap page out of kva
- */
-
-typedef struct iommu_pghdl {
-	ddi_dma_handle_t dma_hdl;
-	ddi_acc_handle_t mem_hdl;
-	paddr_t paddr;
-	caddr_t vaddr;
-	struct iommu_pghdl *prev;
-	struct iommu_pghdl *next;
-} iommu_pghdl_t;
-
-struct intel_iommu_state;
-
-iommu_pghdl_t *iommu_page_alloc(struct intel_iommu_state *iommu, int kmflag);
-void iommu_page_free(struct intel_iommu_state *iommu, paddr_t paddr);
-caddr_t iommu_get_vaddr(struct intel_iommu_state *iommu, paddr_t paddr);
-
-typedef struct iommu_rscs_s *iommu_rscs_t;
-
-void iommu_rscs_init(uint_t min_val, uint_t max_val, iommu_rscs_t *handle);
-void iommu_rscs_fini(iommu_rscs_t *handle);
-int iommu_rscs_alloc(iommu_rscs_t handle, uint_t *rs);
-void iommu_rscs_free(iommu_rscs_t handle, uint_t rs);
-
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif	/* _SYS_IOMMU_H */
--- a/usr/src/uts/i86pc/sys/rootnex.h	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/i86pc/sys/rootnex.h	Sat Jan 30 18:23:16 2010 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -205,6 +205,18 @@
 #endif
 } rootnex_window_t;
 
+typedef struct dvcookie {
+	uint64_t dvck_dvma;
+	uint64_t dvck_npages;
+	uint64_t dvck_sidx;
+	uint64_t dvck_eidx;
+} dvcookie_t;
+
+typedef struct dcookie {
+	paddr_t dck_paddr;
+	uint64_t dck_npages;
+} dcookie_t;
+
 /* per dma handle private state */
 typedef struct rootnex_dma_s {
 	/*
@@ -310,11 +322,16 @@
 	uchar_t			*dp_prealloc_buffer;
 
 	/*
-	 * intel iommu related state
-	 * dvma_cookies saves the dvma allocated for this handler, it has the
-	 * size of si_max_pages, set when bind handler and freed when unbind
+	 * Intel IOMMU (immu) related state
+	 * dv_cookies saves the dvma allocated for this handler
+	 * max index of dvcookies in dvmax
 	 */
-	void			*dp_dvma_cookies;
+	dvcookie_t		*dp_dvcookies;
+	uint64_t		dp_dvmax;
+	dcookie_t		*dp_dcookies;
+	uint64_t		dp_dmax;
+	uint64_t		dp_max_cookies;
+	uint64_t		dp_max_dcookies;
 
 	/*
 	 * sleep flags set on bind and unset on unbind
@@ -349,7 +366,6 @@
  *   r_dip - rootnex dip
  *   r_reserved_msg_printed - ctlops reserve message threshold
  *   r_counters - profile/performance counters
- *   r_intel_iommu_enabled - intel iommu enabled
  */
 typedef struct rootnex_state_s {
 	uint_t			r_prealloc_cookies;
@@ -361,11 +377,9 @@
 	ddi_iblock_cookie_t	r_err_ibc;
 	boolean_t		r_reserved_msg_printed;
 	uint64_t		r_counters[ROOTNEX_CNT_LAST];
-	boolean_t		r_intel_iommu_enabled;
 	iommulib_nexhandle_t    r_iommulib_handle;
 } rootnex_state_t;
 
-
 #ifdef	__cplusplus
 }
 #endif
--- a/usr/src/uts/intel/ia32/ml/modstubs.s	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/intel/ia32/ml/modstubs.s	Sat Jan 30 18:23:16 2010 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1376,6 +1376,17 @@
 #endif
 
 /*
+ * Stubs for rootnex nexus driver.
+ */
+#ifndef ROOTNEX_MODULE
+	MODULE(rootnex,drv);
+	STUB(rootnex, immu_init, 0);
+	STUB(rootnex, immu_startup, 0);
+	STUB(rootnex, immu_physmem_update, 0);
+	END_MODULE(rootnex);
+#endif
+
+/*
  * Stubs for kernel socket, for iscsi
  */
 #ifndef KSOCKET_MODULE
--- a/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/intel/io/hotplug/pcicfg/pcicfg.c	Sat Jan 30 18:23:16 2010 -0800
@@ -3746,6 +3746,8 @@
 	 */
 	(void) i_ndi_config_node(new_child, DS_LINKED, 0);
 
+	DEVI_SET_PCI(new_child);
+
 	if ((header_type & PCI_HEADER_TYPE_M) == PCI_HEADER_PPB) {
 
 		DEBUG3("--Bridge found bus [0x%x] device[0x%x] func [0x%x]\n",
--- a/usr/src/uts/intel/io/pci/pci_boot.c	Sat Jan 30 15:04:39 2010 -0800
+++ b/usr/src/uts/intel/io/pci/pci_boot.c	Sat Jan 30 18:23:16 2010 -0800
@@ -43,7 +43,6 @@
 #include <sys/hotplug/pci/pciehpc_acpi.h>
 #include <sys/acpi/acpi.h>
 #include <sys/acpica.h>
-#include <sys/intel_iommu.h>
 #include <sys/iommulib.h>
 #include <sys/devcache.h>
 #include <sys/pci_cfgacc_x86.h>
@@ -2100,6 +2099,7 @@
 		}
 	}
 
+	DEVI_SET_PCI(dip);
 	reprogram = add_reg_props(dip, bus, dev, func, config_op, pciide);
 	(void) ndi_devi_bind_driver(dip, 0);