Mercurial > illumos > illumos-gate
changeset 7589:7de800909a06
PSARC 2008/560 Intel IOMMU
6714111 Solaris needs to support the Intel IOMMU
author | Vikram Hegde <Vikram.Hegde@Sun.COM> |
---|---|
date | Sun, 14 Sep 2008 19:52:20 -0700 |
parents | fc605a2defdc |
children | c9805cafd4a9 |
files | usr/src/uts/common/os/main.c usr/src/uts/common/sys/ddi_impldefs.h usr/src/uts/i86pc/Makefile.files usr/src/uts/i86pc/io/dmar_acpi.c usr/src/uts/i86pc/io/intel_iommu.c usr/src/uts/i86pc/io/iommu_rscs.c usr/src/uts/i86pc/io/rootnex.c usr/src/uts/i86pc/os/acpi_fw.h usr/src/uts/i86pc/os/cpuid.c usr/src/uts/i86pc/os/fakebop.c usr/src/uts/i86pc/sys/dmar_acpi.h usr/src/uts/i86pc/sys/intel_iommu.h usr/src/uts/i86pc/sys/iommu_rscs.h usr/src/uts/i86pc/sys/machsystm.h usr/src/uts/i86pc/sys/rootnex.h usr/src/uts/i86pc/vm/htable.c usr/src/uts/i86pc/vm/vm_machdep.c usr/src/uts/intel/ia32/ml/i86_subr.s usr/src/uts/intel/io/pci/pci_boot.c usr/src/uts/intel/sys/archsystm.h usr/src/uts/intel/sys/x86_archext.h |
diffstat | 21 files changed, 5053 insertions(+), 69 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/uts/common/os/main.c Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/common/os/main.c Sun Sep 14 19:52:20 2008 -0700 @@ -27,8 +27,6 @@ /* All Rights Reserved */ -#pragma ident "%Z%%M% %I% %E% SMI" /* from SVr4.0 1.31 */ - #include <sys/types.h> #include <sys/param.h> #include <sys/sysmacros.h> @@ -346,6 +344,9 @@ lwp_rtt(); } +extern void return_instr(void); +void (*rootnex_iommu_add_intr)(void) = (void (*)(void))return_instr; + void main(void) { @@ -446,6 +447,11 @@ (void) spl0(); interrupts_unleashed = 1; + /* + * add intel iommu fault event handler + */ + rootnex_iommu_add_intr(); + vfs_mountroot(); /* Mount the root file system */ errorq_init(); /* after vfs_mountroot() so DDI root is ready */ cpu_kstat_init(CPU); /* after vfs_mountroot() so TOD is valid */
--- a/usr/src/uts/common/sys/ddi_impldefs.h Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/common/sys/ddi_impldefs.h Sun Sep 14 19:52:20 2008 -0700 @@ -26,8 +26,6 @@ #ifndef _SYS_DDI_IMPLDEFS_H #define _SYS_DDI_IMPLDEFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/param.h> #include <sys/t_lock.h> @@ -223,6 +221,9 @@ /* Declarations of the pure dynamic properties to snapshot */ struct i_ddi_prop_dyn *devi_prop_dyn_driver; /* prop_op */ struct i_ddi_prop_dyn *devi_prop_dyn_parent; /* bus_prop_op */ + + /* For intel iommu support */ + void *devi_iommu_private; }; #define DEVI(dev_info_type) ((struct dev_info *)(dev_info_type))
--- a/usr/src/uts/i86pc/Makefile.files Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/i86pc/Makefile.files Sun Sep 14 19:52:20 2008 -0700 @@ -184,7 +184,7 @@ ACPIPPM_OBJS += acpippm.o acpisleep.o -ROOTNEX_OBJS += rootnex.o +ROOTNEX_OBJS += rootnex.o iommu_rscs.o dmar_acpi.o intel_iommu.o TZMON_OBJS += tzmon.o UPPC_OBJS += uppc.o psm_common.o XSVC_OBJS += xsvc.o
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/io/dmar_acpi.c Sun Sep 14 19:52:20 2008 -0700 @@ -0,0 +1,662 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Portions Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2008, Intel Corporation. + * All rights reserved. + */ + + +#include <sys/debug.h> +#include <sys/sysmacros.h> +#include <sys/types.h> +#include <sys/kmem.h> +#include <sys/sunddi.h> +#include <sys/list.h> +#include <sys/pci.h> +#include <sys/pci_cfgspace.h> +#include <sys/pci_impl.h> +#include <sys/sunndi.h> +#include <sys/ksynch.h> +#include <sys/cmn_err.h> +#include <sys/bootconf.h> +#include <sys/int_fmtio.h> +#include <sys/dmar_acpi.h> + +/* + * the following pci manipulate function pinter + * are defined in pci_cfgspace.h + */ +#define pci_getb (*pci_getb_func) + +/* + * define for debug + */ +int intel_dmar_acpi_debug = 0; +#define dcmn_err if (intel_dmar_acpi_debug) cmn_err + +/* + * global varables + */ +boolean_t intel_iommu_support; +intel_dmar_info_t *dmar_info; + +/* + * internal varables + */ +static void *dmart; + +/* + * helper functions to release the allocated resources + * when failed + */ +static void +release_dev_scope(list_t *lp) +{ + pci_dev_scope_t *devs; + + if (list_is_empty(lp)) + return; + + while ((devs = list_head(lp)) != NULL) { + list_remove(lp, devs); + kmem_free(devs, sizeof (pci_dev_scope_t)); + } +} + +static void +release_drhd_info(void) +{ + drhd_info_t *drhd; + list_t *lp; + int i; + + for (i = 0; i < DMAR_MAX_SEGMENT; i++) { + lp = &dmar_info->dmari_drhd[i]; + if (list_is_empty(lp)) + break; + + while ((drhd = list_head(lp)) != NULL) { + list_remove(lp, drhd); + + /* + * release the device scope + */ + release_dev_scope(&drhd->di_dev_list); + list_destroy(&drhd->di_dev_list); + kmem_free(drhd, sizeof (drhd_info_t)); + } + } +} + +static void +release_rmrr_info(void) +{ + rmrr_info_t *rmrr; + list_t *lp; + int i; + + for (i = 0; i < DMAR_MAX_SEGMENT; i++) { + lp = &dmar_info->dmari_rmrr[i]; + if (list_is_empty(lp)) + break; + + while ((rmrr = list_head(lp)) != NULL) { + list_remove(lp, rmrr); + release_dev_scope(&rmrr->ri_dev_list); + list_destroy(&rmrr->ri_dev_list); + kmem_free(rmrr, sizeof (rmrr_info_t)); + } + } +} + +/* + * intel_iommu_release_dmar_info() + * global function, which is called to release dmar_info + * when the dmar_intel_iommu_supportinfo is not + * needed any more. + */ +void +intel_iommu_release_dmar_info(void) +{ + int i; + + intel_iommu_support = B_FALSE; + release_drhd_info(); + release_rmrr_info(); + + /* + * destroy the drhd and rmrr list + */ + for (i = 0; i < DMAR_MAX_SEGMENT; i++) { + list_destroy(&dmar_info->dmari_drhd[i]); + list_destroy(&dmar_info->dmari_rmrr[i]); + } + + kmem_free(dmar_info, sizeof (intel_dmar_info_t)); +} + +/* + * create_dmar_devi() + * + * create the dev_info node in the device tree, + * the info node is a nuxus child of the root + * nexus + */ +static void +create_dmar_devi(void) +{ + dev_info_t *dip; + drhd_info_t *drhd; + struct regspec reg; + struct ddi_parent_private_data *pdptr; + char nodename[64]; + int i, j; + + for (i = 0; i < DMAR_MAX_SEGMENT; i++) { + + /* + * ignore the empty list + */ + if (list_is_empty(&dmar_info->dmari_drhd[i])) + break; + + /* + * alloc dev_info per drhd unit + */ + j = 0; + for_each_in_list(&dmar_info->dmari_drhd[i], drhd) { + (void) snprintf(nodename, sizeof (nodename), + "dmar%d,%d", drhd->di_segment, j++); + ndi_devi_alloc_sleep(ddi_root_node(), nodename, + DEVI_SID_NODEID, &dip); + drhd->di_dip = dip; + reg.regspec_bustype = 0; + reg.regspec_addr = drhd->di_reg_base; + reg.regspec_size = IOMMU_REG_SIZE; + + /* + * update the reg properties + * + * reg property will be used for register + * set access + * + * refer to the bus_map of root nexus driver + * I/O or memory mapping: + * + * <bustype=0, addr=x, len=x>: memory + * <bustype=1, addr=x, len=x>: i/o + * <bustype>1, addr=0, len=x>: x86-compatibility i/o + */ + (void) ndi_prop_update_int_array(DDI_DEV_T_NONE, + dip, "reg", (int *)®, + sizeof (struct regspec) / sizeof (int)); + + pdptr = (struct ddi_parent_private_data *) + kmem_zalloc(sizeof (struct ddi_parent_private_data) + + sizeof (struct regspec), KM_SLEEP); + pdptr->par_nreg = 1; + pdptr->par_reg = (struct regspec *)(pdptr + 1); + pdptr->par_reg->regspec_bustype = 0; + pdptr->par_reg->regspec_addr = drhd->di_reg_base; + pdptr->par_reg->regspec_size = IOMMU_REG_SIZE; + ddi_set_parent_data(dip, pdptr); + } + } +} + +/* + * parse_dmar_dev_scope() + * parse the device scope attached to drhd or rmrr + */ +static int +parse_dmar_dev_scope(dmar_acpi_dev_scope_t *scope, pci_dev_scope_t **devs) +{ + int depth; + int bus, dev, func; + pci_dev_scope_t *entry; + + struct path_to_dev { + uint8_t device; + uint8_t function; + } *path; + + path = (struct path_to_dev *)(scope + 1); + depth = (scope->ds_length - 6)/2; + bus = scope->ds_sbusnum; + dev = path->device; + func = path->function; + + while (--depth) { + path++; + bus = pci_getb(bus, dev, func, PCI_BCNF_SECBUS); + dev = path->device; + func = path->function; + } + + entry = (pci_dev_scope_t *)kmem_zalloc( + sizeof (pci_dev_scope_t), KM_SLEEP); + entry->pds_bus = bus; + entry->pds_dev = dev; + entry->pds_func = func; + entry->pds_type = scope->ds_type; + + *devs = entry; + return (PARSE_DMAR_SUCCESS); +} + +/* + * parse_dmar_rmrr() + * parse the rmrr units in dmar table + */ +static int +parse_dmar_rmrr(dmar_acpi_unit_head_t *head) +{ + dmar_acpi_rmrr_t *rmrr; + rmrr_info_t *rinfo; + dmar_acpi_dev_scope_t *scope; + pci_dev_scope_t *devs; + + rmrr = (dmar_acpi_rmrr_t *)head; + ASSERT(head->uh_type == DMAR_UNIT_TYPE_RMRR); + ASSERT(rmrr->rm_segment <= DMAR_MAX_SEGMENT); + + /* + * for each rmrr, limiaddr must > baseaddr + */ + if (rmrr->rm_baseaddr >= rmrr->rm_limiaddr) { + cmn_err(CE_WARN, "parse_dmar_rmrr: buggy rmrr," + " baseaddr = 0x%" PRIx64 + ", limiaddr = 0x%" PRIx64 "", + rmrr->rm_baseaddr, rmrr->rm_limiaddr); + return (PARSE_DMAR_FAIL); + } + + /* + * allocate and setup the device info structure + */ + rinfo = (rmrr_info_t *)kmem_zalloc(sizeof (rmrr_info_t), + KM_SLEEP); + rinfo->ri_segment = rmrr->rm_segment; + rinfo->ri_baseaddr = rmrr->rm_baseaddr; + rinfo->ri_limiaddr = rmrr->rm_limiaddr; + list_create(&rinfo->ri_dev_list, sizeof (pci_dev_scope_t), + offsetof(pci_dev_scope_t, node)); + + /* + * parse the device scope + */ + scope = (dmar_acpi_dev_scope_t *)(rmrr + 1); + while ((unsigned long)scope < ((unsigned long)rmrr + head->uh_length)) { + if (parse_dmar_dev_scope(scope, &devs) + != PARSE_DMAR_SUCCESS) { + return (PARSE_DMAR_FAIL); + } + + list_insert_tail(&rinfo->ri_dev_list, devs); + scope = (dmar_acpi_dev_scope_t *)((unsigned long)scope + + scope->ds_length); + } + + /* + * save this info structure + */ + list_insert_tail(&dmar_info->dmari_rmrr[rinfo->ri_segment], rinfo); + return (PARSE_DMAR_SUCCESS); +} + +/* + * parse_dmar_drhd() + * parse the drhd uints in dmar table + */ +static int +parse_dmar_drhd(dmar_acpi_unit_head_t *head) +{ + dmar_acpi_drhd_t *drhd; + drhd_info_t *dinfo; + dmar_acpi_dev_scope_t *scope; + list_t *lp; + pci_dev_scope_t *devs; + + drhd = (dmar_acpi_drhd_t *)head; + ASSERT(head->uh_type == DMAR_UNIT_TYPE_DRHD); + + /* + * assert the segment boundary + */ + ASSERT(drhd->dr_segment <= DMAR_MAX_SEGMENT); + + /* + * allocate and setup the info structure + */ + dinfo = (drhd_info_t *)kmem_zalloc(sizeof (drhd_info_t), KM_SLEEP); + dinfo->di_segment = drhd->dr_segment; + dinfo->di_reg_base = drhd->dr_baseaddr; + dinfo->di_include_all = (drhd->dr_flags & INCLUDE_PCI_ALL) ? + B_TRUE : B_FALSE; + list_create(&dinfo->di_dev_list, sizeof (pci_dev_scope_t), + offsetof(pci_dev_scope_t, node)); + + /* + * parse the device scope + */ + scope = (dmar_acpi_dev_scope_t *)(drhd + 1); + while ((unsigned long)scope < ((unsigned long)drhd + + head->uh_length)) { + + if (parse_dmar_dev_scope(scope, &devs) + != PARSE_DMAR_SUCCESS) { + return (PARSE_DMAR_FAIL); + } + + list_insert_tail(&dinfo->di_dev_list, devs); + scope = (dmar_acpi_dev_scope_t *)((unsigned long)scope + + scope->ds_length); + } + + lp = &dmar_info->dmari_drhd[dinfo->di_segment]; + list_insert_tail(lp, dinfo); + return (PARSE_DMAR_SUCCESS); +} + +/* + * parse_dmar() + * parse the dmar table + */ +static int +parse_dmar(void) +{ + dmar_acpi_head_t *dmar_head; + dmar_acpi_unit_head_t *unit_head; + drhd_info_t *drhd; + int i; + + dmar_head = (dmar_acpi_head_t *)dmart; + + /* + * do a sanity check + */ + if (!dmar_head || strncmp(dmar_head->dh_sig, "DMAR", 4)) { + dcmn_err(CE_CONT, "wrong DMAR signature: %c%c%c%c", + dmar_head->dh_sig[0], dmar_head->dh_sig[1], + dmar_head->dh_sig[2], dmar_head->dh_sig[3]); + return (PARSE_DMAR_FAIL); + } + + dmar_info->dmari_haw = dmar_head->dh_haw + 1; + dmar_info->dmari_intr_remap = dmar_head->dh_flags & 0x1 ? + B_TRUE : B_FALSE; + + /* + * parse each unit + * only DRHD and RMRR are parsed, others are ignored + */ + unit_head = (dmar_acpi_unit_head_t *)(dmar_head + 1); + while ((unsigned long)unit_head < (unsigned long)dmar_head + + dmar_head->dh_len) { + switch (unit_head->uh_type) { + case DMAR_UNIT_TYPE_DRHD: + if (parse_dmar_drhd(unit_head) != + PARSE_DMAR_SUCCESS) { + + /* + * iommu_detect_parse() will release + * all drhd info structure, just + * return false here + */ + return (PARSE_DMAR_FAIL); + } + break; + case DMAR_UNIT_TYPE_RMRR: + if (parse_dmar_rmrr(unit_head) != + PARSE_DMAR_SUCCESS) + return (PARSE_DMAR_FAIL); + break; + default: + cmn_err(CE_WARN, + "unit type %d ignored\n", unit_head->uh_type); + } + unit_head = (dmar_acpi_unit_head_t *) + ((unsigned long)unit_head + + unit_head->uh_length); + } + +#ifdef DEBUG + /* + * make sure the include_all drhd is the + * last drhd in the list, this is only for + * debug + */ + for (i = 0; i < DMAR_MAX_SEGMENT; i++) { + if (list_is_empty(&dmar_info->dmari_drhd[i])) + break; + + for_each_in_list(&dmar_info->dmari_drhd[i], drhd) { + if (drhd->di_include_all && + list_next(&dmar_info->dmari_drhd[i], drhd) + != NULL) { + list_remove(&dmar_info->dmari_drhd[i], drhd); + list_insert_tail(&dmar_info->dmari_drhd[i], + drhd); + dcmn_err(CE_CONT, + "include_all drhd is adjusted\n"); + } + } + } +#endif + + return (PARSE_DMAR_SUCCESS); +} + +/* + * detect_dmar() + * detect the dmar acpi table + */ +static boolean_t +detect_dmar(void) +{ + int len; + char *intel_iommu; + + /* + * if "intel-iommu = no" boot property is set, + * ignore intel iommu + */ + if ((len = do_bsys_getproplen(NULL, "intel-iommu")) > 0) { + intel_iommu = kmem_alloc(len, KM_SLEEP); + (void) do_bsys_getprop(NULL, "intel-iommu", intel_iommu); + if (strcmp(intel_iommu, "no") == 0) { + dcmn_err(CE_CONT, "\"intel-iommu=no\" was set\n"); + kmem_free(intel_iommu, len); + return (B_FALSE); + } + kmem_free(intel_iommu, len); + } + + /* + * get dmar-table from system properties + */ + if ((len = do_bsys_getproplen(NULL, DMAR_TABLE_PROPNAME)) <= 0) { + dcmn_err(CE_CONT, "dmar-table getprop failed\n"); + return (B_FALSE); + } + dcmn_err(CE_CONT, "dmar-table length = %d\n", len); + dmart = kmem_alloc(len, KM_SLEEP); + (void) do_bsys_getprop(NULL, DMAR_TABLE_PROPNAME, dmart); + + return (B_TRUE); +} + +/* + * print dmar_info for debug + */ +static void +print_dmar_info(void) +{ + drhd_info_t *drhd; + rmrr_info_t *rmrr; + pci_dev_scope_t *dev; + int i; + + /* print the title */ + cmn_err(CE_CONT, "dmar_info->:\n"); + cmn_err(CE_CONT, "\thaw = %d\n", dmar_info->dmari_haw); + cmn_err(CE_CONT, "\tintr_remap = %d\n", + dmar_info->dmari_intr_remap ? 1 : 0); + + /* print drhd info list */ + cmn_err(CE_CONT, "\ndrhd list:\n"); + for (i = 0; i < DMAR_MAX_SEGMENT; i++) { + if (list_is_empty(&dmar_info->dmari_drhd[i])) + break; + for (drhd = list_head(&dmar_info->dmari_drhd[i]); + drhd != NULL; drhd = list_next(&dmar_info->dmari_drhd[i], + drhd)) { + cmn_err(CE_CONT, "\n\tsegment = %d\n", + drhd->di_segment); + cmn_err(CE_CONT, "\treg_base = 0x%" PRIx64 "\n", + drhd->di_reg_base); + cmn_err(CE_CONT, "\tinclude_all = %s\n", + drhd->di_include_all ? "yes" : "no"); + cmn_err(CE_CONT, "\tdip = 0x%p\n", + (void *)drhd->di_dip); + cmn_err(CE_CONT, "\tdevice list:\n"); + for (dev = list_head(&drhd->di_dev_list); + dev != NULL; dev = list_next(&drhd->di_dev_list, + dev)) { + cmn_err(CE_CONT, "\n\t\tbus = %d\n", + dev->pds_bus); + cmn_err(CE_CONT, "\t\tdev = %d\n", + dev->pds_dev); + cmn_err(CE_CONT, "\t\tfunc = %d\n", + dev->pds_func); + cmn_err(CE_CONT, "\t\ttype = %d\n", + dev->pds_type); + } + } + } + + /* print rmrr info list */ + cmn_err(CE_CONT, "\nrmrr list:\n"); + for (i = 0; i < DMAR_MAX_SEGMENT; i++) { + if (list_is_empty(&dmar_info->dmari_rmrr[i])) + break; + for (rmrr = list_head(&dmar_info->dmari_rmrr[i]); + rmrr != NULL; rmrr = list_next(&dmar_info->dmari_rmrr[i], + rmrr)) { + cmn_err(CE_CONT, "\n\tsegment = %d\n", + rmrr->ri_segment); + cmn_err(CE_CONT, "\tbaseaddr = 0x%" PRIx64 "\n", + rmrr->ri_baseaddr); + cmn_err(CE_CONT, "\tlimiaddr = 0x%" PRIx64 "\n", + rmrr->ri_limiaddr); + cmn_err(CE_CONT, "\tdevice list:\n"); + for (dev = list_head(&rmrr->ri_dev_list); + dev != NULL; + dev = list_next(&rmrr->ri_dev_list, dev)) { + cmn_err(CE_CONT, "\n\t\tbus = %d\n", + dev->pds_bus); + cmn_err(CE_CONT, "\t\tdev = %d\n", + dev->pds_dev); + cmn_err(CE_CONT, "\t\tfunc = %d\n", + dev->pds_func); + cmn_err(CE_CONT, "\t\ttype = %d\n", + dev->pds_type); + } + } + } +} + +/* + * intel_iommu_probe_and_parse() + * called from rootnex driver + */ +void +intel_iommu_probe_and_parse(void) +{ + int i, len; + char *opt; + + intel_iommu_support = B_FALSE; + dmar_info = NULL; + + /* + * retrieve the print-dmar-acpi boot option + */ + if ((len = do_bsys_getproplen(NULL, "print-dmar-acpi")) > 0) { + opt = kmem_alloc(len, KM_SLEEP); + (void) do_bsys_getprop(NULL, "print-dmar-acpi", opt); + if (strcmp(opt, "yes") == 0) { + intel_dmar_acpi_debug = 1; + cmn_err(CE_CONT, "\"print-dmar-acpi=yes\" was set\n"); + } else if (strcmp(opt, "no") == 0) { + intel_dmar_acpi_debug = 0; + cmn_err(CE_CONT, "\"print-dmar-acpi=no\" was set\n"); + } + kmem_free(opt, len); + } + + dcmn_err(CE_CONT, "intel iommu detect start\n"); + + if (detect_dmar() == B_FALSE) { + dcmn_err(CE_CONT, "no intel iommu detected\n"); + return; + } + + /* + * the platform has intel iommu, setup globals + */ + intel_iommu_support = B_TRUE; + dmar_info = kmem_zalloc(sizeof (intel_dmar_info_t), + KM_SLEEP); + for (i = 0; i < DMAR_MAX_SEGMENT; i++) { + list_create(&(dmar_info->dmari_drhd[i]), sizeof (drhd_info_t), + offsetof(drhd_info_t, node)); + list_create(&(dmar_info->dmari_rmrr[i]), sizeof (rmrr_info_t), + offsetof(rmrr_info_t, node)); + } + + /* + * parse dmar acpi table + */ + if (parse_dmar() != PARSE_DMAR_SUCCESS) { + intel_iommu_release_dmar_info(); + dcmn_err(CE_CONT, "DMAR parse failed\n"); + return; + } + + /* + * create dev_info structure per hrhd + * and prepare it for binding driver + */ + create_dmar_devi(); + + /* + * print the dmar info if the debug + * is set + */ + if (intel_dmar_acpi_debug) + print_dmar_info(); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/io/intel_iommu.c Sun Sep 14 19:52:20 2008 -0700 @@ -0,0 +1,2959 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Portions Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2008, Intel Corporation. + * All rights reserved. + */ + +/* + * Intel IOMMU implementaion + */ +#include <sys/conf.h> +#include <sys/modctl.h> +#include <sys/pci.h> +#include <sys/pci_impl.h> +#include <sys/sysmacros.h> +#include <sys/ddi.h> +#include <sys/ddidmareq.h> +#include <sys/ddi_impldefs.h> +#include <sys/ddifm.h> +#include <sys/sunndi.h> +#include <sys/debug.h> +#include <sys/fm/protocol.h> +#include <sys/note.h> +#include <sys/apic.h> +#include <vm/hat_i86.h> +#include <sys/smp_impldefs.h> +#include <sys/spl.h> +#include <sys/archsystm.h> +#include <sys/x86_archext.h> +#include <sys/rootnex.h> +#include <sys/avl.h> +#include <sys/bootconf.h> +#include <sys/bootinfo.h> +#include <sys/intel_iommu.h> +#include <sys/atomic.h> + +/* + * internal variables + * iommu_state - the list of iommu structures + * reserve_memory - the list of reserved regions + * page_num - the count of pages for iommu page tables + */ +static list_t iommu_states; +static list_t reserve_memory; +static uint_t page_num; + +/* + * record some frequently used dips + */ +static dev_info_t *pci_top_devinfo = NULL; +static dev_info_t *isa_top_devinfo = NULL; +static dev_info_t *lpc_devinfo = NULL; + +/* + * dvma cache related variables + */ +static uint_t dvma_cache_high = 64; +static dvma_cookie_head_t cookie_cache[MAX_COOKIE_CACHE_SIZE]; + +/* + * switch to turn on/off the gfx dma remapping unit, + * this is used when there is a dedicated drhd for the + * gfx + */ +int gfx_drhd_disable = 0; +static dev_info_t *gfx_devinfo = NULL; + +/* + * switch to disable dmar remapping unit, even the initiation work has + * been finished + */ +int dmar_drhd_disable = 0; + +static char *dmar_fault_reason[] = { + "Reserved", + "The present field in root-entry is Clear", + "The present field in context-entry is Clear", + "Hardware detected invalid programming of a context-entry", + "The DMA request attempted to access an address beyond max support", + "The Write field in a page-table entry is Clear when DMA write", + "The Read field in a page-table entry is Clear when DMA read", + "Access the next level page table resulted in error", + "Access the root-entry table resulted in error", + "Access the context-entry table resulted in error", + "Reserved field not initialized to zero in a present root-entry", + "Reserved field not initialized to zero in a present context-entry", + "Reserved field not initialized to zero in a present page-table entry", + "DMA blocked due to the Translation Type field in context-entry", + "Incorrect fault event reason number" +}; + +#define DMAR_MAX_REASON_NUMBER (14) + +/* + * cpu_clflush() + * flush the cpu cache line + */ +static void +cpu_clflush(caddr_t addr, uint_t size) +{ + uint_t i; + + for (i = 0; i < size; i += x86_clflush_size) { + clflush_insn(addr+i); + } + + mfence_insn(); +} + +/* + * iommu_page_init() + * do some init work for the iommu page allocator + */ +static void +iommu_page_init(void) +{ + page_num = 0; +} + +/* + * iommu_get_page() + * get a 4k iommu page, and zero out it + */ +static paddr_t +iommu_get_page(intel_iommu_state_t *iommu, int kmflag) +{ + paddr_t paddr; + caddr_t vaddr; + + paddr = iommu_page_alloc(kmflag); + vaddr = iommu_page_map(paddr); + bzero(vaddr, IOMMU_PAGE_SIZE); + iommu->iu_dmar_ops->do_clflush(vaddr, IOMMU_PAGE_SIZE); + iommu_page_unmap(vaddr); + + page_num++; + + return (paddr); +} + +/* + * iommu_free_page() + * free the iommu page allocated with iommu_get_page + */ +static void +iommu_free_page(paddr_t paddr) +{ + iommu_page_free(paddr); + page_num--; +} + +#define iommu_get_reg32(iommu, offset) ddi_get32((iommu)->iu_reg_handle, \ + (uint32_t *)(iommu->iu_reg_address + (offset))) +#define iommu_get_reg64(iommu, offset) ddi_get64((iommu)->iu_reg_handle, \ + (uint64_t *)(iommu->iu_reg_address + (offset))) +#define iommu_put_reg32(iommu, offset, val) ddi_put32\ + ((iommu)->iu_reg_handle, \ + (uint32_t *)(iommu->iu_reg_address + (offset)), val) +#define iommu_put_reg64(iommu, offset, val) ddi_put64\ + ((iommu)->iu_reg_handle, \ + (uint64_t *)(iommu->iu_reg_address + (offset)), val) + +/* + * calculate_agaw() + * calculate agaw from gaw + */ +static int +calculate_agaw(int gaw) +{ + int r, agaw; + + r = (gaw - 12) % 9; + + if (r == 0) + agaw = gaw; + else + agaw = gaw + 9 - r; + + if (agaw > 64) + agaw = 64; + + return (agaw); +} + +/* + * destroy_iommu_state() + * destory an iommu state + */ +static void +destroy_iommu_state(intel_iommu_state_t *iommu) +{ + iommu_free_page(iommu->iu_root_entry_paddr); + iommu_rscs_fini(&(iommu->iu_domain_id_hdl)); + mutex_destroy(&(iommu->iu_reg_lock)); + mutex_destroy(&(iommu->iu_root_context_lock)); + ddi_regs_map_free(&(iommu->iu_reg_handle)); + kmem_free(iommu->iu_dmar_ops, sizeof (struct dmar_ops)); + kmem_free(iommu, sizeof (intel_iommu_state_t)); +} + +/* + * iommu_update_stats - update iommu private kstat counters + * + * This routine will dump and reset the iommu's internal + * statistics counters. The current stats dump values will + * be sent to the kernel status area. + */ +static int +iommu_update_stats(kstat_t *ksp, int rw) +{ + intel_iommu_state_t *iommu; + iommu_kstat_t *iommu_ksp; + const char *state; + + if (rw == KSTAT_WRITE) + return (EACCES); + + iommu = (intel_iommu_state_t *)ksp->ks_private; + ASSERT(iommu != NULL); + iommu_ksp = (iommu_kstat_t *)ksp->ks_data; + ASSERT(iommu_ksp != NULL); + + state = iommu->iu_enabled ? "enabled" : "disabled"; + (void) strcpy(iommu_ksp->is_enabled.value.c, state); + iommu_ksp->is_iotlb_psi.value.ui64 = + iommu->iu_statistics.st_iotlb_psi; + iommu_ksp->is_iotlb_domain.value.ui64 = + iommu->iu_statistics.st_iotlb_domain; + iommu_ksp->is_iotlb_global.value.ui64 = + iommu->iu_statistics.st_iotlb_global; + iommu_ksp->is_write_buffer.value.ui64 = + iommu->iu_statistics.st_write_buffer; + iommu_ksp->is_context_cache.value.ui64 = + iommu->iu_statistics.st_context_cache; + iommu_ksp->is_wait_complete_us.value.ui64 = + drv_hztousec(iommu->iu_statistics.st_wait_complete_us); + iommu_ksp->is_domain_alloc.value.ui64 = + iommu->iu_statistics.st_domain_alloc; + iommu_ksp->is_page_used.value.ui64 = page_num; + + return (0); +} + +/* + * iommu_init_stats - initialize kstat data structures + * + * This routine will create and initialize the iommu private + * statistics counters. + */ +int +iommu_init_stats(intel_iommu_state_t *iommu) +{ + kstat_t *ksp; + iommu_kstat_t *iommu_ksp; + + /* + * Create and init kstat + */ + ksp = kstat_create("rootnex", 0, + ddi_node_name(iommu->iu_drhd->di_dip), + "misc", KSTAT_TYPE_NAMED, + sizeof (iommu_kstat_t) / sizeof (kstat_named_t), 0); + + if (ksp == NULL) { + cmn_err(CE_WARN, + "Could not create kernel statistics for %s", + ddi_node_name(iommu->iu_drhd->di_dip)); + return (DDI_FAILURE); + } + + iommu->iu_kstat = ksp; + iommu_ksp = (iommu_kstat_t *)ksp->ks_data; + + /* + * Initialize all the statistics + */ + kstat_named_init(&(iommu_ksp->is_enabled), "iommu_enable", + KSTAT_DATA_CHAR); + kstat_named_init(&(iommu_ksp->is_iotlb_psi), "iotlb_psi", + KSTAT_DATA_UINT64); + kstat_named_init(&(iommu_ksp->is_iotlb_domain), "iotlb_domain", + KSTAT_DATA_UINT64); + kstat_named_init(&(iommu_ksp->is_iotlb_global), "iotlb_global", + KSTAT_DATA_UINT64); + kstat_named_init(&(iommu_ksp->is_write_buffer), "write_buffer", + KSTAT_DATA_UINT64); + kstat_named_init(&(iommu_ksp->is_context_cache), "context_cache", + KSTAT_DATA_UINT64); + kstat_named_init(&(iommu_ksp->is_wait_complete_us), "wait_complete_us", + KSTAT_DATA_UINT64); + kstat_named_init(&(iommu_ksp->is_page_used), "physical_page_used", + KSTAT_DATA_UINT64); + kstat_named_init(&(iommu_ksp->is_domain_alloc), "domain_allocated", + KSTAT_DATA_UINT64); + + /* + * Function to provide kernel stat update on demand + */ + ksp->ks_update = iommu_update_stats; + + /* + * Pointer into provider's raw statistics + */ + ksp->ks_private = (void *)iommu; + + /* + * Add kstat to systems kstat chain + */ + kstat_install(ksp); + + return (DDI_SUCCESS); +} + +/* + * iommu_intr_handler() + * the fault event handler for a single drhd + */ +static int +iommu_intr_handler(intel_iommu_state_t *iommu) +{ + uint32_t status; + int index, fault_reg_offset; + int sindex, max_fault_index; + + mutex_enter(&(iommu->iu_reg_lock)); + + /* read the fault status */ + status = iommu_get_reg32(iommu, IOMMU_REG_FAULT_STS); + + /* check if we have a pending fault for this IOMMU */ + if (!(status & IOMMU_FAULT_STS_PPF)) { + mutex_exit(&(iommu->iu_reg_lock)); + return (0); + } + + /* + * handle all primary pending faults + */ + sindex = index = IOMMU_FAULT_GET_INDEX(status); + max_fault_index = IOMMU_CAP_GET_NFR(iommu->iu_capability) - 1; + fault_reg_offset = IOMMU_CAP_GET_FRO(iommu->iu_capability); + + /* + * don't loop forever for a misbehaving IOMMU. Return after 1 loop + * so that we some progress. + */ + do { + uint64_t val; + uint8_t fault_reason; + uint8_t fault_type; + uint16_t sid; + uint64_t pg_addr; + + if (index > max_fault_index) + index = 0; + + /* read the higher 64bits */ + val = iommu_get_reg64(iommu, + fault_reg_offset + index * 16 + 8); + + /* check if pending fault */ + if (!IOMMU_FRR_GET_F(val)) + break; + + /* get the fault reason, fault type and sid */ + fault_reason = IOMMU_FRR_GET_FR(val); + fault_type = IOMMU_FRR_GET_FT(val); + sid = IOMMU_FRR_GET_SID(val); + + /* read the first 64bits */ + val = iommu_get_reg64(iommu, + fault_reg_offset + index * 16); + pg_addr = val & IOMMU_PAGE_MASK; + + /* clear the fault */ + iommu_put_reg32(iommu, fault_reg_offset + index * 16 + 12, + (((uint32_t)1) << 31)); + + /* report the fault info */ + cmn_err(CE_WARN, + "%s generated a fault event when translating DMA %s\n" + "\t on address 0x%" PRIx64 " for PCI(%d, %d, %d), " + "the reason is:\n\t %s", + ddi_node_name(iommu->iu_drhd->di_dip), + fault_type ? "read" : "write", pg_addr, + (sid >> 8) & 0xff, (sid >> 3) & 0x1f, sid & 0x7, + dmar_fault_reason[MIN(fault_reason, + DMAR_MAX_REASON_NUMBER)]); + + } while (++index < sindex); + + /* + * At this point we have cleared the overflow if any + */ + status = iommu_get_reg32(iommu, IOMMU_REG_FAULT_STS); + + /* clear over flow */ + if (status & IOMMU_FAULT_STS_PFO) { +#ifdef DEBUG + cmn_err(CE_WARN, "Primary Fault logging overflow detected. " + "Clearing fault overflow"); +#endif + iommu_put_reg32(iommu, IOMMU_REG_FAULT_STS, 1); + } + + mutex_exit(&(iommu->iu_reg_lock)); + + return (1); +} + +/* + * intel_iommu_intr_handler() + * call iommu_intr_handler for each iommu + */ +static uint_t +intel_iommu_intr_handler(caddr_t arg) +{ + int claimed = 0; + intel_iommu_state_t *iommu; + list_t *lp = (list_t *)arg; + + for_each_in_list(lp, iommu) { + claimed |= iommu_intr_handler(iommu); + } + + return (claimed ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); +} + +/* + * intel_iommu_add_intr() + * the interface to hook dmar interrupt handler + */ +static void +intel_iommu_add_intr(void) +{ + int ipl, irq, vect; + intel_iommu_state_t *iommu; + uint32_t msi_addr, msi_data; + ipl = IOMMU_INTR_IPL; + + irq = psm_get_ipivect(ipl, -1); + vect = apic_irq_table[irq]->airq_vector; + (void) add_avintr((void *)NULL, ipl, (avfunc)(intel_iommu_intr_handler), + "iommu intr", irq, (caddr_t)&iommu_states, NULL, NULL, NULL); + + msi_addr = (MSI_ADDR_HDR | + (MSI_ADDR_RH_FIXED << MSI_ADDR_RH_SHIFT) | + (MSI_ADDR_DM_PHYSICAL << MSI_ADDR_DM_SHIFT) | + apic_cpus[0].aci_local_id); + msi_data = ((MSI_DATA_TM_EDGE << MSI_DATA_TM_SHIFT) | vect); + + for_each_in_list(&iommu_states, iommu) { + (void) iommu_intr_handler(iommu); + mutex_enter(&(iommu->iu_reg_lock)); + iommu_put_reg32(iommu, IOMMU_REG_FEVNT_ADDR, msi_addr); + iommu_put_reg32(iommu, IOMMU_REG_FEVNT_UADDR, 0); + iommu_put_reg32(iommu, IOMMU_REG_FEVNT_DATA, msi_data); + iommu_put_reg32(iommu, IOMMU_REG_FEVNT_CON, 0); + mutex_exit(&(iommu->iu_reg_lock)); + } +} + +/* + * wait max 60s for the hardware completion + */ +#define IOMMU_WAIT_TIME 60000000 +#define iommu_wait_completion(iommu, offset, getf, completion, status) \ +{ \ + clock_t stick = ddi_get_lbolt(); \ + clock_t ntick; \ + _NOTE(CONSTCOND) \ + while (1) { \ + status = getf(iommu, offset); \ + ntick = ddi_get_lbolt(); \ + if (completion) {\ + atomic_add_64\ + (&(iommu->iu_statistics.st_wait_complete_us),\ + ntick - stick);\ + break; \ + } \ + if (ntick - stick >= drv_usectohz(IOMMU_WAIT_TIME)) { \ + cmn_err(CE_PANIC, \ + "iommu wait completion time out\n"); \ + } else { \ + iommu_cpu_nop();\ + }\ + }\ +} + +/* + * dmar_flush_write_buffer() + * flush the write buffer + */ +static void +dmar_flush_write_buffer(intel_iommu_state_t *iommu) +{ + uint32_t status; + + mutex_enter(&(iommu->iu_reg_lock)); + iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD, + iommu->iu_global_cmd_reg | IOMMU_GCMD_WBF); + iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS, + iommu_get_reg32, !(status & IOMMU_GSTS_WBFS), status); + mutex_exit(&(iommu->iu_reg_lock)); + + /* record the statistics */ + atomic_inc_64(&(iommu->iu_statistics.st_write_buffer)); +} + +/* + * dmar_flush_iotlb_common() + * flush the iotlb cache + */ +static void +dmar_flush_iotlb_common(intel_iommu_state_t *iommu, uint_t domain_id, + uint64_t addr, uint_t am, uint_t hint, tlb_inv_g_t type) +{ + uint64_t command = 0, iva = 0, status; + uint_t iva_offset, iotlb_offset; + + iva_offset = IOMMU_ECAP_GET_IRO(iommu->iu_excapability); + iotlb_offset = iva_offset + 8; + + /* + * prepare drain read/write command + */ + if (IOMMU_CAP_GET_DWD(iommu->iu_capability)) { + command |= TLB_INV_DRAIN_WRITE; + } + + if (IOMMU_CAP_GET_DRD(iommu->iu_capability)) { + command |= TLB_INV_DRAIN_READ; + } + + /* + * if the hardward doesn't support page selective invalidation, we + * will use domain type. Otherwise, use global type + */ + switch (type) { + case TLB_INV_G_PAGE: + if (!IOMMU_CAP_GET_PSI(iommu->iu_capability) || + am > IOMMU_CAP_GET_MAMV(iommu->iu_capability) || + addr & IOMMU_PAGE_OFFSET) { + goto ignore_psi; + } + command |= TLB_INV_PAGE | TLB_INV_IVT | + TLB_INV_DID(domain_id); + iva = addr | am | TLB_IVA_HINT(hint); + break; +ignore_psi: + case TLB_INV_G_DOMAIN: + command |= TLB_INV_DOMAIN | TLB_INV_IVT | + TLB_INV_DID(domain_id); + break; + case TLB_INV_G_GLOBAL: + command |= TLB_INV_GLOBAL | TLB_INV_IVT; + break; + default: + cmn_err(CE_WARN, "incorrect iotlb flush type"); + return; + } + + /* + * do the actual flush + */ + mutex_enter(&(iommu->iu_reg_lock)); + /* verify there is no pending command */ + iommu_wait_completion(iommu, iotlb_offset, iommu_get_reg64, + !(status & TLB_INV_IVT), status); + if (iva) + iommu_put_reg64(iommu, iva_offset, iva); + iommu_put_reg64(iommu, iotlb_offset, command); + iommu_wait_completion(iommu, iotlb_offset, iommu_get_reg64, + !(status & TLB_INV_IVT), status); + mutex_exit(&(iommu->iu_reg_lock)); + + /* + * check the result and record the statistics + */ + switch (TLB_INV_GET_IAIG(status)) { + /* global */ + case 1: + atomic_inc_64(&(iommu->iu_statistics.st_iotlb_global)); + break; + /* domain */ + case 2: + atomic_inc_64(&(iommu->iu_statistics.st_iotlb_domain)); + break; + /* psi */ + case 3: + atomic_inc_64(&(iommu->iu_statistics.st_iotlb_psi)); + break; + default: + break; + } +} + +/* + * dmar_flush_iotlb_psi() + * register based iotlb psi invalidation + */ +static void +dmar_flush_iotlb_psi(intel_iommu_state_t *iommu, uint_t domain_id, + uint64_t dvma, uint_t count, uint_t hint) +{ + uint_t am = 0; + uint_t max_am = 0; + uint64_t align = 0; + uint64_t dvma_pg = 0; + uint_t used_count = 0; + + /* choose page specified invalidation */ + if (IOMMU_CAP_GET_PSI(iommu->iu_capability)) { + /* MAMV is valid only if PSI is set */ + max_am = IOMMU_CAP_GET_MAMV(iommu->iu_capability); + while (count != 0) { + /* First calculate alignment of DVMA */ + dvma_pg = IOMMU_BTOP(dvma); + ASSERT(dvma_pg != NULL); + ASSERT(count >= 1); + for (align = 1; (dvma_pg & align) == 0; align <<= 1) + ; + /* truncate count to the nearest power of 2 */ + for (used_count = 1, am = 0; count >> used_count != 0; + used_count <<= 1, am++) + ; + if (am > max_am) { + am = max_am; + used_count = 1 << am; + } + if (align >= used_count) { + dmar_flush_iotlb_common(iommu, domain_id, + dvma, am, hint, TLB_INV_G_PAGE); + } else { + /* align < used_count */ + used_count = align; + for (am = 0; (1 << am) != used_count; am++) + ; + dmar_flush_iotlb_common(iommu, domain_id, + dvma, am, hint, TLB_INV_G_PAGE); + } + count -= used_count; + dvma = (dvma_pg + used_count) << IOMMU_PAGE_SHIFT; + } + /* choose domain invalidation */ + } else { + dmar_flush_iotlb_common(iommu, domain_id, dvma, + 0, 0, TLB_INV_G_DOMAIN); + } +} + +/* + * dmar_flush_iotlb_dsi() + * flush dsi iotlb + */ +static void +dmar_flush_iotlb_dsi(intel_iommu_state_t *iommu, uint_t domain_id) +{ + dmar_flush_iotlb_common(iommu, domain_id, 0, 0, 0, TLB_INV_G_DOMAIN); +} + +/* + * dmar_flush_iotlb_glb() + * flush global iotbl + */ +static void +dmar_flush_iotlb_glb(intel_iommu_state_t *iommu) +{ + dmar_flush_iotlb_common(iommu, 0, 0, 0, 0, TLB_INV_G_GLOBAL); +} + + +/* + * dmar_flush_context_cache() + * flush the context cache + */ +static void +dmar_flush_context_cache(intel_iommu_state_t *iommu, uint8_t function_mask, + uint16_t source_id, uint_t domain_id, ctt_inv_g_t type) +{ + uint64_t command = 0, status; + + /* + * define the command + */ + switch (type) { + case CTT_INV_G_DEVICE: + command |= CCMD_INV_ICC | CCMD_INV_DEVICE + | CCMD_INV_DID(domain_id) + | CCMD_INV_SID(source_id) | CCMD_INV_FM(function_mask); + break; + case CTT_INV_G_DOMAIN: + command |= CCMD_INV_ICC | CCMD_INV_DOMAIN + | CCMD_INV_DID(domain_id); + break; + case CTT_INV_G_GLOBAL: + command |= CCMD_INV_ICC | CCMD_INV_GLOBAL; + break; + default: + cmn_err(CE_WARN, "incorrect context cache flush type"); + return; + } + + mutex_enter(&(iommu->iu_reg_lock)); + /* verify there is no pending command */ + iommu_wait_completion(iommu, IOMMU_REG_CONTEXT_CMD, iommu_get_reg64, + !(status & CCMD_INV_ICC), status); + iommu_put_reg64(iommu, IOMMU_REG_CONTEXT_CMD, command); + iommu_wait_completion(iommu, IOMMU_REG_CONTEXT_CMD, iommu_get_reg64, + !(status & CCMD_INV_ICC), status); + mutex_exit(&(iommu->iu_reg_lock)); + + /* record the context cache statistics */ + atomic_inc_64(&(iommu->iu_statistics.st_context_cache)); +} + +/* + * dmar_flush_context_fsi() + * function based context cache flush + */ +static void +dmar_flush_context_fsi(intel_iommu_state_t *iommu, uint8_t function_mask, + uint16_t source_id, uint_t domain_id) +{ + dmar_flush_context_cache(iommu, function_mask, source_id, + domain_id, CTT_INV_G_DEVICE); +} + +/* + * dmar_flush_context_dsi() + * domain based context cache flush + */ +static void +dmar_flush_context_dsi(intel_iommu_state_t *iommu, uint_t domain_id) +{ + dmar_flush_context_cache(iommu, 0, 0, domain_id, CTT_INV_G_DOMAIN); +} + +/* + * dmar_flush_context_gbl() + * flush global context cache + */ +static void +dmar_flush_context_gbl(intel_iommu_state_t *iommu) +{ + dmar_flush_context_cache(iommu, 0, 0, 0, CTT_INV_G_GLOBAL); +} + +/* + * dmar_set_root_entry_table() + * set root entry table + */ +static void +dmar_set_root_table(intel_iommu_state_t *iommu) +{ + uint32_t status; + + mutex_enter(&(iommu->iu_reg_lock)); + iommu_put_reg64(iommu, IOMMU_REG_ROOTENTRY, + iommu->iu_root_entry_paddr); + iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD, + iommu->iu_global_cmd_reg | IOMMU_GCMD_SRTP); + iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS, + iommu_get_reg32, (status & IOMMU_GSTS_RTPS), status); + mutex_exit(&(iommu->iu_reg_lock)); +} + +/* + * dmar_enable_unit() + * enable the dmar unit + */ +static void +dmar_enable_unit(intel_iommu_state_t *iommu) +{ + uint32_t status; + + mutex_enter(&(iommu->iu_reg_lock)); + iommu_put_reg32(iommu, IOMMU_REG_GLOBAL_CMD, + IOMMU_GCMD_TE); + iommu_wait_completion(iommu, IOMMU_REG_GLOBAL_STS, + iommu_get_reg32, (status & IOMMU_GSTS_TES), status); + mutex_exit(&(iommu->iu_reg_lock)); + iommu->iu_global_cmd_reg |= IOMMU_GCMD_TE; + cmn_err(CE_CONT, "?\t%s enabled\n", + ddi_node_name(iommu->iu_drhd->di_dip)); +} + +/* + * iommu_bringup_unit() + * the processes to bring up a dmar unit + */ +static void +iommu_bringup_unit(intel_iommu_state_t *iommu) +{ + /* + * flush the iommu write buffer + */ + iommu->iu_dmar_ops->do_flwb(iommu); + + /* + * set root entry table + */ + iommu->iu_dmar_ops->do_set_root_table(iommu); + + /* + * flush the context cache + */ + iommu->iu_dmar_ops->do_context_gbl(iommu); + + /* + * flush the iotlb cache + */ + iommu->iu_dmar_ops->do_iotlb_gbl(iommu); + + /* + * at last enable the unit + */ + iommu->iu_dmar_ops->do_enable(iommu); +} + +/* + * iommu_dvma_cache_get() + * get a dvma from the cache + */ +static uint64_t +iommu_dvma_cache_get(dmar_domain_state_t *domain, + size_t size, size_t align, size_t nocross) +{ + dvma_cache_node_t *cache_node = NULL; + dvma_cache_head_t *cache_head; + uint_t index = IOMMU_BTOP(size) - 1; + uint64_t ioaddr; + + if (index >= DVMA_CACHE_HEAD_CNT) + return (0); + + cache_head = &(domain->dm_dvma_cache[index]); + mutex_enter(&(cache_head->dch_free_lock)); + for_each_in_list(&(cache_head->dch_free_list), cache_node) { + if ((cache_node->dcn_align >= align) && + ((nocross == 0) || + ((cache_node->dcn_dvma ^ (cache_node->dcn_dvma + size - 1)) + < (nocross - 1)))) { + list_remove(&(cache_head->dch_free_list), + cache_node); + cache_head->dch_free_count--; + break; + } + } + mutex_exit(&(cache_head->dch_free_lock)); + + if (cache_node) { + ioaddr = cache_node->dcn_dvma; + mutex_enter(&(cache_head->dch_mem_lock)); + list_insert_head(&(cache_head->dch_mem_list), cache_node); + mutex_exit(&(cache_head->dch_mem_lock)); + return (ioaddr); + } + + return (0); +} + +/* + * iommu_dvma_cache_put() + * put a dvma to the cache after use + */ +static void +iommu_dvma_cache_put(dmar_domain_state_t *domain, uint64_t dvma, + size_t size, size_t align) +{ + dvma_cache_node_t *cache_node = NULL; + dvma_cache_head_t *cache_head; + uint_t index = IOMMU_BTOP(size) - 1; + boolean_t shrink = B_FALSE; + + /* out of cache range */ + if (index >= DVMA_CACHE_HEAD_CNT) { + vmem_xfree(domain->dm_dvma_map, + (void *)(intptr_t)dvma, size); + return; + } + + cache_head = &(domain->dm_dvma_cache[index]); + + /* get a node block */ + mutex_enter(&(cache_head->dch_mem_lock)); + cache_node = list_head(&(cache_head->dch_mem_list)); + if (cache_node) { + list_remove(&(cache_head->dch_mem_list), cache_node); + } + mutex_exit(&(cache_head->dch_mem_lock)); + + /* no cache, alloc one */ + if (cache_node == NULL) { + cache_node = kmem_alloc(sizeof (dvma_cache_node_t), KM_SLEEP); + } + + /* initialize this node */ + cache_node->dcn_align = align; + cache_node->dcn_dvma = dvma; + + /* insert into the free list */ + mutex_enter(&(cache_head->dch_free_lock)); + list_insert_head(&(cache_head->dch_free_list), cache_node); + + /* shrink the cache list */ + if (cache_head->dch_free_count++ > dvma_cache_high) { + cache_node = list_tail(&(cache_head->dch_free_list)); + list_remove(&(cache_head->dch_free_list), cache_node); + shrink = B_TRUE; + cache_head->dch_free_count--; + } + mutex_exit(&(cache_head->dch_free_lock)); + + if (shrink) { + ASSERT(cache_node); + vmem_xfree(domain->dm_dvma_map, + (void *)(intptr_t)(cache_node->dcn_dvma), size); + kmem_free(cache_node, sizeof (dvma_cache_node_t)); + } +} + +/* + * iommu_dvma_cache_flush() + * flush the dvma caches when vmem_xalloc() failed + */ +static void +iommu_dvma_cache_flush(dmar_domain_state_t *domain, dev_info_t *dip) +{ + dvma_cache_node_t *cache_node; + dvma_cache_head_t *cache_head; + uint_t index; + + cmn_err(CE_NOTE, "domain dvma cache for %s flushed", + ddi_node_name(dip)); + + for (index = 0; index < DVMA_CACHE_HEAD_CNT; index++) { + cache_head = &(domain->dm_dvma_cache[index]); + mutex_enter(&(cache_head->dch_free_lock)); + cache_node = list_head(&(cache_head->dch_free_list)); + while (cache_node) { + list_remove(&(cache_head->dch_free_list), cache_node); + vmem_xfree(domain->dm_dvma_map, + (void *)(intptr_t)(cache_node->dcn_dvma), + IOMMU_PTOB(index + 1)); + kmem_free(cache_node, sizeof (dvma_cache_node_t)); + cache_head->dch_free_count--; + cache_node = list_head(&(cache_head->dch_free_list)); + } + ASSERT(cache_head->dch_free_count == 0); + mutex_exit(&(cache_head->dch_free_lock)); + } +} + +/* + * get_dvma_cookie_array() + * get a dvma cookie array from the cache or allocate + */ +static iommu_dvma_cookie_t * +get_dvma_cookie_array(uint_t array_size) +{ + dvma_cookie_head_t *cache_head; + iommu_dvma_cookie_t *cookie = NULL; + + if (array_size > MAX_COOKIE_CACHE_SIZE) { + return (kmem_alloc(sizeof (iommu_dvma_cookie_t) * array_size, + KM_SLEEP)); + } + + cache_head = &(cookie_cache[array_size - 1]); + mutex_enter(&(cache_head->dch_lock)); + /* LINTED E_EQUALITY_NOT_ASSIGNMENT */ + if (cookie = cache_head->dch_next) { + cache_head->dch_next = cookie->dc_next; + cache_head->dch_count--; + } + mutex_exit(&(cache_head->dch_lock)); + + if (cookie) { + return (cookie); + } else { + return (kmem_alloc(sizeof (iommu_dvma_cookie_t) * array_size, + KM_SLEEP)); + } +} + +/* + * put_dvma_cookie_array() + * put a dvma cookie array to the cache or free + */ +static void +put_dvma_cookie_array(iommu_dvma_cookie_t *dcookies, uint_t array_size) +{ + dvma_cookie_head_t *cache_head; + + if (array_size > MAX_COOKIE_CACHE_SIZE) { + kmem_free(dcookies, sizeof (iommu_dvma_cookie_t) * array_size); + return; + } + + cache_head = &(cookie_cache[array_size - 1]); + mutex_enter(&(cache_head->dch_lock)); + dcookies->dc_next = cache_head->dch_next; + cache_head->dch_next = dcookies; + cache_head->dch_count++; + mutex_exit(&(cache_head->dch_lock)); +} + +/* + * dmar_reg_plant_wait() + * the plant wait operation for register based cache invalidation + */ +static void +dmar_reg_plant_wait(intel_iommu_state_t *iommu, iommu_dvma_cookie_t *dcookies, + uint_t count, uint_t array_size) +{ + iotlb_pend_node_t *node = NULL; + iotlb_pend_head_t *head; + + head = &(iommu->iu_pend_head); + + /* get a node */ + mutex_enter(&(head->ich_mem_lock)); + node = list_head(&(head->ich_mem_list)); + if (node) { + list_remove(&(head->ich_mem_list), node); + } + mutex_exit(&(head->ich_mem_lock)); + + /* no cache, alloc one */ + if (node == NULL) { + node = kmem_alloc(sizeof (iotlb_pend_node_t), KM_SLEEP); + } + + /* initialize this node */ + node->icn_dcookies = dcookies; + node->icn_count = count; + node->icn_array_size = array_size; + + /* insert into the pend list */ + mutex_enter(&(head->ich_pend_lock)); + list_insert_tail(&(head->ich_pend_list), node); + head->ich_pend_count++; + mutex_exit(&(head->ich_pend_lock)); +} + +/* + * dmar_release_dvma_cookie() + * release the dvma cookie + */ +static void +dmar_release_dvma_cookie(iommu_dvma_cookie_t *dcookies, + uint_t count, uint_t array_size) +{ + uint_t i; + + /* free dvma */ + for (i = 0; i < count; i++) { + iommu_dvma_cache_put(dcookies[i].dc_domain, + dcookies[i].dc_addr, dcookies[i].dc_size, + dcookies[i].dc_align); + } + + /* free the cookie array */ + put_dvma_cookie_array(dcookies, array_size); +} + +/* + * dmar_reg_reap_wait() + * the reap wait operation for register based cache invalidation + */ +static void +dmar_reg_reap_wait(intel_iommu_state_t *iommu) +{ + iotlb_pend_node_t *node; + iotlb_pend_head_t *head; + + head = &(iommu->iu_pend_head); + mutex_enter(&(head->ich_pend_lock)); + node = list_head(&(head->ich_pend_list)); + if (node) { + list_remove(&(head->ich_pend_list), node); + head->ich_pend_count--; + } + mutex_exit(&(head->ich_pend_lock)); + + if (node) { + dmar_release_dvma_cookie(node->icn_dcookies, + node->icn_count, node->icn_array_size); + /* put the node into the node cache */ + mutex_enter(&(head->ich_mem_lock)); + list_insert_head(&(head->ich_mem_list), node); + mutex_exit(&(head->ich_mem_lock)); + } +} + +/* + * dmar_init_ops() + * init dmar ops + */ +static void +dmar_init_ops(intel_iommu_state_t *iommu) +{ + struct dmar_ops *ops; + + ASSERT(iommu); + ops = kmem_alloc(sizeof (struct dmar_ops), KM_SLEEP); + + /* initialize the dmar operations */ + ops->do_enable = dmar_enable_unit; + ops->do_fault = iommu_intr_handler; + + /* cpu clflush */ + if (iommu->iu_coherency) { + ops->do_clflush = (void (*)(caddr_t, uint_t))return_instr; + } else { + ASSERT(x86_feature & X86_CLFSH); + ops->do_clflush = cpu_clflush; + } + + /* write buffer */ + if (IOMMU_CAP_GET_RWBF(iommu->iu_capability)) { + ops->do_flwb = dmar_flush_write_buffer; + } else { + ops->do_flwb = (void (*)(intel_iommu_state_t *))return_instr; + } + + /* cache related functions */ + ops->do_iotlb_psi = dmar_flush_iotlb_psi; + ops->do_iotlb_dsi = dmar_flush_iotlb_dsi; + ops->do_iotlb_gbl = dmar_flush_iotlb_glb; + ops->do_context_fsi = dmar_flush_context_fsi; + ops->do_context_dsi = dmar_flush_context_dsi; + ops->do_context_gbl = dmar_flush_context_gbl; + ops->do_plant_wait = dmar_reg_plant_wait; + ops->do_reap_wait = dmar_reg_reap_wait; + + ops->do_set_root_table = dmar_set_root_table; + + iommu->iu_dmar_ops = ops; +} + +/* + * create_iommu_state() + * alloc and setup the iommu state + */ +static int +create_iommu_state(drhd_info_t *drhd) +{ + intel_iommu_state_t *iommu; + int mgaw, sagaw, agaw; + int bitnum; + int ret; + + static ddi_device_acc_attr_t ioattr = { + DDI_DEVICE_ATTR_V0, + DDI_NEVERSWAP_ACC, + DDI_STRICTORDER_ACC, + }; + + iommu = kmem_alloc(sizeof (intel_iommu_state_t), KM_SLEEP); + drhd->di_iommu = (void *)iommu; + iommu->iu_drhd = drhd; + + /* + * map the register address space + */ + ret = ddi_regs_map_setup(iommu->iu_drhd->di_dip, 0, + (caddr_t *)&(iommu->iu_reg_address), (offset_t)0, + (offset_t)IOMMU_REG_SIZE, &ioattr, + &(iommu->iu_reg_handle)); + + if (ret != DDI_SUCCESS) { + cmn_err(CE_WARN, "iommu register map failed: %d", ret); + kmem_free(iommu, sizeof (intel_iommu_state_t)); + return (DDI_FAILURE); + } + + mutex_init(&(iommu->iu_reg_lock), NULL, MUTEX_DRIVER, + (void *)ipltospl(IOMMU_INTR_IPL)); + mutex_init(&(iommu->iu_root_context_lock), NULL, MUTEX_DRIVER, NULL); + + /* + * get the register value + */ + iommu->iu_capability = iommu_get_reg64(iommu, IOMMU_REG_CAP); + iommu->iu_excapability = iommu_get_reg64(iommu, IOMMU_REG_EXCAP); + + /* + * if the hardware access is non-coherent, we need clflush + */ + if (IOMMU_ECAP_GET_C(iommu->iu_excapability)) { + iommu->iu_coherency = B_TRUE; + } else { + iommu->iu_coherency = B_FALSE; + if (!(x86_feature & X86_CLFSH)) { + cmn_err(CE_WARN, "drhd can't be enabled due to " + "missing clflush functionality"); + ddi_regs_map_free(&(iommu->iu_reg_handle)); + kmem_free(iommu, sizeof (intel_iommu_state_t)); + return (DDI_FAILURE); + } + } + + /* + * retrieve the maximum number of domains + */ + iommu->iu_max_domain = IOMMU_CAP_ND(iommu->iu_capability); + + /* + * setup the domain id allocator + * domain id 0 is reserved by the architecture + */ + iommu_rscs_init(1, iommu->iu_max_domain, &(iommu->iu_domain_id_hdl)); + + /* + * calculate the agaw + */ + mgaw = IOMMU_CAP_MGAW(iommu->iu_capability); + sagaw = IOMMU_CAP_SAGAW(iommu->iu_capability); + iommu->iu_gaw = mgaw; + agaw = calculate_agaw(iommu->iu_gaw); + bitnum = (agaw - 30) / 9; + + while (bitnum < 5) { + if (sagaw & (1 << bitnum)) + break; + else + bitnum++; + } + + if (bitnum >= 5) { + cmn_err(CE_PANIC, "can't determine agaw"); + /*NOTREACHED*/ + return (DDI_FAILURE); + } else { + iommu->iu_agaw = 30 + bitnum * 9; + if (iommu->iu_agaw > 64) + iommu->iu_agaw = 64; + iommu->iu_level = bitnum + 2; + } + + /* + * the iommu is orginally disabled + */ + iommu->iu_enabled = B_FALSE; + iommu->iu_global_cmd_reg = 0; + + /* + * init kstat + */ + (void) iommu_init_stats(iommu); + bzero(&(iommu->iu_statistics), sizeof (iommu_stat_t)); + + /* + * init dmar ops + */ + dmar_init_ops(iommu); + + /* + * alloc root entry table, this should put after init ops + */ + iommu->iu_root_entry_paddr = iommu_get_page(iommu, KM_SLEEP); + + /* + * initialize the iotlb pending list and cache + */ + mutex_init(&(iommu->iu_pend_head.ich_pend_lock), NULL, + MUTEX_DRIVER, NULL); + list_create(&(iommu->iu_pend_head.ich_pend_list), + sizeof (iotlb_pend_node_t), + offsetof(iotlb_pend_node_t, node)); + iommu->iu_pend_head.ich_pend_count = 0; + + mutex_init(&(iommu->iu_pend_head.ich_mem_lock), NULL, + MUTEX_DRIVER, NULL); + list_create(&(iommu->iu_pend_head.ich_mem_list), + sizeof (iotlb_pend_node_t), + offsetof(iotlb_pend_node_t, node)); + + /* + * insert this iommu into the list + */ + list_insert_tail(&iommu_states, iommu); + + /* + * report this unit + */ + cmn_err(CE_CONT, "?\t%s state structure created\n", + ddi_node_name(iommu->iu_drhd->di_dip)); + + return (DDI_SUCCESS); +} + +#define IS_OVERLAP(new, old) (((new)->rm_pfn_start <= (old)->rm_pfn_end) && \ + ((new)->rm_pfn_end >= (old)->rm_pfn_start)) + +/* + * memory_region_overlap() + * handle the pci mmio pages overlap condition + */ +static boolean_t +memory_region_overlap(dmar_reserve_pages_t *rmem) +{ + dmar_reserve_pages_t *temp; + + for_each_in_list(&reserve_memory, temp) { + if (IS_OVERLAP(rmem, temp)) { + temp->rm_pfn_start = MIN(temp->rm_pfn_start, + rmem->rm_pfn_start); + temp->rm_pfn_end = MAX(temp->rm_pfn_end, + rmem->rm_pfn_end); + return (B_TRUE); + } + } + + return (B_FALSE); +} + +/* + * collect_pci_mmio_walk + * reserve a single dev mmio resources + */ +static int +collect_pci_mmio_walk(dev_info_t *dip, void *arg) +{ + _NOTE(ARGUNUSED(arg)) + + int i, length, account; + pci_regspec_t *assigned; + uint64_t mmio_hi, mmio_lo, mmio_size; + dmar_reserve_pages_t *rmem; + + /* + * ingore the devices which have no assigned-address + * properties + */ + if (ddi_getlongprop(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, + "assigned-addresses", (caddr_t)&assigned, + &length) != DDI_PROP_SUCCESS) + return (DDI_WALK_CONTINUE); + + account = length / sizeof (pci_regspec_t); + + for (i = 0; i < account; i++) { + + /* + * check the memory io assigned-addresses + * refer to pci.h for bits defination of + * pci_phys_hi + */ + if (((assigned[i].pci_phys_hi & PCI_ADDR_MASK) + == PCI_ADDR_MEM32) || + ((assigned[i].pci_phys_hi & PCI_ADDR_MASK) + == PCI_ADDR_MEM64)) { + mmio_lo = (((uint64_t)assigned[i].pci_phys_mid) << 32) | + (uint64_t)assigned[i].pci_phys_low; + mmio_size = + (((uint64_t)assigned[i].pci_size_hi) << 32) | + (uint64_t)assigned[i].pci_size_low; + mmio_hi = mmio_lo + mmio_size - 1; + + rmem = kmem_alloc(sizeof (dmar_reserve_pages_t), + KM_SLEEP); + rmem->rm_pfn_start = IOMMU_BTOP(mmio_lo); + rmem->rm_pfn_end = IOMMU_BTOP(mmio_hi); + if (!memory_region_overlap(rmem)) { + list_insert_tail(&reserve_memory, rmem); + } + } + } + + kmem_free(assigned, length); + + return (DDI_WALK_CONTINUE); +} + +/* + * collect_pci_mmio() + * walk through the pci device tree, and collect the mmio resources + */ +static int +collect_pci_mmio(dev_info_t *pdip) +{ + int count; + ASSERT(pdip); + + /* + * walk through the device tree under pdip + * normally, pdip should be the pci root nexus + */ + ndi_devi_enter(pdip, &count); + ddi_walk_devs(ddi_get_child(pdip), + collect_pci_mmio_walk, NULL); + ndi_devi_exit(pdip, count); + + return (DDI_SUCCESS); +} + +/* + * iommu_collect_reserve_memory() + * collect the reserved memory region + */ +static void +iommu_collect_reserve_memory(void) +{ + dmar_reserve_pages_t *rmem; + + /* + * reserve pages for pci memory mapped io + */ + (void) collect_pci_mmio(pci_top_devinfo); + + /* + * reserve pages for ioapic + */ + rmem = kmem_alloc(sizeof (dmar_reserve_pages_t), KM_SLEEP); + rmem->rm_pfn_start = IOMMU_BTOP(IOAPIC_REGION_START); + rmem->rm_pfn_end = IOMMU_BTOP(IOAPIC_REGION_END); + list_insert_tail(&reserve_memory, rmem); +} + +/* + * match_dip_sbdf() + * walk function for get_dip_from_info() + */ +static int +match_dip_sbdf(dev_info_t *dip, void *arg) +{ + iommu_private_t *private = DEVI(dip)->devi_iommu_private; + pci_dev_info_t *info = arg; + + if (private && + (info->pdi_seg == private->idp_seg) && + (info->pdi_bus == private->idp_bus) && + (info->pdi_devfn == private->idp_devfn)) { + info->pdi_dip = dip; + return (DDI_WALK_TERMINATE); + } + return (DDI_WALK_CONTINUE); +} + +/* + * get_dip_from_info() + * get the dev_info structure by pass a bus/dev/func + */ +static int +get_dip_from_info(pci_dev_info_t *info) +{ + int count; + info->pdi_dip = NULL; + + ndi_devi_enter(pci_top_devinfo, &count); + ddi_walk_devs(ddi_get_child(pci_top_devinfo), + match_dip_sbdf, info); + ndi_devi_exit(pci_top_devinfo, count); + + if (info->pdi_dip) + return (DDI_SUCCESS); + else + return (DDI_FAILURE); +} + +/* + * get_pci_top_bridge() + * get the top level bridge for a pci device + */ +static dev_info_t * +get_pci_top_bridge(dev_info_t *dip) +{ + iommu_private_t *private; + dev_info_t *tmp, *pdip; + + tmp = NULL; + pdip = ddi_get_parent(dip); + while (pdip != pci_top_devinfo) { + private = DEVI(pdip)->devi_iommu_private; + if ((private->idp_bbp_type == IOMMU_PPB_PCIE_PCI) || + (private->idp_bbp_type == IOMMU_PPB_PCI_PCI)) + tmp = pdip; + pdip = ddi_get_parent(pdip); + } + + return (tmp); +} + +/* + * domain_vmem_init_reserve() + * dish out the reserved pages + */ +static void +domain_vmem_init_reserve(dmar_domain_state_t *domain) +{ + dmar_reserve_pages_t *rmem; + uint64_t lo, hi; + size_t size; + + for_each_in_list(&reserve_memory, rmem) { + lo = IOMMU_PTOB(rmem->rm_pfn_start); + hi = IOMMU_PTOB(rmem->rm_pfn_end + 1); + size = hi - lo; + + if (vmem_xalloc(domain->dm_dvma_map, + size, /* size */ + IOMMU_PAGE_SIZE, /* align/quantum */ + 0, /* phase */ + 0, /* nocross */ + (void *)(uintptr_t)lo, /* minaddr */ + (void *)(uintptr_t)hi, /* maxaddr */ + VM_NOSLEEP) == NULL) { + cmn_err(CE_WARN, + "region [%" PRIx64 ",%" PRIx64 ") not reserved", + lo, hi); + } + } +} + +/* + * domain_vmem_init() + * initiate the domain vmem + */ +static void +domain_vmem_init(dmar_domain_state_t *domain) +{ + char vmem_name[64]; + uint64_t base, size; + static uint_t vmem_instance = 0; + + /* + * create the whole available virtual address and + * dish out the reserved memory regions with xalloc + */ + (void) snprintf(vmem_name, sizeof (vmem_name), + "domain_vmem_%d", vmem_instance++); + base = IOMMU_PAGE_SIZE; + size = IOMMU_SIZE_4G - base; + + domain->dm_dvma_map = vmem_create(vmem_name, + (void *)(uintptr_t)base, /* base */ + size, /* size */ + IOMMU_PAGE_SIZE, /* quantum */ + NULL, /* afunc */ + NULL, /* ffunc */ + NULL, /* source */ + 0, /* qcache_max */ + VM_SLEEP); + + /* + * dish out the reserved pages + */ + domain_vmem_init_reserve(domain); +} + +/* + * iommu_domain_init() + * initiate a domain + */ +static int +iommu_domain_init(dmar_domain_state_t *domain) +{ + uint_t i; + + /* + * allocate the domain id + */ + if (iommu_rscs_alloc(domain->dm_iommu->iu_domain_id_hdl, + &(domain->dm_domain_id)) != DDI_SUCCESS) { + cmn_err(CE_WARN, "domain id exhausted %p, assign 1", + (void *)domain->dm_iommu); + domain->dm_domain_id = 1; + } + + /* + * record the domain statistics + */ + atomic_inc_64(&(domain->dm_iommu->iu_statistics.st_domain_alloc)); + + /* + * create vmem map + */ + domain_vmem_init(domain); + + /* + * create the first level page table + */ + domain->dm_page_table_paddr = + iommu_get_page(domain->dm_iommu, KM_SLEEP); + + /* + * init the CPU available page tables + */ + domain->dm_pt_tree.vp = kmem_zalloc(IOMMU_PAGE_SIZE << 1, KM_SLEEP); + domain->dm_pt_tree.pp = iommu_page_map(domain->dm_page_table_paddr); + domain->dm_identity = B_FALSE; + + /* + * init the dvma cache + */ + for (i = 0; i < DVMA_CACHE_HEAD_CNT; i++) { + /* init the free list */ + mutex_init(&(domain->dm_dvma_cache[i].dch_free_lock), + NULL, MUTEX_DRIVER, NULL); + list_create(&(domain->dm_dvma_cache[i].dch_free_list), + sizeof (dvma_cache_node_t), + offsetof(dvma_cache_node_t, node)); + domain->dm_dvma_cache[i].dch_free_count = 0; + + /* init the memory cache list */ + mutex_init(&(domain->dm_dvma_cache[i].dch_mem_lock), + NULL, MUTEX_DRIVER, NULL); + list_create(&(domain->dm_dvma_cache[i].dch_mem_list), + sizeof (dvma_cache_node_t), + offsetof(dvma_cache_node_t, node)); + } + + return (DDI_SUCCESS); +} + +/* + * dmar_check_sub() + * check to see if the device is under scope of a p2p bridge + */ +static boolean_t +dmar_check_sub(dev_info_t *dip, pci_dev_scope_t *devs) +{ + dev_info_t *pdip, *pci_root; + iommu_private_t *private; + int bus = devs->pds_bus; + int devfn = ((devs->pds_dev << 3) | devs->pds_func); + + pdip = ddi_get_parent(dip); + pci_root = pci_top_devinfo; + while (pdip != pci_root) { + private = DEVI(pdip)->devi_iommu_private; + if (private && (private->idp_bus == bus) && + (private->idp_devfn == devfn)) + return (B_TRUE); + pdip = ddi_get_parent(pdip); + } + + return (B_FALSE); +} + +/* + * iommu_get_dmar() + * get the iommu structure for a device + */ +static intel_iommu_state_t * +iommu_get_dmar(dev_info_t *dip) +{ + iommu_private_t *private = + DEVI(dip)->devi_iommu_private; + int seg = private->idp_seg; + int bus = private->idp_bus; + int dev = private->idp_devfn >> 3; + int func = private->idp_devfn & 7; + pci_dev_scope_t *devs; + drhd_info_t *drhd; + + /* + * walk the drhd list for a match + */ + for_each_in_list(&(dmar_info->dmari_drhd[seg]), drhd) { + + /* + * match the include all + */ + if (drhd->di_include_all) + return ((intel_iommu_state_t *) + drhd->di_iommu); + + /* + * try to match the device scope + */ + for_each_in_list(&(drhd->di_dev_list), devs) { + + /* + * get a perfect match + */ + if (devs->pds_bus == bus && + devs->pds_dev == dev && + devs->pds_func == func) { + return ((intel_iommu_state_t *) + (drhd->di_iommu)); + } + + /* + * maybe under a scope of a p2p + */ + if (devs->pds_type == 0x2 && + dmar_check_sub(dip, devs)) + return ((intel_iommu_state_t *) + (drhd->di_iommu)); + } + } + + /* + * shouldn't get here + */ + cmn_err(CE_PANIC, "can't match iommu for %s\n", + ddi_node_name(dip)); + + return (NULL); +} + +/* + * domain_set_root_context + * set root context for a single device + */ +static void +domain_set_root_context(dmar_domain_state_t *domain, + pci_dev_info_t *info, uint_t agaw) +{ + caddr_t root, context; + paddr_t paddr; + iorce_t rce; + uint_t bus, devfn; + intel_iommu_state_t *iommu; + uint_t aw_code; + + ASSERT(domain); + iommu = domain->dm_iommu; + ASSERT(iommu); + bus = info->pdi_bus; + devfn = info->pdi_devfn; + aw_code = (agaw - 30) / 9; + + /* + * set root entry + */ + root = iommu_page_map(iommu->iu_root_entry_paddr); + rce = (iorce_t)root + bus; + mutex_enter(&(iommu->iu_root_context_lock)); + if (!ROOT_ENTRY_GET_P(rce)) { + paddr = iommu_get_page(iommu, KM_SLEEP); + ROOT_ENTRY_SET_P(rce); + ROOT_ENTRY_SET_CTP(rce, paddr); + iommu->iu_dmar_ops->do_clflush((caddr_t)rce, sizeof (*rce)); + context = iommu_page_map(paddr); + } else { + paddr = ROOT_ENTRY_GET_CTP(rce); + context = iommu_page_map(paddr); + } + + /* set context entry */ + rce = (iorce_t)context + devfn; + if (!CONT_ENTRY_GET_P(rce)) { + paddr = domain->dm_page_table_paddr; + CONT_ENTRY_SET_P(rce); + CONT_ENTRY_SET_ASR(rce, paddr); + CONT_ENTRY_SET_AW(rce, aw_code); + CONT_ENTRY_SET_DID(rce, domain->dm_domain_id); + iommu->iu_dmar_ops->do_clflush((caddr_t)rce, sizeof (*rce)); + } else if (CONT_ENTRY_GET_ASR(rce) != + domain->dm_page_table_paddr) { + cmn_err(CE_WARN, "root context entries for" + " %d, %d, %d has been set", bus, + devfn >>3, devfn & 0x7); + } + + mutex_exit(&(iommu->iu_root_context_lock)); + iommu_page_unmap(root); + iommu_page_unmap(context); + + /* cache mode set, flush context cache */ + if (IOMMU_CAP_GET_CM(iommu->iu_capability)) { + iommu->iu_dmar_ops->do_context_fsi(iommu, 0, + (bus << 8) | devfn, domain->dm_domain_id); + iommu->iu_dmar_ops->do_iotlb_dsi(iommu, domain->dm_domain_id); + /* cache mode not set, flush write buffer */ + } else { + iommu->iu_dmar_ops->do_flwb(iommu); + } +} + +/* + * setup_single_context() + * setup the root context entry + */ +static void +setup_single_context(dmar_domain_state_t *domain, + int seg, int bus, int devfn) +{ + pci_dev_info_t info; + + info.pdi_seg = seg; + info.pdi_bus = bus; + info.pdi_devfn = devfn; + + domain_set_root_context(domain, &info, + domain->dm_iommu->iu_agaw); +} + +/* + * setup_context_walk() + * the walk function to set up the possible context entries + */ +static int +setup_context_walk(dev_info_t *dip, void *arg) +{ + dmar_domain_state_t *domain = arg; + iommu_private_t *private; + + private = DEVI(dip)->devi_iommu_private; + ASSERT(private); + + setup_single_context(domain, private->idp_seg, + private->idp_bus, private->idp_devfn); + return (DDI_WALK_PRUNECHILD); +} + +/* + * setup_possible_contexts() + * set up all the possible context entries for a device under ppb + */ +static void +setup_possible_contexts(dmar_domain_state_t *domain, dev_info_t *dip) +{ + int count; + iommu_private_t *private; + private = DEVI(dip)->devi_iommu_private; + + /* for pci-pci bridge */ + if (private->idp_bbp_type == IOMMU_PPB_PCI_PCI) { + setup_single_context(domain, private->idp_seg, + private->idp_bus, private->idp_devfn); + return; + } + + /* for pcie-pci bridge */ + setup_single_context(domain, private->idp_seg, + private->idp_bus, private->idp_devfn); + setup_single_context(domain, private->idp_seg, + private->idp_sec, 0); + + /* for functions under pcie-pci bridge */ + ndi_devi_enter(dip, &count); + ddi_walk_devs(ddi_get_child(dip), setup_context_walk, domain); + ndi_devi_exit(dip, count); +} + +/* + * iommu_alloc_domain() + * allocate a domain for device, the result is returned in domain parameter + */ +static int +iommu_alloc_domain(dev_info_t *dip, dmar_domain_state_t **domain) +{ + iommu_private_t *private, *b_private; + dmar_domain_state_t *new; + pci_dev_info_t info; + dev_info_t *bdip = NULL; + uint_t need_to_set_parent; + int count, pcount; + + need_to_set_parent = 0; + private = DEVI(dip)->devi_iommu_private; + if (private == NULL) { + cmn_err(CE_PANIC, "iommu private is NULL (%s)\n", + ddi_node_name(dip)); + } + + /* + * check if the domain has already allocated + */ + if (private->idp_domain) { + *domain = private->idp_domain; + return (DDI_SUCCESS); + } + + /* + * we have to assign a domain for this device, + */ + + ndi_hold_devi(dip); + bdip = get_pci_top_bridge(dip); + if (bdip != NULL) { + ndi_devi_enter(ddi_get_parent(bdip), &pcount); + } + + /* + * hold the parent for modifying its children + */ + ndi_devi_enter(ddi_get_parent(dip), &count); + + /* + * check to see if it is under a pci bridge + */ + if (bdip != NULL) { + b_private = DEVI(bdip)->devi_iommu_private; + if (b_private->idp_domain) { + new = b_private->idp_domain; + goto get_domain_finish; + } else { + need_to_set_parent = 1; + } + } + +get_domain_alloc: + /* + * OK, we have to allocate a new domain + */ + new = kmem_alloc(sizeof (dmar_domain_state_t), KM_SLEEP); + new->dm_iommu = iommu_get_dmar(dip); + + /* + * setup the domain + */ + if (iommu_domain_init(new) != DDI_SUCCESS) { + ndi_devi_exit(ddi_get_parent(dip), count); + if (need_to_set_parent) + ndi_devi_exit(ddi_get_parent(bdip), pcount); + return (DDI_FAILURE); + } + +get_domain_finish: + /* + * add the device to the domain's device list + */ + private->idp_domain = new; + ndi_devi_exit(ddi_get_parent(dip), count); + + if (need_to_set_parent) { + b_private->idp_domain = new; + ndi_devi_exit(ddi_get_parent(bdip), pcount); + setup_possible_contexts(new, bdip); + } else if (bdip == NULL) { + info.pdi_seg = private->idp_seg; + info.pdi_bus = private->idp_bus; + info.pdi_devfn = private->idp_devfn; + domain_set_root_context(new, &info, + new->dm_iommu->iu_agaw); + } else { + ndi_devi_exit(ddi_get_parent(bdip), pcount); + } + + /* + * return new domain + */ + *domain = new; + return (DDI_SUCCESS); +} + +/* + * iommu_get_domain() + * get a iommu domain for dip, and the result is returned in domain + */ +static int +iommu_get_domain(dev_info_t *dip, dmar_domain_state_t **domain) +{ + iommu_private_t *private; + dev_info_t *pdip; + private = DEVI(dip)->devi_iommu_private; + + ASSERT(domain); + + /* + * for isa devices attached under lpc + */ + if (ddi_get_parent(dip) == isa_top_devinfo) { + if (lpc_devinfo) { + return (iommu_alloc_domain(lpc_devinfo, domain)); + } else { + *domain = NULL; + return (DDI_FAILURE); + } + } + + /* + * for gart, use the real graphic devinfo + */ + if (strcmp(ddi_node_name(dip), "agpgart") == 0) { + if (gfx_devinfo) { + return (iommu_alloc_domain(gfx_devinfo, domain)); + } else { + *domain = NULL; + return (DDI_FAILURE); + } + } + + /* + * if iommu private is NULL, we share + * the domain with the parent + */ + if (private == NULL) { + pdip = ddi_get_parent(dip); + return (iommu_alloc_domain(pdip, domain)); + } + + /* + * check if the domain has already allocated + */ + if (private->idp_domain) { + *domain = private->idp_domain; + return (DDI_SUCCESS); + } + + /* + * allocate a domain for this device + */ + return (iommu_alloc_domain(dip, domain)); +} + +/* + * helper functions to manipulate iommu pte + */ +static inline void +set_pte(iopte_t pte, uint_t rw, paddr_t addr) +{ + *pte |= (rw & 0x3); + *pte |= (addr & IOMMU_PAGE_MASK); +} + +static inline paddr_t +pte_get_paddr(iopte_t pte) +{ + return (*pte & IOMMU_PAGE_MASK); +} + +/* + * dvma_level_offset() + * get the page table offset by specifying a dvma and level + */ +static inline uint_t +dvma_level_offset(uint64_t dvma_pn, uint_t level) +{ + uint_t start_bit, offset; + + start_bit = (level - 1) * IOMMU_LEVEL_STRIDE; + offset = (dvma_pn >> start_bit) & IOMMU_LEVEL_OFFSET; + + return (offset); +} + +/* + * iommu_setup_level_table() + * setup the page table for a level + */ +static iovpte_t +iommu_setup_level_table(dmar_domain_state_t *domain, + iovpte_t pvpte, uint_t offset) +{ + iopte_t pte; + iovpte_t vpte; + paddr_t child; + + vpte = (iovpte_t)(pvpte->vp) + offset; + pte = (iopte_t)(pvpte->pp) + offset; + + /* + * the pte is nonpresent, alloc new page + */ + if (*pte == NULL) { + child = iommu_get_page(domain->dm_iommu, KM_SLEEP); + set_pte(pte, IOMMU_PAGE_PROP_RW, child); + domain->dm_iommu->iu_dmar_ops->do_clflush((caddr_t)pte, + sizeof (*pte)); + vpte->vp = kmem_zalloc(IOMMU_PAGE_SIZE << 1, KM_SLEEP); + vpte->pp = iommu_page_map(child); + } + + return (vpte); +} + +/* + * iommu_setup_page_table() + * setup the page table for a dvma + */ +static caddr_t +iommu_setup_page_table(dmar_domain_state_t *domain, uint64_t dvma) +{ + iovpte_t vpte; + uint_t level; + uint_t offset; + int i; + + level = domain->dm_iommu->iu_level; + vpte = &(domain->dm_pt_tree); + + for (i = level; i > 1; i--) { + offset = dvma_level_offset(IOMMU_BTOP(dvma), i); + vpte = iommu_setup_level_table(domain, vpte, offset); + } + + return (vpte->pp); +} + +/* + * iommu_map_page_range() + * map a range of pages for iommu translation + * + * domain: the device domain + * dvma: the start dvma for mapping + * start: the start physcial address + * end: the end physical address + * flags: misc flag + */ +static int +iommu_map_page_range(dmar_domain_state_t *domain, uint64_t dvma, + uint64_t start, uint64_t end, int flags) +{ + uint_t offset; + iopte_t pte; + caddr_t vaddr, dirt; + uint64_t paddr = start & IOMMU_PAGE_MASK; + uint64_t epaddr = end & IOMMU_PAGE_MASK; + uint64_t ioaddr = dvma & IOMMU_PAGE_MASK; + uint_t count; + + while (paddr <= epaddr) { + vaddr = iommu_setup_page_table(domain, ioaddr); + offset = dvma_level_offset(IOMMU_BTOP(ioaddr), 1); + + count = 0; + dirt = (caddr_t)((iopte_t)vaddr + offset); + while ((paddr <= epaddr) && (offset < IOMMU_PTE_MAX)) { + pte = (iopte_t)vaddr + offset; + if (*pte != NULL) { + if (pte_get_paddr(pte) != paddr) { + cmn_err(CE_WARN, "try to set " + "non-NULL pte"); + } + } else { + set_pte(pte, IOMMU_PAGE_PROP_RW, paddr); + } + paddr += IOMMU_PAGE_SIZE; + offset++; + count++; + } + + /* flush cpu and iotlb cache */ + domain->dm_iommu->iu_dmar_ops->do_clflush(dirt, + count * sizeof (uint64_t)); + + if (!(flags & IOMMU_PAGE_PROP_NOSYNC)) { + /* cache mode set, flush iotlb */ + if (IOMMU_CAP_GET_CM(domain->dm_iommu->iu_capability)) { + domain->dm_iommu->iu_dmar_ops-> + do_iotlb_psi(domain->dm_iommu, + 0, ioaddr, count, TLB_IVA_WHOLE); + /* cache mode not set, flush write buffer */ + } else { + domain->dm_iommu->iu_dmar_ops-> + do_flwb(domain->dm_iommu); + } + } + + ioaddr += IOMMU_PTOB(count); + } + + return (DDI_SUCCESS); +} + +/* + * build_single_rmrr_identity_map() + * build identity map for a single rmrr unit + */ +static void +build_single_rmrr_identity_map(rmrr_info_t *rmrr) +{ + pci_dev_scope_t *devs; + pci_dev_info_t info; + uint64_t start, end, size; + dmar_domain_state_t *domain; + + info.pdi_seg = rmrr->ri_segment; + for_each_in_list(&(rmrr->ri_dev_list), devs) { + info.pdi_bus = devs->pds_bus; + info.pdi_devfn = (devs->pds_dev << 3) | + devs->pds_func; + + if (get_dip_from_info(&info) != DDI_SUCCESS) { + cmn_err(CE_WARN, "rmrr: get dip for %d,%d failed", + info.pdi_bus, info.pdi_devfn); + continue; + } + + if (iommu_get_domain(info.pdi_dip, &domain) != DDI_SUCCESS) { + cmn_err(CE_WARN, "rmrr: get domain for %s failed", + ddi_node_name(info.pdi_dip)); + continue; + } + + start = rmrr->ri_baseaddr; + end = rmrr->ri_limiaddr; + size = end - start + 1; + + /* + * setup the page tables + */ + if ((vmem_xalloc(domain->dm_dvma_map, + size, /* size */ + IOMMU_PAGE_SIZE, /* align/quantum */ + 0, /* phase */ + 0, /* nocross */ + (void *)(uintptr_t)start, /* minaddr */ + (void *)(uintptr_t)(end + 1), /* maxaddr */ + VM_NOSLEEP) != NULL)) { + (void) iommu_map_page_range(domain, + start, start, end, + DDI_DMA_READ | DDI_DMA_WRITE | + IOMMU_PAGE_PROP_NOSYNC); + } else { + cmn_err(CE_WARN, "Can't reserve 0x%" PRIx64 + " ~ 0x%" PRIx64 " for %s", start, end, + ddi_node_name(info.pdi_dip)); + } + } +} + +/* + * build_rmrr_identity_map() + * build identity mapping for devices under rmrr scopes + */ +static void +build_rmrr_identity_map(void) +{ + rmrr_info_t *rmrr; + int i; + + for (i = 0; i < DMAR_MAX_SEGMENT; i++) { + if (list_is_empty(&(dmar_info->dmari_rmrr[i]))) + break; + for_each_in_list(&(dmar_info->dmari_rmrr[i]), rmrr) { + build_single_rmrr_identity_map(rmrr); + } + } +} + +/* + * drhd_only_for_gfx() + * return TRUE, if the drhd is only for gfx + */ +static boolean_t +drhd_only_for_gfx(intel_iommu_state_t *iommu) +{ + drhd_info_t *drhd = iommu->iu_drhd; + pci_dev_scope_t *devs; + pci_dev_info_t info; + int dev_num; + + if (drhd->di_include_all) + return (B_FALSE); + + /* get the device number attached to this drhd */ + dev_num = 0; + for_each_in_list(&(drhd->di_dev_list), devs) { + dev_num++; + } + + if (dev_num == 1) { + iommu_private_t *private; + devs = list_head(&(drhd->di_dev_list)); + info.pdi_seg = drhd->di_segment; + info.pdi_bus = devs->pds_bus; + info.pdi_devfn = (devs->pds_dev << 3) + + (devs->pds_func & 0x7); + + if (get_dip_from_info(&info) != DDI_SUCCESS) { + return (B_FALSE); + } + + private = DEVI(info.pdi_dip)->devi_iommu_private; + if (private->idp_is_display) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * build_gfx_identity_map() + * build identity map for the gfx device + */ +static void +build_gfx_identity_map(dev_info_t *dip) +{ + struct memlist *mp; + dmar_domain_state_t *domain; + + if (iommu_get_domain(dip, &domain) != DDI_SUCCESS) { + cmn_err(CE_WARN, "build identity map for %s failed," + "this device may not be functional", + ddi_node_name(dip)); + return; + } + + gfx_devinfo = dip; + + ASSERT(bootops != NULL); + ASSERT(!modrootloaded); + mp = bootops->boot_mem->physinstalled; + while (mp != 0) { + (void) iommu_map_page_range(domain, + mp->address & IOMMU_PAGE_MASK, + mp->address & IOMMU_PAGE_MASK, + (mp->address + mp->size - 1) & IOMMU_PAGE_MASK, + DDI_DMA_READ | DDI_DMA_WRITE | + IOMMU_PAGE_PROP_NOSYNC); + mp = mp->next; + } + + /* + * record the identity map for domain, any device + * which uses this domain will needn't any further + * map + */ + domain->dm_identity = B_TRUE; +} + +/* + * build_isa_gfx_identity_walk() + * the walk function for build_isa_gfx_identity_map() + */ +static int +build_isa_gfx_identity_walk(dev_info_t *dip, void *arg) +{ + _NOTE(ARGUNUSED(arg)) + + iommu_private_t *private; + private = DEVI(dip)->devi_iommu_private; + + /* ignore the NULL private device */ + if (!private) + return (DDI_WALK_CONTINUE); + + /* fix the gfx and fd */ + if (private->idp_is_display) + build_gfx_identity_map(dip); + else if (private->idp_is_lpc) + lpc_devinfo = dip; + + return (DDI_WALK_CONTINUE); +} + +/* + * build_isa_gfx_identity_map() + * build identity map for isa and gfx devices + */ +static void +build_isa_gfx_identity_map(void) +{ + int count; + + /* + * walk through the device tree from pdip + * normally, pdip should be the pci root + */ + ndi_devi_enter(pci_top_devinfo, &count); + ddi_walk_devs(ddi_get_child(pci_top_devinfo), + build_isa_gfx_identity_walk, NULL); + ndi_devi_exit(pci_top_devinfo, count); +} + +/* + * dmar_check_boot_option() + * check the intel iommu boot option + */ +static void +dmar_check_boot_option(char *opt, int *var) +{ + int len; + char *boot_option; + + if ((len = do_bsys_getproplen(NULL, opt)) > 0) { + boot_option = kmem_alloc(len, KM_SLEEP); + (void) do_bsys_getprop(NULL, opt, boot_option); + if (strcmp(boot_option, "yes") == 0) { + cmn_err(CE_CONT, "\"%s=yes\" was set\n", + opt); + *var = 1; + } else if (strcmp(boot_option, "no") == 0) { + cmn_err(CE_CONT, "\"%s=no\" was set\n", + opt); + *var = 0; + } + kmem_free(boot_option, len); + } +} + +extern void (*rootnex_iommu_add_intr)(void); + +/* + * intel_iommu_attach_dmar_nodes() + * attach intel iommu nodes + */ +int +intel_iommu_attach_dmar_nodes(void) +{ + drhd_info_t *drhd; + intel_iommu_state_t *iommu; + dmar_reserve_pages_t *rmem; + int i; + + /* + * retrieve the dmar boot options + */ + cmn_err(CE_CONT, "?Start to check dmar related boot options\n"); + dmar_check_boot_option("dmar-gfx-disable", &gfx_drhd_disable); + dmar_check_boot_option("dmar-drhd-disable", &dmar_drhd_disable); + + /* + * init the lists + */ + list_create(&iommu_states, sizeof (intel_iommu_state_t), + offsetof(intel_iommu_state_t, node)); + list_create(&reserve_memory, sizeof (dmar_reserve_pages_t), + offsetof(dmar_reserve_pages_t, node)); + + pci_top_devinfo = ddi_find_devinfo("pci", -1, 0); + isa_top_devinfo = ddi_find_devinfo("isa", -1, 0); + if (pci_top_devinfo == NULL) { + cmn_err(CE_WARN, "can't get pci top devinfo"); + return (DDI_FAILURE); + } + + iommu_page_init(); + + /* + * initiate each iommu unit + */ + cmn_err(CE_CONT, "?Start to create iommu state structures\n"); + for (i = 0; i < DMAR_MAX_SEGMENT; i++) { + for_each_in_list(&(dmar_info->dmari_drhd[i]), drhd) { + if (create_iommu_state(drhd) != DDI_SUCCESS) + goto iommu_init_fail; + } + } + + /* + * collect the reserved memory pages + */ + cmn_err(CE_CONT, "?Start to collect the reserved memory\n"); + iommu_collect_reserve_memory(); + + /* + * build identity map for devices in the rmrr scope + */ + cmn_err(CE_CONT, "?Start to prepare identity map for rmrr\n"); + build_rmrr_identity_map(); + + /* + * build identity map for isa and gfx devices + */ + cmn_err(CE_CONT, "?Start to prepare identity map for gfx\n"); + build_isa_gfx_identity_map(); + + /* + * initialize the dvma cookie cache + */ + for (i = 0; i < MAX_COOKIE_CACHE_SIZE; i++) { + mutex_init(&(cookie_cache[i].dch_lock), NULL, + MUTEX_DRIVER, NULL); + cookie_cache[i].dch_count = 0; + cookie_cache[i].dch_next = NULL; + } + + /* + * regist the intr add function + */ + rootnex_iommu_add_intr = intel_iommu_add_intr; + + /* + * enable dma remapping + */ + cmn_err(CE_CONT, "?Start to enable the dmar units\n"); + if (!dmar_drhd_disable) { + for_each_in_list(&iommu_states, iommu) { + if (gfx_drhd_disable && + drhd_only_for_gfx(iommu)) + continue; + iommu_bringup_unit(iommu); + iommu->iu_enabled = B_TRUE; + } + } + + return (DDI_SUCCESS); + +iommu_init_fail: + /* + * free the reserve memory list + */ + while (rmem = list_head(&reserve_memory)) { + list_remove(&reserve_memory, rmem); + kmem_free(rmem, sizeof (dmar_reserve_pages_t)); + } + list_destroy(&reserve_memory); + + /* + * free iommu state structure + */ + while (iommu = list_head(&iommu_states)) { + list_remove(&iommu_states, iommu); + destroy_iommu_state(iommu); + } + list_destroy(&iommu_states); + + return (DDI_FAILURE); +} + +/* + * get_level_table() + * get level n page table, NULL is returned if + * failure encountered + */ +static caddr_t +get_level_table(dmar_domain_state_t *domain, + uint64_t dvma_pn, uint_t n) +{ + iovpte_t vpte; + uint_t level; + uint_t i, offset; + + level = domain->dm_iommu->iu_level; + ASSERT(level >= n); + vpte = &(domain->dm_pt_tree); + + /* walk to the level n page table */ + for (i = level; i > n; i--) { + offset = dvma_level_offset(dvma_pn, i); + vpte = (iovpte_t)(vpte->vp) + offset; + } + + return (vpte->pp); +} + +/* + * iommu_alloc_cookie_array() + * allocate the cookie array which is needed by map sgl + */ +static int +iommu_alloc_cookie_array(rootnex_dma_t *dma, + struct ddi_dma_req *dmareq, uint_t prealloc) +{ + int kmflag; + rootnex_sglinfo_t *sinfo = &(dma->dp_sglinfo); + + /* figure out the rough estimate of array size */ + sinfo->si_max_pages = + (dmareq->dmar_object.dmao_size + IOMMU_PAGE_OFFSET) / + sinfo->si_max_cookie_size + 1; + + /* the preallocated buffer fit this size */ + if (sinfo->si_max_pages <= prealloc) { + dma->dp_cookies = (ddi_dma_cookie_t *)dma->dp_prealloc_buffer; + dma->dp_need_to_free_cookie = B_FALSE; + /* we need to allocate new array */ + } else { + /* convert the sleep flags */ + if (dmareq->dmar_fp == DDI_DMA_SLEEP) { + kmflag = KM_SLEEP; + } else { + kmflag = KM_NOSLEEP; + } + + dma->dp_cookie_size = sinfo->si_max_pages * + sizeof (ddi_dma_cookie_t); + dma->dp_cookies = kmem_alloc(dma->dp_cookie_size, kmflag); + if (dma->dp_cookies == NULL) { + return (IOMMU_SGL_NORESOURCES); + } + dma->dp_need_to_free_cookie = B_TRUE; + } + + /* allocate the dvma cookie array */ + dma->dp_dvma_cookies = get_dvma_cookie_array(sinfo->si_max_pages); + + return (IOMMU_SGL_SUCCESS); +} + +/* + * iommu_alloc_dvma() + * alloc a dvma range for the caller + */ +static int +iommu_alloc_dvma(dmar_domain_state_t *domain, uint_t size, + ddi_dma_impl_t *hp, uint64_t *dvma, uint_t cnt) +{ + rootnex_dma_t *dma; + ddi_dma_attr_t *dma_attr; + iommu_dvma_cookie_t *dcookie; + uint64_t ioaddr; + size_t xsize, align, nocross; + uint64_t minaddr, maxaddr; + + /* shotcuts */ + dma = (rootnex_dma_t *)hp->dmai_private; + dma_attr = &(hp->dmai_attr); + dcookie = dma->dp_dvma_cookies; + + /* parameters */ + xsize = (size + IOMMU_PAGE_OFFSET) & IOMMU_PAGE_MASK; + align = MAX((size_t)(dma_attr->dma_attr_align), IOMMU_PAGE_SIZE); + nocross = (size_t)(dma_attr->dma_attr_seg + 1); + minaddr = dma_attr->dma_attr_addr_lo; + maxaddr = dma_attr->dma_attr_addr_hi + 1; + + /* handle the rollover cases */ + if (maxaddr < dma_attr->dma_attr_addr_hi) { + maxaddr = dma_attr->dma_attr_addr_hi; + } + + /* get from cache first */ + ioaddr = iommu_dvma_cache_get(domain, xsize, align, nocross); + + if (ioaddr == NULL) { + /* allocate from vmem arena */ + ioaddr = (uint64_t)(uintptr_t)vmem_xalloc(domain->dm_dvma_map, + xsize, align, 0, nocross, + (void *)(uintptr_t)minaddr, + (void *)(uintptr_t)maxaddr, + VM_NOSLEEP); + + /* if xalloc failed, we have to flush the cache and retry */ + if (ioaddr == NULL) { + iommu_dvma_cache_flush(domain, dma->dp_dip); + ioaddr = (uint64_t)(uintptr_t)vmem_xalloc( + domain->dm_dvma_map, + xsize, align, 0, nocross, + (void *)(uintptr_t)minaddr, + (void *)(uintptr_t)maxaddr, + VM_NOSLEEP); + ASSERT(ioaddr); + } + } + + ASSERT(ioaddr >= minaddr); + ASSERT(ioaddr + size - 1 < maxaddr); + + *dvma = ioaddr; + + /* + * save the dvma range in the device dvma cookie + */ + dcookie[cnt].dc_addr = ioaddr; + dcookie[cnt].dc_size = xsize; + dcookie[cnt].dc_domain = domain; + dcookie[cnt].dc_align = align; + + return (DDI_SUCCESS); +} + +/* + * iommu_map_dvma() + * map dvma to the physical addresses, the actual + * mapped dvma page number is returned + */ +static int +iommu_map_dvma(dmar_domain_state_t *domain, uint64_t dvma, + uint64_t paddr, uint_t psize, struct ddi_dma_req *dmareq) +{ + uint64_t start, end; + int flags; + + start = paddr & IOMMU_PAGE_MASK; + end = (paddr + psize - 1) & IOMMU_PAGE_MASK; + flags = dmareq->dmar_flags & DDI_DMA_RDWR; + + /* map each physical address */ + (void) iommu_map_page_range(domain, dvma, start, end, flags); + return (IOMMU_BTOP(end - start) + 1); +} + +/* + * intel_iommu_map_sgl() + * called from rootnex_dma_bindhdl(), to build dma + * cookies when iommu is enabled + */ +int +intel_iommu_map_sgl(ddi_dma_handle_t handle, + struct ddi_dma_req *dmareq, uint_t prealloc) +{ + ddi_dma_atyp_t buftype; + uint64_t offset; + page_t **pparray; + uint64_t paddr; + uint64_t dvma; + uint_t psize; + uint_t size; + uint64_t maxseg; + caddr_t vaddr; + uint_t pcnt, cnt; + page_t *page; + ddi_dma_cookie_t *sgl; + rootnex_sglinfo_t *sglinfo; + ddi_dma_obj_t *dmar_object; + ddi_dma_impl_t *hp; + rootnex_dma_t *dma; + dmar_domain_state_t *domain; + int e; + + hp = (ddi_dma_impl_t *)handle; + dma = (rootnex_dma_t *)hp->dmai_private; + sglinfo = &(dma->dp_sglinfo); + dmar_object = &(dmareq->dmar_object); + maxseg = sglinfo->si_max_cookie_size; + pparray = dmar_object->dmao_obj.virt_obj.v_priv; + vaddr = dmar_object->dmao_obj.virt_obj.v_addr; + buftype = dmar_object->dmao_type; + size = dmar_object->dmao_size; + + /* get domain for the dma request */ + if (iommu_get_domain(dma->dp_dip, &domain) != DDI_SUCCESS) { + cmn_err(CE_WARN, "get domain for %s failed", + ddi_node_name(dma->dp_dip)); + return (IOMMU_SGL_NORESOURCES); + } + + /* direct return if drhd is disabled */ + if (!(domain->dm_iommu->iu_enabled) || + domain->dm_identity) + return (IOMMU_SGL_DISABLE); + + /* + * allocate the cookies arrays, if the pre-allocated + * space is not enough, we should reallocate it + */ + if (iommu_alloc_cookie_array(dma, dmareq, prealloc) + != IOMMU_SGL_SUCCESS) + return (IOMMU_SGL_NORESOURCES); + hp->dmai_cookie = dma->dp_cookies; + sgl = dma->dp_cookies; + + pcnt = 0; + cnt = 0; + + /* retrieve paddr, psize, offset from dmareq */ + if (buftype == DMA_OTYP_PAGES) { + page = dmar_object->dmao_obj.pp_obj.pp_pp; + ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page)); + offset = dmar_object->dmao_obj.pp_obj.pp_offset & + MMU_PAGEOFFSET; + paddr = pfn_to_pa(page->p_pagenum) + offset; + psize = MIN((MMU_PAGESIZE - offset), size); + sglinfo->si_asp = NULL; + page = page->p_next; + } else { + ASSERT((buftype == DMA_OTYP_VADDR) || + (buftype == DMA_OTYP_BUFVADDR)); + sglinfo->si_asp = dmar_object->dmao_obj.virt_obj.v_as; + if (sglinfo->si_asp == NULL) { + sglinfo->si_asp = &kas; + } + offset = (uintptr_t)vaddr & MMU_PAGEOFFSET; + + if (pparray != NULL) { + ASSERT(!PP_ISFREE(pparray[pcnt])); + paddr = pfn_to_pa(pparray[pcnt]->p_pagenum) + offset; + psize = MIN((MMU_PAGESIZE - offset), size); + pcnt++; + } else { + paddr = pfn_to_pa(hat_getpfnum(sglinfo->si_asp->a_hat, + vaddr)) + offset; + psize = MIN(size, (MMU_PAGESIZE - offset)); + vaddr += psize; + } + } + + /* save the iommu page offset */ + sglinfo->si_buf_offset = offset & IOMMU_PAGE_OFFSET; + + /* + * allocate the dvma and map [paddr, paddr+psize) + */ + e = iommu_alloc_dvma(domain, MIN(size + sglinfo->si_buf_offset, + maxseg), hp, &dvma, cnt); + if (e != DDI_SUCCESS) + return (IOMMU_SGL_NORESOURCES); + e = iommu_map_dvma(domain, dvma, paddr, psize, dmareq); + + /* + * setup the first cookie with the dvma of the page + * and the its size, we don't take account in the + * offset into the first page now + */ + sgl[cnt].dmac_laddress = dvma; + sgl[cnt].dmac_size = psize + sglinfo->si_buf_offset; + sgl[cnt].dmac_type = 0; + dvma += IOMMU_PTOB(e); + + size -= psize; + while (size > 0) { + /* get the size for this page (i.e. partial or full page) */ + psize = MIN(size, MMU_PAGESIZE); + if (buftype == DMA_OTYP_PAGES) { + /* get the paddr from the page_t */ + ASSERT(!PP_ISFREE(page) && PAGE_LOCKED(page)); + paddr = pfn_to_pa(page->p_pagenum); + page = page->p_next; + } else if (pparray != NULL) { + /* index into the array of page_t's to get the paddr */ + ASSERT(!PP_ISFREE(pparray[pcnt])); + paddr = pfn_to_pa(pparray[pcnt]->p_pagenum); + pcnt++; + } else { + /* call into the VM to get the paddr */ + paddr = pfn_to_pa(hat_getpfnum + (sglinfo->si_asp->a_hat, vaddr)); + vaddr += psize; + } + + /* + * check to see if this page would put us + * over the max cookie size + */ + if ((sgl[cnt].dmac_size + psize) > maxseg) { + /* use the next cookie */ + cnt++; + + /* allocate the dvma and map [paddr, paddr+psize) */ + e = iommu_alloc_dvma(domain, MIN(size, maxseg), + hp, &dvma, cnt); + if (e != DDI_SUCCESS) + return (IOMMU_SGL_NORESOURCES); + e = iommu_map_dvma(domain, dvma, paddr, psize, dmareq); + + /* save the cookie information */ + sgl[cnt].dmac_laddress = dvma; + sgl[cnt].dmac_size = psize; + sgl[cnt].dmac_type = 0; + dvma += IOMMU_PTOB(e); + + /* + * we can add this page in the current cookie + */ + } else { + e = iommu_map_dvma(domain, dvma, paddr, psize, dmareq); + sgl[cnt].dmac_size += psize; + dvma += IOMMU_PTOB(e); + } + + size -= psize; + } + + /* take account in the offset into the first page */ + sgl[0].dmac_laddress += sglinfo->si_buf_offset; + sgl[0].dmac_size -= sglinfo->si_buf_offset; + + /* save away how many cookies we have */ + sglinfo->si_sgl_size = cnt + 1; + + return (IOMMU_SGL_SUCCESS); +} + +/* + * iommu_clear_leaf_pte() + * clear a single leaf pte + */ +static void +iommu_clear_leaf_pte(dmar_domain_state_t *domain, uint64_t dvma, uint64_t size) +{ + iopte_t pte; + uint_t offset; + caddr_t leaf_table, dirt; + uint64_t csize = 0; + uint64_t cdvma = dvma & IOMMU_PAGE_MASK; + int count; + + while (csize < size) { + + /* retrieve the leaf page table */ + leaf_table = get_level_table(domain, IOMMU_BTOP(cdvma), 1); + if (!leaf_table) { + cmn_err(CE_WARN, "get level 1 table for 0x%" + PRIx64 "failed", cdvma); + return; + } + + /* map the leaf page and walk to the pte */ + offset = dvma_level_offset(IOMMU_BTOP(cdvma), 1); + + /* clear the ptes */ + count = 0; + dirt = (caddr_t)((iopte_t)leaf_table + offset); + while ((csize < size) && + (offset < IOMMU_PTE_MAX)) { + pte = (iopte_t)leaf_table + offset; + if (!*pte) { + cmn_err(CE_WARN, "try to clear NULL pte"); + } else { + *pte = 0; + } + csize += IOMMU_PAGE_SIZE; + offset++; + count++; + } + + /* flush cpu and iotlb cache */ + domain->dm_iommu->iu_dmar_ops->do_clflush(dirt, + count * sizeof (uint64_t)); + domain->dm_iommu->iu_dmar_ops->do_iotlb_psi(domain->dm_iommu, + domain->dm_domain_id, cdvma, count, TLB_IVA_LEAF); + + /* unmap the leaf page */ + cdvma += IOMMU_PTOB(count); + } +} + +/* + * intel_iommu_unmap_sgl() + * called from rootnex_dma_unbindhdl(), to unbind dma + * cookies when iommu is enabled + */ +void +intel_iommu_unmap_sgl(ddi_dma_handle_t handle) +{ + ddi_dma_impl_t *hp; + rootnex_dma_t *dma; + dmar_domain_state_t *domain; + iommu_dvma_cookie_t *dcookies; + rootnex_sglinfo_t *sinfo; + uint64_t i; + + hp = (ddi_dma_impl_t *)handle; + dma = (rootnex_dma_t *)hp->dmai_private; + dcookies = dma->dp_dvma_cookies; + sinfo = &(dma->dp_sglinfo); + + /* get the device domain, no return check needed here */ + (void) iommu_get_domain(dma->dp_dip, &domain); + + /* if the drhd is disabled, nothing will be done */ + if (!(domain->dm_iommu->iu_enabled) || + domain->dm_identity) + return; + + /* the drhd is enabled */ + for (i = 0; i < sinfo->si_sgl_size; i++) { + /* clear leaf ptes */ + iommu_clear_leaf_pte(domain, dcookies[i].dc_addr, + dcookies[i].dc_size); + } + + domain->dm_iommu->iu_dmar_ops->do_reap_wait(domain->dm_iommu); + domain->dm_iommu->iu_dmar_ops->do_plant_wait(domain->dm_iommu, + dcookies, sinfo->si_sgl_size, sinfo->si_max_pages); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/io/iommu_rscs.c Sun Sep 14 19:52:20 2008 -0700 @@ -0,0 +1,340 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#include <sys/conf.h> +#include <sys/autoconf.h> +#include <sys/sysmacros.h> +#include <sys/debug.h> +#include <sys/psw.h> +#include <sys/ddidmareq.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <vm/seg.h> +#include <vm/seg_kmem.h> +#include <vm/seg_kpm.h> +#include <vm/seg_dev.h> +#include <sys/vmem.h> +#include <vm/hat.h> +#include <vm/as.h> +#include <vm/page.h> +#include <sys/avintr.h> +#include <sys/errno.h> +#include <sys/modctl.h> +#include <sys/ddi_impldefs.h> +#include <sys/sunddi.h> +#include <sys/sunndi.h> +#include <sys/mach_intr.h> +#include <vm/hat_i86.h> +#include <sys/machsystm.h> +#include <sys/iommu_rscs.h> + + + +typedef struct iommu_rscs_s { + /* + * Bounds of resource allocation. We will start allocating at rs_min + * and rollover at rs_max+1 (rs_max is included). e.g. for rs_min=0 + * and rs_max=7, we will have 8 total resources which can be alloced. + */ + uint_t rs_min; + uint_t rs_max; + + /* + * rs_free points to an array of 64-bit values used to track resource + * allocation. rs_free_size is the free buffer size in bytes. + */ + uint64_t *rs_free; + uint_t rs_free_size; + + /* + * last tracks the last alloc'd resource. This allows us to do a round + * robin allocation. + */ + uint_t rs_last; + + kmutex_t rs_mutex; +} iommu_rscs_state_t; + + +/* + * iommu_page_alloc() + * + */ +paddr_t +iommu_page_alloc(int kmflag) +{ + paddr_t paddr; + page_t *pp; + + ASSERT(kmflag == KM_SLEEP || kmflag == KM_NOSLEEP); + + pp = page_get_physical(kmflag); + if (pp == NULL) { + return (NULL); + } + + paddr = pa_to_ma((uint64_t)pp->p_pagenum << PAGESHIFT); + + return (paddr); +} + + +/* + * iommu_page_free() + */ +void +iommu_page_free(paddr_t paddr) +{ + page_t *pp; + + pp = page_numtopp_nolock(ma_to_pa(paddr) >> PAGESHIFT); + page_free_physical(pp); +} + + +/* + * iommu_page_map() + * + */ +caddr_t +iommu_page_map(paddr_t addr) +{ + paddr_t paddr; + caddr_t kva; + page_t *pp; + + paddr = ma_to_pa(addr); + + if (kpm_enable) { + kva = hat_kpm_pfn2va((pfn_t)btop(paddr)); + } else { + kva = vmem_alloc(heap_arena, PAGESIZE, VM_SLEEP); + if (kva == NULL) { + return (NULL); + } + pp = page_numtopp_nolock(paddr >> PAGESHIFT); + hat_memload(kas.a_hat, kva, pp, + PROT_READ | PROT_WRITE, HAT_LOAD_LOCK); + } + + return (kva); +} + + +/* + * iommu_page_unmap() + * + */ +void +iommu_page_unmap(caddr_t kva) +{ + if (!kpm_enable) { + hat_unload(kas.a_hat, kva, PAGESIZE, HAT_UNLOAD_UNLOCK); + vmem_free(heap_arena, kva, PAGESIZE); + } +} + + + +/* + * iommu_rscs_init() + * Initialize the resource structure. init() returns a handle to be + * used for the rest of the resource functions. This code is written assuming + * that min_val will be close to 0. Therefore, we will allocate the free + * buffer only taking max_val into account. + */ +void +iommu_rscs_init(uint_t min_val, uint_t max_val, iommu_rscs_t *handle) +{ + iommu_rscs_state_t *rstruct; + uint_t array_size; + uint_t index; + + + ASSERT(handle != NULL); + ASSERT(min_val < max_val); + + /* alloc space for resource structure */ + rstruct = kmem_alloc(sizeof (iommu_rscs_state_t), KM_SLEEP); + + /* + * Test to see if the max value is 64-bit aligned. If so, we don't need + * to allocate an extra 64-bit word. alloc space for free buffer + * (8 bytes per uint64_t). + */ + if ((max_val & 0x3F) == 0) { + rstruct->rs_free_size = (max_val >> 6) * 8; + } else { + rstruct->rs_free_size = ((max_val >> 6) + 1) * 8; + } + rstruct->rs_free = kmem_alloc(rstruct->rs_free_size, KM_SLEEP); + + /* Initialize resource structure */ + rstruct->rs_min = min_val; + rstruct->rs_last = min_val; + rstruct->rs_max = max_val; + mutex_init(&rstruct->rs_mutex, NULL, MUTEX_DRIVER, NULL); + + /* Mark all resources as free */ + array_size = rstruct->rs_free_size >> 3; + for (index = 0; index < array_size; index++) { + rstruct->rs_free[index] = (uint64_t)0xFFFFFFFFFFFFFFFF; + } + + /* setup handle which is returned from this function */ + *handle = rstruct; +} + + +/* + * iommu_rscs_fini() + * Frees up the space allocated in init(). Notice that a pointer to the + * handle is used for the parameter. fini() will set the handle to NULL + * before returning. + */ +void +iommu_rscs_fini(iommu_rscs_t *handle) +{ + iommu_rscs_state_t *rstruct; + + + ASSERT(handle != NULL); + + rstruct = (iommu_rscs_state_t *)*handle; + + mutex_destroy(&rstruct->rs_mutex); + kmem_free(rstruct->rs_free, rstruct->rs_free_size); + kmem_free(rstruct, sizeof (iommu_rscs_state_t)); + + /* set handle to null. This helps catch bugs. */ + *handle = NULL; +} + + +/* + * iommu_rscs_alloc() + * alloc a resource. If alloc fails, we are out of resources. + */ +int +iommu_rscs_alloc(iommu_rscs_t handle, uint_t *resource) +{ + iommu_rscs_state_t *rstruct; + uint_t array_idx; + uint64_t free; + uint_t index; + uint_t last; + uint_t min; + uint_t max; + + + ASSERT(handle != NULL); + ASSERT(resource != NULL); + + rstruct = (iommu_rscs_state_t *)handle; + + mutex_enter(&rstruct->rs_mutex); + min = rstruct->rs_min; + max = rstruct->rs_max; + + /* + * Find a free resource. This will return out of the loop once it finds + * a free resource. There are a total of 'max'-'min'+1 resources. + * Performs a round robin allocation. + */ + for (index = min; index <= max; index++) { + + array_idx = rstruct->rs_last >> 6; + free = rstruct->rs_free[array_idx]; + last = rstruct->rs_last & 0x3F; + + /* if the next resource to check is free */ + if ((free & ((uint64_t)1 << last)) != 0) { + /* we are using this resource */ + *resource = rstruct->rs_last; + + /* take it out of the free list */ + rstruct->rs_free[array_idx] &= ~((uint64_t)1 << last); + + /* + * increment the last count so we start checking the + * next resource on the next alloc(). Note the rollover + * at 'max'+1. + */ + rstruct->rs_last++; + if (rstruct->rs_last > max) { + rstruct->rs_last = rstruct->rs_min; + } + + /* unlock the resource structure */ + mutex_exit(&rstruct->rs_mutex); + + return (DDI_SUCCESS); + } + + /* + * This resource is not free, lets go to the next one. Note the + * rollover at 'max'. + */ + rstruct->rs_last++; + if (rstruct->rs_last > max) { + rstruct->rs_last = rstruct->rs_min; + } + } + + mutex_exit(&rstruct->rs_mutex); + + return (DDI_FAILURE); +} + + +/* + * iommu_rscs_free() + * Free the previously alloc'd resource. Once a resource has been free'd, + * it can be used again when alloc is called. + */ +void +iommu_rscs_free(iommu_rscs_t handle, uint_t resource) +{ + iommu_rscs_state_t *rstruct; + uint_t array_idx; + uint_t offset; + + + ASSERT(handle != NULL); + + rstruct = (iommu_rscs_state_t *)handle; + ASSERT(resource >= rstruct->rs_min); + ASSERT(resource <= rstruct->rs_max); + + mutex_enter(&rstruct->rs_mutex); + + /* Put the resource back in the free list */ + array_idx = resource >> 6; + offset = resource & 0x3F; + rstruct->rs_free[array_idx] |= ((uint64_t)1 << offset); + + mutex_exit(&rstruct->rs_mutex); +}
--- a/usr/src/uts/i86pc/io/rootnex.c Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/i86pc/io/rootnex.c Sun Sep 14 19:52:20 2008 -0700 @@ -69,6 +69,13 @@ #include <vm/kboot_mmu.h> #endif +#include <sys/intel_iommu.h> + +/* + * add to support dmar fault interrupt, will change soon + */ +char _depends_on[] = "mach/pcplusmp"; + /* * enable/disable extra checking of function parameters. Useful for debugging * drivers. @@ -399,6 +406,7 @@ rootnex_state->r_err_ibc = (ddi_iblock_cookie_t)ipltospl(15); rootnex_state->r_reserved_msg_printed = B_FALSE; rootnex_cnt = &rootnex_state->r_counters[0]; + rootnex_state->r_intel_iommu_enabled = B_FALSE; /* * Set minimum fm capability level for i86pc platforms and then @@ -426,6 +434,20 @@ /* Initialize rootnex event handle */ i_ddi_rootnex_init_events(dip); +#if defined(__amd64) + /* probe intel iommu */ + intel_iommu_probe_and_parse(); + + /* attach the iommu nodes */ + if (intel_iommu_support) { + if (intel_iommu_attach_dmar_nodes() == DDI_SUCCESS) { + rootnex_state->r_intel_iommu_enabled = B_TRUE; + } else { + intel_iommu_release_dmar_info(); + } + } +#endif + return (DDI_SUCCESS); } @@ -1757,6 +1779,34 @@ /* save away the original bind info */ dma->dp_dma = dmareq->dmar_object; + if (rootnex_state->r_intel_iommu_enabled) { + e = intel_iommu_map_sgl(handle, dmareq, + rootnex_state->r_prealloc_cookies); + + switch (e) { + case IOMMU_SGL_SUCCESS: + goto rootnex_sgl_end; + + case IOMMU_SGL_DISABLE: + goto rootnex_sgl_start; + + case IOMMU_SGL_NORESOURCES: + cmn_err(CE_WARN, "iommu map sgl failed for %s", + ddi_node_name(dma->dp_dip)); + rootnex_clean_dmahdl(hp); + return (DDI_DMA_NORESOURCES); + + default: + cmn_err(CE_WARN, + "undefined value returned from" + " intel_iommu_map_sgl: %d", + e); + rootnex_clean_dmahdl(hp); + return (DDI_DMA_NORESOURCES); + } + } + +rootnex_sgl_start: /* * Figure out a rough estimate of what maximum number of pages this * buffer could use (a high estimate of course). @@ -1818,8 +1868,9 @@ */ rootnex_get_sgl(&dmareq->dmar_object, dma->dp_cookies, &dma->dp_sglinfo); + +rootnex_sgl_end: ASSERT(sinfo->si_sgl_size <= sinfo->si_max_pages); - /* if we don't need a copy buffer, we don't need to sync */ if (sinfo->si_copybuf_req == 0) { hp->dmai_rflags |= DMP_NOSYNC; @@ -1970,6 +2021,13 @@ rootnex_teardown_windows(dma); /* + * If intel iommu enabled, clean up the page tables and free the dvma + */ + if (rootnex_state->r_intel_iommu_enabled) { + intel_iommu_unmap_sgl(handle); + } + + /* * If we had to allocate space to for the worse case sgl (it didn't * fit into our pre-allocate buffer), free that up now */
--- a/usr/src/uts/i86pc/os/acpi_fw.h Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/i86pc/os/acpi_fw.h Sun Sep 14 19:52:20 2008 -0700 @@ -26,8 +26,6 @@ #ifndef _ACPI_FW_H #define _ACPI_FW_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -163,6 +161,14 @@ */ extern struct slit *slit_ptr; +struct dmar { + struct table_header hdr; + uint8_t width; + uint8_t flags; + uint8_t rsvd[10]; +}; + + /* * Arbitrary limit on number of localities we handle; if * this limit is raised to more than UINT16_MAX, make sure
--- a/usr/src/uts/i86pc/os/cpuid.c Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/i86pc/os/cpuid.c Sun Sep 14 19:52:20 2008 -0700 @@ -105,6 +105,7 @@ uint_t x86_feature = 0; uint_t x86_vendor = X86_VENDOR_IntelClone; uint_t x86_type = X86_TYPE_OTHER; +uint_t x86_clflush_size = 0; uint_t pentiumpro_bug4046376; uint_t pentiumpro_bug4064495; @@ -780,6 +781,15 @@ feature |= X86_MWAIT; } + /* + * Only need it first time, rest of the cpus would follow suite. + * we only capture this for the bootcpu. + */ + if (cp->cp_edx & CPUID_INTC_EDX_CLFSH) { + feature |= X86_CLFSH; + x86_clflush_size = (BITX(cp->cp_ebx, 15, 8) * 8); + } + if (feature & X86_PAE) cpi->cpi_pabits = 36;
--- a/usr/src/uts/i86pc/os/fakebop.c Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/i86pc/os/fakebop.c Sun Sep 14 19:52:20 2008 -0700 @@ -57,6 +57,7 @@ #endif #include <vm/kboot_mmu.h> #include <vm/hat_pte.h> +#include <sys/dmar_acpi.h> #include "acpi_fw.h" static int have_console = 0; /* set once primitive console is initialized */ @@ -2015,6 +2016,14 @@ bsetprop(SLIT_PROPNAME, strlen(SLIT_PROPNAME), &tp->entry, tp->number * tp->number); } + +static void +process_dmar(struct dmar *tp) +{ + bsetprop(DMAR_TABLE_PROPNAME, strlen(DMAR_TABLE_PROPNAME), + tp, tp->hdr.len); +} + #else /* __xpv */ static void enumerate_xen_cpus() @@ -2056,6 +2065,9 @@ if (slit_ptr = (struct slit *)find_fw_table("SLIT")) process_slit(slit_ptr); + + if (tp = find_fw_table("DMAR")) + process_dmar((struct dmar *)tp); #else /* __xpv */ enumerate_xen_cpus(); #endif /* __xpv */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/sys/dmar_acpi.h Sun Sep 14 19:52:20 2008 -0700 @@ -0,0 +1,223 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Portions Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2008, Intel Corporation. + * All rights reserved. + */ + +#ifndef _SYS_DMAR_ACPI_H +#define _SYS_DMAR_ACPI_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define DMAR_TABLE_PROPNAME "dmar-table" + +#define DMAR_UNIT_TYPE_DRHD 0 +#define DMAR_UNIT_TYPE_RMRR 1 +#define DMAR_UNIT_TYPE_ATSR 2 + +#define DEV_SCOPE_ENDPOINT 1 +#define DEV_SCOPE_P2P 2 +#define DEV_SCOPE_IOAPIC 3 +#define DEV_SCOPE_HPET 4 + +#define INCLUDE_PCI_ALL 0x01 +#define DMAR_MAX_SEGMENT 1 + +#define IOMMU_PAGE_SIZE_4K (1UL << 12) +#define IOMMU_REG_SIZE (1UL << 12) +#define PARSE_DMAR_SUCCESS 1 +#define PARSE_DMAR_FAIL 0 + +#define for_each_in_list(list, node) \ + for (node = list_head(list); node != NULL; \ + node = list_next(list, node)) + +/* + * The following structure describes the formate of + * DMAR ACPI table format. They are used to parse + * DMAR ACPI table. + * + * Read the spec for the meaning of each member. + */ + +/* DMAR ACPI table header */ +typedef struct dmar_acpi_head { + char dh_sig[4]; + uint32_t dh_len; + uint8_t dh_rev; + uint8_t dh_checksum; + char dh_oemid[6]; + char dh_oemtblid[8]; + uint32_t dh_oemrev; + char dh_asl[4]; + uint32_t dh_aslrev; + uint8_t dh_haw; + uint8_t dh_flags; + uint8_t dh_reserved[10]; +} dmar_acpi_head_t; + +/* Remapping structure header */ +typedef struct dmar_acpi_unit_head { + uint16_t uh_type; + uint16_t uh_length; +} dmar_acpi_unit_head_t; + +/* DRHD unit structure */ +typedef struct dmar_acpi_drhd { + dmar_acpi_unit_head_t dr_header; + uint8_t dr_flags; + uint8_t dr_reserved; + uint16_t dr_segment; + uint64_t dr_baseaddr; +} dmar_acpi_drhd_t; + +/* Device scope structure */ +typedef struct dmar_acpi_dev_scope { + uint8_t ds_type; + uint8_t ds_length; + uint8_t ds_reserved[2]; + uint8_t ds_enumid; + uint8_t ds_sbusnum; +} dmar_acpi_dev_scope_t; + +/* RMRR unit structure */ +typedef struct dmar_acpi_rmrr { + dmar_acpi_unit_head_t rm_header; + uint8_t rm_reserved[2]; + uint16_t rm_segment; + uint64_t rm_baseaddr; + uint64_t rm_limiaddr; +} dmar_acpi_rmrr_t; + +/* + * The following structures describes kernel recorded + * information about the DRHD and RMRR. + */ + +/* + * DRHD information structure + * + * node - the drhd info structure is inserted in the + * list embedded in the intel_dmar_info + * di_segment - the pci segment associated with this drhd + * di_reg_base - base address of the register set, the size + * of this set is 4K + * di_include_all - is it an include_all unit + * di_dev_list - the dev_info list get from the device scope, + * the node of this list is pci_dev_info_t, + * which present a single pci device + * di_dip - pointer to the dev_info for this drhd in the + * device tree + * di_iommu - link to the iommu state structure + */ +typedef struct drhd_info { + list_node_t node; + uint16_t di_segment; + uint64_t di_reg_base; + boolean_t di_include_all; + list_t di_dev_list; + dev_info_t *di_dip; + void *di_iommu; +} drhd_info_t; + +/* + * RMRR information structure + * + * node - the rmrr info structure is inserted in the + * list embedded in the intel_dmar_info + * ri_segment - the pci segment associated with this rmrr + * ri_baseaddr - the low address of the reserved range + * ri_limiaddr - the high address of the reserved range + * ri_dev_list - the dev_info list get from the device scope, + * the node of this list is pci_dev_info_t, w- + * hich present a single pci device + */ +typedef struct rmrr_info { + list_node_t node; + uint16_t ri_segment; + uint64_t ri_baseaddr; + uint64_t ri_limiaddr; + list_t ri_dev_list; +} rmrr_info_t; + +/* + * Intel IOMMU information structure + * + * dmari_haw - haw (host address width) indicates the max- + * imum DMA physical addressability by this + * platform. + * dmari_intr_remap - does this platform support intr remapping + * dmari_drhd - the list array of drhd units with the + * segment number as the index into this array + * dmari_rmrr - list array for the rmrr + */ +typedef struct intel_dmar_info { + uint8_t dmari_haw; + boolean_t dmari_intr_remap; + list_t dmari_drhd[DMAR_MAX_SEGMENT]; + list_t dmari_rmrr[DMAR_MAX_SEGMENT]; +} intel_dmar_info_t; + +/* + * The pci device node in the dev_list of drhd_info and + * rmrr_info + * + * node - list node + * bus, dev, func - bus, device and function number of + * - this pci device + * pdi_type - type of this device, includes + * 0x01 : pci endpoint + * 0x02 : pci p2p bridge + * 0x03 : ioapci + * 0x04 : msi capable hpet + * pdi_sec_bus - record the bus number of the PCI bus + * segment to which the secondary interface + * of the bridge is connected + * pdi_sub_bus - record the bus number of the highest + * numbered PCI bus segment which is behind + * (or subordinate to) the bridge + */ +typedef struct pci_dev_scope { + list_node_t node; + uint8_t pds_bus; + uint8_t pds_dev; + uint8_t pds_func; + uint8_t pds_type; +} pci_dev_scope_t; + +extern boolean_t intel_iommu_support; +extern intel_dmar_info_t *dmar_info; +extern void intel_iommu_release_dmar_info(void); +extern void intel_iommu_probe_and_parse(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMAR_ACPI_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/sys/intel_iommu.h Sun Sep 14 19:52:20 2008 -0700 @@ -0,0 +1,544 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Portions Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2008, Intel Corporation. + * All rights reserved. + */ + +#ifndef _SYS_INTEL_IOMMU_H +#define _SYS_INTEL_IOMMU_H + +/* + * Intel IOMMU implementation specific state + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/dmar_acpi.h> +#include <sys/iommu_rscs.h> +#include <sys/cpu.h> +#include <sys/kstat.h> + +/* extern functions */ +extern int intel_iommu_attach_dmar_nodes(void); +extern int intel_iommu_map_sgl(ddi_dma_handle_t handle, + struct ddi_dma_req *dmareq, uint_t prealloc); +extern void intel_iommu_unmap_sgl(ddi_dma_handle_t handle); +extern void return_instr(void); + +/* define the return value for iommu_map_sgl */ +#define IOMMU_SGL_SUCCESS 0 +#define IOMMU_SGL_DISABLE 1 +#define IOMMU_SGL_NORESOURCES 2 + +/* register offset */ +#define IOMMU_REG_VERSION (0x00) /* Version Rigister, 32 bit */ +#define IOMMU_REG_CAP (0x08) /* Capability Register, 64 bit */ +#define IOMMU_REG_EXCAP (0x10) /* Extended Capability Reg, 64 bit */ +#define IOMMU_REG_GLOBAL_CMD (0x18) /* Global Command Register, 32 bit */ +#define IOMMU_REG_GLOBAL_STS (0x1C) /* Global Status Register, 32 bit */ +#define IOMMU_REG_ROOTENTRY (0x20) /* Root-Entry Table Addr Reg, 64 bit */ +#define IOMMU_REG_CONTEXT_CMD (0x28) /* Context Comand Register, 64 bit */ +#define IOMMU_REG_FAULT_STS (0x34) /* Fault Status Register, 32 bit */ +#define IOMMU_REG_FEVNT_CON (0x38) /* Fault Event Control Reg, 32 bit */ +#define IOMMU_REG_FEVNT_DATA (0x3C) /* Fault Event Data Register, 32 bit */ +#define IOMMU_REG_FEVNT_ADDR (0x40) /* Fault Event Address Reg, 32 bit */ +#define IOMMU_REG_FEVNT_UADDR (0x44) /* Fault Event Upper Addr Reg, 32 bit */ +#define IOMMU_REG_AFAULT_LOG (0x58) /* Advanced Fault Log Reg, 64 bit */ +#define IOMMU_REG_PMER (0x64) /* Protected Memory Enble Reg, 32 bit */ +#define IOMMU_REG_PLMBR (0x68) /* Protected Low Mem Base Reg, 32 bit */ +#define IOMMU_REG_PLMLR (0x6C) /* Protected Low Mem Lim Reg, 32 bit */ +#define IOMMU_REG_PHMBR (0X70) /* Protectd High Mem Base Reg, 64 bit */ +#define IOMMU_REG_PHMLR (0x78) /* Protected High Mem Lim Reg, 64 bit */ +#define IOMMU_REG_INVAL_QH (0x80) /* Invalidation Queue Head, 64 bit */ +#define IOMMU_REG_INVAL_QT (0x88) /* Invalidation Queue Tail, 64 bit */ +#define IOMMU_REG_INVAL_QAR (0x90) /* Invalidtion Queue Addr Reg, 64 bit */ +#define IOMMU_REG_INVAL_CSR (0x9C) /* Inval Compl Status Reg, 32 bit */ +#define IOMMU_REG_INVAL_CECR (0xA0) /* Inval Compl Evnt Ctrl Reg, 32 bit */ +#define IOMMU_REG_INVAL_CEDR (0xA4) /* Inval Compl Evnt Data Reg, 32 bit */ +#define IOMMU_REG_INVAL_CEAR (0xA8) /* Inval Compl Event Addr Reg, 32 bit */ +#define IOMMU_REG_INVAL_CEUAR (0xAC) /* Inval Comp Evnt Up Addr reg, 32bit */ +#define IOMMU_REG_IRTAR (0xB8) /* INTR Remap Tbl Addr Reg, 64 bit */ + +/* ioapic memory region */ +#define IOAPIC_REGION_START (0xfee00000) +#define IOAPIC_REGION_END (0xfeefffff) + +/* iommu page */ +#define IOMMU_LEVEL_STRIDE (9) +#define IOMMU_LEVEL_SIZE ((uint64_t)1 << IOMMU_LEVEL_STRIDE) +#define IOMMU_LEVEL_OFFSET (IOMMU_LEVEL_SIZE - 1) +#define IOMMU_PAGE_SHIFT (12) +#define IOMMU_PAGE_SIZE (uint64_t)((uint64_t)1 << IOMMU_PAGE_SHIFT) +#define IOMMU_PAGE_MASK ~(IOMMU_PAGE_SIZE - 1) +#define IOMMU_PAGE_OFFSET (IOMMU_PAGE_SIZE - 1) +#define IOMMU_PAGE_ROUND(x) (((x) + IOMMU_PAGE_OFFSET) & IOMMU_PAGE_MASK) +#define IOMMU_PTOB(x) (((uint64_t)(x)) << IOMMU_PAGE_SHIFT) +#define IOMMU_BTOP(x) ((x) >> IOMMU_PAGE_SHIFT) +#define IOMMU_BTOPR(x) IOMMU_BTOP((x) + IOMMU_PAGE_OFFSET) +#define IOMMU_LEVEL_TO_AGAW(x) ((x) * 9 + 12) +#define IOMMU_IOVA_MAX_4G (((uint64_t)1 << 32) - 1) +#define IOMMU_SIZE_4G ((uint64_t)1 << 32) +#define IOMMU_SIZE_2M ((uint64_t)1 << 21) +#define IOMMU_2M_MASK ~(IOMMU_SIZE_2M - 1) +#define IOMMU_PTE_MAX (IOMMU_PAGE_SIZE >> 3) + +/* iommu page entry property */ +#define IOMMU_PAGE_PROP_READ (1) +#define IOMMU_PAGE_PROP_WRITE (2) +#define IOMMU_PAGE_PROP_RW (IOMMU_PAGE_PROP_READ | IOMMU_PAGE_PROP_WRITE) +#define IOMMU_PAGE_PROP_NOSYNC (4) + +/* root context entry */ +#define ROOT_ENTRY_GET_P(x) (((x)->lo) & 0x1) +#define ROOT_ENTRY_SET_P(x) ((x)->lo) |= 0x1 +#define ROOT_ENTRY_GET_CTP(x) (((x)->lo) & IOMMU_PAGE_MASK) +#define ROOT_ENTRY_SET_CTP(x, p) ((x)->lo) |= ((p) & IOMMU_PAGE_MASK) +#define CONT_ENTRY_GET_P(x) (((x)->lo) & 0x1) +#define CONT_ENTRY_SET_P(x) ((x)->lo) |= 0x1 +#define CONT_ENTRY_SET_ASR(x, p) ((x)->lo) |= ((p) & IOMMU_PAGE_MASK) +#define CONT_ENTRY_GET_ASR(x) (((x)->lo) & IOMMU_PAGE_MASK) +#define CONT_ENTRY_SET_AW(x, v) ((x)->hi) |= ((v) & 7) +#define CONT_ENTRY_SET_DID(x, v) ((x)->hi) |= (((v) & ((1 << 16) - 1)) << 8) + +/* fault register */ +#define IOMMU_FAULT_STS_PPF (2) +#define IOMMU_FAULT_STS_PFO (1) +#define IOMMU_FAULT_STS_IQE (1 << 4) +#define IOMMU_FAULT_GET_INDEX(x) (((x) >> 8) & 0xff) +#define IOMMU_FRR_GET_F(x) ((x) >> 63) +#define IOMMU_FRR_GET_FR(x) (((x) >> 32) & 0xff) +#define IOMMU_FRR_GET_FT(x) (((x) >> 62) & 0x1) +#define IOMMU_FRR_GET_SID(x) ((x) & 0xffff) + +/* (ex)capability register */ +#define IOMMU_CAP_GET_NFR(x) ((((x) >> 40) & 0xff) + 1) +#define IOMMU_CAP_GET_DWD(x) (((x) >> 54) & 1) +#define IOMMU_CAP_GET_DRD(x) (((x) >> 55) & 1) +#define IOMMU_CAP_GET_PSI(x) (((x) >> 39) & 1) +#define IOMMU_CAP_GET_MAMV(x) (((x) >> 48) & 0x3f) +#define IOMMU_CAP_GET_CM(x) (((x) >> 7) & 1) +#define IOMMU_CAP_GET_RWBF(x) (((x) >> 4) & 1) +#define IOMMU_CAP_GET_FRO(x) ((((x) >> 24) & 0x3ff) * 16) +#define IOMMU_CAP_MGAW(x) (((((uint64_t)x) >> 16) & 0x3f) + 1) +#define IOMMU_CAP_SAGAW(x) (((x) >> 8) & 0x1f) +#define IOMMU_CAP_ND(x) (1 << (((x) & 0x7) *2 + 4)) -1 +#define IOMMU_ECAP_GET_IRO(x) ((((x) >> 8) & 0x3ff) << 4) +#define IOMMU_ECAP_GET_C(x) ((x) & 0x1) +#define IOMMU_ECAP_GET_IR(x) ((x) & 0x8) +#define IOMMU_ECAP_GET_DI(x) ((x) & 0x4) +#define IOMMU_ECAP_GET_QI(x) ((x) & 0x2) + + +/* iotlb invalidation */ +#define TLB_INV_GLOBAL (((uint64_t)1) << 60) +#define TLB_INV_DOMAIN (((uint64_t)2) << 60) +#define TLB_INV_PAGE (((uint64_t)3) << 60) +#define TLB_INV_GET_IAIG(x) (((x) >> 57) & 7) +#define TLB_INV_DRAIN_READ (((uint64_t)1) << 49) +#define TLB_INV_DRAIN_WRITE (((uint64_t)1) << 48) +#define TLB_INV_DID(x) (((uint64_t)((x) & 0xffff)) << 32) +#define TLB_INV_IVT (((uint64_t)1) << 63) +#define TLB_IVA_HINT(x) (((x) & 0x1) << 6) +#define TLB_IVA_LEAF 1 +#define TLB_IVA_WHOLE 0 + +/* context invalidation */ +#define CCMD_INV_ICC (((uint64_t)1) << 63) +#define CCMD_INV_GLOBAL (((uint64_t)1) << 61) +#define CCMD_INV_DOMAIN (((uint64_t)2) << 61) +#define CCMD_INV_DEVICE (((uint64_t)3) << 61) +#define CCMD_INV_DID(x) ((uint64_t)((x) & 0xffff)) +#define CCMD_INV_SID(x) (((uint64_t)((x) & 0xffff)) << 16) +#define CCMD_INV_FM(x) (((uint64_t)((x) & 0x3)) << 32) + +/* global command register */ +#define IOMMU_GCMD_TE (((uint32_t)1) << 31) +#define IOMMU_GCMD_SRTP (((uint32_t)1) << 30) +#define IOMMU_GCMD_SFL (((uint32_t)1) << 29) +#define IOMMU_GCMD_EAFL (((uint32_t)1) << 28) +#define IOMMU_GCMD_WBF (((uint32_t)1) << 27) +#define IOMMU_GCMD_QIE (((uint32_t)1) << 26) +#define IOMMU_GCMD_IRE (((uint32_t)1) << 25) +#define IOMMU_GCMD_SIRTP (((uint32_t)1) << 24) +#define IOMMU_GCMD_CFI (((uint32_t)1) << 23) + +/* global status register */ +#define IOMMU_GSTS_TES (((uint32_t)1) << 31) +#define IOMMU_GSTS_RTPS (((uint32_t)1) << 30) +#define IOMMU_GSTS_FLS (((uint32_t)1) << 29) +#define IOMMU_GSTS_AFLS (((uint32_t)1) << 28) +#define IOMMU_GSTS_WBFS (((uint32_t)1) << 27) +#define IOMMU_GSTS_QIES (((uint32_t)1) << 26) +#define IOMMU_GSTS_IRES (((uint32_t)1) << 25) +#define IOMMU_GSTS_IRTPS (((uint32_t)1) << 24) +#define IOMMU_GSTS_CFIS (((uint32_t)1) << 23) + +/* psi address mask */ +#define ADDR_AM_MAX(m) (((uint_t)1) << (m)) +#define ADDR_AM_OFFSET(n, m) ((n) & (ADDR_AM_MAX(m) - 1)) + +/* dmar fault event */ +#define IOMMU_INTR_IPL (8) +#define IOMMU_REG_FEVNT_CON_IM_SHIFT (31) + +/* page entry structure */ +typedef uint64_t *iopte_t; + +/* root/context entry structure */ +typedef struct iorce { + uint64_t lo; + uint64_t hi; +} *iorce_t; + +/* kernel maintained page table entry */ +typedef struct iovpte { + /* + * pointer to the cpu accessable + * iommu page table + */ + caddr_t vp; + /* + * pointer to the real iommu + * page table + */ + caddr_t pp; +} *iovpte_t; + +/* + * struct iommu_kstat + * kstat tructure for iommu + */ +typedef struct iommu_kstat { + + /* hardware dependent */ + kstat_named_t is_enabled; + kstat_named_t is_iotlb_psi; + kstat_named_t is_iotlb_domain; + kstat_named_t is_iotlb_global; + kstat_named_t is_write_buffer; + kstat_named_t is_context_cache; + kstat_named_t is_wait_complete_us; + kstat_named_t is_domain_alloc; + + /* hardware independent */ + kstat_named_t is_page_used; +} iommu_kstat_t; + +/* + * struct iommu_stat + * statistics for iommu + */ +typedef struct iommu_stat { + uint64_t st_iotlb_psi; + uint64_t st_iotlb_domain; + uint64_t st_iotlb_global; + uint64_t st_write_buffer; + uint64_t st_context_cache; + uint64_t st_wait_complete_us; + uint64_t st_domain_alloc; +} iommu_stat_t; + +struct intel_iommu_state; +struct iommu_dvma_cookie; +struct dmar_domain_state; + +/* + * invalidation granularity + */ +typedef enum { + TLB_INV_G_GLOBAL = 1, + TLB_INV_G_DOMAIN, + TLB_INV_G_PAGE +} tlb_inv_g_t; + +typedef enum { + CTT_INV_G_GLOBAL = 1, + CTT_INV_G_DOMAIN, + CTT_INV_G_DEVICE +} ctt_inv_g_t; + +/* + * struct dmar_ops + * dmar hardware operation functions + */ +struct dmar_ops { + /* enable */ + void (*do_enable)(struct intel_iommu_state *iommu); + + /* page fault */ + int (*do_fault)(struct intel_iommu_state *iommu); + + /* cache related */ + void (*do_flwb)(struct intel_iommu_state *iommu); + void (*do_iotlb_psi)(struct intel_iommu_state *iommu, uint_t domain_id, + uint64_t dvma, uint_t count, uint_t hint); + void (*do_iotlb_dsi)(struct intel_iommu_state *iommu, uint_t domain_id); + void (*do_iotlb_gbl)(struct intel_iommu_state *iommu); + void (*do_context_fsi)(struct intel_iommu_state *iommu, + uint8_t function_mask, + uint16_t source_id, uint_t domain_id); + void (*do_context_dsi)(struct intel_iommu_state *iommu, + uint_t domain_id); + void (*do_context_gbl)(struct intel_iommu_state *iommu); + void (*do_plant_wait)(struct intel_iommu_state *iommu, + struct iommu_dvma_cookie *dcookies, uint_t count, + uint_t array_size); + void (*do_reap_wait)(struct intel_iommu_state *iommu); + + /* root entry */ + void (*do_set_root_table)(struct intel_iommu_state *iommu); + + /* cpu cache line flush */ + void (*do_clflush)(caddr_t addr, uint_t size); +}; + +/* + * struct iotlb_cache_node + * the pending data for iotlb flush + */ +typedef struct iotlb_pend_node { + /* node to hook into the list */ + list_node_t node; + /* ptr to dvma cookie array */ + struct iommu_dvma_cookie *icn_dcookies; + /* valid cookie count */ + uint_t icn_count; + /* array size */ + uint_t icn_array_size; +} iotlb_pend_node_t; + +/* + * struct iotlb_cache_head + * the pending head for the iotlb flush + */ +typedef struct iotlb_pend_head { + /* the pending iotlb list */ + kmutex_t ich_pend_lock; + list_t ich_pend_list; + uint_t ich_pend_count; + + /* the pending node cache list */ + kmutex_t ich_mem_lock; + list_t ich_mem_list; +} iotlb_pend_head_t; + +/* + * struct intel_iommu_state + * This structure describes the state information + * of each iommu unit in the platform. It is cre- + * ated in the dmarnex driver's attach(), and will + * be used in every DMA DDI and the iommu transla- + * tion functions + * + * node - the list node to hook it in iommu_states + * iu_drhd - the related drhd + * iu_reg_handle - register access handler + * iu_reg_lock - lock to protect register operation + * iu_reg_address - virtual address of the register base address + * iu_capability - copy of the capability register + * iu_excapability - copy of the extention register + * iu_root_entry_paddr - root entry page table + * iu_root_context_lock - root context entry lock + * iu_gaw - guest address width + * iu_agaw - adjusted guest address width + * iu_level - the page table level + * iu_global_cmd_reg - global command register save place + * iu_max_domain - the maximum domain numbers + * iu_domain_id_hdl - domain id allocator handler + * iu_enabled - the soft state of the iommu + * iu_coherency - hardware access is coherent + * iu_kstat - kstat pointer + * iu_statistics - iommu statistics + * iu_dmar_ops - iommu operation functions + * iu_pend_head - pending iotlb list + */ +typedef struct intel_iommu_state { + list_node_t node; + drhd_info_t *iu_drhd; + ddi_acc_handle_t iu_reg_handle; + kmutex_t iu_reg_lock; + caddr_t iu_reg_address; + uint64_t iu_capability; + uint64_t iu_excapability; + paddr_t iu_root_entry_paddr; + kmutex_t iu_root_context_lock; + int iu_gaw; + int iu_agaw; + int iu_level; + uint32_t iu_global_cmd_reg; + int iu_max_domain; + iommu_rscs_t iu_domain_id_hdl; + boolean_t iu_enabled; + boolean_t iu_coherency; + kstat_t *iu_kstat; + iommu_stat_t iu_statistics; + struct dmar_ops *iu_dmar_ops; + iotlb_pend_head_t iu_pend_head; +} intel_iommu_state_t; + +/* + * struct dvma_cache_node + * dvma cache node + */ +typedef struct dvma_cache_node { + list_node_t node; + + /* parameters */ + size_t dcn_align; + uint64_t dcn_dvma; +} dvma_cache_node_t; + +/* + * struct dvma_cache_head + * dvma cache head + */ +typedef struct dvma_cache_head { + /* the list of the free dvma */ + kmutex_t dch_free_lock; + list_t dch_free_list; + uint_t dch_free_count; + + /* the cache for the node memory */ + kmutex_t dch_mem_lock; + list_t dch_mem_list; +} dvma_cache_head_t; + +#define DVMA_CACHE_HEAD_CNT 64 + +/* + * struct dmar_domain_state + * This structure describes the state information + * of an iommu domain. It is created and initiated + * when the driver call ddi_dma_bind_handle(). And + * will be used in each iommu translation fucntions + * + * dm_domain_id - the domain id + * dm_iommu - iommu pointer this domain belongs to + * dm_dvma_map - dvma map + * dm_dvma_cache - dvma cahce lists + * dm_page_table_paddr - page table address for this domain + * dm_pt_tree - the kernel maintained page tables + * dm_identity - does this domain identity mapped + */ +typedef struct dmar_domain_state { + uint_t dm_domain_id; + intel_iommu_state_t *dm_iommu; + vmem_t *dm_dvma_map; + dvma_cache_head_t dm_dvma_cache[DVMA_CACHE_HEAD_CNT]; + paddr_t dm_page_table_paddr; + struct iovpte dm_pt_tree; + boolean_t dm_identity; +} dmar_domain_state_t; + +/* + * struct dmar_reserve_mem + * This structure describes the reserved memory regions which can + * not be allocated by vmem. + * + * node - list node + * rm_pfn_start - the start page frame number + * rm_pfn_end - the end page frame number + */ +typedef struct dmar_reserve_pages { + list_node_t node; + uint64_t rm_pfn_start; + uint64_t rm_pfn_end; +} dmar_reserve_pages_t; + +/* + * struct pci_dev_info + * pci device info structure + */ +typedef struct pci_dev_info { + list_node_t node; + int pdi_seg; + int pdi_bus; + int pdi_devfn; + dev_info_t *pdi_dip; +} pci_dev_info_t; + +/* + * struct iommu_dip_private + * the intel iommu private structure hook on dev_info + */ +typedef struct iommu_private { + /* pci seg, bus, dev, func */ + int idp_seg; + int idp_bus; + int idp_devfn; + + /* ppb information */ + boolean_t idp_is_bridge; + int idp_bbp_type; + int idp_sec; + int idp_sub; + + /* identifier for special devices */ + boolean_t idp_is_display; + boolean_t idp_is_lpc; + + /* domain ptr */ + dmar_domain_state_t *idp_domain; +} iommu_private_t; + +#define IOMMU_PPB_NONE 0 +#define IOMMU_PPB_PCIE_PCIE 1 +#define IOMMU_PPB_PCIE_PCI 2 +#define IOMMU_PPB_PCI_PCI 3 + +#define MAX_COOKIE_CACHE_SIZE 20 +/* + * struct iommu_dvma_cookie + * this cookie record the dvma allocated for + * an individual device + */ +typedef struct iommu_dvma_cookie { + uint64_t dc_addr; + uint64_t dc_size; + struct dmar_domain_state *dc_domain; + size_t dc_align; + struct iommu_dvma_cookie *dc_next; +} iommu_dvma_cookie_t; + +/* + * struct dvma_cookie_head + * the cookie cache head + */ +typedef struct dvma_cookie_head { + kmutex_t dch_lock; + iommu_dvma_cookie_t *dch_next; + uint_t dch_count; +} dvma_cookie_head_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_INTEL_IOMMU_H */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/uts/i86pc/sys/iommu_rscs.h Sun Sep 14 19:52:20 2008 -0700 @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_IOMMU_H +#define _SYS_IOMMU_H + +/* + * XXX + */ + +#include <sys/types.h> +#include <sys/conf.h> +#include <sys/modctl.h> +#include <sys/sunddi.h> + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * iommu_page_alloc() + * allocate a 4K page and map it into KVA + * iommu_page_free() + * unmap and free page from iommu_page_alloc() + * iommu_page_map() + * map page into kva + * iommu_page_unmap() + * unmap page out of kva + */ +paddr_t iommu_page_alloc(int kmflag); +void iommu_page_free(paddr_t paddr); +caddr_t iommu_page_map(paddr_t paddr); +void iommu_page_unmap(caddr_t kva); + + +typedef struct iommu_rscs_s *iommu_rscs_t; + +void iommu_rscs_init(uint_t min_val, uint_t max_val, iommu_rscs_t *handle); +void iommu_rscs_fini(iommu_rscs_t *handle); +int iommu_rscs_alloc(iommu_rscs_t handle, uint_t *rs); +void iommu_rscs_free(iommu_rscs_t handle, uint_t rs); + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_IOMMU_H */
--- a/usr/src/uts/i86pc/sys/machsystm.h Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/i86pc/sys/machsystm.h Sun Sep 14 19:52:20 2008 -0700 @@ -27,8 +27,6 @@ #ifndef _SYS_MACHSYSTM_H #define _SYS_MACHSYSTM_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Numerous platform-dependent interfaces that don't seem to belong * in any other header file. @@ -132,7 +130,8 @@ struct memlist; extern void memlist_add(uint64_t, uint64_t, struct memlist *, struct memlist **); -extern page_t *page_get_physical(uintptr_t); +extern page_t *page_get_physical(int flags); +extern void page_free_physical(page_t *); extern int linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp); extern int dtrace_linear_pc(struct regs *rp, proc_t *p, caddr_t *linearp);
--- a/usr/src/uts/i86pc/sys/rootnex.h Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/i86pc/sys/rootnex.h Sun Sep 14 19:52:20 2008 -0700 @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ROOTNEX_H #define _SYS_ROOTNEX_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * x86 root nexus implementation specific state */ @@ -294,6 +291,13 @@ * expensive on x86. */ uchar_t *dp_prealloc_buffer; + + /* + * intel iommu related state + * dvma_cookies saves the dvma allocated for this handler, it has the + * size of si_max_pages, set when bind handler and freed when unbind + */ + void *dp_dvma_cookies; } rootnex_dma_t; /* @@ -323,6 +327,7 @@ * r_dip - rootnex dip * r_reserved_msg_printed - ctlops reserve message threshold * r_counters - profile/performance counters + * r_intel_iommu_enabled - intel iommu enabled */ typedef struct rootnex_state_s { uint_t r_prealloc_cookies; @@ -334,6 +339,7 @@ ddi_iblock_cookie_t r_err_ibc; boolean_t r_reserved_msg_printed; uint64_t r_counters[ROOTNEX_CNT_LAST]; + boolean_t r_intel_iommu_enabled; } rootnex_state_t;
--- a/usr/src/uts/i86pc/vm/htable.c Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/i86pc/vm/htable.c Sun Sep 14 19:52:20 2008 -0700 @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/sysmacros.h> #include <sys/kmem.h> @@ -266,7 +264,7 @@ * A wrapper around page_get_physical(), with some extra checks. */ static pfn_t -ptable_alloc(uintptr_t seed) +ptable_alloc(void) { pfn_t pfn; page_t *pp; @@ -300,13 +298,11 @@ } #endif /* DEBUG */ - pp = page_get_physical(seed); + pp = page_get_physical(KM_NOSLEEP); if (pp == NULL) return (PFN_INVALID); + ASSERT(PAGE_SHARED(pp)); pfn = pp->p_pagenum; - page_downgrade(pp); - ASSERT(PAGE_SHARED(pp)); - if (pfn == PFN_INVALID) panic("ptable_alloc(): Invalid PFN!!"); HATSTAT_INC(hs_ptable_allocs); @@ -330,29 +326,13 @@ atomic_add_32(&active_ptables, -1); if (pp == NULL) panic("ptable_free(): no page for pfn!"); - ASSERT(PAGE_SHARED(pp)); ASSERT(pfn == pp->p_pagenum); ASSERT(!IN_XPV_PANIC()); - - /* - * Get an exclusive lock, might have to wait for a kmem reader. - */ - if (!page_tryupgrade(pp)) { - page_unlock(pp); - /* - * RFE: we could change this to not loop forever - * George Cameron had some idea on how to do that. - * For now looping works - it's just like sfmmu. - */ - while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) - continue; - } #ifdef __xpv if (kpm_vbase && xen_kpm_page(pfn, PT_VALID | PT_WRITABLE) < 0) panic("failure making kpm r/w pfn=0x%lx", pfn); #endif - page_free(pp, 1); - page_unresv(1); + page_free_physical(pp); } /* @@ -680,7 +660,6 @@ return (list); } - /* * This is invoked from kmem when the system is low on memory. We try * to free hments, htables, and ptables to improve the memory situation. @@ -788,7 +767,7 @@ */ if (ht != NULL && !is_bare) { ht->ht_hat = hat; - ht->ht_pfn = ptable_alloc((uintptr_t)ht); + ht->ht_pfn = ptable_alloc(); if (ht->ht_pfn == PFN_INVALID) { if (USE_HAT_RESERVES()) htable_put_reserve(ht); @@ -851,7 +830,7 @@ for (;;) { htable_t *stolen; - hat->hat_user_ptable = ptable_alloc((uintptr_t)ht + 1); + hat->hat_user_ptable = ptable_alloc(); if (hat->hat_user_ptable != PFN_INVALID) break; stolen = htable_steal(1);
--- a/usr/src/uts/i86pc/vm/vm_machdep.c Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/i86pc/vm/vm_machdep.c Sun Sep 14 19:52:20 2008 -0700 @@ -31,8 +31,6 @@ * under license from the Regents of the University of California. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * UNIX machine dependent virtual memory support. */ @@ -3708,34 +3706,25 @@ * available - this would have a minimal impact on page coloring. */ page_t * -page_get_physical(uintptr_t seed) +page_get_physical(int flags) { page_t *pp; - u_offset_t offset; + u_offset_t offset = (u_offset_t)1 << 41; /* in VA hole */ static struct seg tmpseg; static uintptr_t ctr = 0; + static kmutex_t pgp_mutex; /* * This code is gross, we really need a simpler page allocator. * - * We need assign an offset for the page to call page_create_va(). * To avoid conflicts with other pages, we get creative with the offset. - * For 32 bits, we pick an offset > 4Gig - * For 64 bits, pick an offset somewhere in the VA hole. + * For 32 bits, we need an offset > 4Gig + * For 64 bits, need an offset somewhere in the VA hole. */ - offset = seed; - if (offset > kernelbase) - offset -= kernelbase; - offset <<= MMU_PAGESHIFT; -#if defined(__amd64) - offset += mmu.hole_start; /* something in VA hole */ -#else - offset += 1ULL << 40; /* something > 4 Gig */ -#endif - - if (page_resv(1, KM_NOSLEEP) == 0) + if (page_resv(1, flags & KM_NOSLEEP) == 0) return (NULL); + mutex_enter(&pgp_mutex); #ifdef DEBUG pp = page_exists(&kvp, offset); if (pp != NULL) @@ -3744,9 +3733,32 @@ pp = page_create_va(&kvp, offset, MMU_PAGESIZE, PG_EXCL, &tmpseg, (caddr_t)(ctr += MMU_PAGESIZE)); /* changing VA usage */ - if (pp == NULL) - return (NULL); - page_io_unlock(pp); - page_hashout(pp, NULL); + if (pp != NULL) { + page_io_unlock(pp); + page_hashout(pp, NULL); + } + mutex_exit(&pgp_mutex); + page_downgrade(pp); return (pp); } + +void +page_free_physical(page_t *pp) +{ + /* + * Get an exclusive lock, might have to wait for a kmem reader. + */ + ASSERT(PAGE_SHARED(pp)); + if (!page_tryupgrade(pp)) { + page_unlock(pp); + /* + * RFE: we could change this to not loop forever + * George Cameron had some idea on how to do that. + * For now looping works - it's just like sfmmu. + */ + while (!page_lock(pp, SE_EXCL, (kmutex_t *)NULL, P_RECLAIM)) + continue; + } + page_free(pp, 1); + page_unresv(1); +}
--- a/usr/src/uts/intel/ia32/ml/i86_subr.s Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/intel/ia32/ml/i86_subr.s Sun Sep 14 19:52:20 2008 -0700 @@ -4212,5 +4212,67 @@ ret SET_SIZE(ftrace_interrupt_enable) -#endif /* __i386 */ +#endif /* __i386 */ #endif /* __lint */ + +#if defined (__lint) + +/*ARGSUSED*/ +void +iommu_cpu_nop(void) +{} + +#else /* __lint */ + + ENTRY(iommu_cpu_nop) + rep; nop + ret + SET_SIZE(iommu_cpu_nop) + +#endif /* __lint */ + +#if defined (__lint) + +/*ARGSUSED*/ +void +clflush_insn(caddr_t addr) +{} + +#else /* __lint */ + +#if defined (__amd64) + ENTRY(clflush_insn) + clflush (%rdi) + ret + SET_SIZE(clflush_insn) +#elif defined (__i386) + ENTRY(clflush_insn) + movl 4(%esp), %eax + clflush (%eax) + ret + SET_SIZE(clflush_insn) + +#endif /* __i386 */ +#endif /* __lint */ + +#if defined (__lint) +/*ARGSUSED*/ +void +mfence_insn(void) +{} + +#else /* __lint */ + +#if defined (__amd64) + ENTRY(mfence_insn) + mfence + ret + SET_SIZE(mfence_insn) +#elif defined (__i386) + ENTRY(mfence_insn) + mfence + ret + SET_SIZE(mfence_insn) + +#endif /* __i386 */ +#endif /* __lint */
--- a/usr/src/uts/intel/io/pci/pci_boot.c Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/intel/io/pci/pci_boot.c Sun Sep 14 19:52:20 2008 -0700 @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stat.h> #include <sys/sunndi.h> @@ -43,6 +41,7 @@ #include <io/hotplug/pciehpc/pciehpc_acpi.h> #include <sys/acpi/acpi.h> #include <sys/acpica.h> +#include <sys/intel_iommu.h> #define pci_getb (*pci_getb_func) #define pci_getw (*pci_getw_func) @@ -1374,6 +1373,7 @@ int pciex = 0; ushort_t is_pci_bridge = 0; struct pci_devfunc *devlist = NULL, *entry = NULL; + iommu_private_t *private; ushort_t deviceid = pci_getw(bus, dev, func, PCI_CONF_DEVID); @@ -1598,6 +1598,35 @@ reprogram = 0; /* don't reprogram pci-ide bridge */ } + /* allocate and set up iommu private */ + private = kmem_alloc(sizeof (iommu_private_t), KM_SLEEP); + private->idp_seg = 0; + private->idp_bus = bus; + private->idp_devfn = (dev << 3) | func; + private->idp_sec = 0; + private->idp_sub = 0; + private->idp_bbp_type = IOMMU_PPB_NONE; + /* record the bridge */ + private->idp_is_bridge = ((basecl == PCI_CLASS_BRIDGE) && + (subcl == PCI_BRIDGE_PCI)); + if (private->idp_is_bridge) { + private->idp_sec = pci_getb(bus, dev, func, PCI_BCNF_SECBUS); + private->idp_sub = pci_getb(bus, dev, func, PCI_BCNF_SUBBUS); + if (pciex && is_pci_bridge) + private->idp_bbp_type = IOMMU_PPB_PCIE_PCI; + else if (pciex) + private->idp_bbp_type = IOMMU_PPB_PCIE_PCIE; + else + private->idp_bbp_type = IOMMU_PPB_PCI_PCI; + } + /* record the special devices */ + private->idp_is_display = (is_display(classcode) ? B_TRUE : B_FALSE); + private->idp_is_lpc = ((basecl == PCI_CLASS_BRIDGE) && + (subcl == PCI_BRIDGE_ISA)); + private->idp_domain = NULL; + /* hook the private to dip */ + DEVI(dip)->devi_iommu_private = private; + if (reprogram && (entry != NULL)) entry->reprogram = B_TRUE; }
--- a/usr/src/uts/intel/sys/archsystm.h Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/intel/sys/archsystm.h Sun Sep 14 19:52:20 2008 -0700 @@ -45,6 +45,9 @@ extern ulong_t getcr0(void); extern void setcr0(ulong_t); extern ulong_t getcr2(void); +extern void iommu_cpu_nop(void); +extern void clflush_insn(caddr_t addr); +extern void mfence_insn(void); #if defined(__i386) extern uint16_t getgs(void);
--- a/usr/src/uts/intel/sys/x86_archext.h Sun Sep 14 17:28:06 2008 -0700 +++ b/usr/src/uts/intel/sys/x86_archext.h Sun Sep 14 19:52:20 2008 -0700 @@ -337,6 +337,7 @@ #define X86_SSE4_1 0x04000000 #define X86_SSE4_2 0x08000000 #define X86_1GPG 0x10000000 +#define X86_CLFSH 0x20000000 /* * flags to patch tsc_read routine. @@ -512,6 +513,7 @@ extern uint_t x86_feature; extern uint_t x86_type; extern uint_t x86_vendor; +extern uint_t x86_clflush_size; extern uint_t pentiumpro_bug4046376; extern uint_t pentiumpro_bug4064495;