changeset 10588:dc03f981ea18

6438937 if 'zfs destroy' fails, it can leave a zvol device link missing 6573142 zpool destruction/export should better handle stale zvol links 6718816 ZFS volinit fails when ZFS root pool full 6761786 zpool import with 8500 snapshots took 11hours 6604403 replace volinit/volfini with /dev fs vnode ops 6847760 zfs volinit may happen a little too soon during boot 6488792 Warnings on console whenever a volume is created. 6738837 assertion failure in sdev_open 6878496 dmu_objset_own returns EINVAL instead of EROFS in some situations
author Eric Taylor <Eric.Taylor@Sun.COM>
date Mon, 21 Sep 2009 08:55:28 -0600
parents e0d280fab007
children 7b05736960d1
files usr/src/cmd/devfsadm/zfs_link.c usr/src/cmd/svc/milestone/devices-local usr/src/cmd/svc/milestone/fs-usr usr/src/cmd/truss/codes.c usr/src/cmd/zfs/zfs_main.c usr/src/cmd/ztest/ztest.c usr/src/lib/libdiskmgt/common/findevs.c usr/src/lib/libzfs/Makefile.com usr/src/lib/libzfs/common/libzfs.h usr/src/lib/libzfs/common/libzfs_changelist.c usr/src/lib/libzfs/common/libzfs_dataset.c usr/src/lib/libzfs/common/libzfs_impl.h usr/src/lib/libzfs/common/libzfs_mount.c usr/src/lib/libzfs/common/libzfs_pool.c usr/src/lib/libzfs/common/libzfs_sendrecv.c usr/src/lib/libzfs/common/libzfs_util.c usr/src/lib/libzfs/common/mapfile-vers usr/src/lib/pyzfs/common/allow.py usr/src/uts/common/Makefile.files usr/src/uts/common/fs/dev/sdev_subr.c usr/src/uts/common/fs/dev/sdev_vnops.c usr/src/uts/common/fs/dev/sdev_zvolops.c usr/src/uts/common/fs/zfs/dmu_objset.c usr/src/uts/common/fs/zfs/dsl_dataset.c usr/src/uts/common/fs/zfs/spa_misc.c usr/src/uts/common/fs/zfs/sys/dsl_dataset.h usr/src/uts/common/fs/zfs/sys/spa.h usr/src/uts/common/fs/zfs/sys/vdev.h usr/src/uts/common/fs/zfs/sys/zvol.h usr/src/uts/common/fs/zfs/vdev.c usr/src/uts/common/fs/zfs/zfs_ioctl.c usr/src/uts/common/fs/zfs/zvol.c usr/src/uts/common/os/dumpsubr.c usr/src/uts/common/sys/fs/sdev_impl.h usr/src/uts/common/sys/fs/zfs.h usr/src/uts/intel/dev/Makefile usr/src/uts/sparc/dev/Makefile
diffstat 37 files changed, 1246 insertions(+), 1471 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/devfsadm/zfs_link.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/cmd/devfsadm/zfs_link.c	Mon Sep 21 08:55:28 2009 -0600
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <regex.h>
 #include <devfsadm.h>
 #include <stdio.h>
@@ -35,9 +32,7 @@
 #include <sys/mkdev.h>
 #include <sys/fs/zfs.h>
 
-/* zfs and zvol name info */
-
-#define	ZVOL_LINK_RE_DEVICES	"zvol/r?dsk/.*/.*$"
+/* zfs name info */
 
 static int zfs(di_minor_t minor, di_node_t node);
 
@@ -52,64 +47,18 @@
 DEVFSADM_CREATE_INIT_V0(zfs_create_cbt);
 
 /*
- * devfs cleanup register
- */
-static devfsadm_remove_t zfs_remove_cbt[] = {
-	{ "pseudo", ZVOL_LINK_RE_DEVICES, RM_HOT | RM_POST,
-	    ILEVEL_0, devfsadm_rm_all },
-};
-DEVFSADM_REMOVE_INIT_V0(zfs_remove_cbt);
-
-/*
- * For the zfs control node:
+ * The zfs control node looks like this:
  *	/dev/zfs -> /devices/pseudo/zfs@0:zfs
- * For zvols:
- *	/dev/zvol/dsk/<pool>/<dataset> -> /devices/pseudo/zfs@0:1
- *	/dev/zvol/rdsk/<pool>/<dataset> -> /devices/pseudo/zfs@0:1,raw
  */
 static int
 zfs(di_minor_t minor, di_node_t node)
 {
-	dev_t	dev;
-	int	err;
 	char mn[MAXNAMELEN + 1];
-	char blkname[MAXNAMELEN + 1];
-	char rawname[MAXNAMELEN + 1];
-	char path[PATH_MAX + 1];
-	char *name;
 
 	(void) strcpy(mn, di_minor_name(minor));
 
 	if (strcmp(mn, ZFS_DRIVER) == 0) {
 		(void) devfsadm_mklink(ZFS_DRIVER, node, minor, 0);
-	} else {
-		dev = di_minor_devt(minor);
-		err = di_prop_lookup_strings(dev, node, ZVOL_PROP_NAME, &name);
-		if (err < 0) {
-			/* property not defined so can't do anything */
-			return (DEVFSADM_CONTINUE);
-		}
-		(void) snprintf(blkname, sizeof (blkname), "%dc",
-		    (int)minor(dev));
-		(void) snprintf(rawname, sizeof (rawname), "%dc,raw",
-		    (int)minor(dev));
-
-		/*
-		 * This is where the actual public name gets constructed.
-		 * Change the snprintf format to change the public
-		 * path that gets constructed.
-		 */
-		if (strcmp(mn, blkname) == 0) {
-			(void) snprintf(path, sizeof (path), "%s/%s",
-			    ZVOL_DEV_DIR, name);
-		} else if (strcmp(mn, rawname) == 0) {
-			(void) snprintf(path, sizeof (path), "%s/%s",
-			    ZVOL_RDEV_DIR, name);
-		} else {
-			return (DEVFSADM_CONTINUE);
-		}
-
-		(void) devfsadm_mklink(path, node, minor, 0);
 	}
 	return (DEVFSADM_CONTINUE);
 }
--- a/usr/src/cmd/svc/milestone/devices-local	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/cmd/svc/milestone/devices-local	Mon Sep 21 08:55:28 2009 -0600
@@ -75,13 +75,4 @@
         fi
 fi
 
-# Create any zvol devices
-if [ -x /usr/sbin/zfs ]; then
-	/usr/sbin/zfs volinit || exit $SMF_EXIT_ERR_FATAL
-	#
-	# Add swap again to allow for swapping to zvols.
-	#
-	/sbin/swapadd
-fi
-
 exit $SMF_EXIT_OK
--- a/usr/src/cmd/svc/milestone/fs-usr	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/cmd/svc/milestone/fs-usr	Mon Sep 21 08:55:28 2009 -0600
@@ -20,15 +20,13 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T.
 # All rights reserved.
 #
 #
-# ident	"%Z%%M%	%I%	%E% SMI"
-
 . /lib/svc/share/smf_include.sh
 . /lib/svc/share/fs_include.sh
 
@@ -55,16 +53,6 @@
 	# If we have a dedicated dump device, then go ahead and configure it.
 	# 
 	if [ "x$special" != "x$DUMPADM_DEVICE" ]; then
-		if [ -x /usr/sbin/zfs ]; then
-			dataset=`echo $DUMPADM_DEVICE | cut -d'/' -f5-`
-			[ -n "$dataset" ] && \
-			    /usr/sbin/zfs list -t volume $dataset > \
-			    /dev/null 2>&1
-			if [ $? -eq 0 ]; then
-				/usr/sbin/zfs volinit
-			fi
-		fi
-
 		if [ -x /usr/sbin/dumpadm -a -b $DUMPADM_DEVICE ]; then
 			/usr/sbin/dumpadm -u || exit $SMF_EXIT_ERR_CONFIG
 		fi
--- a/usr/src/cmd/truss/codes.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/cmd/truss/codes.c	Mon Sep 21 08:55:28 2009 -0600
@@ -1186,10 +1186,6 @@
 		"zfs_cmd_t" },
 	{ (uint_t)ZFS_IOC_SET_PROP,		"ZFS_IOC_SET_PROP",
 		"zfs_cmd_t" },
-	{ (uint_t)ZFS_IOC_CREATE_MINOR,		"ZFS_IOC_CREATE_MINOR",
-		"zfs_cmd_t" },
-	{ (uint_t)ZFS_IOC_REMOVE_MINOR,		"ZFS_IOC_REMOVE_MINOR",
-		"zfs_cmd_t" },
 	{ (uint_t)ZFS_IOC_CREATE,		"ZFS_IOC_CREATE",
 		"zfs_cmd_t" },
 	{ (uint_t)ZFS_IOC_DESTROY,		"ZFS_IOC_DESTROY",
--- a/usr/src/cmd/zfs/zfs_main.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/cmd/zfs/zfs_main.c	Mon Sep 21 08:55:28 2009 -0600
@@ -875,7 +875,7 @@
 
 	/*
 	 * Ignore pools (which we've already flagged as an error before getting
-	 * here.
+	 * here).
 	 */
 	if (strchr(zfs_get_name(zhp), '/') == NULL &&
 	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
@@ -3997,27 +3997,6 @@
 }
 
 static int
-volcheck(zpool_handle_t *zhp, void *data)
-{
-	boolean_t isinit = *((boolean_t *)data);
-
-	if (isinit)
-		return (zpool_create_zvol_links(zhp));
-	else
-		return (zpool_remove_zvol_links(zhp));
-}
-
-/*
- * Iterate over all pools in the system and either create or destroy /dev/zvol
- * links, depending on the value of 'isinit'.
- */
-static int
-do_volcheck(boolean_t isinit)
-{
-	return (zpool_iter(g_zfs, volcheck, &isinit) ? 1 : 0);
-}
-
-static int
 find_command_idx(char *command, int *idx)
 {
 	int i;
@@ -4103,15 +4082,6 @@
 			usage(B_TRUE);
 
 		/*
-		 * 'volinit' and 'volfini' do not appear in the usage message,
-		 * so we have to special case them here.
-		 */
-		if (strcmp(cmdname, "volinit") == 0)
-			return (do_volcheck(B_TRUE));
-		else if (strcmp(cmdname, "volfini") == 0)
-			return (do_volcheck(B_FALSE));
-
-		/*
 		 * Run the appropriate command.
 		 */
 		libzfs_mnttab_cache(g_zfs, B_TRUE);
--- a/usr/src/cmd/ztest/ztest.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/cmd/ztest/ztest.c	Mon Sep 21 08:55:28 2009 -0600
@@ -1712,7 +1712,7 @@
 	error = dsl_dataset_own(snap1name, B_FALSE, FTAG, &ds);
 	if (error)
 		fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error);
-	error = dsl_dataset_promote(clone2name);
+	error = dsl_dataset_promote(clone2name, NULL);
 	if (error != EBUSY)
 		fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
 		    error);
--- a/usr/src/lib/libdiskmgt/common/findevs.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libdiskmgt/common/findevs.c	Mon Sep 21 08:55:28 2009 -0600
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,6 +31,7 @@
 #include <sys/stat.h>
 #include <sys/sunddi.h>
 #include <sys/types.h>
+#include <sys/mkdev.h>
 #include <ctype.h>
 #include <libgen.h>
 #include <unistd.h>
@@ -579,6 +580,32 @@
 		    }
 		}
 	    }
+	    if (is_zvol(node, minor)) {
+		char zvdsk[MAXNAMELEN];
+		char *str;
+		alias_t *ap;
+
+		if (di_prop_lookup_strings(di_minor_devt(minor),
+		    node, "name", &str) == -1)
+		      return (DI_WALK_CONTINUE);
+		(void) snprintf(zvdsk, MAXNAMELEN, "/dev/zvol/rdsk/%s",
+		    str);
+		if ((ap = find_alias(diskp, kernel_name)) == NULL) {
+			if (new_alias(diskp, kernel_name,
+			    zvdsk, args) != 0) {
+				args->dev_walk_status = ENOMEM;
+		    }
+		} else {
+		    /*
+		     * It is possible that we have already added this devpath.
+		     * Do not add it again. new_devpath will return a 0 if
+		     * found, and not add the path.
+		     */
+		    if (new_devpath(ap, zvdsk) != 0) {
+			args->dev_walk_status = ENOMEM;
+		    }
+		}
+	    }
 
 	    /* Add the devpaths for the drive. */
 	    if (args->dev_walk_status == 0) {
@@ -1537,7 +1564,7 @@
 is_zvol(di_node_t node, di_minor_t minor)
 {
 	if ((strncmp(di_node_name(node), ZFS_DRIVER, 3) == 0) &&
-	    di_minor_devt(minor))
+	    minor(di_minor_devt(minor)))
 		return (1);
 	return (0);
 }
--- a/usr/src/lib/libzfs/Makefile.com	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libzfs/Makefile.com	Mon Sep 21 08:55:28 2009 -0600
@@ -48,7 +48,7 @@
 
 C99MODE=	-xc99=%all
 C99LMODE=	-Xc99=%all
-LDLIBS +=	-lc -lm -ldevinfo -ldevid -lgen -lnvpair -luutil -lavl -lefi \
+LDLIBS +=	-lc -lm -ldevid -lgen -lnvpair -luutil -lavl -lefi \
 	-lidmap
 CPPFLAGS +=	$(INCS) -D_REENTRANT
 
--- a/usr/src/lib/libzfs/common/libzfs.h	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libzfs/common/libzfs.h	Mon Sep 21 08:55:28 2009 -0600
@@ -66,7 +66,6 @@
 	EZFS_BADSTREAM,		/* bad backup stream */
 	EZFS_DSREADONLY,	/* dataset is readonly */
 	EZFS_VOLTOOBIG,		/* volume is too large for 32-bit system */
-	EZFS_VOLHASDATA,	/* volume already contains data */
 	EZFS_INVALIDNAME,	/* invalid dataset name */
 	EZFS_BADRESTORE,	/* unable to restore to destination */
 	EZFS_BADBACKUP,		/* backup failed */
@@ -85,7 +84,6 @@
 	EZFS_UMOUNTFAILED,	/* failed to unmount dataset */
 	EZFS_UNSHARENFSFAILED,	/* unshare(1M) failed */
 	EZFS_SHARENFSFAILED,	/* share(1M) failed */
-	EZFS_DEVLINKS,		/* failed to create zvol links */
 	EZFS_PERM,		/* permission denied */
 	EZFS_NOSPC,		/* out of space */
 	EZFS_IO,		/* I/O error */
@@ -582,12 +580,6 @@
  */
 extern int zpool_read_label(int, nvlist_t **);
 
-/*
- * Create and remove zvol /dev links.
- */
-extern int zpool_create_zvol_links(zpool_handle_t *);
-extern int zpool_remove_zvol_links(zpool_handle_t *);
-
 /* is this zvol valid for use as a dump device? */
 extern int zvol_check_dump_config(char *);
 
--- a/usr/src/lib/libzfs/common/libzfs_changelist.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libzfs/common/libzfs_changelist.c	Mon Sep 21 08:55:28 2009 -0600
@@ -119,18 +119,8 @@
 		if (ZFS_IS_VOLUME(cn->cn_handle)) {
 			switch (clp->cl_realprop) {
 			case ZFS_PROP_NAME:
-				/*
-				 * If this was a rename, unshare the zvol, and
-				 * remove the /dev/zvol links.
-				 */
+				/* If this was a rename, unshare the zvol */
 				(void) zfs_unshare_iscsi(cn->cn_handle);
-
-				if (zvol_remove_link(cn->cn_handle->zfs_hdl,
-				    cn->cn_handle->zfs_name) != 0) {
-					ret = -1;
-					cn->cn_needpost = B_FALSE;
-					(void) zfs_share_iscsi(cn->cn_handle);
-				}
 				break;
 
 			case ZFS_PROP_VOLSIZE:
@@ -235,15 +225,7 @@
 		zfs_refresh_properties(cn->cn_handle);
 
 		if (ZFS_IS_VOLUME(cn->cn_handle)) {
-			/*
-			 * If we're doing a rename, recreate the /dev/zvol
-			 * links.
-			 */
-			if (clp->cl_realprop == ZFS_PROP_NAME &&
-			    zvol_create_link(cn->cn_handle->zfs_hdl,
-			    cn->cn_handle->zfs_name) != 0) {
-				errors++;
-			} else if (cn->cn_shared ||
+			if (cn->cn_shared ||
 			    clp->cl_prop == ZFS_PROP_SHAREISCSI) {
 				if (zfs_prop_get(cn->cn_handle,
 				    ZFS_PROP_SHAREISCSI, shareopts,
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c	Mon Sep 21 08:55:28 2009 -0600
@@ -24,10 +24,8 @@
  * Use is subject to license terms.
  */
 
-#include <assert.h>
 #include <ctype.h>
 #include <errno.h>
-#include <libdevinfo.h>
 #include <libintl.h>
 #include <math.h>
 #include <stdio.h>
@@ -39,7 +37,6 @@
 #include <fcntl.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
-#include <sys/avl.h>
 #include <priv.h>
 #include <pwd.h>
 #include <grp.h>
@@ -58,7 +55,6 @@
 #include "libzfs_impl.h"
 #include "zfs_deleg.h"
 
-static int zvol_create_link_common(libzfs_handle_t *, const char *, int);
 static int userquota_propname_decode(const char *propname, boolean_t zoned,
     zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp);
 
@@ -1262,10 +1258,7 @@
 			break;
 
 		case EBUSY:
-			if (prop == ZFS_PROP_VOLBLOCKSIZE)
-				(void) zfs_error(hdl, EZFS_VOLHASDATA, errbuf);
-			else
-				(void) zfs_standard_error(hdl, EBUSY, errbuf);
+			(void) zfs_standard_error(hdl, EBUSY, errbuf);
 			break;
 
 		case EROFS:
@@ -2636,18 +2629,6 @@
 	/* create the dataset */
 	ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc);
 
-	if (ret == 0 && type == ZFS_TYPE_VOLUME) {
-		ret = zvol_create_link(hdl, path);
-		if (ret) {
-			(void) zfs_standard_error(hdl, errno,
-			    dgettext(TEXT_DOMAIN,
-			    "Volume successfully created, but device links "
-			    "were not created"));
-			zcmd_free_nvlists(&zc);
-			return (-1);
-		}
-	}
-
 	zcmd_free_nvlists(&zc);
 
 	/* check for failure */
@@ -2719,9 +2700,6 @@
 			return (-1);
 		}
 
-		if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
-			return (-1);
-
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	} else {
 		zc.zc_objset_type = DMU_OST_ZFS;
@@ -2746,13 +2724,13 @@
 };
 
 static int
-zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
+zfs_check_snap_cb(zfs_handle_t *zhp, void *arg)
 {
 	struct destroydata *dd = arg;
 	zfs_handle_t *szhp;
 	char name[ZFS_MAXNAMELEN];
 	boolean_t closezhp = dd->closezhp;
-	int rv;
+	int rv = 0;
 
 	(void) strlcpy(name, zhp->zfs_name, sizeof (name));
 	(void) strlcat(name, "@", sizeof (name));
@@ -2764,17 +2742,9 @@
 		zfs_close(szhp);
 	}
 
-	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
-		(void) zvol_remove_link(zhp->zfs_hdl, name);
-		/*
-		 * NB: this is simply a best-effort.  We don't want to
-		 * return an error, because then we wouldn't visit all
-		 * the volumes.
-		 */
-	}
-
 	dd->closezhp = B_TRUE;
-	rv = zfs_iter_filesystems(zhp, zfs_remove_link_cb, arg);
+	if (!dd->gotone)
+		rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg);
 	if (closezhp)
 		zfs_close(zhp);
 	return (rv);
@@ -2791,7 +2761,7 @@
 	struct destroydata dd = { 0 };
 
 	dd.snapname = snapname;
-	(void) zfs_remove_link_cb(zhp, &dd);
+	(void) zfs_check_snap_cb(zhp, &dd);
 
 	if (!dd.gotone) {
 		return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
@@ -2909,70 +2879,11 @@
 			return (zfs_standard_error(zhp->zfs_hdl, errno,
 			    errbuf));
 		}
-	} else if (ZFS_IS_VOLUME(zhp)) {
-		ret = zvol_create_link(zhp->zfs_hdl, target);
 	}
 
 	return (ret);
 }
 
-typedef struct promote_data {
-	char cb_mountpoint[MAXPATHLEN];
-	const char *cb_target;
-	const char *cb_errbuf;
-	uint64_t cb_pivot_txg;
-} promote_data_t;
-
-static int
-promote_snap_cb(zfs_handle_t *zhp, void *data)
-{
-	promote_data_t *pd = data;
-	zfs_handle_t *szhp;
-	char snapname[MAXPATHLEN];
-	int rv = 0;
-
-	/* We don't care about snapshots after the pivot point */
-	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) {
-		zfs_close(zhp);
-		return (0);
-	}
-
-	/* Remove the device link if it's a zvol. */
-	if (ZFS_IS_VOLUME(zhp))
-		(void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name);
-
-	/* Check for conflicting names */
-	(void) strlcpy(snapname, pd->cb_target, sizeof (snapname));
-	(void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname));
-	szhp = make_dataset_handle(zhp->zfs_hdl, snapname);
-	if (szhp != NULL) {
-		zfs_close(szhp);
-		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
-		    "snapshot name '%s' from origin \n"
-		    "conflicts with '%s' from target"),
-		    zhp->zfs_name, snapname);
-		rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf);
-	}
-	zfs_close(zhp);
-	return (rv);
-}
-
-static int
-promote_snap_done_cb(zfs_handle_t *zhp, void *data)
-{
-	promote_data_t *pd = data;
-
-	/* We don't care about snapshots after the pivot point */
-	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) {
-		/* Create the device link if it's a zvol. */
-		if (ZFS_IS_VOLUME(zhp))
-			(void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
-	}
-
-	zfs_close(zhp);
-	return (0);
-}
-
 /*
  * Promotes the given clone fs to be the clone parent.
  */
@@ -2982,10 +2893,7 @@
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_cmd_t zc = { 0 };
 	char parent[MAXPATHLEN];
-	char *cp;
 	int ret;
-	zfs_handle_t *pzhp;
-	promote_data_t pd;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
@@ -3003,29 +2911,7 @@
 		    "not a cloned filesystem"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
-	cp = strchr(parent, '@');
-	*cp = '\0';
-
-	/* Walk the snapshots we will be moving */
-	pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
-	if (pzhp == NULL)
-		return (-1);
-	pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG);
-	zfs_close(pzhp);
-	pd.cb_target = zhp->zfs_name;
-	pd.cb_errbuf = errbuf;
-	pzhp = zfs_open(hdl, parent, ZFS_TYPE_DATASET);
-	if (pzhp == NULL)
-		return (-1);
-	(void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint,
-	    sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE);
-	ret = zfs_iter_snapshots(pzhp, promote_snap_cb, &pd);
-	if (ret != 0) {
-		zfs_close(pzhp);
-		return (-1);
-	}
-
-	/* issue the ioctl */
+
 	(void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin,
 	    sizeof (zc.zc_value));
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
@@ -3034,62 +2920,18 @@
 	if (ret != 0) {
 		int save_errno = errno;
 
-		(void) zfs_iter_snapshots(pzhp, promote_snap_done_cb, &pd);
-		zfs_close(pzhp);
-
 		switch (save_errno) {
 		case EEXIST:
-			/*
-			 * There is a conflicting snapshot name.  We
-			 * should have caught this above, but they could
-			 * have renamed something in the mean time.
-			 */
+			/* There is a conflicting snapshot name. */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "conflicting snapshot name from parent '%s'"),
-			    parent);
+			    "conflicting snapshot '%s' from parent '%s'"),
+			    zc.zc_string, parent);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, save_errno, errbuf));
 		}
-	} else {
-		(void) zfs_iter_snapshots(zhp, promote_snap_done_cb, &pd);
 	}
-
-	zfs_close(pzhp);
-	return (ret);
-}
-
-struct createdata {
-	const char *cd_snapname;
-	int cd_ifexists;
-};
-
-static int
-zfs_create_link_cb(zfs_handle_t *zhp, void *arg)
-{
-	struct createdata *cd = arg;
-	int ret;
-
-	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
-		char name[MAXPATHLEN];
-
-		(void) strlcpy(name, zhp->zfs_name, sizeof (name));
-		(void) strlcat(name, "@", sizeof (name));
-		(void) strlcat(name, cd->cd_snapname, sizeof (name));
-		(void) zvol_create_link_common(zhp->zfs_hdl, name,
-		    cd->cd_ifexists);
-		/*
-		 * NB: this is simply a best-effort.  We don't want to
-		 * return an error, because then we wouldn't visit all
-		 * the volumes.
-		 */
-	}
-
-	ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd);
-
-	zfs_close(zhp);
-
 	return (ret);
 }
 
@@ -3153,31 +2995,11 @@
 	 * if it was recursive, the one that actually failed will be in
 	 * zc.zc_name.
 	 */
-	if (ret != 0)
+	if (ret != 0) {
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value);
-
-	if (ret == 0 && recursive) {
-		struct createdata cd;
-
-		cd.cd_snapname = delim + 1;
-		cd.cd_ifexists = B_FALSE;
-		(void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd);
+		(void) zfs_standard_error(hdl, errno, errbuf);
 	}
-	if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) {
-		ret = zvol_create_link(zhp->zfs_hdl, path);
-		if (ret != 0) {
-			(void) zfs_standard_error(hdl, errno,
-			    dgettext(TEXT_DOMAIN,
-			    "Volume successfully snapshotted, but device links "
-			    "were not created"));
-			zfs_close(zhp);
-			return (-1);
-		}
-	}
-
-	if (ret != 0)
-		(void) zfs_standard_error(hdl, errno, errbuf);
 
 	zfs_close(zhp);
 
@@ -3280,8 +3102,6 @@
 	 */
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
-		if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
-			return (-1);
 		if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
 			return (-1);
 		old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
@@ -3319,10 +3139,6 @@
 	 */
 	if ((zhp->zfs_type == ZFS_TYPE_VOLUME) &&
 	    (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) {
-		if (err = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name)) {
-			zfs_close(zhp);
-			return (err);
-		}
 		if (restore_resv) {
 			new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 			if (old_volsize != new_volsize)
@@ -3475,7 +3291,6 @@
 	}
 
 	if (recursive) {
-		struct destroydata dd;
 
 		parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
 		if (parentname == NULL) {
@@ -3490,15 +3305,6 @@
 			goto error;
 		}
 
-		dd.snapname = delim + 1;
-		dd.gotone = B_FALSE;
-		dd.closezhp = B_TRUE;
-
-		/* We remove any zvol links prior to renaming them */
-		ret = zfs_iter_filesystems(zhrp, zfs_remove_link_cb, &dd);
-		if (ret) {
-			goto error;
-		}
 	} else {
 		if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL)
 			return (-1);
@@ -3546,27 +3352,10 @@
 		 * On failure, we still want to remount any filesystems that
 		 * were previously mounted, so we don't alter the system state.
 		 */
-		if (recursive) {
-			struct createdata cd;
-
-			/* only create links for datasets that had existed */
-			cd.cd_snapname = delim + 1;
-			cd.cd_ifexists = B_TRUE;
-			(void) zfs_iter_filesystems(zhrp, zfs_create_link_cb,
-			    &cd);
-		} else {
+		if (!recursive)
 			(void) changelist_postfix(cl);
-		}
 	} else {
-		if (recursive) {
-			struct createdata cd;
-
-			/* only create links for datasets that had existed */
-			cd.cd_snapname = strchr(target, '@') + 1;
-			cd.cd_ifexists = B_TRUE;
-			ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb,
-			    &cd);
-		} else {
+		if (!recursive) {
 			changelist_rename(cl, zfs_get_name(zhp), target);
 			ret = changelist_postfix(cl);
 		}
@@ -3585,139 +3374,6 @@
 	return (ret);
 }
 
-/*
- * Given a zvol dataset, issue the ioctl to create the appropriate minor node,
- * poke devfsadm to create the /dev link, and then wait for the link to appear.
- */
-int
-zvol_create_link(libzfs_handle_t *hdl, const char *dataset)
-{
-	return (zvol_create_link_common(hdl, dataset, B_FALSE));
-}
-
-static int
-zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists)
-{
-	zfs_cmd_t zc = { 0 };
-	di_devlink_handle_t dhdl;
-	priv_set_t *priv_effective;
-	int privileged;
-
-	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
-	/*
-	 * Issue the appropriate ioctl.
-	 */
-	if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) {
-		switch (errno) {
-		case EEXIST:
-			/*
-			 * Silently ignore the case where the link already
-			 * exists.  This allows 'zfs volinit' to be run multiple
-			 * times without errors.
-			 */
-			return (0);
-
-		case ENOENT:
-			/*
-			 * Dataset does not exist in the kernel.  If we
-			 * don't care (see zfs_rename), then ignore the
-			 * error quietly.
-			 */
-			if (ifexists) {
-				return (0);
-			}
-
-			/* FALLTHROUGH */
-
-		default:
-			return (zfs_standard_error_fmt(hdl, errno,
-			    dgettext(TEXT_DOMAIN, "cannot create device links "
-			    "for '%s'"), dataset));
-		}
-	}
-
-	/*
-	 * If privileged call devfsadm and wait for the links to
-	 * magically appear.
-	 * Otherwise, print out an informational message.
-	 */
-
-	priv_effective = priv_allocset();
-	(void) getppriv(PRIV_EFFECTIVE, priv_effective);
-	privileged = (priv_isfullset(priv_effective) == B_TRUE);
-	priv_freeset(priv_effective);
-
-	if (privileged) {
-		if ((dhdl = di_devlink_init(ZFS_DRIVER,
-		    DI_MAKE_LINK)) == NULL) {
-			zfs_error_aux(hdl, strerror(errno));
-			(void) zfs_error_fmt(hdl, errno,
-			    dgettext(TEXT_DOMAIN, "cannot create device links "
-			    "for '%s'"), dataset);
-			(void) ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc);
-			return (-1);
-		} else {
-			(void) di_devlink_fini(&dhdl);
-		}
-	} else {
-		char pathname[MAXPATHLEN];
-		struct stat64 statbuf;
-		int i;
-
-#define	MAX_WAIT	10
-
-		/*
-		 * This is the poor mans way of waiting for the link
-		 * to show up.  If after 10 seconds we still don't
-		 * have it, then print out a message.
-		 */
-		(void) snprintf(pathname, sizeof (pathname), "/dev/zvol/dsk/%s",
-		    dataset);
-
-		for (i = 0; i != MAX_WAIT; i++) {
-			if (stat64(pathname, &statbuf) == 0)
-				break;
-			(void) sleep(1);
-		}
-		if (i == MAX_WAIT)
-			(void) printf(gettext("%s may not be immediately "
-			    "available\n"), pathname);
-	}
-
-	return (0);
-}
-
-/*
- * Remove a minor node for the given zvol and the associated /dev links.
- */
-int
-zvol_remove_link(libzfs_handle_t *hdl, const char *dataset)
-{
-	zfs_cmd_t zc = { 0 };
-
-	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
-	if (ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) {
-		switch (errno) {
-		case ENXIO:
-			/*
-			 * Silently ignore the case where the link no longer
-			 * exists, so that 'zfs volfini' can be run multiple
-			 * times without errors.
-			 */
-			return (0);
-
-		default:
-			return (zfs_standard_error_fmt(hdl, errno,
-			    dgettext(TEXT_DOMAIN, "cannot remove device "
-			    "links for '%s'"), dataset));
-		}
-	}
-
-	return (0);
-}
-
 nvlist_t *
 zfs_get_user_props(zfs_handle_t *zhp)
 {
--- a/usr/src/lib/libzfs/common/libzfs_impl.h	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libzfs/common/libzfs_impl.h	Mon Sep 21 08:55:28 2009 -0600
@@ -172,9 +172,6 @@
 
 int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **);
 
-int zvol_create_link(libzfs_handle_t *, const char *);
-int zvol_remove_link(libzfs_handle_t *, const char *);
-int zpool_iter_zvol(zpool_handle_t *, int (*)(const char *, void *), void *);
 boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *);
 
 void namespace_clear(libzfs_handle_t *);
--- a/usr/src/lib/libzfs/common/libzfs_mount.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libzfs/common/libzfs_mount.c	Mon Sep 21 08:55:28 2009 -0600
@@ -1215,26 +1215,19 @@
 	return (ret);
 }
 
-
+/*ARGSUSED1*/
 static int
-zvol_cb(const char *dataset, void *data)
+zvol_cb(zfs_handle_t *zhp, void *unused)
 {
-	libzfs_handle_t *hdl = data;
-	zfs_handle_t *zhp;
+	int error = 0;
 
-	/*
-	 * Ignore snapshots and ignore failures from non-existant datasets.
-	 */
-	if (strchr(dataset, '@') != NULL ||
-	    (zhp = zfs_open(hdl, dataset, ZFS_TYPE_VOLUME)) == NULL)
-		return (0);
-
-	if (zfs_unshare_iscsi(zhp) != 0)
-		return (-1);
-
+	if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM)
+		(void) zfs_iter_children(zhp, zvol_cb, NULL);
+	if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME)
+		error = zfs_unshare_iscsi(zhp);
 	zfs_close(zhp);
 
-	return (0);
+	return (error);
 }
 
 static int
@@ -1246,6 +1239,8 @@
 	return (strcmp(mountb, mounta));
 }
 
+/* alias for 2002/240 */
+#pragma weak zpool_unmount_datasets = zpool_disable_datasets
 /*
  * Unshare and unmount all datasets within the given pool.  We don't want to
  * rely on traversing the DSL to discover the filesystems within the pool,
@@ -1253,7 +1248,6 @@
  * arbitrarily (on I/O error, for example).  Instead, we walk /etc/mnttab and
  * gather all the filesystems that are currently mounted.
  */
-#pragma weak zpool_unmount_datasets = zpool_disable_datasets
 int
 zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 {
@@ -1261,6 +1255,7 @@
 	struct mnttab entry;
 	size_t namelen;
 	char **mountpoints = NULL;
+	zfs_handle_t *zfp;
 	zfs_handle_t **datasets = NULL;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 	int i;
@@ -1270,8 +1265,12 @@
 	/*
 	 * First unshare all zvols.
 	 */
-	if (zpool_iter_zvol(zhp, zvol_cb, hdl) != 0)
-		return (-1);
+	zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
+	    ZFS_TYPE_FILESYSTEM);
+	if (zfp != NULL) {
+		(void) zfs_iter_children(zfp, zvol_cb, NULL);
+		zfs_close(zfp);
+	}
 
 	namelen = strlen(zhp->zpool_name);
 
--- a/usr/src/lib/libzfs/common/libzfs_pool.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c	Mon Sep 21 08:55:28 2009 -0600
@@ -24,24 +24,18 @@
  * Use is subject to license terms.
  */
 
-#include <alloca.h>
-#include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <devid.h>
-#include <dirent.h>
 #include <fcntl.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
-#include <zone.h>
 #include <sys/efi_partition.h>
 #include <sys/vtoc.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/zio.h>
-#include <strings.h>
 #include <dlfcn.h>
 
 #include "zfs_namecheck.h"
@@ -1004,9 +998,6 @@
 	    ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (-1);
 
-	if (zpool_remove_zvol_links(zhp) != 0)
-		return (-1);
-
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 
 	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
@@ -1167,9 +1158,6 @@
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 
-	if (zpool_remove_zvol_links(zhp) != 0)
-		return (-1);
-
 	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
 	    "cannot export '%s'"), zhp->zpool_name);
 
@@ -1339,13 +1327,10 @@
 		/*
 		 * This should never fail, but play it safe anyway.
 		 */
-		if (zpool_open_silent(hdl, thename, &zhp) != 0) {
+		if (zpool_open_silent(hdl, thename, &zhp) != 0)
 			ret = -1;
-		} else if (zhp != NULL) {
-			ret = zpool_create_zvol_links(zhp);
+		else if (zhp != NULL)
 			zpool_close(zhp);
-		}
-
 	}
 
 	zcmd_free_nvlists(&zc);
@@ -2344,173 +2329,6 @@
 }
 
 /*
- * Iterate over all zvols in a given pool by walking the /dev/zvol/dsk/<pool>
- * hierarchy.
- */
-int
-zpool_iter_zvol(zpool_handle_t *zhp, int (*cb)(const char *, void *),
-    void *data)
-{
-	libzfs_handle_t *hdl = zhp->zpool_hdl;
-	char (*paths)[MAXPATHLEN];
-	size_t size = 4;
-	int curr, fd, base, ret = 0;
-	DIR *dirp;
-	struct dirent *dp;
-	struct stat st;
-
-	if ((base = open("/dev/zvol/dsk", O_RDONLY)) < 0)
-		return (errno == ENOENT ? 0 : -1);
-
-	if (fstatat(base, zhp->zpool_name, &st, 0) != 0) {
-		int err = errno;
-		(void) close(base);
-		return (err == ENOENT ? 0 : -1);
-	}
-
-	/*
-	 * Oddly this wasn't a directory -- ignore that failure since we
-	 * know there are no links lower in the (non-existant) hierarchy.
-	 */
-	if (!S_ISDIR(st.st_mode)) {
-		(void) close(base);
-		return (0);
-	}
-
-	if ((paths = zfs_alloc(hdl, size * sizeof (paths[0]))) == NULL) {
-		(void) close(base);
-		return (-1);
-	}
-
-	(void) strlcpy(paths[0], zhp->zpool_name, sizeof (paths[0]));
-	curr = 0;
-
-	while (curr >= 0) {
-		if (fstatat(base, paths[curr], &st, AT_SYMLINK_NOFOLLOW) != 0)
-			goto err;
-
-		if (S_ISDIR(st.st_mode)) {
-			if ((fd = openat(base, paths[curr], O_RDONLY)) < 0)
-				goto err;
-
-			if ((dirp = fdopendir(fd)) == NULL) {
-				(void) close(fd);
-				goto err;
-			}
-
-			while ((dp = readdir(dirp)) != NULL) {
-				if (dp->d_name[0] == '.')
-					continue;
-
-				if (curr + 1 == size) {
-					paths = zfs_realloc(hdl, paths,
-					    size * sizeof (paths[0]),
-					    size * 2 * sizeof (paths[0]));
-					if (paths == NULL) {
-						(void) closedir(dirp);
-						(void) close(fd);
-						goto err;
-					}
-
-					size *= 2;
-				}
-
-				(void) strlcpy(paths[curr + 1], paths[curr],
-				    sizeof (paths[curr + 1]));
-				(void) strlcat(paths[curr], "/",
-				    sizeof (paths[curr]));
-				(void) strlcat(paths[curr], dp->d_name,
-				    sizeof (paths[curr]));
-				curr++;
-			}
-
-			(void) closedir(dirp);
-
-		} else {
-			if ((ret = cb(paths[curr], data)) != 0)
-				break;
-		}
-
-		curr--;
-	}
-
-	free(paths);
-	(void) close(base);
-
-	return (ret);
-
-err:
-	free(paths);
-	(void) close(base);
-	return (-1);
-}
-
-typedef struct zvol_cb {
-	zpool_handle_t *zcb_pool;
-	boolean_t zcb_create;
-} zvol_cb_t;
-
-/*ARGSUSED*/
-static int
-do_zvol_create(zfs_handle_t *zhp, void *data)
-{
-	int ret = 0;
-
-	if (ZFS_IS_VOLUME(zhp)) {
-		(void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
-		ret = zfs_iter_snapshots(zhp, do_zvol_create, NULL);
-	}
-
-	if (ret == 0)
-		ret = zfs_iter_filesystems(zhp, do_zvol_create, NULL);
-
-	zfs_close(zhp);
-
-	return (ret);
-}
-
-/*
- * Iterate over all zvols in the pool and make any necessary minor nodes.
- */
-int
-zpool_create_zvol_links(zpool_handle_t *zhp)
-{
-	zfs_handle_t *zfp;
-	int ret;
-
-	/*
-	 * If the pool is unavailable, just return success.
-	 */
-	if ((zfp = make_dataset_handle(zhp->zpool_hdl,
-	    zhp->zpool_name)) == NULL)
-		return (0);
-
-	ret = zfs_iter_filesystems(zfp, do_zvol_create, NULL);
-
-	zfs_close(zfp);
-	return (ret);
-}
-
-static int
-do_zvol_remove(const char *dataset, void *data)
-{
-	zpool_handle_t *zhp = data;
-
-	return (zvol_remove_link(zhp->zpool_hdl, dataset));
-}
-
-/*
- * Iterate over all zvols in the pool and remove any minor nodes.  We iterate
- * by examining the /dev links so that a corrupted pool doesn't impede this
- * operation.
- */
-int
-zpool_remove_zvol_links(zpool_handle_t *zhp)
-{
-	return (zpool_iter_zvol(zhp, do_zvol_remove, zhp));
-}
-
-/*
  * Convert from a devid string to a path.
  */
 static char *
--- a/usr/src/lib/libzfs/common/libzfs_sendrecv.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libzfs/common/libzfs_sendrecv.c	Mon Sep 21 08:55:28 2009 -0600
@@ -27,7 +27,6 @@
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
-#include <libdevinfo.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -36,10 +35,6 @@
 #include <stddef.h>
 #include <fcntl.h>
 #include <sys/mount.h>
-#include <sys/mntent.h>
-#include <sys/mnttab.h>
-#include <sys/avl.h>
-#include <stddef.h>
 
 #include <libzfs.h>
 
@@ -1840,12 +1835,6 @@
 				return (-1);
 			}
 		}
-		if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_VOLUME &&
-		    zvol_remove_link(hdl, zhp->zfs_name) != 0) {
-			zfs_close(zhp);
-			zcmd_free_nvlists(&zc);
-			return (-1);
-		}
 		zfs_close(zhp);
 	} else {
 		/*
@@ -1988,12 +1977,9 @@
 	}
 
 	/*
-	 * Mount or recreate the /dev links for the target filesystem
-	 * (if created, or if we tore them down to do an incremental
-	 * restore), and the /dev links for the new snapshot (if
-	 * created). Also mount any children of the target filesystem
-	 * if we did a replication receive (indicated by stream_avl
-	 * being non-NULL).
+	 * Mount the target filesystem (if created).  Also mount any
+	 * children of the target filesystem if we did a replication
+	 * receive (indicated by stream_avl being non-NULL).
 	 */
 	cp = strchr(zc.zc_value, '@');
 	if (cp && (ioctl_err == 0 || !newfs)) {
@@ -2005,10 +1991,6 @@
 		if (h != NULL) {
 			if (h->zfs_type == ZFS_TYPE_VOLUME) {
 				*cp = '@';
-				err = zvol_create_link(hdl, h->zfs_name);
-				if (err == 0 && ioctl_err == 0)
-					err = zvol_create_link(hdl,
-					    zc.zc_value);
 			} else if (newfs || stream_avl) {
 				/*
 				 * Track the first/top of hierarchy fs,
--- a/usr/src/lib/libzfs/common/libzfs_util.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libzfs/common/libzfs_util.c	Mon Sep 21 08:55:28 2009 -0600
@@ -94,8 +94,6 @@
 	case EZFS_VOLTOOBIG:
 		return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
 		    "this system"));
-	case EZFS_VOLHASDATA:
-		return (dgettext(TEXT_DOMAIN, "volume has data"));
 	case EZFS_INVALIDNAME:
 		return (dgettext(TEXT_DOMAIN, "invalid name"));
 	case EZFS_BADRESTORE:
@@ -142,8 +140,6 @@
 		return (dgettext(TEXT_DOMAIN,
 		    "iscsitgt service need to be enabled by "
 		    "a privileged user"));
-	case EZFS_DEVLINKS:
-		return (dgettext(TEXT_DOMAIN, "failed to create /dev links"));
 	case EZFS_PERM:
 		return (dgettext(TEXT_DOMAIN, "permission denied"));
 	case EZFS_NOSPC:
--- a/usr/src/lib/libzfs/common/mapfile-vers	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/libzfs/common/mapfile-vers	Mon Sep 21 08:55:28 2009 -0600
@@ -141,7 +141,6 @@
 	zpool_clear;
 	zpool_close;
 	zpool_create;
-	zpool_create_zvol_links;
 	zpool_destroy;
 	zpool_disable_datasets;
 	zpool_enable_datasets;
@@ -183,7 +182,6 @@
 	zpool_prop_values;
 	zpool_read_label;
 	zpool_refresh_stats;
-	zpool_remove_zvol_links;
 	zpool_scrub;
 	zpool_set_history_str;
 	zpool_set_prop;
--- a/usr/src/lib/pyzfs/common/allow.py	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/lib/pyzfs/common/allow.py	Mon Sep 21 08:55:28 2009 -0600
@@ -204,8 +204,8 @@
 perms_subcmd = dict(
     create=_("Must also have the 'mount' ability"),
     destroy=_("Must also have the 'mount' ability"),
-    snapshot=_("Must also have the 'mount' ability"),
-    rollback=_("Must also have the 'mount' ability"),
+    snapshot="",
+    rollback="",
     clone=_("""Must also have the 'create' ability and 'mount'
 \t\t\t\tability in the origin file system"""),
     promote=_("""Must also have the 'mount'
--- a/usr/src/uts/common/Makefile.files	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/Makefile.files	Mon Sep 21 08:55:28 2009 -0600
@@ -1064,9 +1064,10 @@
 DEVFS_OBJS +=	devfs_subr.o	devfs_vfsops.o	devfs_vnops.o
 
 DEV_OBJS  +=	sdev_subr.o	sdev_vfsops.o	sdev_vnops.o	\
-		sdev_ptsops.o	sdev_comm.o	sdev_profile.o	\
-		sdev_ncache.o	sdev_netops.o	sdev_vtops.o	\
-		sdev_ipnetops.o
+		sdev_ptsops.o	sdev_zvolops.o	sdev_comm.o	\
+		sdev_profile.o	sdev_ncache.o	sdev_netops.o	\
+		sdev_ipnetops.o	\
+		sdev_vtops.o
 
 CTFS_OBJS +=	ctfs_all.o ctfs_cdir.o ctfs_ctl.o ctfs_event.o \
 		ctfs_latest.o ctfs_root.o ctfs_sym.o ctfs_tdir.o ctfs_tmpl.o
--- a/usr/src/uts/common/fs/dev/sdev_subr.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/dev/sdev_subr.c	Mon Sep 21 08:55:28 2009 -0600
@@ -152,7 +152,7 @@
 
 /* static */
 static struct vnodeops *sdev_get_vop(struct sdev_node *);
-static void sdev_set_no_nocache(struct sdev_node *);
+static void sdev_set_no_negcache(struct sdev_node *);
 static fs_operation_def_t *sdev_merge_vtab(const fs_operation_def_t []);
 static void sdev_free_vtab(fs_operation_def_t *);
 
@@ -329,7 +329,7 @@
 		dhl = &(dv->sdev_handle);
 		dhl->dh_data = dv;
 		dhl->dh_args = NULL;
-		sdev_set_no_nocache(dv);
+		sdev_set_no_negcache(dv);
 		dv->sdev_gdir_gen = 0;
 	} else {
 		dv->sdev_flags &= ~SDEV_GLOBAL;
@@ -402,11 +402,8 @@
 		else
 			*dv->sdev_attr = *vap;
 
-		if ((SDEV_IS_PERSIST(dv) && (dv->sdev_attrvp == NULL)) ||
-		    ((SDEVTOV(dv)->v_type == VDIR) &&
-		    (dv->sdev_attrvp == NULL))) {
+		if ((dv->sdev_attrvp == NULL) && SDEV_IS_PERSIST(dv))
 			error = sdev_shadow_node(dv, cred);
-		}
 	}
 
 	if (error == 0) {
@@ -517,6 +514,9 @@
 	{ "vt", devvt_vnodeops_tbl, NULL, &devvt_vnodeops, devvt_validate,
 	SDEV_DYNAMIC | SDEV_VTOR },
 
+	{ "zvol", devzvol_vnodeops_tbl, NULL, &devzvol_vnodeops,
+	devzvol_validate, SDEV_DYNAMIC | SDEV_VTOR | SDEV_SUBDIR },
+
 	{ "zcons", NULL, NULL, NULL, NULL, SDEV_NO_NCACHE },
 
 	{ "net", devnet_vnodeops_tbl, NULL, &devnet_vnodeops, devnet_validate,
@@ -528,6 +528,29 @@
 	{ NULL, NULL, NULL, NULL, NULL, 0}
 };
 
+struct sdev_vop_table *
+sdev_match(struct sdev_node *dv)
+{
+	int vlen;
+	int i;
+
+	for (i = 0; vtab[i].vt_name; i++) {
+		if (strcmp(vtab[i].vt_name, dv->sdev_name) == 0)
+			return (&vtab[i]);
+		if (vtab[i].vt_flags & SDEV_SUBDIR) {
+			char *ptr;
+
+			ASSERT(strlen(dv->sdev_path) > 5);
+			ptr = dv->sdev_path + 5;
+			vlen = strlen(vtab[i].vt_name);
+			if ((strncmp(vtab[i].vt_name, ptr,
+			    vlen - 1) == 0) && ptr[vlen] == '/')
+				return (&vtab[i]);
+		}
+
+	}
+	return (NULL);
+}
 
 /*
  *  sets a directory's vnodeops if the directory is in the vtab;
@@ -535,7 +558,7 @@
 static struct vnodeops *
 sdev_get_vop(struct sdev_node *dv)
 {
-	int i;
+	struct sdev_vop_table *vtp;
 	char *path;
 
 	path = dv->sdev_path;
@@ -544,33 +567,31 @@
 	/* gets the relative path to /dev/ */
 	path += 5;
 
-	/* gets the vtab entry if matches */
-	for (i = 0; vtab[i].vt_name; i++) {
-		if (strcmp(vtab[i].vt_name, path) != 0)
-			continue;
-		dv->sdev_flags |= vtab[i].vt_flags;
-
-		if (vtab[i].vt_vops) {
-			if (vtab[i].vt_global_vops)
-				*(vtab[i].vt_global_vops) = vtab[i].vt_vops;
-			return (vtab[i].vt_vops);
+	/* gets the vtab entry it matches */
+	if ((vtp = sdev_match(dv)) != NULL) {
+		dv->sdev_flags |= vtp->vt_flags;
+
+		if (vtp->vt_vops) {
+			if (vtp->vt_global_vops)
+				*(vtp->vt_global_vops) = vtp->vt_vops;
+			return (vtp->vt_vops);
 		}
 
-		if (vtab[i].vt_service) {
+		if (vtp->vt_service) {
 			fs_operation_def_t *templ;
-			templ = sdev_merge_vtab(vtab[i].vt_service);
-			if (vn_make_ops(vtab[i].vt_name,
+			templ = sdev_merge_vtab(vtp->vt_service);
+			if (vn_make_ops(vtp->vt_name,
 			    (const fs_operation_def_t *)templ,
-			    &vtab[i].vt_vops) != 0) {
+			    &vtp->vt_vops) != 0) {
 				cmn_err(CE_PANIC, "%s: malformed vnode ops\n",
-				    vtab[i].vt_name);
+				    vtp->vt_name);
 				/*NOTREACHED*/
 			}
-			if (vtab[i].vt_global_vops) {
-				*(vtab[i].vt_global_vops) = vtab[i].vt_vops;
+			if (vtp->vt_global_vops) {
+				*(vtp->vt_global_vops) = vtp->vt_vops;
 			}
 			sdev_free_vtab(templ);
-			return (vtab[i].vt_vops);
+			return (vtp->vt_vops);
 		}
 		return (sdev_vnodeops);
 	}
@@ -583,7 +604,7 @@
 }
 
 static void
-sdev_set_no_nocache(struct sdev_node *dv)
+sdev_set_no_negcache(struct sdev_node *dv)
 {
 	int i;
 	char *path;
@@ -603,14 +624,13 @@
 void *
 sdev_get_vtor(struct sdev_node *dv)
 {
-	int i;
-
-	for (i = 0; vtab[i].vt_name; i++) {
-		if (strcmp(vtab[i].vt_name, dv->sdev_name) != 0)
-			continue;
-		return ((void *)vtab[i].vt_vtor);
-	}
-	return (NULL);
+	struct sdev_vop_table *vtp;
+
+	vtp = sdev_match(dv);
+	if (vtp)
+		return ((void *)vtp->vt_vtor);
+	else
+		return (NULL);
 }
 
 /*
@@ -631,7 +651,7 @@
 	return (ino);
 }
 
-static int
+int
 sdev_getlink(struct vnode *linkvp, char **link)
 {
 	int err;
@@ -988,7 +1008,7 @@
 		rw_enter(&dv->sdev_contents, RW_WRITER);
 		if (dv->sdev_state == SDEV_READY) {
 			sdcmn_err9((
-			    "sdev_delete: node %s busy with count %d\n",
+			    "sdev_dirdelete: node %s busy with count %d\n",
 			    dv->sdev_name, vp->v_count));
 			dv->sdev_state = SDEV_ZOMBIE;
 		}
@@ -1287,7 +1307,7 @@
 	}
 }
 
-static struct vattr *
+struct vattr *
 sdev_getdefault_attr(enum vtype type)
 {
 	if (type == VDIR)
@@ -1327,34 +1347,6 @@
 }
 
 /*
- * the junction between devname and devfs
- */
-static struct vnode *
-devname_configure_by_path(char *physpath, struct vattr *vattr)
-{
-	int error = 0;
-	struct vnode *vp;
-
-	ASSERT(strncmp(physpath, "/devices/", sizeof ("/devices/") - 1)
-	    == 0);
-
-	error = devfs_lookupname(physpath + sizeof ("/devices/") - 1,
-	    NULLVPP, &vp);
-	if (error != 0) {
-		if (error == ENODEV) {
-			cmn_err(CE_CONT, "%s: not found (line %d)\n",
-			    physpath, __LINE__);
-		}
-
-		return (NULL);
-	}
-
-	if (vattr)
-		(void) VOP_GETATTR(vp, vattr, 0, kcred, NULL);
-	return (vp);
-}
-
-/*
  * junction between devname and root file system, e.g. ufs
  */
 int
@@ -1418,10 +1410,8 @@
 		if (error || dbuflen == 0)
 			break;
 
-		if (!(ddv->sdev_flags & SDEV_BUILD)) {
-			error = 0;
+		if (!(ddv->sdev_flags & SDEV_BUILD))
 			break;
-		}
 
 		for (dp = dbuf; ((intptr_t)dp <
 		    (intptr_t)dbuf + dbuflen);
@@ -1511,6 +1501,9 @@
 	ASSERT((ddv->sdev_flags & SDEV_BUILD));
 
 	vap = sdev_getdefault_attr(VDIR);
+	gethrestime(&vap->va_atime);
+	vap->va_mtime = vap->va_atime;
+	vap->va_ctime = vap->va_atime;
 	for (i = 0; vtab[i].vt_name != NULL; i++) {
 		nm = vtab[i].vt_name;
 		ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
@@ -1737,66 +1730,12 @@
 {
 	int rv = 0;
 	char *physpath = NULL;
-	struct vnode *rvp = NULL;
 	struct vattr vattr;
 	struct vattr *vap;
-	struct sdev_node *dv = *dvp;
-
-	mutex_enter(&dv->sdev_lookup_lock);
-	SDEV_BLOCK_OTHERS(dv, SDEV_LOOKUP);
-	mutex_exit(&dv->sdev_lookup_lock);
-
-	/* for non-devfsadm devices */
-	if (flags & SDEV_PATH) {
-		physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
-		rv = callback(ddv, nm, (void *)&physpath, kcred, NULL,
-		    NULL);
-		if (rv) {
-			kmem_free(physpath, MAXPATHLEN);
-			return (-1);
-		}
-
-		rvp = devname_configure_by_path(physpath, NULL);
-		if (rvp == NULL) {
-			sdcmn_err3(("devname_configure_by_path: "
-			    "failed for /dev/%s/%s\n",
-			    ddv->sdev_name, nm));
-			kmem_free(physpath, MAXPATHLEN);
-			rv = -1;
-		} else {
-			vap = sdev_getdefault_attr(VLNK);
-			ASSERT(RW_READ_HELD(&ddv->sdev_contents));
-
-			/*
-			 * Sdev_mknode may return back a different sdev_node
-			 * that was created by another thread that
-			 * raced to the directroy cache before this thread.
-			 *
-			 * With current directory cache mechanism
-			 * (linked list with the sdev_node name as
-			 * the entity key), this is a way to make sure
-			 * only one entry exists for the same name
-			 * in the same directory. The outcome is
-			 * the winner wins.
-			 */
-			if (!rw_tryupgrade(&ddv->sdev_contents)) {
-				rw_exit(&ddv->sdev_contents);
-				rw_enter(&ddv->sdev_contents, RW_WRITER);
-			}
-			rv = sdev_mknode(ddv, nm, &dv, vap, NULL,
-			    (void *)physpath, cred, SDEV_READY);
-			rw_downgrade(&ddv->sdev_contents);
-			kmem_free(physpath, MAXPATHLEN);
-			if (rv) {
-				return (rv);
-			} else {
-				mutex_enter(&dv->sdev_lookup_lock);
-				SDEV_UNBLOCK_OTHERS(dv, SDEV_LOOKUP);
-				mutex_exit(&dv->sdev_lookup_lock);
-				return (0);
-			}
-		}
-	} else if (flags & SDEV_VLINK) {
+	struct sdev_node *dv = NULL;
+
+	ASSERT(RW_WRITE_HELD(&ddv->sdev_contents));
+	if (flags & SDEV_VLINK) {
 		physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 		rv = callback(ddv, nm, (void *)&physpath, kcred, NULL,
 		    NULL);
@@ -1807,61 +1746,19 @@
 
 		vap = sdev_getdefault_attr(VLNK);
 		vap->va_size = strlen(physpath);
-		ASSERT(RW_READ_HELD(&ddv->sdev_contents));
-
-		if (!rw_tryupgrade(&ddv->sdev_contents)) {
-			rw_exit(&ddv->sdev_contents);
-			rw_enter(&ddv->sdev_contents, RW_WRITER);
-		}
+		gethrestime(&vap->va_atime);
+		vap->va_mtime = vap->va_atime;
+		vap->va_ctime = vap->va_atime;
+
 		rv = sdev_mknode(ddv, nm, &dv, vap, NULL,
 		    (void *)physpath, cred, SDEV_READY);
-		rw_downgrade(&ddv->sdev_contents);
 		kmem_free(physpath, MAXPATHLEN);
 		if (rv)
 			return (rv);
-
-		mutex_enter(&dv->sdev_lookup_lock);
-		SDEV_UNBLOCK_OTHERS(dv, SDEV_LOOKUP);
-		mutex_exit(&dv->sdev_lookup_lock);
-		return (0);
-	} else if (flags & SDEV_VNODE) {
-		/*
-		 * DBNR has its own way to create the device
-		 * and return a backing store vnode in rvp
-		 */
-		ASSERT(callback);
-		rv = callback(ddv, nm, (void *)&rvp, kcred, NULL, NULL);
-		if (rv || (rvp == NULL)) {
-			sdcmn_err3(("devname_lookup_func: SDEV_VNODE "
-			    "callback failed \n"));
-			return (-1);
-		}
-		vap = sdev_getdefault_attr(rvp->v_type);
-		if (vap == NULL)
-			return (-1);
-
-		ASSERT(RW_READ_HELD(&ddv->sdev_contents));
-		if (!rw_tryupgrade(&ddv->sdev_contents)) {
-			rw_exit(&ddv->sdev_contents);
-			rw_enter(&ddv->sdev_contents, RW_WRITER);
-		}
-		rv = sdev_mknode(ddv, nm, &dv, vap, rvp, NULL,
-		    cred, SDEV_READY);
-		rw_downgrade(&ddv->sdev_contents);
-		if (rv)
-			return (rv);
-
-		mutex_enter(&dv->sdev_lookup_lock);
-		SDEV_UNBLOCK_OTHERS(dv, SDEV_LOOKUP);
-		mutex_exit(&dv->sdev_lookup_lock);
-		return (0);
 	} else if (flags & SDEV_VATTR) {
 		/*
 		 * /dev/pts
 		 *
-		 * DBNR has its own way to create the device
-		 * "0" is returned upon success.
-		 *
 		 * callback is responsible to set the basic attributes,
 		 * e.g. va_type/va_uid/va_gid/
 		 *    dev_t if VCHR or VBLK/
@@ -1874,22 +1771,12 @@
 			return (-1);
 		}
 
-		ASSERT(RW_READ_HELD(&ddv->sdev_contents));
-		if (!rw_tryupgrade(&ddv->sdev_contents)) {
-			rw_exit(&ddv->sdev_contents);
-			rw_enter(&ddv->sdev_contents, RW_WRITER);
-		}
 		rv = sdev_mknode(ddv, nm, &dv, &vattr, NULL, NULL,
 		    cred, SDEV_READY);
-		rw_downgrade(&ddv->sdev_contents);
 
 		if (rv)
 			return (rv);
 
-		mutex_enter(&dv->sdev_lookup_lock);
-		SDEV_UNBLOCK_OTHERS(dv, SDEV_LOOKUP);
-		mutex_exit(&dv->sdev_lookup_lock);
-		return (0);
 	} else {
 		impossible(("lookup: %s/%s by %s not supported (%d)\n",
 		    SDEVTOV(ddv)->v_path, nm, curproc->p_user.u_comm,
@@ -1914,7 +1801,6 @@
 	return (0);
 }
 
-
 /*
  * Lookup Order:
  *	sdev_node cache;
@@ -2050,7 +1936,7 @@
 	 */
 	if (parent_state == SDEV_ZOMBIE) {
 		rw_exit(&ddv->sdev_contents);
-		*vpp = NULL;
+		*vpp = NULLVP;
 		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
 		return (ENOENT);
 	}
@@ -2060,13 +1946,12 @@
 	 *	SDEV_PERSIST is default except:
 	 *		1) pts nodes
 	 *		2) non-chmod'ed local nodes
+	 *		3) zvol nodes
 	 */
 	if (SDEV_IS_PERSIST(ddv)) {
 		error = devname_backstore_lookup(ddv, nm, &rvp);
 
 		if (!error) {
-			sdcmn_err3(("devname_backstore_lookup: "
-			    "found attrvp %p for %s\n", (void *)rvp, nm));
 
 			vattr.va_mask = AT_MODE|AT_UID|AT_GID;
 			error = VOP_GETATTR(rvp, &vattr, 0, cred, NULL);
@@ -2133,6 +2018,23 @@
 
 lookup_create_node:
 	/* first thread that is doing the lookup on this node */
+	if (callback) {
+		ASSERT(dv == NULL);
+		if (!rw_tryupgrade(&ddv->sdev_contents)) {
+			rw_exit(&ddv->sdev_contents);
+			rw_enter(&ddv->sdev_contents, RW_WRITER);
+		}
+		error = sdev_call_dircallback(ddv, &dv, nm, callback,
+		    flags, cred);
+		rw_downgrade(&ddv->sdev_contents);
+		if (error == 0) {
+			goto found;
+		} else {
+			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
+			rw_exit(&ddv->sdev_contents);
+			goto lookup_failed;
+		}
+	}
 	if (!dv) {
 		if (!rw_tryupgrade(&ddv->sdev_contents)) {
 			rw_exit(&ddv->sdev_contents);
@@ -2149,86 +2051,62 @@
 		}
 		rw_downgrade(&ddv->sdev_contents);
 	}
-	ASSERT(dv);
-	ASSERT(SDEV_HELD(dv));
-
-	if (SDEV_IS_NO_NCACHE(dv)) {
-		failed_flags |= SLF_NO_NCACHE;
-	}
 
 	/*
 	 * (b1) invoking devfsadm once per life time for devfsadm nodes
 	 */
-	if (!callback) {
-
-		if (sdev_reconfig_boot || !i_ddi_io_initialized() ||
-		    SDEV_IS_DYNAMIC(ddv) || SDEV_IS_NO_NCACHE(dv) ||
-		    ((moddebug & MODDEBUG_FINI_EBUSY) != 0)) {
-			ASSERT(SDEV_HELD(dv));
-			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
-			goto nolock_notfound;
-		}
-
-		/*
-		 * filter out known non-existent devices recorded
-		 * during initial reconfiguration boot for which
-		 * reconfig should not be done and lookup may
-		 * be short-circuited now.
-		 */
-		if (sdev_lookup_filter(ddv, nm)) {
-			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
-			goto nolock_notfound;
-		}
-
-		/* bypassing devfsadm internal nodes */
-		if (is_devfsadm_thread(lookup_thread)) {
-			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
-			goto nolock_notfound;
-		}
-
-		if (sdev_reconfig_disable) {
-			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
-			goto nolock_notfound;
-		}
-
-		error = sdev_call_devfsadmd(ddv, dv, nm);
-		if (error == 0) {
-			sdcmn_err8(("lookup of %s/%s by %s: reconfig\n",
-			    ddv->sdev_name, nm, curproc->p_user.u_comm));
-			if (sdev_reconfig_verbose) {
-				cmn_err(CE_CONT,
-				    "?lookup of %s/%s by %s: reconfig\n",
-				    ddv->sdev_name, nm, curproc->p_user.u_comm);
-			}
-			retried = 1;
-			failed_flags |= SLF_REBUILT;
-			ASSERT(dv->sdev_state != SDEV_ZOMBIE);
-			SDEV_SIMPLE_RELE(dv);
-			goto tryagain;
-		} else {
-			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
-			goto nolock_notfound;
-		}
+	ASSERT(SDEV_HELD(dv));
+
+	if (SDEV_IS_NO_NCACHE(dv))
+		failed_flags |= SLF_NO_NCACHE;
+	if (sdev_reconfig_boot || !i_ddi_io_initialized() ||
+	    SDEV_IS_DYNAMIC(ddv) || SDEV_IS_NO_NCACHE(dv) ||
+	    ((moddebug & MODDEBUG_FINI_EBUSY) != 0)) {
+		ASSERT(SDEV_HELD(dv));
+		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
+		goto nolock_notfound;
 	}
 
 	/*
-	 * (b2) Directory Based Name Resolution (DBNR):
-	 *	ddv	- parent
-	 *	nm	- /dev/(ddv->sdev_name)/nm
-	 *
-	 *	note: module vnode ops take precedence than the build-in ones
+	 * filter out known non-existent devices recorded
+	 * during initial reconfiguration boot for which
+	 * reconfig should not be done and lookup may
+	 * be short-circuited now.
 	 */
-	if (callback) {
-		error = sdev_call_dircallback(ddv, &dv, nm, callback,
-		    flags, cred);
-		if (error == 0) {
-			goto found;
-		} else {
-			SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
-			goto notfound;
+	if (sdev_lookup_filter(ddv, nm)) {
+		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
+		goto nolock_notfound;
+	}
+
+	/* bypassing devfsadm internal nodes */
+	if (is_devfsadm_thread(lookup_thread)) {
+		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
+		goto nolock_notfound;
+	}
+
+	if (sdev_reconfig_disable) {
+		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
+		goto nolock_notfound;
+	}
+
+	error = sdev_call_devfsadmd(ddv, dv, nm);
+	if (error == 0) {
+		sdcmn_err8(("lookup of %s/%s by %s: reconfig\n",
+		    ddv->sdev_name, nm, curproc->p_user.u_comm));
+		if (sdev_reconfig_verbose) {
+			cmn_err(CE_CONT,
+			    "?lookup of %s/%s by %s: reconfig\n",
+			    ddv->sdev_name, nm, curproc->p_user.u_comm);
 		}
+		retried = 1;
+		failed_flags |= SLF_REBUILT;
+		ASSERT(dv->sdev_state != SDEV_ZOMBIE);
+		SDEV_SIMPLE_RELE(dv);
+		goto tryagain;
+	} else {
+		SD_TRACE_FAILED_LOOKUP(ddv, nm, retried);
+		goto nolock_notfound;
 	}
-	ASSERT(rvp);
 
 found:
 	ASSERT(!(dv->sdev_flags & SDEV_STALE));
@@ -2275,7 +2153,6 @@
 			    "dev fs: validator failed: %s(%p)\n",
 			    dv->sdev_name, (void *)dv);
 			break;
-			/*NOTREACHED*/
 		}
 	}
 
@@ -2286,10 +2163,6 @@
 	    dv->sdev_state, nm, rv));
 	return (rv);
 
-notfound:
-	mutex_enter(&dv->sdev_lookup_lock);
-	SDEV_UNBLOCK_OTHERS(dv, SDEV_LOOKUP);
-	mutex_exit(&dv->sdev_lookup_lock);
 nolock_notfound:
 	/*
 	 * Destroy the node that is created for synchronization purposes.
@@ -2668,7 +2541,7 @@
 		/* bypassing pre-matured nodes */
 		if (diroff < soff || (dv->sdev_state != SDEV_READY)) {
 			sdcmn_err3(("sdev_readdir: pre-mature node  "
-			    "%s\n", dv->sdev_name));
+			    "%s %d\n", dv->sdev_name, dv->sdev_state));
 			continue;
 		}
 
--- a/usr/src/uts/common/fs/dev/sdev_vnops.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/dev/sdev_vnops.c	Mon Sep 21 08:55:28 2009 -0600
@@ -85,6 +85,8 @@
 	if (!SDEV_IS_GLOBAL(dv))
 		return (ENOTSUP);
 
+	if ((*vpp)->v_type == VLNK)
+		return (ENOENT);
 	ASSERT((*vpp)->v_type == VREG);
 	if ((*vpp)->v_type != VREG)
 		return (ENOTSUP);
@@ -1015,7 +1017,7 @@
 	rw_exit(&parent->sdev_dotdot->sdev_contents);
 
 	/* execute access is required to search the directory */
-	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
+	if ((error = VOP_ACCESS(dvp, VEXEC|VWRITE, 0, cred, ct)) != 0)
 		return (error);
 
 	/* check existing name */
@@ -1034,13 +1036,6 @@
 		return (ENOENT);
 	}
 
-	/* write access is required to remove a directory */
-	if ((error = VOP_ACCESS(dvp, VWRITE, 0, cred, ct)) != 0) {
-		rw_exit(&parent->sdev_contents);
-		VN_RELE(vp);
-		return (error);
-	}
-
 	/* some sanity checks */
 	if (vp == dvp || vp == cdir) {
 		rw_exit(&parent->sdev_contents);
@@ -1238,17 +1233,6 @@
 }
 
 static int
-sdev_setfl(struct vnode *vp, int oflags, int nflags, cred_t *cr,
-    caller_context_t *ct)
-{
-	struct sdev_node *dv = VTOSDEV(vp);
-	ASSERT(dv);
-	ASSERT(dv->sdev_attrvp);
-
-	return (VOP_SETFL(dv->sdev_attrvp, oflags, nflags, cr, ct));
-}
-
-static int
 sdev_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
@@ -1288,7 +1272,6 @@
 	VOPNAME_SEEK,		{ .vop_seek = sdev_seek },
 	VOPNAME_FRLOCK,		{ .vop_frlock = sdev_frlock },
 	VOPNAME_PATHCONF,	{ .vop_pathconf = sdev_pathconf },
-	VOPNAME_SETFL,		{ .vop_setfl = sdev_setfl },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = sdev_setsecattr },
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = sdev_getsecattr },
 	NULL,			NULL
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/dev/sdev_zvolops.c	Mon Sep 21 08:55:28 2009 -0600
@@ -0,0 +1,686 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* vnode ops for the /dev/zvol directory */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/ddi.h>
+#include <sys/sunndi.h>
+#include <sys/sunldi.h>
+#include <fs/fs_subr.h>
+#include <sys/fs/dv_node.h>
+#include <sys/fs/sdev_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/policy.h>
+#include <sys/stat.h>
+#include <sys/vfs_opreg.h>
+
+struct vnodeops	*devzvol_vnodeops;
+static uint64_t devzvol_gen = 0;
+static uint64_t devzvol_zclist;
+static size_t devzvol_zclist_size;
+static ldi_ident_t devzvol_li;
+static ldi_handle_t devzvol_lh;
+static kmutex_t devzvol_mtx;
+static int devzvol_isopen;
+
+/*
+ * we need to use ddi_mod* since fs/dev gets loaded early on in
+ * startup(), and linking fs/dev to fs/zfs would drag in a lot of
+ * other stuff (like drv/random) before the rest of the system is
+ * ready to go
+ */
+ddi_modhandle_t zfs_mod;
+int (*szcm)(char *);
+int (*szn2m)(char *, minor_t *);
+
+int
+sdev_zvol_create_minor(char *dsname)
+{
+	return ((*szcm)(dsname));
+}
+
+int
+sdev_zvol_name2minor(char *dsname, minor_t *minor)
+{
+	return ((*szn2m)(dsname, minor));
+}
+
+int
+devzvol_open_zfs()
+{
+	int rc;
+
+	devzvol_li = ldi_ident_from_anon();
+	if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
+	    &devzvol_lh, devzvol_li))
+		return (-1);
+	if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
+	    KRTLD_MODE_FIRST, &rc)) == NULL)) {
+		return (rc);
+	}
+	ASSERT(szcm == NULL && szn2m == NULL);
+	if ((szcm = (int (*)(char *))
+	    ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
+		cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
+		return (rc);
+	}
+	if ((szn2m = (int(*)(char *, minor_t *))
+	    ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
+		cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
+		return (rc);
+	}
+	return (0);
+}
+
+void
+devzvol_close_zfs()
+{
+	szcm = NULL;
+	szn2m = NULL;
+	(void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
+	ldi_ident_release(devzvol_li);
+	if (zfs_mod != NULL) {
+		(void) ddi_modclose(zfs_mod);
+		zfs_mod = NULL;
+	}
+}
+
+int
+devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
+{
+	uint64_t cookie;
+	int size = 8000;
+	int unused;
+	int rc;
+
+	if (cmd != ZFS_IOC_POOL_CONFIGS)
+		mutex_enter(&devzvol_mtx);
+	if (devzvol_isopen == 0) {
+		if ((rc = devzvol_open_zfs()) == 0) {
+			devzvol_isopen++;
+		} else {
+			if (cmd != ZFS_IOC_POOL_CONFIGS)
+				mutex_exit(&devzvol_mtx);
+			return (ENXIO);
+		}
+	}
+	cookie = zc->zc_cookie;
+again:
+	zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
+	    KM_SLEEP);
+	zc->zc_nvlist_dst_size = size;
+	rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
+	    &unused);
+	if (rc == ENOMEM) {
+		int newsize;
+		newsize = zc->zc_nvlist_dst_size;
+		ASSERT(newsize > size);
+		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
+		size = newsize;
+		zc->zc_cookie = cookie;
+		goto again;
+	}
+	if (alloc_size == NULL)
+		kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
+	else
+		*alloc_size = size;
+	if (cmd != ZFS_IOC_POOL_CONFIGS)
+		mutex_exit(&devzvol_mtx);
+	return (rc);
+}
+
+/* figures out if the objset exists and returns its type */
+int
+devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
+{
+	boolean_t	ispool;
+	zfs_cmd_t	*zc;
+	int rc;
+
+	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+	(void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);
+
+	ispool = (strchr(dsname, '/') == NULL) ? B_TRUE : B_FALSE;
+	if (!ispool && sdev_zvol_name2minor(dsname, NULL) == 0) {
+		sdcmn_err13(("found cached minor node"));
+		if (type)
+			*type = DMU_OST_ZVOL;
+		kmem_free(zc, sizeof (zfs_cmd_t));
+		return (0);
+	}
+	rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
+	    ZFS_IOC_OBJSET_STATS, zc, NULL);
+	if (type && rc == 0)
+		*type = (ispool) ? DMU_OST_ZFS :
+		    zc->zc_objset_stats.dds_type;
+	kmem_free(zc, sizeof (zfs_cmd_t));
+	return (rc);
+}
+
+/*
+ * returns what the zfs dataset name should be, given the /dev/zvol
+ * path and an optional name; otherwise NULL
+ */
+char *
+devzvol_make_dsname(const char *path, const char *name)
+{
+	char *dsname;
+	const char *ptr;
+	int dslen;
+
+	if (strcmp(path, ZVOL_DIR) == 0)
+		return (NULL);
+	if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
+		return (NULL);
+	ptr = path + strlen(ZVOL_DIR);
+	if (strncmp(ptr, "/dsk", 4) == 0)
+		ptr += strlen("/dsk");
+	else if (strncmp(ptr, "/rdsk", 5) == 0)
+		ptr += strlen("/rdsk");
+	else
+		return (NULL);
+	if (*ptr == '/')
+		ptr++;
+
+	dslen = strlen(ptr);
+	if (dslen)
+		dslen++;			/* plus null */
+	if (name)
+		dslen += strlen(name) + 1;	/* plus slash */
+	dsname = kmem_zalloc(dslen, KM_SLEEP);
+	if (*ptr) {
+		(void) strlcpy(dsname, ptr, dslen);
+		if (name)
+			(void) strlcat(dsname, "/", dslen);
+	}
+	if (name)
+		(void) strlcat(dsname, name, dslen);
+	return (dsname);
+}
+
+/*
+ * check if the zvol's sdev_node is still valid, which means make
+ * sure the zvol is still valid.  zvol minors aren't proactively
+ * destroyed when the zvol is destroyed, so we use a validator to clean
+ * these up (in other words, when such nodes are encountered during
+ * subsequent lookup() and readdir() operations) so that only valid
+ * nodes are returned.  The ordering between devname_lookup_func and
+ * devzvol_validate is a little inefficient in the case of invalid
+ * or stale nodes because devname_lookup_func calls
+ * devzvol_create_{dir, link}, then the validator says it's invalid,
+ * and then the node gets cleaned up.
+ */
+int
+devzvol_validate(struct sdev_node *dv)
+{
+	dmu_objset_type_t do_type;
+	char *dsname;
+	char *nm = dv->sdev_name;
+	int rc;
+
+	sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
+	/*
+	 * validate only READY nodes; if someone is sitting on the
+	 * directory of a dataset that just got destroyed we could
+	 * get a zombie node which we just skip.
+	 */
+	if (dv->sdev_state != SDEV_READY) {
+		sdcmn_err13(("skipping '%s'", nm));
+		return (SDEV_VTOR_SKIP);
+	}
+
+	if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
+	    (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
+		return (SDEV_VTOR_VALID);
+	dsname = devzvol_make_dsname(dv->sdev_path, NULL);
+	if (dsname == NULL)
+		return (SDEV_VTOR_INVALID);
+
+	rc = devzvol_objset_check(dsname, &do_type);
+	sdcmn_err13(("  '%s' rc %d", dsname, rc));
+	if (rc != 0) {
+		kmem_free(dsname, strlen(dsname) + 1);
+		return (SDEV_VTOR_INVALID);
+	}
+	sdcmn_err13(("  v_type %d do_type %d",
+	    SDEVTOV(dv)->v_type, do_type));
+	if ((SDEVTOV(dv)->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
+	    (SDEVTOV(dv)->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
+		kmem_free(dsname, strlen(dsname) + 1);
+		return (SDEV_VTOR_STALE);
+	}
+	if (SDEVTOV(dv)->v_type == VLNK) {
+		char *ptr, *link;
+		long val = 0;
+		minor_t lminor, ominor;
+
+		rc = sdev_getlink(SDEVTOV(dv), &link);
+		ASSERT(rc == 0);
+
+		ptr = strrchr(link, ':') + 1;
+		rc = ddi_strtol(ptr, NULL, 10, &val);
+		kmem_free(link, strlen(link) + 1);
+		ASSERT(rc == 0 && val != 0);
+		lminor = (minor_t)val;
+		if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
+		    ominor != lminor) {
+			kmem_free(dsname, strlen(dsname) + 1);
+			return (SDEV_VTOR_STALE);
+		}
+	}
+	kmem_free(dsname, strlen(dsname) + 1);
+	return (SDEV_VTOR_VALID);
+}
+
+/*
+ * creates directories as needed in response to a readdir
+ */
+void
+devzvol_create_pool_dirs(struct vnode *dvp)
+{
+	zfs_cmd_t	*zc;
+	nvlist_t *nv = NULL;
+	nvpair_t *elem = NULL;
+	size_t size;
+	int pools = 0;
+	int rc;
+
+	sdcmn_err13(("devzvol_create_pool_dirs"));
+	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+	mutex_enter(&devzvol_mtx);
+	zc->zc_cookie = devzvol_gen;
+
+	rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
+	switch (rc) {
+		case 0:
+			/* new generation */
+			ASSERT(devzvol_gen != zc->zc_cookie);
+			devzvol_gen = zc->zc_cookie;
+			if (devzvol_zclist)
+				kmem_free((void *)(uintptr_t)devzvol_zclist,
+				    devzvol_zclist_size);
+			devzvol_zclist = zc->zc_nvlist_dst;
+			devzvol_zclist_size = size;
+			break;
+		case EEXIST:
+			/*
+			 * no change in the configuration; still need
+			 * to do lookups in case we did a lookup in
+			 * zvol/rdsk but not zvol/dsk (or vice versa)
+			 */
+			kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
+			    size);
+			break;
+		default:
+			kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
+			    size);
+			goto out;
+	}
+	rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
+	    devzvol_zclist_size, &nv, 0);
+	if (rc) {
+		ASSERT(rc == 0);
+		kmem_free((void *)(uintptr_t)devzvol_zclist,
+		    devzvol_zclist_size);
+		devzvol_gen = 0;
+		devzvol_zclist = NULL;
+		devzvol_zclist_size = 0;
+		goto out;
+	}
+	mutex_exit(&devzvol_mtx);
+	while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
+		struct vnode *vp;
+		ASSERT(dvp->v_count > 0);
+		rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
+		    NULL, kcred, NULL, 0, NULL);
+		/* should either work, or not be visible from a zone */
+		ASSERT(rc == 0 || rc == ENOENT);
+		if (rc == 0)
+			VN_RELE(vp);
+		pools++;
+	}
+	nvlist_free(nv);
+	mutex_enter(&devzvol_mtx);
+	if (pools == 0) {
+		/* clean up so zfs can be unloaded */
+		devzvol_close_zfs();
+		devzvol_isopen--;
+	}
+out:
+	mutex_exit(&devzvol_mtx);
+	kmem_free(zc, sizeof (zfs_cmd_t));
+}
+
+/*ARGSUSED3*/
+static int
+devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
+    cred_t *cred, void *whatever, char *whichever)
+{
+	timestruc_t now;
+	struct vattr *vap = (struct vattr *)arg;
+
+	sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
+	    ddv->sdev_path, nm));
+	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
+	    strlen(ZVOL_DIR)) == 0);
+	*vap = *sdev_getdefault_attr(VDIR);
+	gethrestime(&now);
+	vap->va_atime = now;
+	vap->va_mtime = now;
+	vap->va_ctime = now;
+	return (0);
+}
+
+/*ARGSUSED3*/
+static int
+devzvol_create_link(struct sdev_node *ddv, char *nm,
+    void **arg, cred_t *cred, void *whatever, char *whichever)
+{
+	minor_t minor;
+	char *pathname = (char *)*arg;
+	int rc;
+	char *dsname;
+	char *x;
+	char str[MAXNAMELEN];
+	sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
+	    ddv->sdev_path, nm));
+	dsname = devzvol_make_dsname(ddv->sdev_path, nm);
+	rc = sdev_zvol_create_minor(dsname);
+	if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
+	    sdev_zvol_name2minor(dsname, &minor)) {
+		sdcmn_err13(("devzvol_create_link %d", rc));
+		kmem_free(dsname, strlen(dsname) + 1);
+		return (-1);
+	}
+	kmem_free(dsname, strlen(dsname) + 1);
+
+	/*
+	 * This is a valid zvol; create a symlink that points to the
+	 * minor which was created under /devices/pseudo/zfs@0
+	 */
+	*pathname = '\0';
+	for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
+		(void) strcat(pathname, "../");
+	(void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
+	(void) strncat(pathname, str, MAXPATHLEN);
+	if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
+	    strlen(ZVOL_FULL_RDEV_DIR)) == 0)
+		(void) strcat(pathname, ",raw");
+	return (0);
+}
+
+/* Clean zvol sdev_nodes that are no longer valid.  */
+static void
+devzvol_prunedir(struct sdev_node *ddv)
+{
+	struct sdev_node *dv;
+
+	ASSERT(RW_READ_HELD(&ddv->sdev_contents));
+
+	sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
+	ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
+	if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
+		rw_exit(&ddv->sdev_contents);
+		rw_enter(&ddv->sdev_contents, RW_WRITER);
+	}
+
+	dv = SDEV_FIRST_ENTRY(ddv);
+	while (dv) {
+		sdcmn_err13(("sdev_name '%s'", dv->sdev_name));
+		/* skip stale nodes */
+		if (dv->sdev_flags & SDEV_STALE) {
+			sdcmn_err13(("  stale"));
+			dv = SDEV_NEXT_ENTRY(ddv, dv);
+			continue;
+		}
+
+		switch (devzvol_validate(dv)) {
+		case SDEV_VTOR_VALID:
+		case SDEV_VTOR_SKIP:
+			dv = SDEV_NEXT_ENTRY(ddv, dv);
+			continue;
+		case SDEV_VTOR_INVALID:
+			sdcmn_err7(("prunedir: destroy invalid "
+			    "node: %s\n", dv->sdev_name));
+			break;
+		}
+
+		if ((SDEVTOV(dv)->v_type == VDIR) &&
+		    (sdev_cleandir(dv, NULL, 0) != 0)) {
+			dv = SDEV_NEXT_ENTRY(ddv, dv);
+			continue;
+		}
+		SDEV_HOLD(dv);
+		/* remove the cache node */
+		if (sdev_cache_update(ddv, &dv, dv->sdev_name,
+		    SDEV_CACHE_DELETE) == 0)
+			dv = SDEV_FIRST_ENTRY(ddv);
+		else
+			dv = SDEV_NEXT_ENTRY(ddv, dv);
+	}
+	rw_downgrade(&ddv->sdev_contents);
+}
+
+/*ARGSUSED*/
+static int
+devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
+    struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
+    caller_context_t *ct, int *direntflags, pathname_t *realpnp)
+{
+	enum vtype expected_type = VDIR;
+	struct sdev_node *parent = VTOSDEV(dvp);
+	char *dsname;
+	dmu_objset_type_t do_type;
+	int error;
+
+	sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
+	*vpp = NULL;
+	/* execute access is required to search the directory */
+	if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
+		return (error);
+
+	rw_enter(&parent->sdev_contents, RW_READER);
+	if (!SDEV_IS_GLOBAL(parent)) {
+		rw_exit(&parent->sdev_contents);
+		return (prof_lookup(dvp, nm, vpp, cred));
+	}
+
+	dsname = devzvol_make_dsname(parent->sdev_path, nm);
+	rw_exit(&parent->sdev_contents);
+	sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
+	if (dsname) {
+		error = devzvol_objset_check(dsname, &do_type);
+		if (error != 0) {
+			error = ENOENT;
+			goto out;
+		}
+		if (do_type == DMU_OST_ZVOL)
+			expected_type = VLNK;
+	}
+	/*
+	 * the callbacks expect:
+	 *
+	 * parent->sdev_path		   nm
+	 * /dev/zvol			   {r}dsk
+	 * /dev/zvol/{r}dsk		   <pool name>
+	 * /dev/zvol/{r}dsk/<dataset name> <last ds component>
+	 *
+	 * sdev_name is always last path component of sdev_path
+	 */
+	if (expected_type == VDIR) {
+		error = devname_lookup_func(parent, nm, vpp, cred,
+		    devzvol_create_dir, SDEV_VATTR);
+	} else {
+		error = devname_lookup_func(parent, nm, vpp, cred,
+		    devzvol_create_link, SDEV_VLINK);
+	}
+	sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
+	ASSERT(error || ((*vpp)->v_type == expected_type));
+out:
+	if (dsname)
+		kmem_free(dsname, strlen(dsname) + 1);
+	sdcmn_err13(("devzvol_lookup %d", error));
+	return (error);
+}
+
+/*
+ * We allow create to find existing nodes
+ *	- if the node doesn't exist - EROFS
+ *	- creating an existing dir read-only succeeds, otherwise EISDIR
+ *	- exclusive creates fail - EEXIST
+ */
+/*ARGSUSED2*/
+static int
+devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
+    int mode, struct vnode **vpp, struct cred *cred, int flag,
+    caller_context_t *ct, vsecattr_t *vsecp)
+{
+	int error;
+	struct vnode *vp;
+
+	*vpp = NULL;
+
+	error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
+	    NULL);
+	if (error == 0) {
+		if (excl == EXCL)
+			error = EEXIST;
+		else if (vp->v_type == VDIR && (mode & VWRITE))
+			error = EISDIR;
+		else
+			error = VOP_ACCESS(vp, mode, 0, cred, ct);
+
+		if (error) {
+			VN_RELE(vp);
+		} else
+			*vpp = vp;
+	} else if (error == ENOENT) {
+		error = EROFS;
+	}
+
+	return (error);
+}
+
+void sdev_iter_snapshots(struct vnode *dvp, char *name);
+
+void
+sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
+{
+	zfs_cmd_t	*zc;
+	int rc;
+
+	sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
+	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+	(void) strcpy(zc->zc_name, name);
+
+	while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
+		struct vnode *vpp;
+		char *ptr;
+
+		sdcmn_err13(("  name %s", zc->zc_name));
+		if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
+			goto skip;
+		ptr = strrchr(zc->zc_name, '/') + 1;
+		rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
+		    kcred, NULL, NULL, NULL);
+		if (rc == 0) {
+			VN_RELE(vpp);
+		} else if (rc == ENOENT) {
+			goto skip;
+		} else {
+			/* EBUSY == problem with zvols's dmu holds? */
+			ASSERT(0);
+			goto skip;
+		}
+		if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
+		    zc->zc_objset_stats.dds_type != DMU_OST_ZFS)
+			sdev_iter_snapshots(dvp, zc->zc_name);
+skip:
+		(void) strcpy(zc->zc_name, name);
+	}
+	kmem_free(zc, sizeof (zfs_cmd_t));
+}
+
+void
+sdev_iter_snapshots(struct vnode *dvp, char *name)
+{
+	sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
+}
+
+/*ARGSUSED4*/
+static int
+devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
+    int *eofp, caller_context_t *ct_unused, int flags_unused)
+{
+	struct sdev_node *sdvp = VTOSDEV(dvp);
+	char *ptr;
+
+	sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
+	    sdvp->sdev_name));
+
+	if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
+		struct vnode *vp;
+
+		rw_exit(&sdvp->sdev_contents);
+		(void) devname_lookup_func(sdvp, "dsk", &vp, cred,
+		    devzvol_create_dir, SDEV_VATTR);
+		VN_RELE(vp);
+		(void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
+		    devzvol_create_dir, SDEV_VATTR);
+		VN_RELE(vp);
+		rw_enter(&sdvp->sdev_contents, RW_READER);
+		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
+	}
+	if (uiop->uio_offset == 0)
+		devzvol_prunedir(sdvp);
+	ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
+	if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
+		rw_exit(&sdvp->sdev_contents);
+		devzvol_create_pool_dirs(dvp);
+		rw_enter(&sdvp->sdev_contents, RW_READER);
+		return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
+	}
+
+	ptr = strchr(ptr + 1, '/') + 1;
+	rw_exit(&sdvp->sdev_contents);
+	sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
+	rw_enter(&sdvp->sdev_contents, RW_READER);
+	return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
+}
+
+const fs_operation_def_t devzvol_vnodeops_tbl[] = {
+	VOPNAME_READDIR,	{ .vop_readdir = devzvol_readdir },
+	VOPNAME_LOOKUP,		{ .vop_lookup = devzvol_lookup },
+	VOPNAME_CREATE,		{ .vop_create = devzvol_create },
+	VOPNAME_RENAME,		{ .error = fs_nosys },
+	VOPNAME_MKDIR,		{ .error = fs_nosys },
+	VOPNAME_RMDIR,		{ .error = fs_nosys },
+	VOPNAME_REMOVE,		{ .error = fs_nosys },
+	VOPNAME_SYMLINK,	{ .error = fs_nosys },
+	NULL,			NULL
+};
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Mon Sep 21 08:55:28 2009 -0600
@@ -389,12 +389,13 @@
 	err = dmu_objset_from_ds(ds, osp);
 	if (err) {
 		dsl_dataset_disown(ds, tag);
-	} else if ((type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) ||
-	    (!readonly && dsl_dataset_is_snapshot(ds))) {
+	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
 		dmu_objset_disown(*osp, tag);
 		return (EINVAL);
+	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+		dmu_objset_disown(*osp, tag);
+		return (EROFS);
 	}
-
 	return (err);
 }
 
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Mon Sep 21 08:55:28 2009 -0600
@@ -945,24 +945,6 @@
 	return (might_destroy);
 }
 
-#ifdef _KERNEL
-static int
-dsl_dataset_zvol_cleanup(dsl_dataset_t *ds, const char *name)
-{
-	int error;
-	objset_t *os;
-
-	error = dmu_objset_from_ds(ds, &os);
-	if (error)
-		return (error);
-
-	if (dmu_objset_type(os) == DMU_OST_ZVOL)
-		error = zvol_remove_minor(name);
-
-	return (error);
-}
-#endif
-
 /*
  * If we're removing a clone, and these three conditions are true:
  *	1) the clone's origin has no other children
@@ -990,11 +972,6 @@
 			kmem_free(name, namelen);
 			return (error);
 		}
-		error = dsl_dataset_zvol_cleanup(origin, name);
-		if (error) {
-			kmem_free(name, namelen);
-			return (error);
-		}
 #endif
 		error = dsl_dataset_own(name, B_TRUE, tag, &origin);
 		kmem_free(name, namelen);
@@ -2324,7 +2301,7 @@
 		return (err);
 	}
 	if (tail[0] != '@') {
-		/* the name ended in a nonexistant component */
+		/* the name ended in a nonexistent component */
 		dsl_dir_close(dd, FTAG);
 		return (ENOENT);
 	}
@@ -2365,6 +2342,7 @@
 	list_t shared_snaps, origin_snaps, clone_snaps;
 	dsl_dataset_t *origin_origin, *origin_head;
 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
+	char *err_ds;
 };
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
@@ -2424,10 +2402,12 @@
 		/* Check that the snapshot name does not conflict */
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
-		if (err == 0)
-			return (EEXIST);
+		if (err == 0) {
+			err = EEXIST;
+			goto out;
+		}
 		if (err != ENOENT)
-			return (err);
+			goto out;
 
 		/* The very first snapshot does not have a deadlist */
 		if (ds->ds_phys->ds_prev_snap_obj == 0)
@@ -2435,7 +2415,7 @@
 
 		if (err = bplist_space(&ds->ds_deadlist,
 		    &dlused, &dlcomp, &dluncomp))
-			return (err);
+			goto out;
 		pa->used += dlused;
 		pa->comp += dlcomp;
 		pa->uncomp += dluncomp;
@@ -2493,6 +2473,9 @@
 	}
 
 	return (0);
+out:
+	pa->err_ds =  snap->ds->ds_snapname;
+	return (err);
 }
 
 static void
@@ -2707,7 +2690,7 @@
  * NULL, indicating that the clone is not a clone of a clone).
  */
 int
-dsl_dataset_promote(const char *name)
+dsl_dataset_promote(const char *name, char *conflsnap)
 {
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
@@ -2779,6 +2762,8 @@
 		err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
 		    dsl_dataset_promote_sync, ds, &pa,
 		    2 + 2 * doi.doi_physical_blks);
+		if (err && pa.err_ds && conflsnap)
+			(void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
 	}
 
 	snaplist_destroy(&pa.shared_snaps, B_TRUE);
@@ -3533,11 +3518,6 @@
 			dsl_dataset_rele(ds, dtag);
 			return (error);
 		}
-		error = dsl_dataset_zvol_cleanup(ds, name);
-		if (error) {
-			dsl_dataset_rele(ds, dtag);
-			return (error);
-		}
 #endif
 		if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
 			dsl_dataset_rele(ds, dtag);
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Sep 21 08:55:28 2009 -0600
@@ -1280,6 +1280,24 @@
 	return (spa->spa_ubsync.ub_version);
 }
 
+/*
+ * if there is a pool on top of zvols, there can be a situation where
+ * a second vdev_set_state ioctl can come in (grabbing the pool's config
+ * lock and then calling into the zvol's pool) before the config has
+ * synced out from a previous vdev_set_state ioctl, resulting in
+ * deadlock.
+ */
+boolean_t
+spa_uses_zvols(spa_t *spa)
+{
+	boolean_t i;
+
+	spa_config_enter(spa, SCL_STATE_ALL, spa, RW_READER);
+	i = vdev_uses_zvols(spa->spa_root_vdev);
+	spa_config_exit(spa, SCL_STATE_ALL, spa);
+	return (i);
+}
+
 int
 spa_max_replication(spa_t *spa)
 {
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Mon Sep 21 08:55:28 2009 -0600
@@ -193,7 +193,7 @@
 dsl_checkfunc_t dsl_dataset_snapshot_check;
 dsl_syncfunc_t dsl_dataset_snapshot_sync;
 int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
-int dsl_dataset_promote(const char *name);
+int dsl_dataset_promote(const char *name, char *conflsnap);
 int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
     boolean_t force);
 int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Mon Sep 21 08:55:28 2009 -0600
@@ -448,6 +448,7 @@
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
+extern boolean_t spa_uses_zvols(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_alloc(spa_t *spa);
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Mon Sep 21 08:55:28 2009 -0600
@@ -47,7 +47,8 @@
 extern boolean_t zfs_nocacheflush;
 
 extern int vdev_open(vdev_t *);
-extern void vdev_open_children(vdev_t *vd);
+extern void vdev_open_children(vdev_t *);
+extern boolean_t vdev_uses_zvols(vdev_t *);
 extern int vdev_validate(vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
--- a/usr/src/uts/common/fs/zfs/sys/zvol.h	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/sys/zvol.h	Mon Sep 21 08:55:28 2009 -0600
@@ -20,15 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZVOL_H
 #define	_SYS_ZVOL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
@@ -43,10 +41,10 @@
 extern int zvol_check_volblocksize(uint64_t volblocksize);
 extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
 extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-extern int zvol_create_minor(const char *, major_t);
+extern int zvol_create_minor(const char *);
 extern int zvol_remove_minor(const char *);
+extern void zvol_remove_minors(const char *);
 extern int zvol_set_volsize(const char *, major_t, uint64_t);
-extern int zvol_set_volblocksize(const char *, uint64_t);
 
 extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
 extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
--- a/usr/src/uts/common/fs/zfs/vdev.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Mon Sep 21 08:55:28 2009 -0600
@@ -1007,12 +1007,35 @@
 	vd->vdev_open_thread = NULL;
 }
 
+boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+	    strlen(ZVOL_DIR)) == 0)
+		return (B_TRUE);
+	for (int c = 0; c < vd->vdev_children; c++)
+		if (vdev_uses_zvols(vd->vdev_child[c]))
+			return (B_TRUE);
+	return (B_FALSE);
+}
+
 void
 vdev_open_children(vdev_t *vd)
 {
 	taskq_t *tq;
 	int children = vd->vdev_children;
 
+	/*
+	 * in order to handle pools on top of zvols, do the opens
+	 * in a single thread so that the same thread holds the
+	 * spa_namespace_lock
+	 */
+	if (vdev_uses_zvols(vd)) {
+		for (int c = 0; c < children; c++)
+			vd->vdev_child[c]->vdev_open_error =
+			    vdev_open(vd->vdev_child[c]);
+		return;
+	}
 	tq = taskq_create("vdev_open", children, minclsyspri,
 	    children, children, TASKQ_PREPOPULATE);
 
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Sep 21 08:55:28 2009 -0600
@@ -382,13 +382,8 @@
 int
 zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr)
 {
-	int error;
-	error = zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_ROLLBACK, cr);
-	if (error == 0)
-		error = zfs_secpolicy_write_perms(zc->zc_name,
-		    ZFS_DELEG_PERM_MOUNT, cr);
-	return (error);
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_ROLLBACK, cr));
 }
 
 int
@@ -594,16 +589,8 @@
 int
 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 {
-	int error;
-
-	if ((error = zfs_secpolicy_write_perms(name,
-	    ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0)
-		return (error);
-
-	error = zfs_secpolicy_write_perms(name,
-	    ZFS_DELEG_PERM_MOUNT, cr);
-
-	return (error);
+	return (zfs_secpolicy_write_perms(name,
+	    ZFS_DELEG_PERM_SNAPSHOT, cr));
 }
 
 static int
@@ -666,22 +653,6 @@
 }
 
 /*
- * Just like zfs_secpolicy_config, except that we will check for
- * mount permission on the dataset for permission to create/remove
- * the minor nodes.
- */
-static int
-zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr)
-{
-	if (secpolicy_sys_config(cr, B_FALSE) != 0) {
-		return (dsl_deleg_access(zc->zc_name,
-		    ZFS_DELEG_PERM_MOUNT, cr));
-	}
-
-	return (0);
-}
-
-/*
  * Policy for fault injection.  Requires all privileges.
  */
 /* ARGSUSED */
@@ -971,6 +942,8 @@
 	int error;
 	zfs_log_history(zc);
 	error = spa_destroy(zc->zc_name);
+	if (error == 0)
+		zvol_remove_minors(zc->zc_name);
 	return (error);
 }
 
@@ -1018,6 +991,8 @@
 
 	zfs_log_history(zc);
 	error = spa_export(zc->zc_name, NULL, force, hardforce);
+	if (error == 0)
+		zvol_remove_minors(zc->zc_name);
 	return (error);
 }
 
@@ -1272,12 +1247,16 @@
 static int
 zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
 {
+	boolean_t nslock;
 	spa_t *spa;
 	int error;
 	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
+	nslock = spa_uses_zvols(spa);
+	if (nslock)
+		mutex_enter(&spa_namespace_lock);
 	switch (zc->zc_cookie) {
 	case VDEV_STATE_ONLINE:
 		error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
@@ -1298,6 +1277,8 @@
 	default:
 		error = EINVAL;
 	}
+	if (nslock)
+		mutex_exit(&spa_namespace_lock);
 	zc->zc_cookie = newstate;
 	spa_close(spa, FTAG);
 	return (error);
@@ -1544,12 +1525,16 @@
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = ESRCH;
-	} while (error == 0 && dataset_name_hidden(zc->zc_name));
+	} while (error == 0 && dataset_name_hidden(zc->zc_name) &&
+	    !(zc->zc_iflags & FKIOCTL));
 	dmu_objset_rele(os, FTAG);
 
-	if (error == 0)
+	/*
+	 * If it's an internal dataset (ie. with a '$' in its name),
+	 * don't try to get stats for it, otherwise we'll return ENOENT.
+	 */
+	if (error == 0 && strchr(zc->zc_name, '$') == NULL)
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
 	return (error);
 }
 
@@ -1789,12 +1774,6 @@
 				goto out;
 			break;
 
-		case ZFS_PROP_VOLBLOCKSIZE:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = zvol_set_volblocksize(name, intval)) != 0)
-				goto out;
-			break;
-
 		case ZFS_PROP_VERSION:
 		{
 			zfsvfs_t *zfsvfs;
@@ -2140,30 +2119,6 @@
 }
 
 /*
- * inputs:
- * zc_name		name of volume
- *
- * outputs:		none
- */
-static int
-zfs_ioc_create_minor(zfs_cmd_t *zc)
-{
-	return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip)));
-}
-
-/*
- * inputs:
- * zc_name		name of volume
- *
- * outputs:		none
- */
-static int
-zfs_ioc_remove_minor(zfs_cmd_t *zc)
-{
-	return (zvol_remove_minor(zc->zc_name));
-}
-
-/*
  * Search the vfs list for a specified resource.  Returns a pointer to it
  * or NULL if no suitable entry is found. The caller of this routine
  * is responsible for releasing the returned vfs pointer.
@@ -2494,7 +2449,8 @@
  * zc_cookie	recursive flag
  * zc_nvlist_src[_size] property list
  *
- * outputs:	none
+ * outputs:
+ * zc_value	short snapname (i.e. part after the '@')
  */
 static int
 zfs_ioc_snapshot(zfs_cmd_t *zc)
@@ -2600,13 +2556,17 @@
 static int
 zfs_ioc_destroy(zfs_cmd_t *zc)
 {
+	int err;
 	if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
-		int err = zfs_unmount_snap(zc->zc_name, NULL);
+		err = zfs_unmount_snap(zc->zc_name, NULL);
 		if (err)
 			return (err);
 	}
 
-	return (dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy));
+	err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy);
+	if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
+		zvol_remove_minor(zc->zc_name);
+	return (err);
 }
 
 /*
@@ -2722,6 +2682,8 @@
 		if (err)
 			return (err);
 	}
+	if (zc->zc_objset_type == DMU_OST_ZVOL)
+		(void) zvol_remove_minor(zc->zc_name);
 	return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
 }
 
@@ -3052,7 +3014,8 @@
  * zc_name	name of filesystem
  * zc_value	name of origin snapshot
  *
- * outputs:	none
+ * outputs:
+ * zc_string	name of conflicting snapshot, if there is one
  */
 static int
 zfs_ioc_promote(zfs_cmd_t *zc)
@@ -3068,7 +3031,7 @@
 		*cp = '\0';
 	(void) dmu_objset_find(zc->zc_value,
 	    zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
-	return (dsl_dataset_promote(zc->zc_name));
+	return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
 }
 
 /*
@@ -3583,10 +3546,6 @@
 	{ zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
 	    B_FALSE },
 	{ zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE },
-	{ zfs_ioc_create_minor,	zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
-	    B_FALSE },
-	{ zfs_ioc_remove_minor,	zfs_secpolicy_minor, DATASET_NAME, B_FALSE,
-	    B_FALSE },
 	{ zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE },
 	{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
 	    B_TRUE},
@@ -3679,7 +3638,7 @@
 
 	error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
 
-	if (error == 0)
+	if ((error == 0) && !(flag & FKIOCTL))
 		error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr);
 
 	/*
--- a/usr/src/uts/common/fs/zfs/zvol.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Mon Sep 21 08:55:28 2009 -0600
@@ -32,7 +32,7 @@
  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  *
- * These links are created by the ZFS-specific devfsadm link generator.
+ * These links are created by the /dev filesystem (sdev_zvolops.c).
  * Volumes are persistent through reboot.  No user command needs to be
  * run before opening and using a device.
  */
@@ -110,7 +110,6 @@
 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
 	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
 	objset_t	*zv_objset;	/* objset handle */
-	boolean_t 	zv_issnap;	/* is a snapshot (read-only) */
 	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
 	uint32_t	zv_total_opens;	/* total open count */
 	zilog_t		*zv_zilog;	/* ZIL handle */
@@ -131,6 +130,7 @@
  */
 int zvol_maxphys = DMU_MAX_ACCESS/2;
 
+static int zvol_remove_zv(zvol_state_t *);
 extern int zfs_set_prop_nvlist(const char *, nvlist_t *);
 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
 static int zvol_dumpify(zvol_state_t *zv);
@@ -138,14 +138,14 @@
 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 
 static void
-zvol_size_changed(zvol_state_t *zv, major_t maj)
+zvol_size_changed(uint64_t volsize, major_t maj, minor_t min)
 {
-	dev_t dev = makedevice(maj, zv->zv_minor);
+	dev_t dev = makedevice(maj, min);
 
 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
-	    "Size", zv->zv_volsize) == DDI_SUCCESS);
+	    "Size", volsize) == DDI_SUCCESS);
 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
-	    "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS);
+	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
 
 	/* Notify specfs to invalidate the cached size */
 	spec_size_invalidate(dev, VBLK);
@@ -179,17 +179,6 @@
 	return (0);
 }
 
-static void
-zvol_readonly_changed_cb(void *arg, uint64_t newval)
-{
-	zvol_state_t *zv = arg;
-
-	if (newval)
-		zv->zv_flags |= ZVOL_RDONLY;
-	else
-		zv->zv_flags &= ~ZVOL_RDONLY;
-}
-
 int
 zvol_get_stats(objset_t *os, nvlist_t *nv)
 {
@@ -197,7 +186,6 @@
 	dmu_object_info_t doi;
 	uint64_t val;
 
-
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 	if (error)
 		return (error);
@@ -421,20 +409,29 @@
 	zvol_replay_err,	/* TX_ACL */
 };
 
+int
+zvol_name2minor(const char *name, minor_t *minor)
+{
+	zvol_state_t *zv;
+
+	mutex_enter(&zvol_state_lock);
+	zv = zvol_minor_lookup(name);
+	if (minor && zv)
+		*minor = zv->zv_minor;
+	mutex_exit(&zvol_state_lock);
+	return (zv ? 0 : -1);
+}
+
 /*
  * Create a minor node (plus a whole lot more) for the specified volume.
  */
 int
-zvol_create_minor(const char *name, major_t maj)
+zvol_create_minor(const char *name)
 {
 	zvol_state_t *zv;
 	objset_t *os;
 	dmu_object_info_t doi;
-	uint64_t volsize;
 	minor_t minor = 0;
-	struct pathname linkpath;
-	vnode_t *vp = NULL;
-	char *devpath;
 	char chrbuf[30], blkbuf[30];
 	int error;
 
@@ -453,51 +450,7 @@
 		return (error);
 	}
 
-	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
-
-	if (error) {
-		dmu_objset_disown(os, zvol_tag);
-		mutex_exit(&zvol_state_lock);
-		return (error);
-	}
-
-	/*
-	 * If there's an existing /dev/zvol symlink, try to use the
-	 * same minor number we used last time.
-	 */
-	devpath = kmem_asprintf("%s%s", ZVOL_FULL_DEV_DIR, name);
-	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
-	strfree(devpath);
-
-	if (error == 0 && vp->v_type != VLNK)
-		error = EINVAL;
-
-	if (error == 0) {
-		pn_alloc(&linkpath);
-		error = pn_getsymlink(vp, &linkpath, kcred);
-		if (error == 0) {
-			char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV);
-			if (ms != NULL) {
-				ms += strlen(ZVOL_PSEUDO_DEV);
-				minor = stoi(&ms);
-			}
-		}
-		pn_free(&linkpath);
-	}
-
-	if (vp != NULL)
-		VN_RELE(vp);
-
-	/*
-	 * If we found a minor but it's already in use, we must pick a new one.
-	 */
-	if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
-		minor = 0;
-
-	if (minor == 0)
-		minor = zvol_minor_alloc();
-
-	if (minor == 0) {
+	if ((minor = zvol_minor_alloc()) == 0) {
 		dmu_objset_disown(os, zvol_tag);
 		mutex_exit(&zvol_state_lock);
 		return (ENXIO);
@@ -508,11 +461,10 @@
 		mutex_exit(&zvol_state_lock);
 		return (EAGAIN);
 	}
-
 	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
 	    (char *)name);
 
-	(void) sprintf(chrbuf, "%uc,raw", minor);
+	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
 
 	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
@@ -522,7 +474,7 @@
 		return (EAGAIN);
 	}
 
-	(void) sprintf(blkbuf, "%uc", minor);
+	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
 
 	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
@@ -535,13 +487,12 @@
 
 	zv = ddi_get_soft_state(zvol_state, minor);
 
-	(void) strcpy(zv->zv_name, name);
+	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 	zv->zv_min_bs = DEV_BSHIFT;
 	zv->zv_minor = minor;
-	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
-	zv->zv_issnap = dmu_objset_is_snapshot(os);
-	zv->zv_zilog = zil_open(os, zvol_get_data);
+	if (dmu_objset_is_snapshot(os))
+		zv->zv_flags |= ZVOL_RDONLY;
 	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
@@ -553,11 +504,8 @@
 	zv->zv_volblocksize = doi.doi_data_block_size;
 
 	zil_replay(os, zv, zvol_replay_vector);
-	zvol_size_changed(zv, maj);
-
-	/* XXX this should handle the possible i/o error */
-	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
-	    "readonly", zvol_readonly_changed_cb, zv) == 0);
+	dmu_objset_disown(os, zvol_tag);
+	zv->zv_objset = NULL;
 
 	zvol_minors++;
 
@@ -569,47 +517,88 @@
 /*
  * Remove minor node for the specified volume.
  */
-int
-zvol_remove_minor(const char *name)
+static int
+zvol_remove_zv(zvol_state_t *zv)
 {
-	zvol_state_t *zv;
-	char namebuf[30];
-
-	mutex_enter(&zvol_state_lock);
+	char nmbuf[20];
 
-	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		mutex_exit(&zvol_state_lock);
-		return (ENXIO);
-	}
-
-	if (zv->zv_total_opens != 0) {
-		mutex_exit(&zvol_state_lock);
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	if (zv->zv_total_opens != 0)
 		return (EBUSY);
-	}
 
-	(void) sprintf(namebuf, "%uc,raw", zv->zv_minor);
-	ddi_remove_minor_node(zfs_dip, namebuf);
-
-	(void) sprintf(namebuf, "%uc", zv->zv_minor);
-	ddi_remove_minor_node(zfs_dip, namebuf);
+	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", zv->zv_minor);
+	ddi_remove_minor_node(zfs_dip, nmbuf);
 
-	VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
-	    "readonly", zvol_readonly_changed_cb, zv) == 0);
+	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", zv->zv_minor);
+	ddi_remove_minor_node(zfs_dip, nmbuf);
 
-	zil_close(zv->zv_zilog);
-	zv->zv_zilog = NULL;
-	dmu_objset_disown(zv->zv_objset, zvol_tag);
-	zv->zv_objset = NULL;
 	avl_destroy(&zv->zv_znode.z_range_avl);
 	mutex_destroy(&zv->zv_znode.z_range_lock);
 
 	ddi_soft_state_free(zvol_state, zv->zv_minor);
 
 	zvol_minors--;
+	return (0);
+}
 
+int
+zvol_remove_minor(const char *name)
+{
+	zvol_state_t *zv;
+	int rc;
+
+	mutex_enter(&zvol_state_lock);
+	if ((zv = zvol_minor_lookup(name)) == NULL) {
+		mutex_exit(&zvol_state_lock);
+		return (ENXIO);
+	}
+	rc = zvol_remove_zv(zv);
 	mutex_exit(&zvol_state_lock);
+	return (rc);
+}
 
-	return (0);
+int
+zvol_first_open(zvol_state_t *zv)
+{
+	objset_t *os;
+	uint64_t volsize;
+	int error;
+	uint64_t readonly;
+
+	/* lie and say we're read-only */
+	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
+	    zvol_tag, &os);
+	if (error)
+		return (error);
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+	if (error) {
+		ASSERT(error == 0);
+		dmu_objset_disown(os, zvol_tag);
+		return (error);
+	}
+	zv->zv_objset = os;
+	zv->zv_volsize = volsize;
+	zv->zv_zilog = zil_open(os, zvol_get_data);
+	zvol_size_changed(zv->zv_volsize, ddi_driver_major(zfs_dip),
+	    zv->zv_minor);
+
+	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
+	    NULL) == 0);
+	if (readonly || dmu_objset_is_snapshot(os))
+		zv->zv_flags |= ZVOL_RDONLY;
+	else
+		zv->zv_flags &= ~ZVOL_RDONLY;
+	return (error);
+}
+
+void
+zvol_last_close(zvol_state_t *zv)
+{
+	zil_close(zv->zv_zilog);
+	zv->zv_zilog = NULL;
+	dmu_objset_disown(zv->zv_objset, zvol_tag);
+	zv->zv_objset = NULL;
 }
 
 int
@@ -652,14 +641,14 @@
 }
 
 int
-zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
+zvol_update_volsize(objset_t *os, uint64_t volsize)
 {
 	dmu_tx_t *tx;
 	int error;
 
 	ASSERT(MUTEX_HELD(&zvol_state_lock));
 
-	tx = dmu_tx_create(zv->zv_objset);
+	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
@@ -667,83 +656,101 @@
 		return (error);
 	}
 
-	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 	    &volsize, tx);
 	dmu_tx_commit(tx);
 
 	if (error == 0)
-		error = dmu_free_long_range(zv->zv_objset,
+		error = dmu_free_long_range(os,
 		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
+	return (error);
+}
+
+void
+zvol_remove_minors(const char *name)
+{
+	zvol_state_t *zv;
+	char *namebuf;
+	minor_t minor;
 
-	/*
-	 * If we are using a faked-up state (zv_minor == 0) then don't
-	 * try to update the in-core zvol state.
-	 */
-	if (error == 0 && zv->zv_minor) {
-		zv->zv_volsize = volsize;
-		zvol_size_changed(zv, maj);
+	namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
+	(void) strncpy(namebuf, name, strlen(name));
+	(void) strcat(namebuf, "/");
+	mutex_enter(&zvol_state_lock);
+	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
+
+		zv = ddi_get_soft_state(zvol_state, minor);
+		if (zv == NULL)
+			continue;
+		if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
+			(void) zvol_remove_zv(zv);
 	}
-	return (error);
+	kmem_free(namebuf, strlen(name) + 2);
+
+	mutex_exit(&zvol_state_lock);
 }
 
 int
 zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
 {
-	zvol_state_t *zv;
+	zvol_state_t *zv = NULL;
+	objset_t *os;
 	int error;
 	dmu_object_info_t doi;
 	uint64_t old_volsize = 0ULL;
-	zvol_state_t state = { 0 };
+	uint64_t readonly;
 
 	mutex_enter(&zvol_state_lock);
+	zv = zvol_minor_lookup(name);
+	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+		mutex_exit(&zvol_state_lock);
+		return (error);
+	}
 
-	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		/*
-		 * If we are doing a "zfs clone -o volsize=", then the
-		 * minor node won't exist yet.
-		 */
-		error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, FTAG,
-		    &state.zv_objset);
-		if (error != 0)
-			goto out;
-		zv = &state;
-	}
-	old_volsize = zv->zv_volsize;
-
-	if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
+	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
 	    (error = zvol_check_volsize(volsize,
 	    doi.doi_data_block_size)) != 0)
 		goto out;
 
-	if (zv->zv_flags & ZVOL_RDONLY || zv->zv_issnap) {
+	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
+	    NULL) == 0);
+	if (readonly) {
 		error = EROFS;
 		goto out;
 	}
 
-	error = zvol_update_volsize(zv, maj, volsize);
-
+	error = zvol_update_volsize(os, volsize);
 	/*
 	 * Reinitialize the dump area to the new size. If we
-	 * failed to resize the dump area then restore the it back to
-	 * it's original size.
+	 * failed to resize the dump area then restore it back to
+	 * its original size.
 	 */
-	if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) {
-		if ((error = zvol_dumpify(zv)) != 0 ||
-		    (error = dumpvp_resize()) != 0) {
-			(void) zvol_update_volsize(zv, maj, old_volsize);
-			error = zvol_dumpify(zv);
+	if (zv && error == 0) {
+		if (zv->zv_flags & ZVOL_DUMPIFIED) {
+			old_volsize = zv->zv_volsize;
+			zv->zv_volsize = volsize;
+			if ((error = zvol_dumpify(zv)) != 0 ||
+			    (error = dumpvp_resize()) != 0) {
+				(void) zvol_update_volsize(os, old_volsize);
+				zv->zv_volsize = old_volsize;
+				error = zvol_dumpify(zv);
+			}
+		}
+		if (error == 0) {
+			zv->zv_volsize = volsize;
+			zvol_size_changed(volsize, maj, zv->zv_minor);
 		}
 	}
 
 	/*
 	 * Generate a LUN expansion event.
 	 */
-	if (error == 0) {
+	if (zv && error == 0) {
 		sysevent_id_t eid;
 		nvlist_t *attr;
 		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 
-		(void) snprintf(physpath, MAXPATHLEN, "%s%uc", ZVOL_PSEUDO_DEV,
+		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
 		    zv->zv_minor);
 
 		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
@@ -757,68 +764,20 @@
 	}
 
 out:
-	if (state.zv_objset)
-		dmu_objset_disown(state.zv_objset, FTAG);
+	dmu_objset_rele(os, FTAG);
 
 	mutex_exit(&zvol_state_lock);
 
 	return (error);
 }
 
-int
-zvol_set_volblocksize(const char *name, uint64_t volblocksize)
-{
-	zvol_state_t *zv;
-	dmu_tx_t *tx;
-	int error;
-	boolean_t needlock;
-
-	/*
-	 * The lock may already be held if we are being called from
-	 * zvol_dump_init().
-	 */
-	needlock = !MUTEX_HELD(&zvol_state_lock);
-	if (needlock)
-		mutex_enter(&zvol_state_lock);
-
-	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		if (needlock)
-			mutex_exit(&zvol_state_lock);
-		return (ENXIO);
-	}
-	if (zv->zv_flags & ZVOL_RDONLY || zv->zv_issnap) {
-		if (needlock)
-			mutex_exit(&zvol_state_lock);
-		return (EROFS);
-	}
-
-	tx = dmu_tx_create(zv->zv_objset);
-	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-	} else {
-		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
-		    volblocksize, 0, tx);
-		if (error == ENOTSUP)
-			error = EBUSY;
-		dmu_tx_commit(tx);
-		if (error == 0)
-			zv->zv_volblocksize = volblocksize;
-	}
-
-	if (needlock)
-		mutex_exit(&zvol_state_lock);
-
-	return (error);
-}
-
 /*ARGSUSED*/
 int
 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 {
 	minor_t minor = getminor(*devp);
 	zvol_state_t *zv;
+	int err = 0;
 
 	if (minor == 0)			/* This is the control device */
 		return (0);
@@ -831,21 +790,24 @@
 		return (ENXIO);
 	}
 
-	ASSERT(zv->zv_objset != NULL);
-
-	if ((flag & FWRITE) &&
-	    (zv->zv_flags & ZVOL_RDONLY || zv->zv_issnap)) {
+	if (zv->zv_total_opens == 0)
+		err = zvol_first_open(zv);
+	if (err) {
 		mutex_exit(&zvol_state_lock);
-		return (EROFS);
+		return (err);
+	}
+	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+		err = EROFS;
+		goto out;
 	}
 	if (zv->zv_flags & ZVOL_EXCL) {
-		mutex_exit(&zvol_state_lock);
-		return (EBUSY);
+		err = EBUSY;
+		goto out;
 	}
 	if (flag & FEXCL) {
 		if (zv->zv_total_opens != 0) {
-			mutex_exit(&zvol_state_lock);
-			return (EBUSY);
+			err = EBUSY;
+			goto out;
 		}
 		zv->zv_flags |= ZVOL_EXCL;
 	}
@@ -854,10 +816,14 @@
 		zv->zv_open_count[otyp]++;
 		zv->zv_total_opens++;
 	}
-
 	mutex_exit(&zvol_state_lock);
 
-	return (0);
+	return (err);
+out:
+	if (zv->zv_total_opens == 0)
+		zvol_last_close(zv);
+	mutex_exit(&zvol_state_lock);
+	return (err);
 }
 
 /*ARGSUSED*/
@@ -866,6 +832,7 @@
 {
 	minor_t minor = getminor(dev);
 	zvol_state_t *zv;
+	int error = 0;
 
 	if (minor == 0)		/* This is the control device */
 		return (0);
@@ -896,9 +863,11 @@
 	zv->zv_open_count[otyp]--;
 	zv->zv_total_opens--;
 
-	mutex_exit(&zvol_state_lock);
+	if (zv->zv_total_opens == 0)
+		zvol_last_close(zv);
 
-	return (0);
+	mutex_exit(&zvol_state_lock);
+	return (error);
 }
 
 static void
@@ -1162,8 +1131,7 @@
 		return (0);
 	}
 
-	if (!(bp->b_flags & B_READ) &&
-	    (zv->zv_flags & ZVOL_RDONLY || zv->zv_issnap)) {
+	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
 		bioerror(bp, EROFS);
 		biodone(bp);
 		return (0);
@@ -1603,26 +1571,6 @@
 	ddi_soft_state_fini(&zvol_state);
 }
 
-static boolean_t
-zvol_is_swap(zvol_state_t *zv)
-{
-	vnode_t *vp;
-	boolean_t ret = B_FALSE;
-	char *devpath;
-	int error;
-
-	devpath = kmem_asprintf("%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name);
-	error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
-	strfree(devpath);
-
-	ret = !error && IS_SWAPVP(common_specvp(vp));
-
-	if (vp != NULL)
-		VN_RELE(vp);
-
-	return (ret);
-}
-
 static int
 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 {
@@ -1632,9 +1580,14 @@
 	nvlist_t *nv = NULL;
 
 	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
+	    DMU_OBJECT_END);
+	/* wait for dmu_free_long_range to actually free the blocks */
+	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -1674,17 +1627,13 @@
 		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
 		    &vbs, tx);
+		error = error ? error : dmu_object_set_blocksize(
+		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
+		if (error == 0)
+			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
 	}
 	dmu_tx_commit(tx);
 
-	/* Truncate the file */
-	if (!error)
-		error = dmu_free_long_range(zv->zv_objset,
-		    ZVOL_OBJ, 0, DMU_OBJECT_END);
-
-	if (error)
-		return (error);
-
 	/*
 	 * We only need update the zvol's property if we are initializing
 	 * the dump area for the first time.
@@ -1699,9 +1648,6 @@
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 		    ZIO_CHECKSUM_OFF) == 0);
-		VERIFY(nvlist_add_uint64(nv,
-		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
-		    SPA_MAXBLOCKSIZE) == 0);
 
 		error = zfs_set_prop_nvlist(zv->zv_name, nv);
 		nvlist_free(nv);
@@ -1723,15 +1669,9 @@
 	dmu_tx_t *tx;
 	objset_t *os = zv->zv_objset;
 
-	if (zv->zv_flags & ZVOL_RDONLY || zv->zv_issnap)
+	if (zv->zv_flags & ZVOL_RDONLY)
 		return (EROFS);
 
-	/*
-	 * We do not support swap devices acting as dump devices.
-	 */
-	if (zvol_is_swap(zv))
-		return (ENOTSUP);
-
 	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
 	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
 		boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE;
@@ -1816,14 +1756,23 @@
 	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
 	(void) nvlist_add_uint64(nv,
 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
-	(void) nvlist_add_uint64(nv,
-	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs);
 	(void) zfs_set_prop_nvlist(zv->zv_name, nv);
 	nvlist_free(nv);
 
 	zvol_free_extents(zv);
 	zv->zv_flags &= ~ZVOL_DUMPIFIED;
 	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
+	/* wait for dmu_free_long_range to actually free the blocks */
+	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+	dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx);
+	dmu_tx_commit(tx);
 
 	return (0);
 }
--- a/usr/src/uts/common/os/dumpsubr.c	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/os/dumpsubr.c	Mon Sep 21 08:55:28 2009 -0600
@@ -217,6 +217,10 @@
 				error = ENOTSUP;
 			else if (vfs_devismounted(vattr.va_rdev))
 				error = EBUSY;
+			if (strcmp(ddi_driver_name(VTOS(cvp)->s_dip),
+			    ZFS_DRIVER) == 0 &&
+			    IS_SWAPVP(common_specvp(cvp)))
+					error = EBUSY;
 		} else {
 			if (vn_matchopval(cvp, VOPNAME_DUMP, fs_nosys) ||
 			    !IS_SWAPVP(cvp))
@@ -270,15 +274,16 @@
 				dumpbuf_resize();
 			}
 			/*
-			 * If we are working with a zvol then call into
-			 * it to dumpify itself.
+			 * If we are working with a zvol then dumpify it
+			 * if it's not being used as swap.
 			 */
 			if (strcmp(dki.dki_dname, ZVOL_DRIVER) == 0) {
-				if ((error = VOP_IOCTL(cdev_vp,
+				if (IS_SWAPVP(common_specvp(cvp)))
+					error = EBUSY;
+				else if ((error = VOP_IOCTL(cdev_vp,
 				    DKIOCDUMPINIT, NULL, FKIOCTL, kcred,
-				    NULL, NULL)) != 0) {
+				    NULL, NULL)) != 0)
 					dumpfini();
-				}
 			}
 
 			(void) VOP_CLOSE(cdev_vp, FREAD | FWRITE, 1, 0,
--- a/usr/src/uts/common/sys/fs/sdev_impl.h	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/sys/fs/sdev_impl.h	Mon Sep 21 08:55:28 2009 -0600
@@ -229,6 +229,7 @@
 #define	SDEV_VTOR		0x0040	/* validate sdev_nodes during search */
 #define	SDEV_ATTR_INVALID	0x0080	/* invalid node attributes, */
 					/* need update */
+#define	SDEV_SUBDIR		0x0100	/* match all subdirs under here */
 
 /* sdev_lookup_flags */
 #define	SDEV_LOOKUP	0x0001	/* node creation in progress */
@@ -249,7 +250,6 @@
 	(dv->sdev_flags & SDEV_DYNAMIC)
 #define	SDEV_IS_NO_NCACHE(dv)	\
 	(dv->sdev_flags & SDEV_NO_NCACHE)
-
 #define	SDEV_IS_LOOKUP(dv)	\
 	(dv->sdev_lookup_flags & SDEV_LOOKUP)
 #define	SDEV_IS_READDIR(dv)	\
@@ -296,8 +296,6 @@
 /*
  * flags used by devname_lookup_func callbacks
  */
-#define	SDEV_PATH	0x1	/* callback returning /devices physical path */
-#define	SDEV_VNODE	0x2	/* callback returning backing store vnode */
 #define	SDEV_VATTR	0x4	/* callback returning node vattr */
 #define	SDEV_VLINK	0x8	/* callback returning /dev link */
 
@@ -316,11 +314,10 @@
  */
 extern int devname_setattr_func(struct vnode *, struct vattr *, int,
     struct cred *, int (*)(struct sdev_node *, struct vattr *, int), int);
-
 /*
  * devname_inactive_func()
  */
-extern void devname_inactive_func(struct vnode *, struct cred *cred,
+extern void devname_inactive_func(struct vnode *, struct cred *,
     void (*)(struct vnode *));
 
 /*
@@ -399,51 +396,6 @@
 extern struct vnodeops *devvt_getvnodeops(void);
 
 /*
- * directory name rule
- */
-struct devname_nsmap {
-	struct devname_nsmap	*prev;	/* previous entry */
-	struct devname_nsmap	*next;	/* next entry */
-	char	*dir_name;	/* /dev subdir name, e.g. /dev/disk */
-	char	*dir_module;	/* devname module impl the operations */
-	char	*dir_map;	/* dev naming rules, e.g. /etc/dev/disks */
-	char    *dir_newmodule; /* to be reloaded  */
-	char    *dir_newmap;    /* to be reloaded */
-	int	dir_invalid;    /* map entry obsolete */
-	int	dir_maploaded;	/* map contents read */
-	krwlock_t dir_lock;	/* protects the data structure */
-};
-
-/*
- * name-property pairs to be looked up
- */
-typedef struct devname_lkp_arg {
-	char *devname_dir;	/* the directory to look */
-	char *devname_name;	/* the device name to be looked up */
-	char *devname_map;	/* the directory device naming map */
-	int reserved;
-} devname_lkp_arg_t;
-
-/*
- * directory name-value populating results
- */
-typedef struct devname_rdr_result {
-	uint32_t	ns_mapcount;
-} devname_rdr_result_t;
-
-/*
- * sdev_nsrdr work
- */
-typedef struct sdev_nsrdr_work {
-	char *dir_name;
-	char *dir_map;
-	struct sdev_node *dir_dv;
-	devname_rdr_result_t **result;
-	struct sdev_nsrdr_work *next;
-} sdev_nsrdr_work_t;
-
-
-/*
  * boot states - warning, the ordering here is significant
  *
  * the difference between "system available" and "boot complete"
@@ -528,6 +480,8 @@
 extern void sdev_filldir_dynamic(struct sdev_node *);
 extern int sdev_mknode(struct sdev_node *, char *, struct sdev_node **,
     struct vattr *, struct vnode *, void *, struct cred *, sdev_node_state_t);
+extern int sdev_getlink(struct vnode *linkvp, char **link);
+
 extern int sdev_nodeinit(struct sdev_node *, char *, struct sdev_node **,
     vattr_t *);
 extern int sdev_nodeready(struct sdev_node *, vattr_t *, vnode_t *, void *,
@@ -547,6 +501,7 @@
 extern int sdev_rnmnode(struct sdev_node *, struct sdev_node *,
     struct sdev_node *, struct sdev_node **, char *, struct cred *);
 extern size_t add_dir_entry(dirent64_t *, char *, size_t, ino_t, offset_t);
+extern struct vattr *sdev_getdefault_attr(enum vtype type);
 extern int sdev_to_vp(struct sdev_node *, struct vnode **);
 extern ino_t sdev_mkino(struct sdev_node *);
 extern int devname_backstore_lookup(struct sdev_node *, char *,
@@ -560,9 +515,17 @@
 extern int devnet_validate(struct sdev_node *dv);
 extern int devipnet_validate(struct sdev_node *dv);
 extern int devvt_validate(struct sdev_node *dv);
+extern int devzvol_validate(struct sdev_node *dv);
 extern void *sdev_get_vtor(struct sdev_node *dv);
 
 /*
+ * devinfo helpers
+ */
+extern int sdev_modctl_readdir(const char *, char ***, int *, int *, int);
+extern void sdev_modctl_readdir_free(char **, int, int);
+extern int sdev_modctl_devexists(const char *);
+
+/*
  * ncache handlers
  */
 
@@ -575,13 +538,6 @@
 extern void sdev_modctl_dump_files(void);
 
 /*
- * devinfo helpers
- */
-extern int sdev_modctl_readdir(const char *, char ***, int *, int *, int);
-extern void sdev_modctl_readdir_free(char **, int, int);
-extern int sdev_modctl_devexists(const char *);
-
-/*
  * globals
  */
 extern kmutex_t sdev_lock;
@@ -593,6 +549,8 @@
 extern struct vnodeops		*devipnet_vnodeops;
 extern struct vnodeops		*devvt_vnodeops;
 extern struct sdev_data *sdev_origins; /* mount info for global /dev instance */
+extern struct vnodeops		*devzvol_vnodeops;
+
 extern const fs_operation_def_t	sdev_vnodeops_tbl[];
 extern const fs_operation_def_t	devpts_vnodeops_tbl[];
 extern const fs_operation_def_t	devnet_vnodeops_tbl[];
@@ -600,6 +558,7 @@
 extern const fs_operation_def_t	devvt_vnodeops_tbl[];
 extern const fs_operation_def_t	devsys_vnodeops_tbl[];
 extern const fs_operation_def_t	devpseudo_vnodeops_tbl[];
+extern const fs_operation_def_t	devzvol_vnodeops_tbl[];
 
 extern sdev_nc_list_t	*sdev_ncache;
 extern int		sdev_reconfig_boot;
@@ -628,6 +587,7 @@
 #define	SDEV_DEBUG_MODCTL	0x400	/* trace modctl activity */
 #define	SDEV_DEBUG_FLK		0x800	/* trace failed lookups */
 #define	SDEV_DEBUG_NET		0x1000	/* /dev/net tracing */
+#define	SDEV_DEBUG_ZVOL		0x2000	/* /dev/zvol/tracing */
 
 #define	sdcmn_err(args)  if (sdev_debug & SDEV_DEBUG) printf args
 #define	sdcmn_err2(args) if (sdev_debug & SDEV_DEBUG_VOPS) printf args
@@ -641,6 +601,7 @@
 #define	sdcmn_err10(args) if (sdev_debug & SDEV_DEBUG_PROFILE) printf args
 #define	sdcmn_err11(args) if (sdev_debug & SDEV_DEBUG_MODCTL) printf args
 #define	sdcmn_err12(args) if (sdev_debug & SDEV_DEBUG_NET) printf args
+#define	sdcmn_err13(args) if (sdev_debug & SDEV_DEBUG_ZVOL) printf args
 #define	impossible(args) printf args
 #else
 #define	sdcmn_err(args)		/* does nothing */
@@ -655,6 +616,7 @@
 #define	sdcmn_err10(args)	/* does nothing */
 #define	sdcmn_err11(args)	/* does nothing */
 #define	sdcmn_err12(args)	/* does nothing */
+#define	sdcmn_err13(args) 	/* does nothing */
 #define	impossible(args)	/* does nothing */
 #endif
 
--- a/usr/src/uts/common/sys/fs/zfs.h	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/common/sys/fs/zfs.h	Mon Sep 21 08:55:28 2009 -0600
@@ -545,19 +545,13 @@
 #define	ZFS_DRIVER	"zfs"
 #define	ZFS_DEV		"/dev/zfs"
 
-/*
- * zvol paths.  Irritatingly, the devfsadm interfaces want all these
- * paths without the /dev prefix, but for some things, we want the
- * /dev prefix.  Below are the names without /dev.
- */
-#define	ZVOL_DEV_DIR	"zvol/dsk"
-#define	ZVOL_RDEV_DIR	"zvol/rdsk"
-
-/*
- * And here are the things we need with /dev, etc. in front of them.
- */
+/* general zvol path */
+#define	ZVOL_DIR		"/dev/zvol"
+/* expansion */
 #define	ZVOL_PSEUDO_DEV		"/devices/pseudo/zfs@0:"
-#define	ZVOL_FULL_DEV_DIR	"/dev/" ZVOL_DEV_DIR "/"
+/* for dump and swap */
+#define	ZVOL_FULL_DEV_DIR	ZVOL_DIR "/dsk/"
+#define	ZVOL_FULL_RDEV_DIR	ZVOL_DIR "/rdsk/"
 
 #define	ZVOL_PROP_NAME		"name"
 
@@ -590,8 +584,6 @@
 	ZFS_IOC_DATASET_LIST_NEXT,
 	ZFS_IOC_SNAPSHOT_LIST_NEXT,
 	ZFS_IOC_SET_PROP,
-	ZFS_IOC_CREATE_MINOR,
-	ZFS_IOC_REMOVE_MINOR,
 	ZFS_IOC_CREATE,
 	ZFS_IOC_DESTROY,
 	ZFS_IOC_ROLLBACK,
--- a/usr/src/uts/intel/dev/Makefile	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/intel/dev/Makefile	Mon Sep 21 08:55:28 2009 -0600
@@ -20,7 +20,7 @@
 #
 # uts/intel/dev/Makefile
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 #	This makefile drives the production of the dev file system
@@ -60,6 +60,7 @@
 MODSTUBS_DIR	 = $(OBJS_DIR)
 CFLAGS		+= $(CCVERBOSE)
 LDFLAGS		+= -dy -Nfs/devfs -Nmisc/dls
+INC_PATH	+= -I$(UTSBASE)/common/fs/zfs
 
 #
 #	Default build targets.
--- a/usr/src/uts/sparc/dev/Makefile	Mon Sep 21 11:09:02 2009 +0200
+++ b/usr/src/uts/sparc/dev/Makefile	Mon Sep 21 08:55:28 2009 -0600
@@ -18,7 +18,7 @@
 #
 # CDDL HEADER END
 #
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # uts/sparc/dev/Makefile
@@ -61,6 +61,7 @@
 # CLEANFILES	+= $(MODSTUBS_O)
 CFLAGS		+= $(CCVERBOSE)
 LDFLAGS		+= -dy -Nfs/devfs -Nmisc/dls
+INC_PATH	+= -I$(UTSBASE)/common/fs/zfs
 
 #
 #	Default build targets.