# HG changeset patch
# User Robert Harris <Robert.Harris@Sun.COM>
# Date 1256861437 25200
# Node ID 951a65b3846b42c01d4346bb7e73ccd269dae918
# Parent  11fc80bc5cb9224d5f6d6501dd6e4671a95f1725
PSARC/2009/566 Provide minor private interface modifications to support mntfs
6813502 mntfs is not fork-safe

diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/cmd/truss/codes.c
--- a/usr/src/cmd/truss/codes.c	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/cmd/truss/codes.c	Thu Oct 29 17:10:37 2009 -0700
@@ -1405,8 +1405,22 @@
 		NULL},
 
 	/* mntio ioctls - ('m' << 8) */
+	{ (uint_t)MNTIOC_NMNTS,		"MNTIOC_NMNTS",
+		NULL},
+	{ (uint_t)MNTIOC_GETDEVLIST,	"MNTIOC_GETDEVLIST",
+		NULL},
+	{ (uint_t)MNTIOC_SETTAG,	"MNTIOC_SETTAG",
+		"struct mnttagdesc"},
+	{ (uint_t)MNTIOC_CLRTAG,	"MNTIOC_CLRTAG",
+		"struct mnttagdesc"},
+	{ (uint_t)MNTIOC_SHOWHIDDEN,	"MNTIOC_SHOWHIDDEN",
+		NULL},
 	{ (uint_t)MNTIOC_GETMNTENT,	"MNTIOC_GETMNTENT",
+		"struct mnttab"},
+	{ (uint_t)MNTIOC_GETEXTMNTENT,	"MNTIOC_GETEXTMNTENT",
 		"struct extmnttab"},
+	{ (uint_t)MNTIOC_GETMNTANY,	"MNTIOC_GETMNTANY",
+		"struct mnttab"},
 
 	/* devinfo ioctls - ('df' << 8) - devinfo_impl.h */
 	{ (uint_t)DINFOUSRLD,		"DINFOUSRLD",
diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/lib/brand/solaris10/s10_brand/common/s10_brand.c
--- a/usr/src/lib/brand/solaris10/s10_brand/common/s10_brand.c	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/lib/brand/solaris10/s10_brand/common/s10_brand.c	Thu Oct 29 17:10:37 2009 -0700
@@ -51,6 +51,9 @@
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/ucontext.h>
+#include <sys/mntio.h>
+#include <sys/mnttab.h>
+#include <atomic.h>
 
 #include <s10_brand.h>
 #include <s10_misc.h>
@@ -344,6 +347,149 @@
 }
 #endif /* __sparc && !__sparcv9 */
 
+/* Free the thread-local storage provided my mntfs_get_mntentbuf() */
+static void
+mntfs_free_mntentbuf(void *arg)
+{
+	struct mntentbuf *embufp = arg;
+
+	if (embufp == NULL)
+		return;
+	if (embufp->mbuf_emp)
+		free(embufp->mbuf_emp);
+	if (embufp->mbuf_buf)
+		free(embufp->mbuf_buf);
+	bzero(embufp, sizeof (struct mntentbuf));
+	free(embufp);
+}
+
+/* Provide the thread-local storage required by mntfs_ioctl() */
+static struct mntentbuf *
+mntfs_get_mntentbuf(size_t size)
+{
+	static mutex_t keylock;
+	static thread_key_t key;
+	static int once_per_keyname = 0;
+	void *tsd = NULL;
+	struct mntentbuf *embufp;
+
+	/* Create the key. */
+	if (!once_per_keyname) {
+		(void) mutex_lock(&keylock);
+		if (!once_per_keyname) {
+			if (thr_keycreate(&key, mntfs_free_mntentbuf)) {
+				(void) mutex_unlock(&keylock);
+				return (NULL);
+			} else {
+				once_per_keyname++;
+			}
+		}
+		(void) mutex_unlock(&keylock);
+	}
+
+	/*
+	 * The thread-specific datum for this key is the address of a struct
+	 * mntentbuf. If this is the first time here then we allocate the struct
+	 * and its contents, and associate its address with the thread; if there
+	 * are any problems then we abort.
+	 */
+	if (thr_getspecific(key, &tsd))
+		return (NULL);
+	if (tsd == NULL) {
+		if (!(embufp = calloc(1, sizeof (struct mntentbuf))) ||
+		    !(embufp->mbuf_emp = malloc(sizeof (struct extmnttab))) ||
+		    thr_setspecific(key, embufp)) {
+			mntfs_free_mntentbuf(embufp);
+			return (NULL);
+		}
+	} else {
+		embufp = tsd;
+	}
+
+	/* Return the buffer, resizing it if necessary. */
+	if (size > embufp->mbuf_bufsize) {
+		if (embufp->mbuf_buf)
+			free(embufp->mbuf_buf);
+		if ((embufp->mbuf_buf = malloc(size)) == NULL) {
+			embufp->mbuf_bufsize = 0;
+			return (NULL);
+		} else {
+			embufp->mbuf_bufsize = size;
+		}
+	}
+	return (embufp);
+}
+
+/*
+ * The MNTIOC_GETMNTENT command in this release differs from that in Solaris 10.
+ * Previously, the command would copy a pointer to a struct extmnttab to an
+ * address provided as an argument. The pointer would be somewhere within a
+ * mapping already present within the user's address space. In addition, the
+ * text to which the struct's members pointed would also be within a
+ * pre-existing mapping. Now, the user is required to allocate memory for both
+ * the struct and the text buffer, and to pass the address of each within a
+ * struct mntentbuf. In order to conceal these details from a Solaris 10 client
+ * we allocate some thread-local storage in which to create the necessary data
+ * structures; this is static, thread-safe memory that will be cleaned up
+ * without the caller's intervention.
+ *
+ * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY are new in this release; they should
+ * not work for older clients.
+ */
+int
+mntfs_ioctl(sysret_t *rval, int fdes, int cmd, intptr_t arg)
+{
+	int err;
+	struct stat statbuf;
+	struct mntentbuf *embufp;
+	static size_t bufsize = MNT_LINE_MAX;
+
+	if ((err = __systemcall(rval, SYS_fstat + 1024, fdes, &statbuf)) != 0)
+		return (err);
+	if (strcmp(statbuf.st_fstype, MNTTYPE_MNTFS) != 0)
+		return (__systemcall(rval, SYS_ioctl + 1024, fdes, cmd, arg));
+
+	if (cmd == MNTIOC_GETEXTMNTENT || cmd == MNTIOC_GETMNTANY)
+		return (EINVAL);
+
+	if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL)
+		return (ENOMEM);
+
+	/*
+	 * MNTIOC_GETEXTMNTENT advances the file pointer once it has
+	 * successfully copied out the result to the address provided. We
+	 * therefore need to check the user-supplied address now since the
+	 * one we'll be providing is guaranteed to work.
+	 */
+	if (s10_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0)
+		return (EFAULT);
+
+	/*
+	 * Keep retrying for as long as we fail for want of a large enough
+	 * buffer.
+	 */
+	for (;;) {
+		if ((err = __systemcall(rval, SYS_ioctl + 1024, fdes,
+		    MNTIOC_GETEXTMNTENT, embufp)) != 0)
+			return (err);
+
+		if (rval->sys_rval1 == MNTFS_TOOLONG) {
+			/* The buffer wasn't large enough. */
+			(void) atomic_swap_ulong((unsigned long *)&bufsize,
+			    2 * embufp->mbuf_bufsize);
+			if ((embufp = mntfs_get_mntentbuf(bufsize)) == NULL)
+				return (ENOMEM);
+		} else {
+			break;
+		}
+	}
+
+	if (s10_uucopy(&embufp->mbuf_emp, (void *)arg, sizeof (void *)) != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
 /*
  * Assign the structure member value from the s (source) structure to the
  * d (dest) structure.
@@ -732,6 +878,12 @@
 		/*FALLTHRU*/
 	case CT_TSET:
 		return (ctfs_ioctl(rval, fdes, cmd, arg));
+	case MNTIOC_GETMNTENT:
+		/*FALLTHRU*/
+	case MNTIOC_GETEXTMNTENT:
+		/*FALLTHRU*/
+	case MNTIOC_GETMNTANY:
+		return (mntfs_ioctl(rval, fdes, cmd, arg));
 	}
 
 	if ((cmd & 0xff00) == ZFS_IOC)
diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/lib/libc/port/gen/getmntent.c
--- a/usr/src/lib/libc/port/gen/getmntent.c	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/lib/libc/port/gen/getmntent.c	Thu Oct 29 17:10:37 2009 -0700
@@ -20,15 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*	Copyright (c) 1988 AT&T	*/
 /*	  All Rights Reserved  	*/
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include "lint.h"
 #include <mtlib.h>
 #include <stdio.h>
@@ -45,9 +43,10 @@
 #include <libc.h>
 #include <unistd.h>
 #include "tsd.h"
+#include <atomic.h>
+#include <strings.h>
 
 static int getmntent_compat(FILE *fp, struct mnttab *mp);
-static int convert_mntent(struct extmnttab *, struct extmnttab *, int);
 
 #define	GETTOK_R(xx, ll, tmp)\
 	if ((mp->xx = (char *)strtok_r(ll, sepstr, tmp)) == NULL)\
@@ -89,9 +88,6 @@
 {
 	thread_data_t *thread_data;
 
-	if (size < MNT_LINE_MAX)
-		size = MNT_LINE_MAX;
-
 	thread_data = tsdalloc(_T_GETMNTENT,
 	    sizeof (thread_data_t), destroy_thread_data);
 	if (thread_data == NULL)
@@ -108,8 +104,8 @@
 	return (thread_data->buf);
 }
 
-int
-getmntany(FILE *fp, struct mnttab *mgetp, struct mnttab *mrefp)
+static int
+getmntany_compat(FILE *fp, struct mnttab *mgetp, struct mnttab *mrefp)
 {
 	int	ret, bstat;
 	mode_t	bmode;
@@ -130,7 +126,7 @@
 		bstat = 0;
 	}
 
-	while ((ret = getmntent(fp, mgetp)) == 0 &&
+	while ((ret = getmntent_compat(fp, mgetp)) == 0 &&
 	    ((bstat == 0 && DIFF(mnt_special)) ||
 	    (bstat == 1 && SDIFF(mnt_special, bmode, brdev)) ||
 	    DIFF(mnt_mountp) ||
@@ -143,23 +139,149 @@
 }
 
 int
-getmntent(FILE *fp, struct mnttab *mp)
+getmntany(FILE *fp, struct mnttab *mgetp, struct mnttab *mrefp)
 {
-	int	ret;
-	struct	extmnttab *emp;
+	struct mntentbuf embuf;
+	char *copyp, *bufp;
+	int ret;
+
+
+	/*
+	 * We collect all of the text strings pointed to by members of the
+	 * user's preferences struct into a single buffer. At the same time
+	 * populate the members of the results struct to point to the
+	 * corresponding words. We then ask the kernel to figure out the
+	 * rest; if this is a non-mntfs file then we handover to
+	 * getmntany_compat().
+	 */
+	if ((copyp = bufp = getmntbuf(MNT_LINE_MAX)) == NULL) {
+		errno = ENOMEM;
+		return (-1);
+	}
+	bzero(mgetp, sizeof (struct mnttab));
+	if (mrefp->mnt_special) {
+		mgetp->mnt_special = copyp;
+		copyp += snprintf(mgetp->mnt_special, MNT_LINE_MAX, "%s",
+		    mrefp->mnt_special) + 1;
+	}
+	if (mrefp->mnt_mountp) {
+		mgetp->mnt_mountp = copyp;
+		copyp += snprintf(mgetp->mnt_mountp,
+		    bufp + MNT_LINE_MAX - copyp, "%s", mrefp->mnt_mountp) + 1;
+	}
+	if (mrefp->mnt_fstype) {
+		mgetp->mnt_fstype = copyp;
+		copyp += snprintf(mgetp->mnt_fstype,
+		    bufp + MNT_LINE_MAX - copyp, "%s", mrefp->mnt_fstype) + 1;
+	}
+	if (mrefp->mnt_mntopts) {
+		mgetp->mnt_mntopts = copyp;
+		copyp += snprintf(mgetp->mnt_mntopts,
+		    bufp + MNT_LINE_MAX - copyp, "%s", mrefp->mnt_mntopts) + 1;
+	}
+	if (mrefp->mnt_time) {
+		mgetp->mnt_time = copyp;
+		(void) snprintf(mgetp->mnt_time, bufp + MNT_LINE_MAX - copyp,
+		    "%s", mrefp->mnt_time);
+	}
+
+	embuf.mbuf_emp = (struct extmnttab *)mgetp;
+	embuf.mbuf_bufsize = MNT_LINE_MAX;
+	embuf.mbuf_buf = bufp;
 
-	ret = ioctl(fileno(fp), MNTIOC_GETMNTENT, &emp);
+	switch (ret = ioctl(fileno(fp), MNTIOC_GETMNTANY, &embuf)) {
+	case 0:
+		/* Success. */
+		return (0);
+	case MNTFS_EOF:
+		return (-1);
+	case MNTFS_TOOLONG:
+		return (MNT_TOOLONG);
+	default:
+		/* A failure of some kind. */
+		if (errno == ENOTTY)
+			return (getmntany_compat(fp, mgetp, mrefp));
+		else
+			return (ret);
+	}
+}
+
+/*
+ * Common code for getmntent() and getextmntent().
+ *
+ * These functions serve to populate a structure supplied by the user. Common
+ * to both struct mnttab and struct extmnttab is a set of pointers to the
+ * individual text fields that form an entry in /etc/mnttab. We arrange for the
+ * text itself to be stored in some thread-local storage, and for the kernel to
+ * populate both this buffer and the structure directly.
+ *
+ * If getmntent() passes a file that isn't provided by mntfs then we assume that
+ * it is a simple text file and give it to getmntent_compat() to parse. For
+ * getextmntent() we give up; it requires major and minor numbers that only the
+ * kernel can provide.
+ */
+static int
+getmntent_common(FILE *fp, struct extmnttab *emp, int command)
+{
+	struct mntentbuf embuf;
+	static size_t bufsize = MNT_LINE_MAX;
+	int ret;
+
+	embuf.mbuf_emp = emp;
+	embuf.mbuf_bufsize = bufsize;
+	if ((embuf.mbuf_buf = getmntbuf(embuf.mbuf_bufsize)) == NULL) {
+		errno = ENOMEM;
+		return (-1);
+	}
+
+	while ((ret = ioctl(fileno(fp), command, &embuf)) == MNTFS_TOOLONG) {
+		/* The buffer wasn't large enough. */
+		(void) atomic_swap_ulong((unsigned long *)&bufsize,
+		    2 * embuf.mbuf_bufsize);
+		embuf.mbuf_bufsize = bufsize;
+		if ((embuf.mbuf_buf = getmntbuf(embuf.mbuf_bufsize)) == NULL) {
+			errno = ENOMEM;
+			return (-1);
+		}
+	}
 
 	switch (ret) {
-		case 0:
-			return (convert_mntent(emp, (struct extmnttab *)mp, 0));
-		case 1:
-			return (-1);
-		default:
-			return (getmntent_compat(fp, mp));
+	case 0:
+		/*
+		 * We were successful, but we may have to enforce getmntent()'s
+		 * documented limit on the line length.
+		 */
+		if (command == MNTIOC_GETMNTENT &&
+		    (emp->mnt_time + strlen(emp->mnt_time) + 1 -
+		    emp->mnt_special > MNT_LINE_MAX))
+			return (MNT_TOOLONG);
+		else
+			return (0);
+	case MNTFS_EOF:
+		/* EOF. */
+		return (-1);
+	default:
+		/* A non-mntfs file. */
+		if (command == MNTIOC_GETMNTENT)
+			return (getmntent_compat(fp, (struct mnttab *)emp));
+		else
+			return (ret);
 	}
 }
 
+int
+getmntent(FILE *fp, struct mnttab *mp)
+{
+	return (getmntent_common(fp, (struct extmnttab *)mp, MNTIOC_GETMNTENT));
+}
+
+/*ARGSUSED*/
+int
+getextmntent(FILE *fp, struct extmnttab *emp, size_t len)
+{
+	return (getmntent_common(fp, emp, MNTIOC_GETEXTMNTENT));
+}
+
 char *
 mntopt(char **p)
 {
@@ -207,25 +329,6 @@
 	return (NULL);
 }
 
-/*ARGSUSED*/
-int
-getextmntent(FILE *fp, struct extmnttab *mp, size_t len)
-{
-	int	ret;
-	struct	extmnttab *emp;
-
-	ret = ioctl(fileno(fp), MNTIOC_GETMNTENT, &emp);
-
-	switch (ret) {
-		case 0:
-			return (convert_mntent(emp, mp, 1));
-		case 1:
-			return (-1);
-		default:
-			return (ret);
-	}
-}
-
 void
 resetmnttab(FILE *fp)
 {
@@ -233,41 +336,6 @@
 }
 
 /*
- * This is a horrible function, necessary to support this broken interface.
- * Some callers of get(ext)mntent assume that the memory is valid even after the
- * file is closed.  Since we switched to a direct ioctl() interface, this is no
- * longer true.  In order to support these apps, we have to put the data into a
- * thread specific buffer.
- */
-static int
-convert_mntent(struct extmnttab *src, struct extmnttab *dst, int isext)
-{
-	size_t len;
-	char *buf;
-
-	len = src->mnt_time - src->mnt_special + strlen(src->mnt_time) + 1;
-
-	buf = getmntbuf(len);
-	if (buf == NULL) {
-		errno = ENOMEM;
-		return (-1);
-	}
-
-	memcpy(buf, src->mnt_special, len);
-	dst->mnt_special = buf;
-	dst->mnt_mountp = buf + (src->mnt_mountp - src->mnt_special);
-	dst->mnt_fstype = buf + (src->mnt_fstype - src->mnt_special);
-	dst->mnt_mntopts = buf + (src->mnt_mntopts - src->mnt_special);
-	dst->mnt_time = buf + (src->mnt_time - src->mnt_special);
-	if (isext) {
-		dst->mnt_major = src->mnt_major;
-		dst->mnt_minor = src->mnt_minor;
-	}
-
-	return (0);
-}
-
-/*
  * Compatibility for non-mntfs files.  For backwards compatibility, we continue
  * to have to support this broken interface.  Note that getextmntent() has
  * always failed when using a file other than /etc/mnttab, because it relies on
diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/uts/common/fs/mntfs/mntvnops.c
--- a/usr/src/uts/common/fs/mntfs/mntvnops.c	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/uts/common/fs/mntfs/mntvnops.c	Thu Oct 29 17:10:37 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,6 +38,9 @@
 #include <fs/fs_subr.h>
 #include <sys/vmsystm.h>
 #include <vm/seg_vn.h>
+#include <sys/time.h>
+#include <sys/ksynch.h>
+#include <sys/sdt.h>
 
 #define	MNTROOTINO	2
 
@@ -49,25 +52,51 @@
 /*
  * Design of kernel mnttab accounting.
  *
- * To support whitespace in mount names, we implement an ioctl
- * (MNTIOC_GETMNTENT) which allows a programmatic interface to the data in
- * /etc/mnttab.  The libc functions getmntent() and getextmntent() are built
- * atop this interface.
+ * mntfs provides two methods of reading the in-kernel mnttab, i.e. the state of
+ * the mounted resources: the read-only file /etc/mnttab, and a collection of
+ * ioctl() commands. Most of these interfaces are public and are described in
+ * mnttab(4). Three private ioctl() commands, MNTIOC_GETMNTENT,
+ * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY, provide for the getmntent(3C)
+ * family of functions, allowing them to support white space in mount names.
  *
- * To minimize the amount of memory used in the kernel, we keep all the
- * necessary information in the user's address space.  Large server
- * configurations can have /etc/mnttab files in excess of 64k.
+ * A significant feature of mntfs is that it provides a file descriptor with a
+ * snapshot once it begins to consume mnttab data. Thus, as the process
+ * continues to consume data, its view of the in-kernel mnttab does not change
+ * even if resources are mounted or unmounted. The intent is to ensure that
+ * processes are guaranteed to read self-consistent data even as the system
+ * changes.
+ *
+ * The snapshot is implemented by a "database", unique to each zone, that
+ * comprises a linked list of mntelem_ts. The database is identified by
+ * zone_mntfs_db and is protected by zone_mntfs_db_lock. Each element contains
+ * the text entry in /etc/mnttab for a mounted resource, i.e. a vfs_t, and is
+ * marked with its time of "birth", i.e. creation. An element is "killed", and
+ * marked with its time of death, when it is found to be out of date, e.g. when
+ * the corresponding resource has been unmounted.
  *
- * To support both vanilla read() calls as well as ioctl() calls, we have two
- * different snapshots of the kernel data structures, mnt_read and mnt_ioctl.
- * These snapshots include the base location in user memory, the number of
- * mounts in the snapshot, and any metadata associated with it.  The metadata is
- * used only to support the ioctl() interface, and is a series of extmnttab
- * structures.  When the user issues an ioctl(), we simply copyout a pointer to
- * that structure, and the rest is handled in userland.
- */
-
-/*
+ * When a process performs the first read() or ioctl() for a file descriptor for
+ * /etc/mnttab, the database is updated by a call to mntfs_snapshot() to ensure
+ * that an element exists for each currently mounted resource. Following this,
+ * the current time is written into a snapshot structure, a mntsnap_t, embedded
+ * in the descriptor's mntnode_t.
+ *
+ * mntfs is able to enumerate the /etc/mnttab entries corresponding to a
+ * particular file descriptor by searching the database for entries that were
+ * born before the appropriate snapshot and that either are still alive or died
+ * after the snapshot was created. Consumers use the iterator function
+ * mntfs_get_next_elem() to identify the next suitable element in the database.
+ *
+ * Each snapshot has a hold on its corresponding database elements, effected by
+ * a per-element reference count. At last close(), a snapshot is destroyed in
+ * mntfs_freesnap() by releasing all of its holds; an element is destroyed if
+ * its reference count becomes zero. Therefore the database never exists unless
+ * there is at least one active consumer of /etc/mnttab.
+ *
+ * getmntent(3C) et al. "do not open, close or rewind the file." This implies
+ * that getmntent() and read() must be able to operate without interaction on
+ * the same file descriptor; this is accomplished by the use of separate
+ * mntsnap_ts for both read() and ioctl().
+ *
  * NOTE: The following variable enables the generation of the "dev=xxx"
  * in the option string for a mounted file system.  Really this should
  * be gotten rid of altogether, but for the sake of backwards compatibility
@@ -80,6 +109,16 @@
  */
 int mntfs_enabledev = 1;	/* enable old "dev=xxx" option */
 
+extern void vfs_mono_time(timespec_t *);
+enum { MNTFS_FIRST, MNTFS_SECOND, MNTFS_NEITHER };
+
+/*
+ * Determine whether a field within a line from /etc/mnttab contains actual
+ * content or simply the marker string "-". This never applies to the time,
+ * therefore the delimiter must be a tab.
+ */
+#define	MNTFS_REAL_FIELD(x)	(*(x) != '-' || *((x) + 1) != '\t')
+
 static int
 mntfs_devsize(struct vfs *vfsp)
 {
@@ -98,6 +137,22 @@
 	return (snprintf(buf, MAX_MNTOPT_STR, "dev=%x", odev));
 }
 
+/* Identify which, if either, of two supplied timespec structs is newer. */
+static int
+mntfs_newest(timespec_t *a, timespec_t *b)
+{
+	if (a->tv_sec == b->tv_sec &&
+	    a->tv_nsec == b->tv_nsec) {
+		return (MNTFS_NEITHER);
+	} else if (b->tv_sec > a->tv_sec ||
+	    (b->tv_sec == a->tv_sec &&
+	    b->tv_nsec > a->tv_nsec)) {
+		return (MNTFS_SECOND);
+	} else {
+		return (MNTFS_FIRST);
+	}
+}
+
 static int
 mntfs_optsize(struct vfs *vfsp)
 {
@@ -185,18 +240,80 @@
 	return (buf - origbuf);
 }
 
+void
+mntfs_populate_text(vfs_t *vfsp, zone_t *zonep, mntelem_t *elemp)
+{
+	struct extmnttab *tabp = &elemp->mnte_tab;
+	const char *resource, *mntpt;
+	char *cp = elemp->mnte_text;
+	mntpt = refstr_value(vfsp->vfs_mntpt);
+	resource = refstr_value(vfsp->vfs_resource);
+
+	tabp->mnt_special = 0;
+	if (resource != NULL && resource[0] != '\0') {
+		if (resource[0] != '/') {
+			cp += snprintf(cp, MAXPATHLEN, "%s\t", resource);
+		} else if (!ZONE_PATH_VISIBLE(resource, zonep)) {
+			/*
+			 * Use the mount point as the resource.
+			 */
+			cp += snprintf(cp, MAXPATHLEN, "%s\t",
+			    ZONE_PATH_TRANSLATE(mntpt, zonep));
+		} else {
+			cp += snprintf(cp, MAXPATHLEN, "%s\t",
+			    ZONE_PATH_TRANSLATE(resource, zonep));
+		}
+	} else {
+		cp += snprintf(cp, MAXPATHLEN, "-\t");
+	}
+
+	tabp->mnt_mountp = (char *)(cp - elemp->mnte_text);
+	if (mntpt != NULL && mntpt[0] != '\0') {
+		/*
+		 * We know the mount point is visible from within the zone,
+		 * otherwise it wouldn't be on the zone's vfs list.
+		 */
+		cp += snprintf(cp, MAXPATHLEN, "%s\t",
+		    ZONE_PATH_TRANSLATE(mntpt, zonep));
+	} else {
+		cp += snprintf(cp, MAXPATHLEN, "-\t");
+	}
+
+	tabp->mnt_fstype = (char *)(cp - elemp->mnte_text);
+	cp += snprintf(cp, MAXPATHLEN, "%s\t",
+	    vfssw[vfsp->vfs_fstype].vsw_name);
+
+	tabp->mnt_mntopts = (char *)(cp - elemp->mnte_text);
+	cp += mntfs_optprint(vfsp, cp);
+	*cp++ = '\t';
+
+	tabp->mnt_time = (char *)(cp - elemp->mnte_text);
+	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
+	*cp++ = '\n'; /* over-write snprintf's trailing null-byte */
+
+	tabp->mnt_major = getmajor(vfsp->vfs_dev);
+	tabp->mnt_minor = getminor(vfsp->vfs_dev);
+
+	elemp->mnte_text_size = cp - elemp->mnte_text;
+	elemp->mnte_vfs_ctime = vfsp->vfs_hrctime;
+	elemp->mnte_hidden = vfsp->vfs_flag & VFS_NOMNTTAB;
+}
+
+/* Determine the length of the /etc/mnttab entry for this vfs_t. */
 static size_t
-mntfs_vfs_len(vfs_t *vfsp, zone_t *zone)
+mntfs_text_len(vfs_t *vfsp, zone_t *zone)
 {
 	size_t size = 0;
 	const char *resource, *mntpt;
+	size_t mntsize;
 
 	mntpt = refstr_value(vfsp->vfs_mntpt);
 	if (mntpt != NULL && mntpt[0] != '\0') {
-		size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
+		mntsize = strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
 	} else {
-		size += strlen("-") + 1;
+		mntsize = 2;	/* "-\t" */
 	}
+	size += mntsize;
 
 	resource = refstr_value(vfsp->vfs_resource);
 	if (resource != NULL && resource[0] != '\0') {
@@ -206,12 +323,12 @@
 			/*
 			 * Same as the zone's view of the mount point.
 			 */
-			size += strlen(ZONE_PATH_TRANSLATE(mntpt, zone)) + 1;
+			size += mntsize;
 		} else {
 			size += strlen(ZONE_PATH_TRANSLATE(resource, zone)) + 1;
 		}
 	} else {
-		size += strlen("-") + 1;
+		size += 2;	/* "-\t" */
 	}
 	size += strlen(vfssw[vfsp->vfs_fstype].vsw_name) + 1;
 	size += mntfs_optsize(vfsp);
@@ -219,421 +336,451 @@
 	return (size);
 }
 
+/* Destroy the resources associated with a snapshot element. */
 static void
-mntfs_zonerootvfs(zone_t *zone, vfs_t *rootvfsp)
+mntfs_destroy_elem(mntelem_t *elemp)
 {
-	/*
-	 * Basically copy over the real vfs_t on which the root vnode is
-	 * located, changing its mountpoint and resource to match those of
-	 * the zone's rootpath.
-	 */
-	*rootvfsp = *zone->zone_rootvp->v_vfsp;
-	rootvfsp->vfs_mntpt = refstr_alloc(zone->zone_rootpath);
-	rootvfsp->vfs_resource = rootvfsp->vfs_mntpt;
+	kmem_free(elemp->mnte_text, elemp->mnte_text_size);
+	kmem_free(elemp, sizeof (mntelem_t));
 }
 
-static size_t
-mntfs_zone_len(uint_t *nent_ptr, zone_t *zone, int showhidden)
+/*
+ * Return 1 if the given snapshot is in the range of the given element; return
+ * 0 otherwise.
+ */
+static int
+mntfs_elem_in_range(mntsnap_t *snapp, mntelem_t *elemp)
 {
-	struct vfs *zonelist;
-	struct vfs *vfsp;
-	size_t size = 0;
-	uint_t cnt = 0;
-
-	ASSERT(zone->zone_rootpath != NULL);
+	timespec_t	*stimep = &snapp->mnts_time;
+	timespec_t	*btimep = &elemp->mnte_birth;
+	timespec_t	*dtimep = &elemp->mnte_death;
 
 	/*
-	 * If the zone has a root entry, it will be the first in the list.  If
-	 * it doesn't, we conjure one up.
+	 * If a snapshot is in range of an element then the snapshot must have
+	 * been created after the birth of the element, and either the element
+	 * is still alive or it died after the snapshot was created.
 	 */
-	vfsp = zonelist = zone->zone_vfslist;
-	if (zonelist == NULL ||
-	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
-		vfs_t tvfs;
-		/*
-		 * The root of the zone is not a mount point.  The vfs we want
-		 * to report is that of the zone's root vnode.
-		 */
-		ASSERT(zone != global_zone);
-		mntfs_zonerootvfs(zone, &tvfs);
-		size += mntfs_vfs_len(&tvfs, zone);
-		refstr_rele(tvfs.vfs_mntpt);
-		cnt++;
-	}
-	if (zonelist == NULL)
-		goto out;
-	do {
-		/*
-		 * Skip mounts that should not show up in mnttab
-		 */
-		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
-			vfsp = vfsp->vfs_zone_next;
-			continue;
-		}
-		cnt++;
-		size += mntfs_vfs_len(vfsp, zone);
-		vfsp = vfsp->vfs_zone_next;
-	} while (vfsp != zonelist);
-out:
-	*nent_ptr = cnt;
-	return (size);
+	if (mntfs_newest(btimep, stimep) == MNTFS_SECOND &&
+	    (MNTFS_ELEM_IS_ALIVE(elemp) ||
+	    mntfs_newest(stimep, dtimep) == MNTFS_SECOND))
+		return (1);
+	else
+		return (0);
 }
 
-static size_t
-mntfs_global_len(uint_t *nent_ptr, int showhidden)
+/*
+ * Return the next valid database element, after the one provided, for a given
+ * snapshot; return NULL if none exists. The caller must hold the zone's
+ * database lock as a reader before calling this function.
+ */
+static mntelem_t *
+mntfs_get_next_elem(mntsnap_t *snapp, mntelem_t *elemp)
 {
-	struct vfs *vfsp;
-	size_t size = 0;
-	uint_t cnt = 0;
+	int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
 
-	vfsp = rootvfs;
 	do {
-		/*
-		 * Skip mounts that should not show up in mnttab
-		 */
-		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
-			vfsp = vfsp->vfs_next;
-			continue;
-		}
-		cnt++;
-		size += mntfs_vfs_len(vfsp, global_zone);
-		vfsp = vfsp->vfs_next;
-	} while (vfsp != rootvfs);
-	*nent_ptr = cnt;
-	return (size);
+		elemp = elemp->mnte_next;
+	} while (elemp &&
+	    (!mntfs_elem_in_range(snapp, elemp) ||
+	    (!show_hidden && elemp->mnte_hidden)));
+	return (elemp);
 }
 
+/*
+ * This function frees the resources associated with a mntsnap_t. It walks
+ * through the database, decrementing the reference count of any element that
+ * satisfies the snapshot. If the reference count of an element becomes zero
+ * then it is removed from the database.
+ */
 static void
-mntfs_vfs_generate(vfs_t *vfsp, zone_t *zone, struct extmnttab *tab,
-    char **basep, int forread)
+mntfs_freesnap(mntnode_t *mnp, mntsnap_t *snapp)
 {
-	const char *resource, *mntpt;
-	char *cp = *basep;
-
-	mntpt = refstr_value(vfsp->vfs_mntpt);
-	resource = refstr_value(vfsp->vfs_resource);
+	zone_t *zonep = MTOD(mnp)->mnt_zone;
+	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
+	mntelem_t **elempp = &zonep->zone_mntfs_db;
+	mntelem_t *elemp;
+	int show_hidden = snapp->mnts_flags & MNTS_SHOWHIDDEN;
+	size_t number_decremented = 0;
 
-	if (tab)
-		tab->mnt_special = cp;
-	if (resource != NULL && resource[0] != '\0') {
-		if (resource[0] != '/') {
-			cp += snprintf(cp, MAXPATHLEN, "%s", resource);
-		} else if (!ZONE_PATH_VISIBLE(resource, zone)) {
-			/*
-			 * Use the mount point as the resource.
-			 */
-			cp += snprintf(cp, MAXPATHLEN, "%s",
-			    ZONE_PATH_TRANSLATE(mntpt, zone));
-		} else {
-			cp += snprintf(cp, MAXPATHLEN, "%s",
-			    ZONE_PATH_TRANSLATE(resource, zone));
-		}
-	} else {
-		cp += snprintf(cp, MAXPATHLEN, "-");
-	}
-	*cp++ = forread ? '\t' : '\0';
+	ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
 
-	if (tab)
-		tab->mnt_mountp = cp;
-	if (mntpt != NULL && mntpt[0] != '\0') {
-		/*
-		 * We know the mount point is visible from within the zone,
-		 * otherwise it wouldn't be on the zone's vfs list.
-		 */
-		cp += snprintf(cp, MAXPATHLEN, "%s",
-		    ZONE_PATH_TRANSLATE(mntpt, zone));
-	} else {
-		cp += snprintf(cp, MAXPATHLEN, "-");
-	}
-	*cp++ = forread ? '\t' : '\0';
+	/* Ignore an uninitialised snapshot. */
+	if (snapp->mnts_nmnts == 0)
+		return;
 
-	if (tab)
-		tab->mnt_fstype = cp;
-	cp += snprintf(cp, MAXPATHLEN, "%s",
-	    vfssw[vfsp->vfs_fstype].vsw_name);
-	*cp++ = forread ? '\t' : '\0';
-
-	if (tab)
-		tab->mnt_mntopts = cp;
-	cp += mntfs_optprint(vfsp, cp);
-	*cp++ = forread ? '\t' : '\0';
+	/* Drop the holds on any matching database elements. */
+	rw_enter(dblockp, RW_WRITER);
+	while ((elemp = *elempp) != NULL) {
+		if (mntfs_elem_in_range(snapp, elemp) &&
+		    (!elemp->mnte_hidden || show_hidden) &&
+		    ++number_decremented && --elemp->mnte_refcnt == 0) {
+			if ((*elempp = elemp->mnte_next) != NULL)
+				(*elempp)->mnte_prev = elemp->mnte_prev;
+			mntfs_destroy_elem(elemp);
+		} else {
+			elempp = &elemp->mnte_next;
+		}
+	}
+	rw_exit(dblockp);
+	ASSERT(number_decremented == snapp->mnts_nmnts);
 
-	if (tab)
-		tab->mnt_time = cp;
-	cp += snprintf(cp, MAX_MNTOPT_STR, "%ld", vfsp->vfs_mtime);
-	*cp++ = forread ? '\n' : '\0';
-
-	if (tab) {
-		tab->mnt_major = getmajor(vfsp->vfs_dev);
-		tab->mnt_minor = getminor(vfsp->vfs_dev);
-	}
-
-	*basep = cp;
+	/* Clear the snapshot data. */
+	bzero(snapp, sizeof (mntsnap_t));
 }
 
+/* Insert the new database element newp after the existing element prevp. */
 static void
-mntfs_zone_generate(zone_t *zone, int showhidden, struct extmnttab *tab,
-    char *basep, int forread)
+mntfs_insert_after(mntelem_t *newp, mntelem_t *prevp)
 {
-	vfs_t *zonelist;
-	vfs_t *vfsp;
-	char *cp = basep;
+	newp->mnte_prev = prevp;
+	newp->mnte_next = prevp->mnte_next;
+	prevp->mnte_next = newp;
+	if (newp->mnte_next != NULL)
+		newp->mnte_next->mnte_prev = newp;
+}
+
+/* Create and return a copy of a given database element. */
+static mntelem_t *
+mntfs_copy(mntelem_t *origp)
+{
+	mntelem_t *copyp;
 
-	/*
-	 * If the zone has a root entry, it will be the first in the list.  If
-	 * it doesn't, we conjure one up.
-	 */
-	vfsp = zonelist = zone->zone_vfslist;
-	if (zonelist == NULL ||
-	    strcmp(refstr_value(vfsp->vfs_mntpt), zone->zone_rootpath) != 0) {
-		vfs_t tvfs;
-		/*
-		 * The root of the zone is not a mount point.  The vfs we want
-		 * to report is that of the zone's root vnode.
-		 */
-		ASSERT(zone != global_zone);
-		mntfs_zonerootvfs(zone, &tvfs);
-		mntfs_vfs_generate(&tvfs, zone, tab, &cp, forread);
-		refstr_rele(tvfs.vfs_mntpt);
-		if (tab)
-			tab++;
-	}
-	if (zonelist == NULL)
-		return;
-	do {
-		/*
-		 * Skip mounts that should not show up in mnttab
-		 */
-		if (!showhidden && (vfsp->vfs_flag & VFS_NOMNTTAB)) {
-			vfsp = vfsp->vfs_zone_next;
-			continue;
-		}
-		mntfs_vfs_generate(vfsp, zone, tab, &cp, forread);
-		if (tab)
-			tab++;
-		vfsp = vfsp->vfs_zone_next;
-	} while (vfsp != zonelist);
+	copyp = kmem_zalloc(sizeof (mntelem_t), KM_SLEEP);
+	copyp->mnte_vfs_ctime = origp->mnte_vfs_ctime;
+	copyp->mnte_text_size = origp->mnte_text_size;
+	copyp->mnte_text = kmem_alloc(copyp->mnte_text_size, KM_SLEEP);
+	bcopy(origp->mnte_text, copyp->mnte_text, copyp->mnte_text_size);
+	copyp->mnte_tab = origp->mnte_tab;
+	copyp->mnte_hidden = origp->mnte_hidden;
+
+	return (copyp);
+}
+
+/*
+ * Compare two database elements and determine whether or not the vfs_t payload
+ * data of each are the same. Return 1 if so and 0 otherwise.
+ */
+static int
+mntfs_is_same_element(mntelem_t *a, mntelem_t *b)
+{
+	if (a->mnte_hidden == b->mnte_hidden &&
+	    a->mnte_text_size == b->mnte_text_size &&
+	    bcmp(a->mnte_text, b->mnte_text, a->mnte_text_size) == 0 &&
+	    bcmp(&a->mnte_tab, &b->mnte_tab, sizeof (struct extmnttab)) == 0)
+		return (1);
+	else
+		return (0);
 }
 
+/*
+ * mntfs_snapshot() updates the database, creating it if necessary, so that it
+ * accurately reflects the state of the in-kernel mnttab. It also increments
+ * the reference count on all database elements that correspond to currently-
+ * mounted resources. Finally, it initialises the appropriate snapshot
+ * structure.
+ *
+ * Each vfs_t is given a high-resolution time stamp, for the benefit of mntfs,
+ * when it is inserted into the in-kernel mnttab. This time stamp is copied into
+ * the corresponding database element when it is created, allowing the element
+ * and the vfs_t to be identified as a pair. It is possible that some file
+ * systems may make unadvertised changes to, for example, a resource's mount
+ * options. Therefore, in order to determine whether a database element is an
+ * up-to-date representation of a given vfs_t, it is compared with a temporary
+ * element generated for this purpose. Although less efficient, this is safer
+ * than implementing an mtime for a vfs_t.
+ *
+ * Some mounted resources are marked as "hidden" with a VFS_NOMNTTAB flag. These
+ * are considered invisible unless the user has already set the MNT_SHOWHIDDEN
+ * flag in the vnode using the MNTIOC_SHOWHIDDEN ioctl.
+ */
 static void
-mntfs_global_generate(int showhidden, struct extmnttab *tab, char *basep,
-    int forread)
-{
-	vfs_t *vfsp;
-	char *cp = basep;
-
-	vfsp = rootvfs;
-	do {
-		/*
-		 * Skip mounts that should not show up in mnttab
-		 */
-		if (!showhidden && vfsp->vfs_flag & VFS_NOMNTTAB) {
-			vfsp = vfsp->vfs_next;
-			continue;
-		}
-		mntfs_vfs_generate(vfsp, global_zone, tab, &cp, forread);
-		if (tab)
-			tab++;
-		vfsp = vfsp->vfs_next;
-	} while (vfsp != rootvfs);
-}
-
-static char *
-mntfs_mapin(char *base, size_t size)
-{
-	size_t rlen = roundup(size, PAGESIZE);
-	struct as *as = curproc->p_as;
-	char *addr = NULL;
-
-	as_rangelock(as);
-	map_addr(&addr, rlen, 0, 1, 0);
-	if (addr == NULL || as_map(as, addr, rlen, segvn_create, zfod_argsp)) {
-		as_rangeunlock(as);
-		return (NULL);
-	}
-	as_rangeunlock(as);
-	if (copyout(base, addr, size)) {
-		(void) as_unmap(as, addr, rlen);
-		return (NULL);
-	}
-	return (addr);
-}
-
-static void
-mntfs_freesnap(mntsnap_t *snap)
+mntfs_snapshot(mntnode_t *mnp, mntsnap_t *snapp)
 {
-	if (snap->mnts_text != NULL)
-		(void) as_unmap(curproc->p_as, snap->mnts_text,
-		    roundup(snap->mnts_textsize, PAGESIZE));
-	snap->mnts_textsize = snap->mnts_count = 0;
-	if (snap->mnts_metadata != NULL)
-		(void) as_unmap(curproc->p_as, snap->mnts_metadata,
-		    roundup(snap->mnts_metasize, PAGESIZE));
-	snap->mnts_metasize = 0;
-}
+	zone_t		*zonep = MTOD(mnp)->mnt_zone;
+	int		is_global_zone = (zonep == global_zone);
+	int		show_hidden = mnp->mnt_flags & MNT_SHOWHIDDEN;
+	vfs_t		*vfsp, *firstvfsp, *lastvfsp;
+	vfs_t		dummyvfs;
+	vfs_t		*dummyvfsp = NULL;
+	krwlock_t	*dblockp = &zonep->zone_mntfs_db_lock;
+	mntelem_t	**headpp = &zonep->zone_mntfs_db;
+	mntelem_t	*elemp;
+	mntelem_t	*prevp = NULL;
+	int		order;
+	mntelem_t	*tempelemp;
+	mntelem_t	*newp;
+	mntelem_t	*firstp = NULL;
+	size_t		nmnts = 0;
+	size_t		text_size = 0;
+	int		insert_before;
+	timespec_t	last_mtime;
+	size_t		entry_length, new_entry_length;
 
-#ifdef _SYSCALL32_IMPL
+
+	ASSERT(RW_WRITE_HELD(&mnp->mnt_contents));
+	vfs_list_read_lock();
+	vfs_mnttab_modtime(&last_mtime);
 
-typedef struct extmnttab32 {
-	uint32_t	mnt_special;
-	uint32_t	mnt_mountp;
-	uint32_t	mnt_fstype;
-	uint32_t	mnt_mntopts;
-	uint32_t	mnt_time;
-	uint_t		mnt_major;
-	uint_t		mnt_minor;
-} extmnttab32_t;
-
-#endif
+	/*
+	 * If this snapshot already exists then we must have been asked to
+	 * rewind the file, i.e. discard the snapshot and create a new one in
+	 * its place. In this case we first see if the in-kernel mnttab has
+	 * advertised a change; if not then we simply reinitialise the metadata.
+	 */
+	if (snapp->mnts_nmnts) {
+		if (mntfs_newest(&last_mtime, &snapp->mnts_last_mtime) ==
+		    MNTFS_NEITHER) {
+			/*
+			 * An unchanged mtime is no guarantee that the
+			 * in-kernel mnttab is unchanged; for example, a
+			 * concurrent remount may be between calls to
+			 * vfs_setmntopt_nolock() and vfs_mnttab_modtimeupd().
+			 * It follows that the database may have changed, and
+			 * in particular that some elements in this snapshot
+			 * may have been killed by another call to
+			 * mntfs_snapshot(). It is therefore not merely
+			 * unnecessary to update the snapshot's time but in
+			 * fact dangerous; it needs to be left alone.
+			 */
+			snapp->mnts_next = snapp->mnts_first;
+			snapp->mnts_flags &= ~MNTS_REWIND;
+			snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
+			vfs_list_unlock();
+			return;
+		} else {
+			mntfs_freesnap(mnp, snapp);
+		}
+	}
 
-/*
- * Snapshot the latest version of the kernel mounted resource information
- *
- * There are two types of snapshots: one destined for reading, and one destined
- * for ioctl().  The difference is that the ioctl() interface is delimited by
- * NULLs, while the read() interface is delimited by tabs and newlines.
- */
-/* ARGSUSED */
-static int
-mntfs_snapshot(mntnode_t *mnp, int forread, int datamodel)
-{
-	size_t size;
-	timespec_t lastmodt;
-	mntdata_t *mntdata = MTOD(mnp);
-	zone_t *zone = mntdata->mnt_zone;
-	boolean_t global_view = (MTOD(mnp)->mnt_zone == global_zone);
-	boolean_t showhidden = ((mnp->mnt_flags & MNT_SHOWHIDDEN) != 0);
-	struct extmnttab *metadata_baseaddr;
-	char *text_baseaddr;
-	int i;
-	mntsnap_t *snap;
+	/*
+	 * Create a temporary database element. For each vfs_t, the temporary
+	 * element will be populated with the corresponding text. If the vfs_t
+	 * does not have a corresponding element within the database, or if
+	 * there is such an element but it is stale, a copy of the temporary
+	 * element is inserted into the database at the appropriate location.
+	 */
+	tempelemp = kmem_alloc(sizeof (mntelem_t), KM_SLEEP);
+	entry_length = MNT_LINE_MAX;
+	tempelemp->mnte_text = kmem_alloc(entry_length, KM_SLEEP);
 
-	if (forread)
-		snap = &mnp->mnt_read;
-	else
-		snap = &mnp->mnt_ioctl;
-
-	vfs_list_read_lock();
-	/*
-	 * Check if the mnttab info has changed since the last snapshot
-	 */
-	vfs_mnttab_modtime(&lastmodt);
-	if (snap->mnts_count &&
-	    lastmodt.tv_sec == snap->mnts_time.tv_sec &&
-	    lastmodt.tv_nsec == snap->mnts_time.tv_nsec) {
-		vfs_list_unlock();
-		return (0);
+	/* Find the first and last vfs_t for the given zone. */
+	if (is_global_zone) {
+		firstvfsp = rootvfs;
+		lastvfsp = firstvfsp->vfs_prev;
+	} else {
+		firstvfsp = zonep->zone_vfslist;
+		/*
+		 * If there isn't already a vfs_t for root then we create a
+		 * dummy which will be used as the head of the list (which will
+		 * therefore no longer be circular).
+		 */
+		if (firstvfsp == NULL ||
+		    strcmp(refstr_value(firstvfsp->vfs_mntpt),
+		    zonep->zone_rootpath) != 0) {
+			/*
+			 * The zone's vfs_ts will have mount points relative to
+			 * the zone's root path. The vfs_t for the zone's
+			 * root file system would therefore have a mount point
+			 * equal to the zone's root path. Since the zone's root
+			 * path isn't a mount point, we copy the vfs_t of the
+			 * zone's root vnode, and provide it with a fake mount
+			 * point and resource.
+			 *
+			 * Note that by cloning another vfs_t we also acquire
+			 * its high-resolution ctime. This might appear to
+			 * violate the requirement that the ctimes in the list
+			 * of vfs_ts are unique and monotonically increasing;
+			 * this is not the case. The dummy vfs_t appears in only
+			 * a non-global zone's vfs_t list, where the cloned
+			 * vfs_t would not ordinarily be visible; the ctimes are
+			 * therefore unique. The zone's root path must be
+			 * available before the zone boots, and so its root
+			 * vnode's vfs_t's ctime must be lower than those of any
+			 * resources subsequently mounted by the zone. The
+			 * ctimes are therefore monotonically increasing.
+			 */
+			dummyvfs = *zonep->zone_rootvp->v_vfsp;
+			dummyvfs.vfs_mntpt = refstr_alloc(zonep->zone_rootpath);
+			dummyvfs.vfs_resource = dummyvfs.vfs_mntpt;
+			dummyvfsp = &dummyvfs;
+			if (firstvfsp == NULL) {
+				lastvfsp = dummyvfsp;
+			} else {
+				lastvfsp = firstvfsp->vfs_zone_prev;
+				dummyvfsp->vfs_zone_next = firstvfsp;
+			}
+			firstvfsp = dummyvfsp;
+		} else {
+			lastvfsp = firstvfsp->vfs_zone_prev;
+		}
 	}
 
+	/*
+	 * Now walk through all the vfs_ts for this zone. For each one, find the
+	 * corresponding database element, creating it first if necessary, and
+	 * increment its reference count.
+	 */
+	rw_enter(dblockp, RW_WRITER);
+	elemp = zonep->zone_mntfs_db;
+	/* CSTYLED */
+	for (vfsp = firstvfsp;;
+	    vfsp = is_global_zone ? vfsp->vfs_next : vfsp->vfs_zone_next) {
+		DTRACE_PROBE1(new__vfs, vfs_t *, vfsp);
+		/* Consider only visible entries. */
+		if ((vfsp->vfs_flag & VFS_NOMNTTAB) == 0 || show_hidden) {
+			/*
+			 * Walk through the existing database looking for either
+			 * an element that matches the current vfs_t, or for the
+			 * correct place in which to insert a new element.
+			 */
+			insert_before = 0;
+			for (; elemp; prevp = elemp, elemp = elemp->mnte_next) {
+				DTRACE_PROBE1(considering__elem, mntelem_t *,
+				    elemp);
 
-	if (snap->mnts_count != 0)
-		mntfs_freesnap(snap);
-	if (global_view)
-		size = mntfs_global_len(&snap->mnts_count, showhidden);
-	else
-		size = mntfs_zone_len(&snap->mnts_count, zone, showhidden);
-	ASSERT(size != 0);
+				/* Compare the vfs_t with the element. */
+				order = mntfs_newest(&elemp->mnte_vfs_ctime,
+				    &vfsp->vfs_hrctime);
+
+				/*
+				 * If we encounter a database element newer than
+				 * this vfs_t then we've stepped over a gap
+				 * where the element for this vfs_t must be
+				 * inserted.
+				 */
+				if (order == MNTFS_FIRST) {
+					insert_before = 1;
+					break;
+				}
+
+				/* Dead elements no longer interest us. */
+				if (MNTFS_ELEM_IS_DEAD(elemp))
+					continue;
 
-	if (!forread)
-		metadata_baseaddr = kmem_alloc(
-		    snap->mnts_count * sizeof (struct extmnttab), KM_SLEEP);
-	else
-		metadata_baseaddr = NULL;
+				/*
+				 * If the time stamps are the same then the
+				 * element is potential match for the vfs_t,
+				 * although it may later prove to be stale.
+				 */
+				if (order == MNTFS_NEITHER)
+					break;
+
+				/*
+				 * This element must be older than the vfs_t.
+				 * It must, therefore, correspond to a vfs_t
+				 * that has been unmounted. Since the element is
+				 * still alive, we kill it if it is visible.
+				 */
+				if (!elemp->mnte_hidden || show_hidden)
+					vfs_mono_time(&elemp->mnte_death);
+			}
+			DTRACE_PROBE2(possible__match, vfs_t *, vfsp,
+			    mntelem_t *, elemp);
 
-	text_baseaddr = kmem_alloc(size, KM_SLEEP);
+			/* Create a new database element if required. */
+			new_entry_length = mntfs_text_len(vfsp, zonep);
+			if (new_entry_length > entry_length) {
+				kmem_free(tempelemp->mnte_text, entry_length);
+				tempelemp->mnte_text =
+				    kmem_alloc(new_entry_length, KM_SLEEP);
+				entry_length = new_entry_length;
+			}
+			mntfs_populate_text(vfsp, zonep, tempelemp);
+			ASSERT(tempelemp->mnte_text_size == new_entry_length);
+			if (elemp == NULL) {
+				/*
+				 * We ran off the end of the database. Insert a
+				 * new element at the end.
+				 */
+				newp = mntfs_copy(tempelemp);
+				vfs_mono_time(&newp->mnte_birth);
+				if (prevp) {
+					mntfs_insert_after(newp, prevp);
+				} else {
+					newp->mnte_next = NULL;
+					newp->mnte_prev = NULL;
+					ASSERT(*headpp == NULL);
+					*headpp = newp;
+				}
+				elemp = newp;
+			} else if (insert_before) {
+				/*
+				 * Insert a new element before the current one.
+				 */
+				newp = mntfs_copy(tempelemp);
+				vfs_mono_time(&newp->mnte_birth);
+				if (prevp) {
+					mntfs_insert_after(newp, prevp);
+				} else {
+					newp->mnte_next = elemp;
+					newp->mnte_prev = NULL;
+					elemp->mnte_prev = newp;
+					ASSERT(*headpp == elemp);
+					*headpp = newp;
+				}
+				elemp = newp;
+			} else if (!mntfs_is_same_element(elemp, tempelemp)) {
+				/*
+				 * The element corresponds to the vfs_t, but the
+				 * vfs_t has changed; it must have been
+				 * remounted. Kill the old element and insert a
+				 * new one after it.
+				 */
+				vfs_mono_time(&elemp->mnte_death);
+				newp = mntfs_copy(tempelemp);
+				vfs_mono_time(&newp->mnte_birth);
+				mntfs_insert_after(newp, elemp);
+				elemp = newp;
+			}
 
-	if (global_view)
-		mntfs_global_generate(showhidden, metadata_baseaddr,
-		    text_baseaddr, forread);
-	else
-		mntfs_zone_generate(zone, showhidden,
-		    metadata_baseaddr, text_baseaddr, forread);
+			/* We've found the corresponding element. Hold it. */
+			DTRACE_PROBE1(incrementing, mntelem_t *, elemp);
+			elemp->mnte_refcnt++;
 
-	vfs_mnttab_modtime(&snap->mnts_time);
-	vfs_list_unlock();
+			/*
+			 * Update the parameters used to initialise the
+			 * snapshot.
+			 */
+			nmnts++;
+			text_size += elemp->mnte_text_size;
+			if (!firstp)
+				firstp = elemp;
 
-	snap->mnts_text = mntfs_mapin(text_baseaddr, size);
-	snap->mnts_textsize = size;
-	kmem_free(text_baseaddr, size);
+			prevp = elemp;
+			elemp = elemp->mnte_next;
+		}
+
+		if (vfsp == lastvfsp)
+			break;
+	}
 
 	/*
-	 * The pointers in the metadata refer to addreesses in the range
-	 * [base_addr, base_addr + size].  Now that we have mapped the text into
-	 * the user's address space, we have to convert these addresses into the
-	 * new (user) range.  We also handle the conversion for 32-bit and
-	 * 32-bit applications here.
+	 * Any remaining visible database elements that are still alive must be
+	 * killed now, because their corresponding vfs_ts must have been
+	 * unmounted.
 	 */
-	if (!forread) {
-		struct extmnttab *tab;
-#ifdef _SYSCALL32_IMPL
-		struct extmnttab32 *tab32;
-
-		if (datamodel == DATAMODEL_ILP32) {
-			tab = (struct extmnttab *)metadata_baseaddr;
-			tab32 = (struct extmnttab32 *)metadata_baseaddr;
-
-			for (i = 0; i < snap->mnts_count; i++) {
-				tab32[i].mnt_special =
-				    (uintptr_t)snap->mnts_text +
-				    (tab[i].mnt_special - text_baseaddr);
-				tab32[i].mnt_mountp =
-				    (uintptr_t)snap->mnts_text +
-				    (tab[i].mnt_mountp - text_baseaddr);
-				tab32[i].mnt_fstype =
-				    (uintptr_t)snap->mnts_text +
-				    (tab[i].mnt_fstype - text_baseaddr);
-				tab32[i].mnt_mntopts =
-				    (uintptr_t)snap->mnts_text +
-				    (tab[i].mnt_mntopts - text_baseaddr);
-				tab32[i].mnt_time = (uintptr_t)snap->mnts_text +
-				    (tab[i].mnt_time - text_baseaddr);
-				tab32[i].mnt_major = tab[i].mnt_major;
-				tab32[i].mnt_minor = tab[i].mnt_minor;
-			}
-
-			snap->mnts_metasize =
-			    snap->mnts_count * sizeof (struct extmnttab32);
-			snap->mnts_metadata = mntfs_mapin(
-			    (char *)metadata_baseaddr,
-			    snap->mnts_metasize);
-
-		} else {
-#endif
-			tab = (struct extmnttab *)metadata_baseaddr;
-			for (i = 0; i < snap->mnts_count; i++) {
-				tab[i].mnt_special = snap->mnts_text +
-				    (tab[i].mnt_special - text_baseaddr);
-				tab[i].mnt_mountp = snap->mnts_text +
-				    (tab[i].mnt_mountp - text_baseaddr);
-				tab[i].mnt_fstype = snap->mnts_text +
-				    (tab[i].mnt_fstype - text_baseaddr);
-				tab[i].mnt_mntopts = snap->mnts_text +
-				    (tab[i].mnt_mntopts - text_baseaddr);
-				tab[i].mnt_time = snap->mnts_text +
-				    (tab[i].mnt_time - text_baseaddr);
-			}
-
-			snap->mnts_metasize =
-			    snap->mnts_count * sizeof (struct extmnttab);
-			snap->mnts_metadata = mntfs_mapin(
-			    (char *)metadata_baseaddr, snap->mnts_metasize);
-#ifdef _SYSCALL32_IMPL
-		}
-#endif
-
-		kmem_free(metadata_baseaddr,
-		    snap->mnts_count * sizeof (struct extmnttab));
+	for (; elemp; elemp = elemp->mnte_next) {
+		if (MNTFS_ELEM_IS_ALIVE(elemp) &&
+		    (!elemp->mnte_hidden || show_hidden))
+			vfs_mono_time(&elemp->mnte_death);
 	}
 
-	mntdata->mnt_size = size;
+	/* Initialise the snapshot. */
+	vfs_mono_time(&snapp->mnts_time);
+	snapp->mnts_last_mtime = last_mtime;
+	snapp->mnts_first = snapp->mnts_next = firstp;
+	snapp->mnts_flags = show_hidden ? MNTS_SHOWHIDDEN : 0;
+	snapp->mnts_nmnts = nmnts;
+	snapp->mnts_text_size = MTOD(mnp)->mnt_size = text_size;
+	snapp->mnts_foffset = snapp->mnts_ieoffset = 0;
 
-	if (snap->mnts_text == NULL ||
-	    (!forread && snap->mnts_metadata == NULL)) {
-		mntfs_freesnap(snap);
-		return (ENOMEM);
-	}
-	vfs_mnttab_readop();
-	return (0);
+	/* Clean up. */
+	rw_exit(dblockp);
+	vfs_list_unlock();
+	if (dummyvfsp != NULL)
+		refstr_rele(dummyvfsp->vfs_mntpt);
+	kmem_free(tempelemp->mnte_text, entry_length);
+	kmem_free(tempelemp, sizeof (mntelem_t));
 }
 
 /*
@@ -665,7 +812,6 @@
 	*lenp = len;
 }
 
-
 /* ARGSUSED */
 static int
 mntopen(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
@@ -704,8 +850,10 @@
 	if (count > 1)
 		return (0);
 	if (vp->v_count == 1) {
-		mntfs_freesnap(&mnp->mnt_read);
-		mntfs_freesnap(&mnp->mnt_ioctl);
+		rw_enter(&mnp->mnt_contents, RW_WRITER);
+		mntfs_freesnap(mnp, &mnp->mnt_read);
+		mntfs_freesnap(mnp, &mnp->mnt_ioctl);
+		rw_exit(&mnp->mnt_contents);
 		atomic_add_32(&MTOD(mnp)->mnt_nopen, -1);
 	}
 	return (0);
@@ -715,43 +863,27 @@
 static int
 mntread(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cred, caller_context_t *ct)
 {
-	int error = 0;
+	mntnode_t *mnp = VTOM(vp);
+	zone_t *zonep = MTOD(mnp)->mnt_zone;
+	mntsnap_t *snapp = &mnp->mnt_read;
 	off_t off = uio->uio_offset;
 	size_t len = uio->uio_resid;
-	mntnode_t *mnp = VTOM(vp);
-	char *buf;
-	mntsnap_t *snap;
-	int datamodel;
+	char *bufferp;
+	size_t available, copylen;
+	size_t written = 0;
+	mntelem_t *elemp;
+	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
+	int error = 0;
+	off_t	ieoffset;
 
-	rw_enter(&mnp->mnt_contents, RW_READER);
-	snap = &mnp->mnt_read;
-	if (off == (off_t)0 || snap->mnts_count == 0) {
-		/*
-		 * It is assumed that any kernel callers wishing
-		 * to read mnttab will be using extmnttab entries
-		 * and not extmnttab32 entries, whether or not
-		 * the kernel is LP64 or ILP32.  Thus, force the
-		 * datamodel that mntfs_snapshot uses to be
-		 * DATAMODEL_LP64.
-		 */
-		if (uio->uio_segflg == UIO_SYSSPACE)
-			datamodel = DATAMODEL_LP64;
-		else
-			datamodel = get_udatamodel();
-		if (!rw_tryupgrade(&mnp->mnt_contents)) {
-			rw_exit(&mnp->mnt_contents);
-			rw_enter(&mnp->mnt_contents, RW_WRITER);
-		}
-		if ((error = mntfs_snapshot(mnp, 1, datamodel)) != 0) {
-			rw_exit(&mnp->mnt_contents);
-			return (error);
-		}
-		rw_downgrade(&mnp->mnt_contents);
-	}
-	if ((size_t)(off + len) > snap->mnts_textsize)
-		len = snap->mnts_textsize - off;
+	rw_enter(&mnp->mnt_contents, RW_WRITER);
+	if (snapp->mnts_nmnts == 0 || (off == (off_t)0))
+		mntfs_snapshot(mnp, snapp);
 
-	if (off < 0 || len > snap->mnts_textsize) {
+	if ((size_t)(off + len) > snapp->mnts_text_size)
+		len = snapp->mnts_text_size - off;
+
+	if (off < 0 || len > snapp->mnts_text_size) {
 		rw_exit(&mnp->mnt_contents);
 		return (EFAULT);
 	}
@@ -762,23 +894,82 @@
 	}
 
 	/*
-	 * The mnttab image is stored in the user's address space,
-	 * so we have to copy it into the kernel from userland,
-	 * then copy it back out to the specified address.
+	 * For the file offset provided, locate the corresponding database
+	 * element and calculate the corresponding offset within its text. If
+	 * the file offset is the same as that reached during the last read(2)
+	 * then use the saved element and intra-element offset.
 	 */
-	buf = kmem_alloc(len, KM_SLEEP);
-	if (copyin(snap->mnts_text + off, buf, len))
-		error = EFAULT;
-	else {
-		error = uiomove(buf, len, UIO_READ, uio);
+	rw_enter(dblockp, RW_READER);
+	if (off == 0 || (off == snapp->mnts_foffset)) {
+		elemp = snapp->mnts_next;
+		ieoffset = snapp->mnts_ieoffset;
+	} else {
+		off_t total_off;
+		/*
+		 * Find the element corresponding to the requested file offset
+		 * by walking through the database and summing the text sizes
+		 * of the individual elements. If the requested file offset is
+		 * greater than that reached on the last visit then we can start
+		 * at the last seen element; otherwise, we have to start at the
+		 * beginning.
+		 */
+		if (off > snapp->mnts_foffset) {
+			elemp = snapp->mnts_next;
+			total_off = snapp->mnts_foffset - snapp->mnts_ieoffset;
+		} else {
+			elemp = snapp->mnts_first;
+			total_off = 0;
+		}
+		while (off > total_off + elemp->mnte_text_size) {
+			total_off += elemp->mnte_text_size;
+			elemp = mntfs_get_next_elem(snapp, elemp);
+			ASSERT(elemp != NULL);
+		}
+		/* Calculate the intra-element offset. */
+		if (off > total_off)
+			ieoffset = off - total_off;
+		else
+			ieoffset = 0;
 	}
-	kmem_free(buf, len);
+
+	/*
+	 * Create a buffer and populate it with the text from successive
+	 * database elements until it is full.
+	 */
+	bufferp = kmem_alloc(len, KM_SLEEP);
+	while (written < len) {
+		available = elemp->mnte_text_size - ieoffset;
+		copylen = MIN(len - written, available);
+		bcopy(elemp->mnte_text + ieoffset, bufferp + written, copylen);
+		written += copylen;
+		if (copylen == available) {
+			elemp = mntfs_get_next_elem(snapp, elemp);
+			ASSERT(elemp != NULL || written == len);
+			ieoffset = 0;
+		} else {
+			ieoffset += copylen;
+		}
+	}
+	rw_exit(dblockp);
+
+	/*
+	 * Write the populated buffer, update the snapshot's state if
+	 * successful and then advertise our read.
+	 */
+	error = uiomove(bufferp, len, UIO_READ, uio);
+	if (error == 0) {
+		snapp->mnts_next = elemp;
+		snapp->mnts_foffset = off + len;
+		snapp->mnts_ieoffset = ieoffset;
+	}
 	vfs_mnttab_readop();
 	rw_exit(&mnp->mnt_contents);
+
+	/* Clean up. */
+	kmem_free(bufferp, len);
 	return (error);
 }
 
-
 static int
 mntgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	caller_context_t *ct)
@@ -791,7 +982,7 @@
 	mntsnap_t *snap;
 
 	rw_enter(&mnp->mnt_contents, RW_READER);
-	snap = mnp->mnt_read.mnts_count ? &mnp->mnt_read : &mnp->mnt_ioctl;
+	snap = mnp->mnt_read.mnts_nmnts ? &mnp->mnt_read : &mnp->mnt_ioctl;
 	/*
 	 * Return all the attributes.  Should be refined
 	 * so that it returns only those asked for.
@@ -801,8 +992,10 @@
 	/*
 	 * Attributes are same as underlying file with modifications
 	 */
-	if (error = VOP_GETATTR(rvp, vap, flags, cr, ct))
+	if (error = VOP_GETATTR(rvp, vap, flags, cr, ct)) {
+		rw_exit(&mnp->mnt_contents);
 		return (error);
+	}
 
 	/*
 	 * We always look like a regular file
@@ -825,7 +1018,7 @@
 	 * If we haven't taken a snapshot yet, set the
 	 * size to the size of the latest snapshot.
 	 */
-	vap->va_size = snap->mnts_textsize ? snap->mnts_textsize :
+	vap->va_size = snap->mnts_text_size ? snap->mnts_text_size :
 	    mntdata->mnt_size;
 	rw_exit(&mnp->mnt_contents);
 	/*
@@ -915,16 +1108,24 @@
 	mntfreenode(mnp);
 }
 
+/*
+ * lseek(2) is supported only to rewind the file. Rewinding has a special
+ * meaning for /etc/mnttab: it forces mntfs to refresh the snapshot at the next
+ * read() or ioctl().
+ *
+ * The generic lseek() code will have already changed the file offset. Therefore
+ * mntread() can detect a rewind simply by looking for a zero offset. For the
+ * benefit of mntioctl() we advertise a rewind with a specific flag.
+ */
 /* ARGSUSED */
 static int
-mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp,
-	caller_context_t *ct)
+mntseek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
 {
 	mntnode_t *mnp = VTOM(vp);
 
 	if (*noffp == 0) {
 		rw_enter(&mnp->mnt_contents, RW_WRITER);
-		VTOM(vp)->mnt_offset = 0;
+		mnp->mnt_ioctl.mnts_flags |= MNTS_REWIND;
 		rw_exit(&mnp->mnt_contents);
 	}
 
@@ -942,14 +1143,14 @@
 	caller_context_t *ct)
 {
 	mntnode_t *mnp = VTOM(vp);
-	mntsnap_t *snap;
+	mntsnap_t *snapp;
 
 	rw_enter(&mnp->mnt_contents, RW_READER);
-	snap = &mnp->mnt_read;
-	if (mnp->mnt_ioctl.mnts_time.tv_sec > snap->mnts_time.tv_sec ||
-	    (mnp->mnt_ioctl.mnts_time.tv_sec == snap->mnts_time.tv_sec &&
-	    mnp->mnt_ioctl.mnts_time.tv_nsec > snap->mnts_time.tv_nsec))
-		snap = &mnp->mnt_ioctl;
+	if (mntfs_newest(&mnp->mnt_ioctl.mnts_last_mtime,
+	    &mnp->mnt_read.mnts_last_mtime) == MNTFS_FIRST)
+		snapp = &mnp->mnt_ioctl;
+	else
+		snapp = &mnp->mnt_read;
 
 	*revp = 0;
 	*phpp = (pollhead_t *)NULL;
@@ -960,7 +1161,7 @@
 		*revp |= POLLRDNORM;
 
 	if (ev & POLLRDBAND) {
-		vfs_mnttab_poll(&snap->mnts_time, phpp);
+		vfs_mnttab_poll(&snapp->mnts_last_mtime, phpp);
 		if (*phpp == (pollhead_t *)NULL)
 			*revp |= POLLRDBAND;
 	}
@@ -978,92 +1179,293 @@
 	*revp = POLLERR;
 	return (0);
 }
+
+/*
+ * mntfs_same_word() returns 1 if two words are the same in the context of
+ * MNTIOC_GETMNTANY and 0 otherwise.
+ *
+ * worda is a memory address that lies somewhere in the buffer bufa; it cannot
+ * be NULL since this is used to indicate to getmntany(3C) that the user does
+ * not wish to match a particular field. The text to which worda points is
+ * supplied by the user; if it is not null-terminated then it cannot match.
+ *
+ * Buffer bufb contains a line from /etc/mnttab, in which the fields are
+ * delimited by tab or new-line characters. offb is the offset of the second
+ * word within this buffer.
+ *
+ * mntfs_same_word() returns 1 if the words are the same and 0 otherwise.
+ */
+int
+mntfs_same_word(char *worda, char *bufa, size_t sizea, off_t offb, char *bufb,
+    size_t sizeb)
+{
+	char *wordb = bufb + offb;
+	int bytes_remaining;
+
+	ASSERT(worda != NULL);
+
+	bytes_remaining = MIN(((bufa + sizea) - worda),
+	    ((bufb + sizeb) - wordb));
+	while (bytes_remaining && *worda == *wordb) {
+		worda++;
+		wordb++;
+		bytes_remaining--;
+	}
+	if (bytes_remaining &&
+	    *worda == '\0' && (*wordb == '\t' || *wordb == '\n'))
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * mntfs_special_info_string() returns which, if either, of VBLK or VCHR
+ * corresponds to a supplied path. If the path is a special device then the
+ * function optionally sets the major and minor numbers.
+ */
+vtype_t
+mntfs_special_info_string(char *path, uint_t *major, uint_t *minor, cred_t *cr)
+{
+	vattr_t vattr;
+	vnode_t *vp;
+	vtype_t type;
+	int error;
+
+	if (path == NULL || *path != '/' ||
+	    lookupnameat(path + 1, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp, rootdir))
+		return (0);
+
+	vattr.va_mask = AT_TYPE | AT_RDEV;
+	error = VOP_GETATTR(vp, &vattr, ATTR_REAL, cr, NULL);
+	VN_RELE(vp);
+
+	if (error == 0 && ((type = vattr.va_type) == VBLK || type == VCHR)) {
+		if (major && minor) {
+			*major = getmajor(vattr.va_rdev);
+			*minor = getminor(vattr.va_rdev);
+		}
+		return (type);
+	} else {
+		return (0);
+	}
+}
+
+/*
+ * mntfs_special_info_element() extracts the name of the mounted resource
+ * for a given element and copies it into a null-terminated string, which it
+ * then passes to mntfs_special_info_string().
+ */
+vtype_t
+mntfs_special_info_element(mntelem_t *elemp, cred_t *cr)
+{
+	char *newpath;
+	vtype_t type;
+
+	newpath = kmem_alloc(elemp->mnte_text_size, KM_SLEEP);
+	bcopy(elemp->mnte_text, newpath, (off_t)(elemp->mnte_tab.mnt_mountp));
+	*(newpath + (off_t)elemp->mnte_tab.mnt_mountp - 1) = '\0';
+	type = mntfs_special_info_string(newpath, NULL, NULL, cr);
+	kmem_free(newpath, elemp->mnte_text_size);
+
+	return (type);
+}
+
+/*
+ * Convert an address that points to a byte within a user buffer into an
+ * address that points to the corresponding offset within a kernel buffer. If
+ * the user address is NULL then make no conversion. If the address does not
+ * lie within the buffer then reset it to NULL.
+ */
+char *
+mntfs_import_addr(char *uaddr, char *ubufp, char *kbufp, size_t bufsize)
+{
+	if (uaddr < ubufp || uaddr >= ubufp + bufsize)
+		return (NULL);
+	else
+		return (kbufp + (uaddr - ubufp));
+}
+
+/*
+ * These 32-bit versions are to support STRUCT_DECL(9F) etc. in
+ * mntfs_copyout_element() and mntioctl().
+ */
+#ifdef _SYSCALL32_IMPL
+typedef struct extmnttab32 {
+	uint32_t	mnt_special;
+	uint32_t	mnt_mountp;
+	uint32_t	mnt_fstype;
+	uint32_t	mnt_mntopts;
+	uint32_t	mnt_time;
+	uint_t		mnt_major;
+	uint_t		mnt_minor;
+} extmnttab32_t;
+
+typedef struct mnttab32 {
+	uint32_t	mnt_special;
+	uint32_t	mnt_mountp;
+	uint32_t	mnt_fstype;
+	uint32_t	mnt_mntopts;
+	uint32_t	mnt_time;
+} mnttab32_t;
+
+struct mntentbuf32 {
+	uint32_t	mbuf_emp;
+	uint_t		mbuf_bufsize;
+	uint32_t	mbuf_buf;
+};
+#endif
+
+/*
+ * mntfs_copyout_element() is common code for the MNTIOC_GETMNTENT,
+ * MNTIOC_GETEXTMNTENT and MNTIOC_GETMNTANY ioctls. Having identifed the
+ * database element desired by the user, this function copies out the text and
+ * the pointers to the relevant userland addresses. It returns 0 on success
+ * and non-zero otherwise.
+ */
+int
+mntfs_copyout_elem(mntelem_t *elemp, struct extmnttab *uemp,
+    char *ubufp, int cmd, int datamodel)
+{
+		STRUCT_DECL(extmnttab, ktab);
+		char *dbbufp = elemp->mnte_text;
+		size_t dbbufsize = elemp->mnte_text_size;
+		struct extmnttab *dbtabp = &elemp->mnte_tab;
+		size_t ssize;
+		char *kbufp;
+		int error = 0;
+
+
+		/*
+		 * We create a struct extmnttab within the kernel of the size
+		 * determined by the user's data model. We then populate its
+		 * fields by combining the start address of the text buffer
+		 * supplied by the user, ubufp, with the offsets stored for
+		 * this database element within dbtabp, a pointer to a struct
+		 * extmnttab.
+		 *
+		 * Note that if the corresponding field is "-" this signifies
+		 * no real content, and we set the address to NULL. This does
+		 * not apply to mnt_time.
+		 */
+		STRUCT_INIT(ktab, datamodel);
+		STRUCT_FSETP(ktab, mnt_special,
+		    MNTFS_REAL_FIELD(dbbufp) ? ubufp : NULL);
+		STRUCT_FSETP(ktab, mnt_mountp,
+		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mountp) ?
+		    ubufp + (off_t)dbtabp->mnt_mountp : NULL);
+		STRUCT_FSETP(ktab, mnt_fstype,
+		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_fstype) ?
+		    ubufp + (off_t)dbtabp->mnt_fstype : NULL);
+		STRUCT_FSETP(ktab, mnt_mntopts,
+		    MNTFS_REAL_FIELD(dbbufp + (off_t)dbtabp->mnt_mntopts) ?
+		    ubufp + (off_t)dbtabp->mnt_mntopts : NULL);
+		STRUCT_FSETP(ktab, mnt_time,
+		    ubufp + (off_t)dbtabp->mnt_time);
+		if (cmd == MNTIOC_GETEXTMNTENT) {
+			STRUCT_FSETP(ktab, mnt_major, dbtabp->mnt_major);
+			STRUCT_FSETP(ktab, mnt_minor, dbtabp->mnt_minor);
+			ssize = SIZEOF_STRUCT(extmnttab, datamodel);
+		} else {
+			ssize = SIZEOF_STRUCT(mnttab, datamodel);
+		}
+		if (copyout(STRUCT_BUF(ktab), uemp, ssize))
+			return (EFAULT);
+
+		/*
+		 * We create a text buffer in the kernel into which we copy the
+		 * /etc/mnttab entry for this element. We change the tab and
+		 * new-line delimiters to null bytes before copying out the
+		 * buffer.
+		 */
+		kbufp = kmem_alloc(dbbufsize, KM_SLEEP);
+		bcopy(elemp->mnte_text, kbufp, dbbufsize);
+		*(kbufp + (off_t)dbtabp->mnt_mountp - 1) =
+		    *(kbufp + (off_t)dbtabp->mnt_fstype - 1) =
+		    *(kbufp + (off_t)dbtabp->mnt_mntopts - 1) =
+		    *(kbufp + (off_t)dbtabp->mnt_time - 1) =
+		    *(kbufp + dbbufsize - 1) = '\0';
+		if (copyout(kbufp, ubufp, dbbufsize))
+			error = EFAULT;
+
+		kmem_free(kbufp, dbbufsize);
+		return (error);
+}
+
 /* ARGSUSED */
 static int
-mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag,
-	cred_t *cr, int *rvalp, caller_context_t *ct)
+mntioctl(struct vnode *vp, int cmd, intptr_t arg, int flag, cred_t *cr,
+    int *rvalp, caller_context_t *ct)
 {
 	uint_t *up = (uint_t *)arg;
 	mntnode_t *mnp = VTOM(vp);
-	mntsnap_t *snap;
-	int error;
+	mntsnap_t *snapp = &mnp->mnt_ioctl;
+	int error = 0;
+	zone_t *zonep = MTOD(mnp)->mnt_zone;
+	krwlock_t *dblockp = &zonep->zone_mntfs_db_lock;
+	model_t datamodel = flag & DATAMODEL_MASK;
 
-	error = 0;
-	rw_enter(&mnp->mnt_contents, RW_READER);
-	snap = &mnp->mnt_ioctl;
 	switch (cmd) {
 
-	case MNTIOC_NMNTS: {		/* get no. of mounted resources */
-		if (snap->mnts_count == 0) {
+	case MNTIOC_NMNTS:  		/* get no. of mounted resources */
+	{
+		rw_enter(&mnp->mnt_contents, RW_READER);
+		if (snapp->mnts_nmnts == 0 ||
+		    (snapp->mnts_flags & MNTS_REWIND)) {
 			if (!rw_tryupgrade(&mnp->mnt_contents)) {
 				rw_exit(&mnp->mnt_contents);
 				rw_enter(&mnp->mnt_contents, RW_WRITER);
 			}
-			if ((error =
-			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK))
-			    != 0) {
-				rw_exit(&mnp->mnt_contents);
-				return (error);
-			}
-			rw_downgrade(&mnp->mnt_contents);
+			if (snapp->mnts_nmnts == 0 ||
+			    (snapp->mnts_flags & MNTS_REWIND))
+				mntfs_snapshot(mnp, snapp);
 		}
-		if (suword32(up, snap->mnts_count) != 0)
+		rw_exit(&mnp->mnt_contents);
+
+		if (suword32(up, snapp->mnts_nmnts) != 0)
 			error = EFAULT;
 		break;
 	}
 
-	case MNTIOC_GETDEVLIST: {	/* get mounted device major/minor nos */
+	case MNTIOC_GETDEVLIST:  	/* get mounted device major/minor nos */
+	{
+		size_t len;
 		uint_t *devlist;
-		int i;
-		size_t len;
+		mntelem_t *elemp;
+		int i = 0;
 
-		if (snap->mnts_count == 0) {
+		rw_enter(&mnp->mnt_contents, RW_READER);
+		if (snapp->mnts_nmnts == 0 ||
+		    (snapp->mnts_flags & MNTS_REWIND)) {
 			if (!rw_tryupgrade(&mnp->mnt_contents)) {
 				rw_exit(&mnp->mnt_contents);
 				rw_enter(&mnp->mnt_contents, RW_WRITER);
 			}
-			if ((error =
-			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK))
-			    != 0) {
-				rw_exit(&mnp->mnt_contents);
-				return (error);
-			}
+			if (snapp->mnts_nmnts == 0 ||
+			    (snapp->mnts_flags & MNTS_REWIND))
+				mntfs_snapshot(mnp, snapp);
 			rw_downgrade(&mnp->mnt_contents);
 		}
 
-		len = 2 * snap->mnts_count * sizeof (uint_t);
+		/* Create a local buffer to hold the device numbers. */
+		len = 2 * snapp->mnts_nmnts * sizeof (uint_t);
 		devlist = kmem_alloc(len, KM_SLEEP);
-		for (i = 0; i < snap->mnts_count; i++) {
-
-#ifdef _SYSCALL32_IMPL
-			if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
-				struct extmnttab32 tab;
-
-				if ((error = xcopyin(snap->mnts_text +
-				    i * sizeof (struct extmnttab32), &tab,
-				    sizeof (tab))) != 0)
-					break;
 
-				devlist[i*2] = tab.mnt_major;
-				devlist[i*2+1] = tab.mnt_minor;
-			} else {
-#endif
-				struct extmnttab tab;
+		/*
+		 * Walk the database elements for this snapshot and add their
+		 * major and minor numbers.
+		 */
+		rw_enter(dblockp, RW_READER);
+		for (elemp = snapp->mnts_first; elemp;
+		    elemp = mntfs_get_next_elem(snapp, elemp)) {
+				devlist[2 * i] = elemp->mnte_tab.mnt_major;
+				devlist[2 * i + 1] = elemp->mnte_tab.mnt_minor;
+				i++;
+		}
+		rw_exit(dblockp);
+		ASSERT(i == snapp->mnts_nmnts);
+		rw_exit(&mnp->mnt_contents);
 
-				if ((error = xcopyin(snap->mnts_text +
-				    i * sizeof (struct extmnttab), &tab,
-				    sizeof (tab))) != 0)
-					break;
-
-				devlist[i*2] = tab.mnt_major;
-				devlist[i*2+1] = tab.mnt_minor;
-#ifdef _SYSCALL32_IMPL
-			}
-#endif
-		}
-
-		if (error == 0)
-			error = xcopyout(devlist, up, len);
+		error = xcopyout(devlist, up, len);
 		kmem_free(devlist, len);
 		break;
 	}
@@ -1128,54 +1530,251 @@
 		break;
 	}
 
-	case MNTIOC_GETMNTENT:
+	case MNTIOC_GETMNTANY:
 	{
-		size_t idx;
-		uintptr_t addr;
+		STRUCT_DECL(mntentbuf, embuf);	/* Our copy of user's embuf */
+		STRUCT_DECL(extmnttab, ktab);	/* Out copy of user's emp */
+		struct extmnttab *uemp;		/* uaddr of user's emp */
+		char *ubufp;			/* uaddr of user's text buf */
+		size_t ubufsize;		/* size of the above */
+		struct extmnttab preftab;	/* our version of user's emp */
+		char *prefbuf;			/* our copy of user's text */
+		mntelem_t *elemp;		/* a database element */
+		struct extmnttab *dbtabp;	/* element's extmnttab */
+		char *dbbufp;			/* element's text buf */
+		size_t dbbufsize;		/* size of the above */
+		vtype_t type;			/* type, if any, of special */
 
-		if (!rw_tryupgrade(&mnp->mnt_contents)) {
-			rw_exit(&mnp->mnt_contents);
-			rw_enter(&mnp->mnt_contents, RW_WRITER);
+
+		/*
+		 * embuf is a struct embuf within the kernel. We copy into it
+		 * the struct embuf supplied by the user.
+		 */
+		STRUCT_INIT(embuf, datamodel);
+		if (copyin((void *) arg, STRUCT_BUF(embuf),
+		    STRUCT_SIZE(embuf))) {
+			error = EFAULT;
+			break;
 		}
-		idx = mnp->mnt_offset;
-		if (snap->mnts_count == 0 || idx == 0) {
-			if ((error =
-			    mntfs_snapshot(mnp, 0, flag & DATAMODEL_MASK))
-			    != 0) {
-				rw_exit(&mnp->mnt_contents);
-				return (error);
-			}
+		uemp = STRUCT_FGETP(embuf, mbuf_emp);
+		ubufp = STRUCT_FGETP(embuf, mbuf_buf);
+		ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
+
+		/*
+		 * Check that the text buffer offered by the user is the
+		 * agreed size.
+		 */
+		if (ubufsize != MNT_LINE_MAX) {
+			error = EINVAL;
+			break;
 		}
-		/*
-		 * If the next index is beyond the end of the current mnttab,
-		 * return EOF
-		 */
-		if (idx >= snap->mnts_count) {
-			*rvalp = 1;
-			rw_exit(&mnp->mnt_contents);
-			return (0);
+
+		/* Copy the user-supplied entry into a local buffer. */
+		prefbuf = kmem_alloc(MNT_LINE_MAX, KM_SLEEP);
+		if (copyin(ubufp, prefbuf, MNT_LINE_MAX)) {
+			kmem_free(prefbuf, MNT_LINE_MAX);
+			error = EFAULT;
+			break;
+		}
+
+		/* Ensure that any string within it is null-terminated. */
+		*(prefbuf + MNT_LINE_MAX - 1) = 0;
+
+		/* Copy in the user-supplied mpref */
+		STRUCT_INIT(ktab, datamodel);
+		if (copyin(uemp, STRUCT_BUF(ktab),
+		    SIZEOF_STRUCT(mnttab, datamodel))) {
+			kmem_free(prefbuf, MNT_LINE_MAX);
+			error = EFAULT;
+			break;
 		}
 
-#ifdef _SYSCALL32_IMPL
-		if ((flag & DATAMODEL_MASK) == DATAMODEL_ILP32) {
-			addr = (uintptr_t)(snap->mnts_metadata + idx *
-			    sizeof (struct extmnttab32));
-			error = suword32((void *)arg, addr);
-		} else {
-#endif
-			addr = (uintptr_t)(snap->mnts_metadata + idx *
-			    sizeof (struct extmnttab));
-			error = sulword((void *)arg, addr);
-#ifdef _SYSCALL32_IMPL
-		}
-#endif
+		/*
+		 * Copy the members of the user's pref struct into a local
+		 * struct. The pointers need to be offset and verified to
+		 * ensure that they lie within the bounds of the buffer.
+		 */
+		preftab.mnt_special = mntfs_import_addr(STRUCT_FGETP(ktab,
+		    mnt_special), ubufp, prefbuf, MNT_LINE_MAX);
+		preftab.mnt_mountp = mntfs_import_addr(STRUCT_FGETP(ktab,
+		    mnt_mountp), ubufp, prefbuf, MNT_LINE_MAX);
+		preftab.mnt_fstype = mntfs_import_addr(STRUCT_FGETP(ktab,
+		    mnt_fstype), ubufp, prefbuf, MNT_LINE_MAX);
+		preftab.mnt_mntopts = mntfs_import_addr(STRUCT_FGETP(ktab,
+		    mnt_mntopts), ubufp, prefbuf, MNT_LINE_MAX);
+		preftab.mnt_time = mntfs_import_addr(STRUCT_FGETP(ktab,
+		    mnt_time), ubufp, prefbuf, MNT_LINE_MAX);
+
+		/*
+		 * If the user specifies a mounted resource that is a special
+		 * device then we capture its mode and major and minor numbers;
+		 * c.f. the block comment below.
+		 */
+		type = mntfs_special_info_string(preftab.mnt_special,
+		    &preftab.mnt_major, &preftab.mnt_minor, cr);
+
+		rw_enter(&mnp->mnt_contents, RW_WRITER);
+		if (snapp->mnts_nmnts == 0 ||
+		    (snapp->mnts_flags & MNTS_REWIND))
+			mntfs_snapshot(mnp, snapp);
 
-		if (error != 0) {
-			rw_exit(&mnp->mnt_contents);
-			return (error);
+		/*
+		 * This is the core functionality that implements getmntany().
+		 * We walk through the mntfs database until we find an element
+		 * matching the user's preferences that are contained in
+		 * preftab. Typically, this means checking that the text
+		 * matches. However, the mounted resource is special: if the
+		 * user is looking for a special device then we must find a
+		 * database element with the same major and minor numbers and
+		 * the same type, i.e. VBLK or VCHR. The type is not recorded
+		 * in the element because it cannot be inferred from the vfs_t.
+		 * We therefore check the type of suitable candidates via
+		 * mntfs_special_info_element(); since this calls into the
+		 * underlying file system we make sure to drop the database lock
+		 * first.
+		 */
+		elemp = snapp->mnts_next;
+		rw_enter(dblockp, RW_READER);
+		for (;;) {
+			for (; elemp; elemp = mntfs_get_next_elem(snapp,
+			    elemp)) {
+				dbtabp = &elemp->mnte_tab;
+				dbbufp = elemp->mnte_text;
+				dbbufsize = elemp->mnte_text_size;
+
+				if (((type &&
+				    dbtabp->mnt_major == preftab.mnt_major &&
+				    dbtabp->mnt_minor == preftab.mnt_minor &&
+				    MNTFS_REAL_FIELD(dbbufp)) ||
+				    (!type && (!preftab.mnt_special ||
+				    mntfs_same_word(preftab.mnt_special,
+				    prefbuf, MNT_LINE_MAX, (off_t)0, dbbufp,
+				    dbbufsize)))) &&
+
+				    (!preftab.mnt_mountp || mntfs_same_word(
+				    preftab.mnt_mountp, prefbuf, MNT_LINE_MAX,
+				    (off_t)dbtabp->mnt_mountp, dbbufp,
+				    dbbufsize)) &&
+
+				    (!preftab.mnt_fstype || mntfs_same_word(
+				    preftab.mnt_fstype, prefbuf, MNT_LINE_MAX,
+				    (off_t)dbtabp->mnt_fstype, dbbufp,
+				    dbbufsize)) &&
+
+				    (!preftab.mnt_mntopts || mntfs_same_word(
+				    preftab.mnt_mntopts, prefbuf, MNT_LINE_MAX,
+				    (off_t)dbtabp->mnt_mntopts, dbbufp,
+				    dbbufsize)) &&
+
+				    (!preftab.mnt_time || mntfs_same_word(
+				    preftab.mnt_time, prefbuf, MNT_LINE_MAX,
+				    (off_t)dbtabp->mnt_time, dbbufp,
+				    dbbufsize)))
+					break;
+			}
+			rw_exit(dblockp);
+
+			if (elemp == NULL || type == 0 ||
+			    type == mntfs_special_info_element(elemp, cr))
+				break;
+
+			rw_enter(dblockp, RW_READER);
+			elemp = mntfs_get_next_elem(snapp, elemp);
 		}
 
-		mnp->mnt_offset++;
+		kmem_free(prefbuf, MNT_LINE_MAX);
+
+		/* If we failed to find a match then return EOF. */
+		if (elemp == NULL) {
+			rw_exit(&mnp->mnt_contents);
+			*rvalp = MNTFS_EOF;
+			break;
+		}
+
+		/*
+		 * Check that the text buffer offered by the user will be large
+		 * enough to accommodate the text for this entry.
+		 */
+		if (elemp->mnte_text_size > MNT_LINE_MAX) {
+			rw_exit(&mnp->mnt_contents);
+			*rvalp = MNTFS_TOOLONG;
+			break;
+		}
+
+		/*
+		 * Populate the user's struct mnttab and text buffer using the
+		 * element's contents.
+		 */
+		if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
+			error = EFAULT;
+		} else {
+			rw_enter(dblockp, RW_READER);
+			elemp = mntfs_get_next_elem(snapp, elemp);
+			rw_exit(dblockp);
+			snapp->mnts_next = elemp;
+		}
+		rw_exit(&mnp->mnt_contents);
+		break;
+	}
+
+	case MNTIOC_GETMNTENT:
+	case MNTIOC_GETEXTMNTENT:
+	{
+		STRUCT_DECL(mntentbuf, embuf);	/* Our copy of user's embuf */
+		struct extmnttab *uemp;		/* uaddr of user's emp */
+		char *ubufp;			/* uaddr of user's text buf */
+		size_t ubufsize;		/* size of the above */
+		mntelem_t *elemp;		/* a database element */
+
+
+		rw_enter(&mnp->mnt_contents, RW_WRITER);
+		if (snapp->mnts_nmnts == 0 ||
+		    (snapp->mnts_flags & MNTS_REWIND))
+			mntfs_snapshot(mnp, snapp);
+		if ((elemp = snapp->mnts_next) == NULL) {
+			rw_exit(&mnp->mnt_contents);
+			*rvalp = MNTFS_EOF;
+			break;
+		}
+
+		/*
+		 * embuf is a struct embuf within the kernel. We copy into it
+		 * the struct embuf supplied by the user.
+		 */
+		STRUCT_INIT(embuf, datamodel);
+		if (copyin((void *) arg, STRUCT_BUF(embuf),
+		    STRUCT_SIZE(embuf))) {
+			rw_exit(&mnp->mnt_contents);
+			error = EFAULT;
+			break;
+		}
+		uemp = STRUCT_FGETP(embuf, mbuf_emp);
+		ubufp = STRUCT_FGETP(embuf, mbuf_buf);
+		ubufsize = STRUCT_FGET(embuf, mbuf_bufsize);
+
+		/*
+		 * Check that the text buffer offered by the user will be large
+		 * enough to accommodate the text for this entry.
+		 */
+		if (elemp->mnte_text_size > ubufsize) {
+			rw_exit(&mnp->mnt_contents);
+			*rvalp = MNTFS_TOOLONG;
+			break;
+		}
+
+		/*
+		 * Populate the user's struct mnttab and text buffer using the
+		 * element's contents.
+		 */
+		if (mntfs_copyout_elem(elemp, uemp, ubufp, cmd, datamodel)) {
+			error = EFAULT;
+		} else {
+			rw_enter(dblockp, RW_READER);
+			elemp = mntfs_get_next_elem(snapp, elemp);
+			rw_exit(dblockp);
+			snapp->mnts_next = elemp;
+		}
+		rw_exit(&mnp->mnt_contents);
 		break;
 	}
 
@@ -1184,7 +1783,6 @@
 		break;
 	}
 
-	rw_exit(&mnp->mnt_contents);
 	return (error);
 }
 
diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/uts/common/fs/vfs.c
--- a/usr/src/uts/common/fs/vfs.c	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/uts/common/fs/vfs.c	Thu Oct 29 17:10:37 2009 -0700
@@ -2964,6 +2964,28 @@
 	}
 }
 
+/* Provide a unique and monotonically-increasing timestamp. */
+void
+vfs_mono_time(timespec_t *ts)
+{
+	static volatile hrtime_t hrt;		/* The saved time. */
+	hrtime_t	newhrt, oldhrt;		/* For effecting the CAS. */
+	timespec_t	newts;
+
+	gethrestime(&newts);
+	newhrt = ts2hrt(&newts);
+	do {
+		oldhrt = hrt;
+		if (newhrt <= hrt) {
+			/* Have another go. */
+			gethrestime(&newts);
+			newhrt = ts2hrt(&newts);
+			continue;
+		}
+	} while (cas64((uint64_t *)&hrt, oldhrt, newhrt) != oldhrt);
+	hrt2ts(newhrt, ts);
+}
+
 /*
  * Update the mnttab modification time and wake up any waiters for
  * mnttab changes
@@ -2971,21 +2993,11 @@
 void
 vfs_mnttab_modtimeupd()
 {
-	hrtime_t oldhrt, newhrt;
-
 	ASSERT(RW_WRITE_HELD(&vfslist));
-	oldhrt = ts2hrt(&vfs_mnttab_mtime);
-	gethrestime(&vfs_mnttab_mtime);
-	newhrt = ts2hrt(&vfs_mnttab_mtime);
-	if (oldhrt == (hrtime_t)0)
+	vfs_mono_time(&vfs_mnttab_mtime);
+	/* If this is our first visit then let this be the creation time. */
+	if (vfs_mnttab_ctime.tv_sec == 0 && vfs_mnttab_ctime.tv_nsec == 0)
 		vfs_mnttab_ctime = vfs_mnttab_mtime;
-	/*
-	 * Attempt to provide unique mtime (like uniqtime but not).
-	 */
-	if (newhrt == oldhrt) {
-		newhrt++;
-		hrt2ts(newhrt, &vfs_mnttab_mtime);
-	}
 	pollwakeup(&vfs_pollhd, (short)POLLRDBAND);
 	vfs_mnttab_writeop();
 }
@@ -3492,6 +3504,18 @@
 	zone_t *zone;
 
 	/*
+	 * Typically, the vfs_t will have been created on behalf of the file
+	 * system in vfs_init, where it will have been provided with a
+	 * vfs_impl_t. This, however, might be lacking if the vfs_t was created
+	 * by an unbundled file system. We therefore check for such an example
+	 * before stamping the vfs_t with its creation time for the benefit of
+	 * mntfs.
+	 */
+	if (vfsp->vfs_implp == NULL)
+		vfsimpl_setup(vfsp);
+	vfs_mono_time(&vfsp->vfs_hrctime);
+
+	/*
 	 * The zone that owns the mount is the one that performed the mount.
 	 * Note that this isn't necessarily the same as the zone mounted into.
 	 * The corresponding zone_rele() will be done when the vfs_t is
diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/uts/common/os/zone.c
--- a/usr/src/uts/common/os/zone.c	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/uts/common/os/zone.c	Thu Oct 29 17:10:37 2009 -0700
@@ -647,11 +647,11 @@
 		zsd_apply_all_zones(zsd_apply_create, key);
 	}
 	/*
-	* It is safe for consumers to use the key now, make it
-	* globally visible. Specifically zone_getspecific() will
-	* always successfully return the zone specific data associated
-	* with the key.
-	*/
+	 * It is safe for consumers to use the key now, make it
+	 * globally visible. Specifically zone_getspecific() will
+	 * always successfully return the zone specific data associated
+	 * with the key.
+	 */
 	*keyp = key;
 
 }
@@ -1937,6 +1937,11 @@
 	rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
 	label_hold(l_admin_low);
 
+	/*
+	 * Initialise the lock for the database structure used by mntfs.
+	 */
+	rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
+
 	mutex_enter(&zonehash_lock);
 	zone_uniqid(&zone0);
 	ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
@@ -2042,6 +2047,7 @@
 	mutex_destroy(&zone->zone_lock);
 	cv_destroy(&zone->zone_cv);
 	rw_destroy(&zone->zone_mlps.mlpl_rwlock);
+	rw_destroy(&zone->zone_mntfs_db_lock);
 	kmem_free(zone, sizeof (zone_t));
 }
 
@@ -3813,6 +3819,7 @@
 	list_create(&zone->zone_dl_list, sizeof (zone_dl_t),
 	    offsetof(zone_dl_t, zdl_linkage));
 	rw_init(&zone->zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zone->zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
 
 	if (flags & ZCF_NET_EXCL) {
 		zone->zone_flags |= ZF_NET_EXCL;
diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/uts/common/sys/fs/mntdata.h
--- a/usr/src/uts/common/sys/fs/mntdata.h	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/uts/common/sys/fs/mntdata.h	Thu Oct 29 17:10:37 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -28,28 +28,46 @@
 
 #include <sys/vnode.h>
 #include <sys/poll.h>
+#include <sys/mnttab.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+typedef struct mntelem {
+	/* Metadata. */
+	struct mntelem 	*mnte_next;
+	struct mntelem 	*mnte_prev;
+	timespec_t	mnte_birth;
+	timespec_t	mnte_death;
+	timespec_t	mnte_vfs_ctime;
+	int		mnte_refcnt;
+	/* Payload. */
+	int		mnte_hidden;
+	char		*mnte_text;
+	size_t		mnte_text_size;
+	struct extmnttab mnte_tab;
+} mntelem_t;
+
 typedef struct mntsnap {
-	char	*mnts_text;	/* base address of text in user addr space */
-	size_t	mnts_textsize;	/* size of mapped text */
-	char	*mnts_metadata;	/* base address of metadata in user space */
-	size_t	mnts_metasize;	/* size of mapped metadata */
-	uint_t	mnts_count;	/* number of mounts in snapshot */
-	timespec_t mnts_time;	/* time when snapshot was taken */
+	timespec_t mnts_time;		/* Time of this snapshot. */
+	timespec_t mnts_last_mtime;	/* mnttab modification time. */
+	mntelem_t *mnts_first;		/* First element in this snapshot. */
+	mntelem_t *mnts_next;		/* Next element to use. */
+	int mnts_flags;			/* flags; see below. */
+	size_t mnts_nmnts;		/* # of elements in this snapshot. */
+	size_t mnts_text_size;		/* Text size for this snapshot. */
+	size_t mnts_foffset;		/* File offset of last read(). */
+	size_t mnts_ieoffset;		/* Offset of last read() in element. */
 } mntsnap_t;
 
 typedef struct mntnode {
-	uint_t mnt_flags;	/* flags */
+	krwlock_t mnt_contents;	/* protects mnt_read, mnt_ioctl, mnt_offset */
+	uint_t mnt_flags;	/* flags; see below */
 	vnode_t *mnt_mountvp;	/* vnode mounted on */
 	vnode_t *mnt_vnode;	/* vnode for this mntnode */
 	mntsnap_t mnt_read;	/* Data for read() */
 	mntsnap_t mnt_ioctl;	/* Data for ioctl() */
-	uint_t mnt_offset;	/* offset within ioctl() snapshot */
-	krwlock_t mnt_contents;	/* protects mnt_read, mnt_ioctl, mnt_offset */
 } mntnode_t;
 
 struct zone;
@@ -68,12 +86,21 @@
 #define	MTOV(pnp)	((pnp)->mnt_vnode)
 #define	MTOD(pnp)	((struct mntdata *)MTOV(pnp)->v_vfsp->vfs_data)
 
+#define	MNTFS_ELEM_IS_DEAD(x)	((x)->mnte_death.tv_sec || \
+				(x)->mnte_death.tv_nsec)
+#define	MNTFS_ELEM_IS_ALIVE(x)	!MNTFS_ELEM_IS_DEAD(x)
+
 #if defined(_KERNEL)
 
 /*
- * Values for mnt_flags.
+ * Value for a mntsnap_t's mnts_flags.
  */
-#define	MNT_SHOWHIDDEN	0x1	/* Hack to show all mounts, even MS_NOMNTTAB */
+#define	MNTS_SHOWHIDDEN	0x1	/* This snapshot contains hidden mounts. */
+#define	MNTS_REWIND	0x2	/* This snapshot must be refreshed. */
+/*
+ * Values for a mntnode_t's mnt_flags.
+ */
+#define	MNT_SHOWHIDDEN	0x1	/* Include MS_NOMNTTAB mounts in snapshots. */
 
 extern	struct vnodeops	*mntvnodeops;
 extern	void mntfs_getmntopts(struct vfs *, char **, size_t *);
diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/uts/common/sys/mntio.h
--- a/usr/src/uts/common/sys/mntio.h	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/uts/common/sys/mntio.h	Thu Oct 29 17:10:37 2009 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_MNTIO_H
 #define	_SYS_MNTIO_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -43,6 +40,15 @@
 #define	MNTIOC_CLRTAG		(MNTIOC|4)	/* Clear a tag from a fs */
 #define	MNTIOC_SHOWHIDDEN	(MNTIOC|6)	/* private */
 #define	MNTIOC_GETMNTENT	(MNTIOC|7)	/* private */
+#define	MNTIOC_GETEXTMNTENT	(MNTIOC|8)	/* private */
+#define	MNTIOC_GETMNTANY	(MNTIOC|9)	/* private */
+
+/*
+ * Private mntfs return codes
+ */
+#define	MNTFS_EOF	1
+#define	MNTFS_TOOLONG	2
+
 
 #define	MAX_MNTOPT_TAG	64	/* Maximum size for a mounted file system tag */
 
diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/uts/common/sys/mnttab.h
--- a/usr/src/uts/common/sys/mnttab.h	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/uts/common/sys/mnttab.h	Thu Oct 29 17:10:37 2009 -0700
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -24,15 +23,13 @@
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_MNTTAB_H
 #define	_SYS_MNTTAB_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 
 #ifdef	__cplusplus
@@ -53,6 +50,14 @@
 
 #define	putmntent(fd, mp)	(-1)
 
+/*
+ * The fields in struct extmnttab should match those in struct mnttab until new
+ * fields are encountered. This allows hasmntopt(), getmntent_common() and
+ * mntioctl() to cast one type to the other safely.
+ *
+ * The fields in struct mnttab, struct extmnttab and struct mntentbuf must all
+ * match those in the corresponding 32-bit versions defined in mntvnops.c.
+ */
 struct mnttab {
 	char	*mnt_special;
 	char	*mnt_mountp;
@@ -61,11 +66,6 @@
 	char	*mnt_time;
 };
 
-/*
- * NOTE: fields in extmnttab should match struct mnttab till new fields
- * are encountered, this allows hasmntopt to work properly when its arg is
- * a pointer to an extmnttab struct cast to a mnttab struct pointer.
- */
 struct extmnttab {
 	char	*mnt_special;
 	char	*mnt_mountp;
@@ -76,6 +76,12 @@
 	uint_t	mnt_minor;
 };
 
+struct mntentbuf {
+	struct	extmnttab *mbuf_emp;
+	size_t 	mbuf_bufsize;
+	char	*mbuf_buf;
+};
+
 #if !defined(_KERNEL)
 #ifdef __STDC__
 extern void	resetmnttab(FILE *);
diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/uts/common/sys/vfs.h
--- a/usr/src/uts/common/sys/vfs.h	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/uts/common/sys/vfs.h	Thu Oct 29 17:10:37 2009 -0700
@@ -41,6 +41,7 @@
 #include <sys/statvfs.h>
 #include <sys/refstr.h>
 #include <sys/avl.h>
+#include <sys/time.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -187,6 +188,8 @@
 	vsk_anchor_t	*vi_vskap;		/* anchor for vopstats' kstat */
 	vopstats_t	*vi_fstypevsp;		/* ptr to per-fstype vopstats */
 	vopstats_t	vi_vopstats;		/* per-mount vnode op stats */
+
+	timespec_t	vi_hrctime; 		/* High-res creation time */
 } vfs_impl_t;
 
 
@@ -258,6 +261,7 @@
 #define	vfs_vskap	vfs_implp->vi_vskap
 #define	vfs_fstypevsp	vfs_implp->vi_fstypevsp
 #define	vfs_vopstats	vfs_implp->vi_vopstats
+#define	vfs_hrctime	vfs_implp->vi_hrctime
 
 /*
  * VFS flags.
diff -r 11fc80bc5cb9 -r 951a65b3846b usr/src/uts/common/sys/zone.h
--- a/usr/src/uts/common/sys/zone.h	Thu Oct 29 18:18:00 2009 -0400
+++ b/usr/src/uts/common/sys/zone.h	Thu Oct 29 17:10:37 2009 -0700
@@ -36,6 +36,7 @@
 #include <sys/cred.h>
 #include <sys/netstack.h>
 #include <sys/uadmin.h>
+#include <sys/ksynch.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -435,6 +436,11 @@
 	 * Solaris Auditing per-zone audit context
 	 */
 	struct au_kcontext	*zone_audit_kctxt;
+	/*
+	 * For private use by mntfs.
+	 */
+	struct mntelem	*zone_mntfs_db;
+	krwlock_t	zone_mntfs_db_lock;
 } zone_t;
 
 /*