changeset 9981:b4907297e740

6775100 stat() performance on files on zfs should be improved 6827779 rrwlock is overly protective of its counters
author Tim Haley <Tim.Haley@Sun.COM>
date Sat, 27 Jun 2009 19:22:00 -0600
parents 13d7f3eec672
children 4174e5b637ba
files usr/src/uts/common/fs/zfs/rrwlock.c usr/src/uts/common/fs/zfs/sys/zfs_acl.h usr/src/uts/common/fs/zfs/sys/zfs_znode.h usr/src/uts/common/fs/zfs/zfs_acl.c usr/src/uts/common/fs/zfs/zfs_vnops.c usr/src/uts/common/fs/zfs/zfs_znode.c
diffstat 6 files changed, 200 insertions(+), 58 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/uts/common/fs/zfs/rrwlock.c	Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/rrwlock.c	Sat Jun 27 19:22:00 2009 -0600
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/refcount.h>
 #include <sys/rrwlock.h>
 
@@ -118,7 +116,7 @@
 	rrw_node_t *prev = NULL;
 
 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
-		return (NULL);
+		return (B_FALSE);
 
 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
 		if (rn->rn_rrl == rrl) {
@@ -159,6 +157,14 @@
 rrw_enter_read(rrwlock_t *rrl, void *tag)
 {
 	mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+	if (!rrl->rr_writer && !rrl->rr_writer_wanted) {
+		rrl->rr_anon_rcount.rc_count++;
+		mutex_exit(&rrl->rr_lock);
+		return;
+	}
+	DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
+#endif
 	ASSERT(rrl->rr_writer != curthread);
 	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
 
@@ -208,19 +214,28 @@
 rrw_exit(rrwlock_t *rrl, void *tag)
 {
 	mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+	if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
+		rrl->rr_anon_rcount.rc_count--;
+		if (rrl->rr_anon_rcount.rc_count == 0)
+			cv_broadcast(&rrl->rr_cv);
+		mutex_exit(&rrl->rr_lock);
+		return;
+	}
+	DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
+#endif
 	ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
 	    !refcount_is_zero(&rrl->rr_linked_rcount) ||
 	    rrl->rr_writer != NULL);
 
 	if (rrl->rr_writer == NULL) {
-		if (rrn_find_and_remove(rrl)) {
-			if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
-				cv_broadcast(&rrl->rr_cv);
-
-		} else {
-			if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
-				cv_broadcast(&rrl->rr_cv);
-		}
+		int64_t count;
+		if (rrn_find_and_remove(rrl))
+			count = refcount_remove(&rrl->rr_linked_rcount, tag);
+		else
+			count = refcount_remove(&rrl->rr_anon_rcount, tag);
+		if (count == 0)
+			cv_broadcast(&rrl->rr_cv);
 	} else {
 		ASSERT(rrl->rr_writer == curthread);
 		ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
--- a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h	Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h	Sat Jun 27 19:22:00 2009 -0600
@@ -203,6 +203,7 @@
 void zfs_ace_byteswap(void *, size_t, boolean_t);
 extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr);
 extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
+int zfs_fastaccesschk_execute(struct znode *, cred_t *);
 extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
 extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
 extern int zfs_acl_access(struct znode *, int, cred_t *);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Sat Jun 27 19:22:00 2009 -0600
@@ -77,6 +77,7 @@
 #define	ZFS_ACL_DEFAULTED	0x20		/* ACL should be defaulted */
 #define	ZFS_ACL_AUTO_INHERIT	0x40		/* ACL should be inherited */
 #define	ZFS_BONUS_SCANSTAMP	0x80		/* Scanstamp in bonus area */
+#define	ZFS_NO_EXECS_DENIED	0x100		/* exec was given to everyone */
 
 /*
  * Is ID ephemeral?
@@ -200,6 +201,7 @@
 	uint64_t	z_gen;		/* generation (same as zp_gen) */
 	uint32_t	z_sync_cnt;	/* synchronous open count */
 	kmutex_t	z_acl_lock;	/* acl data lock */
+	zfs_acl_t	*z_acl_cached;	/* cached acl */
 	list_node_t	z_link_node;	/* all znodes in fs link */
 	/*
 	 * These are dmu managed fields.
--- a/usr/src/uts/common/fs/zfs/zfs_acl.c	Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c	Sat Jun 27 19:22:00 2009 -0600
@@ -781,6 +781,7 @@
 	uint64_t	who;
 	uint16_t	iflags, type;
 	uint32_t	access_mask;
+	boolean_t	an_exec_denied = B_FALSE;
 
 	mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
 
@@ -905,8 +906,26 @@
 					}
 				}
 			}
+		} else {
+			/*
+			 * Only care if this IDENTIFIER_GROUP or
+			 * USER ACE denies execute access to someone,
+			 * mode is not affected
+			 */
+			if ((access_mask & ACE_EXECUTE) && type == DENY)
+				an_exec_denied = B_TRUE;
 		}
 	}
+
+	if (!an_exec_denied && !(seen & (S_IXUSR | S_IXGRP | S_IXOTH)) ||
+	    !(mode & (S_IXUSR | S_IXGRP | S_IXOTH)))
+		an_exec_denied = B_TRUE;
+
+	if (an_exec_denied)
+		zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED;
+	else
+		zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED;
+
 	return (mode);
 }
 
@@ -960,8 +979,14 @@
 
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
+	if (zp->z_acl_cached) {
+		*aclpp = zp->z_acl_cached;
+		return (0);
+	}
+
 	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
 		*aclpp = zfs_acl_node_read_internal(zp, will_modify);
+		zp->z_acl_cached = *aclpp;
 		return (0);
 	}
 
@@ -994,7 +1019,7 @@
 		return (error);
 	}
 
-	*aclpp = aclp;
+	zp->z_acl_cached = *aclpp = aclp;
 	return (0);
 }
 
@@ -1019,6 +1044,11 @@
 
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
 
+	if (zp->z_acl_cached != aclp && zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
 	zphys->zp_mode = zfs_mode_compute(zp, aclp);
 
 	/*
@@ -1606,6 +1636,7 @@
 	if (error == 0) {
 		(*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
 		zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
+		zp->z_acl_cached = *aclp;
 	}
 	mutex_exit(&zp->z_acl_lock);
 	mutex_exit(&zp->z_lock);
@@ -1869,7 +1900,6 @@
 			mutex_exit(&dzp->z_acl_lock);
 			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
 			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
-			zfs_acl_free(paclp);
 		} else {
 			acl_ids->z_aclp =
 			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
@@ -1998,8 +2028,6 @@
 
 	mutex_exit(&zp->z_acl_lock);
 
-	zfs_acl_free(aclp);
-
 	return (0);
 }
 
@@ -2095,11 +2123,6 @@
 		aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
 	}
 top:
-	if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) {
-		zfs_acl_free(aclp);
-		return (error);
-	}
-
 	mutex_enter(&zp->z_lock);
 	mutex_enter(&zp->z_acl_lock);
 
@@ -2154,7 +2177,7 @@
 
 	if (fuidp)
 		zfs_fuid_info_free(fuidp);
-	zfs_acl_free(aclp);
+	zp->z_acl_cached = aclp;
 	dmu_tx_commit(tx);
 done:
 	mutex_exit(&zp->z_acl_lock);
@@ -2301,7 +2324,6 @@
 					checkit = B_TRUE;
 				break;
 			} else {
-				zfs_acl_free(aclp);
 				mutex_exit(&zp->z_acl_lock);
 				return (EIO);
 			}
@@ -2334,7 +2356,6 @@
 	}
 
 	mutex_exit(&zp->z_acl_lock);
-	zfs_acl_free(aclp);
 
 	/* Put the found 'denies' back on the working mode */
 	if (deny_mask) {
@@ -2420,6 +2441,72 @@
 	    check_privs, B_FALSE, cr));
 }
 
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+	boolean_t owner = B_FALSE;
+	boolean_t groupmbr = B_FALSE;
+	boolean_t is_attr;
+	uid_t fowner;
+	uid_t gowner;
+	uid_t uid = crgetuid(cr);
+	int error;
+
+	if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+		return (EACCES);
+
+	is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) &&
+	    (ZTOV(zdp)->v_type == VDIR));
+	if (is_attr)
+		goto slow;
+
+	mutex_enter(&zdp->z_acl_lock);
+
+	if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) {
+		mutex_exit(&zdp->z_acl_lock);
+		return (0);
+	}
+
+	if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 ||
+	    FUID_INDEX(zdp->z_phys->zp_gid) != 0) {
+		mutex_exit(&zdp->z_acl_lock);
+		goto slow;
+	}
+
+	fowner = (uid_t)zdp->z_phys->zp_uid;
+	gowner = (uid_t)zdp->z_phys->zp_gid;
+
+	if (uid == fowner) {
+		owner = B_TRUE;
+		if (zdp->z_phys->zp_mode & S_IXUSR) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		}
+	}
+	if (groupmember(gowner, cr)) {
+		groupmbr = B_TRUE;
+		if (zdp->z_phys->zp_mode & S_IXGRP) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		}
+	}
+	if (!owner && !groupmbr) {
+		if (zdp->z_phys->zp_mode & S_IXOTH) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		}
+	}
+
+	mutex_exit(&zdp->z_acl_lock);
+
+slow:
+	DTRACE_PROBE(zfs__fastpath__execute__access__miss);
+	ZFS_ENTER(zdp->z_zfsvfs);
+	error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+	ZFS_EXIT(zdp->z_zfsvfs);
+	return (error);
+}
+
 /*
  * Determine whether Access should be granted/denied, invoking least
  * priv subsytem when a deny is determined.
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c	Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c	Sat Jun 27 19:22:00 2009 -0600
@@ -988,6 +988,27 @@
 }
 
 /*
+ * If vnode is for a device return a specfs vnode instead.
+ */
+static int
+specvp_check(vnode_t **vpp, cred_t *cr)
+{
+	int error = 0;
+
+	if (IS_DEVVP(*vpp)) {
+		struct vnode *svp;
+
+		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+		VN_RELE(*vpp);
+		if (svp == NULL)
+			error = ENOSYS;
+		*vpp = svp;
+	}
+	return (error);
+}
+
+
+/*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
@@ -1017,7 +1038,46 @@
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
-	int	error;
+	int	error = 0;
+
+	/* fast path */
+	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
+
+		if (dvp->v_type != VDIR) {
+			return (ENOTDIR);
+		} else if (zdp->z_dbuf == NULL) {
+			return (EIO);
+		}
+
+		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
+			error = zfs_fastaccesschk_execute(zdp, cr);
+			if (!error) {
+				*vpp = dvp;
+				VN_HOLD(*vpp);
+				return (0);
+			}
+			return (error);
+		} else {
+			vnode_t *tvp = dnlc_lookup(dvp, nm);
+
+			if (tvp) {
+				error = zfs_fastaccesschk_execute(zdp, cr);
+				if (error) {
+					VN_RELE(tvp);
+					return (error);
+				}
+				if (tvp == DNLC_NO_VNODE) {
+					VN_RELE(tvp);
+					return (ENOENT);
+				} else {
+					*vpp = tvp;
+					return (specvp_check(vpp, cr));
+				}
+			}
+		}
+	}
+
+	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zdp);
@@ -1082,21 +1142,8 @@
 	}
 
 	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
-	if (error == 0) {
-		/*
-		 * Convert device special files
-		 */
-		if (IS_DEVVP(*vpp)) {
-			vnode_t	*svp;
-
-			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-			VN_RELE(*vpp);
-			if (svp == NULL)
-				error = ENOSYS;
-			else
-				*vpp = svp;
-		}
-	}
+	if (error == 0)
+		error = specvp_check(vpp, cr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
@@ -1332,19 +1379,7 @@
 			VN_RELE(ZTOV(zp));
 	} else {
 		*vpp = ZTOV(zp);
-		/*
-		 * If vnode is for a device return a specfs vnode instead.
-		 */
-		if (IS_DEVVP(*vpp)) {
-			struct vnode *svp;
-
-			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-			VN_RELE(*vpp);
-			if (svp == NULL) {
-				error = ENOSYS;
-			}
-			*vpp = svp;
-		}
+		error = specvp_check(vpp, cr);
 	}
 
 	ZFS_EXIT(zfsvfs);
@@ -2456,6 +2491,7 @@
 top:
 	attrzp = NULL;
 
+	/* Can this be moved to before the top label? */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (EROFS);
@@ -2856,11 +2892,6 @@
 	if (attrzp)
 		VN_RELE(ZTOV(attrzp));
 
-	if (aclp) {
-		zfs_acl_free(aclp);
-		aclp = NULL;
-	}
-
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c	Fri Jun 26 17:26:34 2009 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c	Sat Jun 27 19:22:00 2009 -0600
@@ -133,6 +133,7 @@
 
 	zp->z_dbuf = NULL;
 	zp->z_dirlocks = NULL;
+	zp->z_acl_cached = NULL;
 	return (0);
 }
 
@@ -1081,6 +1082,11 @@
 	list_remove(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
 	kmem_cache_free(znode_cache, zp);
 
 	VFS_RELE(zfsvfs->z_vfs);