changeset 923:78f6e60ae914

PSARC 2004/422 posix_fallocate 4517427 All filesystems need a way to ftruncate/mmap a file with disk-space reservation
author sdebnath
date Tue, 15 Nov 2005 08:00:52 -0800
parents f5c8f1a2c9e3
children 0366bf445df6
files usr/src/cmd/fs.d/ufs/fsck/pass1.c usr/src/cmd/fs.d/ufs/mkfs/mkfs.c usr/src/head/fcntl.h usr/src/lib/librt/Makefile.com usr/src/lib/librt/common/fallocate.c usr/src/lib/librt/spec/rt.spec usr/src/lib/librt/spec/versions usr/src/uts/common/fs/ufs/ufs_alloc.c usr/src/uts/common/fs/ufs/ufs_bmap.c usr/src/uts/common/fs/ufs/ufs_directio.c usr/src/uts/common/fs/ufs/ufs_extvnops.c usr/src/uts/common/fs/ufs/ufs_lockfs.c usr/src/uts/common/fs/ufs/ufs_trans.c usr/src/uts/common/fs/ufs/ufs_vfsops.c usr/src/uts/common/fs/ufs/ufs_vnops.c usr/src/uts/common/sys/fcntl.h usr/src/uts/common/sys/fs/ufs_inode.h usr/src/uts/common/sys/fs/ufs_lockfs.h usr/src/uts/common/sys/fs/ufs_trans.h usr/src/uts/common/syscall/fcntl.c
diffstat 20 files changed, 947 insertions(+), 199 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/fs.d/ufs/fsck/pass1.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/cmd/fs.d/ufs/fsck/pass1.c	Tue Nov 15 08:00:52 2005 -0800
@@ -625,6 +625,14 @@
 	daddr32_t fragno = idesc->id_blkno;
 	struct dinode *dp;
 
+	/*
+	 * If this is a fallocate'd file, block numbers may be stored
+	 * as negative. In that case negate the negative numbers.
+	 */
+	dp = ginode(idesc->id_number);
+	if (dp->di_cflags & IFALLOCATE && fragno < 0)
+		fragno = -fragno;
+
 	if ((anyout = chkrange(fragno, idesc->id_numfrags)) != 0) {
 		/*
 		 * Note that blkerror() exits when preening.
--- a/usr/src/cmd/fs.d/ufs/mkfs/mkfs.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/cmd/fs.d/ufs/mkfs/mkfs.c	Tue Nov 15 08:00:52 2005 -0800
@@ -207,6 +207,11 @@
 #define	RC_KEYWORD	1
 #define	RC_POSITIONAL	2
 
+/*
+ * ufs hole
+ */
+#define	UFS_HOLE	-1
+
 #ifndef	STANDALONE
 #include	<stdio.h>
 #include	<sys/mnttab.h>
@@ -4399,8 +4404,15 @@
 		frags   = dbtofsb(&sblock, dp->di_blocks);
 
 		checkdirect((ino_t)i, &frags, &dp->di_db[0], NDADDR+NIADDR);
-		for (j = 0; j < NIADDR && frags; ++j)
-			checkindirect((ino_t)i, &frags, dp->di_ib[j], j);
+		for (j = 0; j < NIADDR && frags; ++j) {
+			/* Negate the block if its an fallocate'd block */
+			if (dp->di_ib[j] < 0 && dp->di_ib[j] != UFS_HOLE)
+				checkindirect((ino_t)i, &frags,
+				    -(dp->di_ib[j]), j);
+			else
+				checkindirect((ino_t)i, &frags,
+				    dp->di_ib[j], j);
+		}
 	}
 }
 
--- a/usr/src/head/fcntl.h	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/head/fcntl.h	Tue Nov 15 08:00:52 2005 -0800
@@ -77,6 +77,9 @@
 #ifdef __PRAGMA_REDEFINE_EXTNAME
 #pragma redefine_extname	open	open64
 #pragma redefine_extname	creat	creat64
+#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX)
+#pragma redefine_extname	posix_fallocate posix_fallocate64
+#endif /* defined(__EXTENSIONS__) || defined(_XPG6) ||  ... */
 #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \
 	defined(_ATFILE_SOURCE)
 #pragma redefine_extname	openat	openat64
@@ -85,6 +88,9 @@
 #else
 #define	open			open64
 #define	creat			creat64
+#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX)
+#define	posix_fallocate		posix_fallocate64
+#endif /* defined(__EXTENSIONS__) || defined(_XPG6) ||  ... */
 #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \
 	defined(_ATFILE_SOURCE)
 #define	openat			openat64
@@ -97,6 +103,9 @@
 #ifdef __PRAGMA_REDEFINE_EXTNAME
 #pragma	redefine_extname	open64	open
 #pragma	redefine_extname	creat64	creat
+#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX)
+#pragma redefine_extname	posix_fallocate64 posix_fallocate
+#endif /* defined(__EXTENSIONS__) || defined(_XPG6) ||  ... */
 #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \
 	defined(_ATFILE_SOURCE)
 #pragma	redefine_extname	openat64	openat
@@ -105,6 +114,9 @@
 #else
 #define	open64				open
 #define	creat64				creat
+#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX)
+#define	posix_fallocate64		posix_fallocate
+#endif /* defined(__EXTENSIONS__) || defined(_XPG6) ||  ... */
 #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \
 	defined(_ATFILE_SOURCE)
 #define	openat64			openat
@@ -118,6 +130,9 @@
 extern int fcntl(int, int, ...);
 extern int open(const char *, int, ...);
 extern int creat(const char *, mode_t);
+#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX)
+extern int posix_fallocate(int fd, off_t offset, off_t len);
+#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */
 #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \
 	defined(_ATFILE_SOURCE)
 extern int openat(int, const char *, int, ...);
@@ -132,6 +147,9 @@
 	    !defined(__PRAGMA_REDEFINE_EXTNAME))
 extern int open64(const char *, int, ...);
 extern int creat64(const char *, mode_t);
+#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX)
+extern int posix_fallocate64(int fd, off64_t offset, off64_t len);
+#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */
 #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \
 	defined(_ATFILE_SOURCE)
 extern int openat64(int, const char *, int, ...);
@@ -144,6 +162,9 @@
 extern int fcntl();
 extern int open();
 extern int creat();
+#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX)
+extern int posix_fallocate();
+#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */
 #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \
 	defined(_ATFILE_SOURCE)
 extern int openat();
@@ -159,6 +180,9 @@
 	    !defined(__PRAGMA_REDEFINE_EXTNAME))
 extern int open64();
 extern int creat64();
+#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX)
+extern int posix_fallocate64();
+#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */
 #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \
 	defined(_ATFILE_SOURCE)
 extern int openat64();
--- a/usr/src/lib/librt/Makefile.com	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/lib/librt/Makefile.com	Tue Nov 15 08:00:52 2005 -0800
@@ -20,7 +20,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 # ident	"%Z%%M%	%I%	%E% SMI"
@@ -33,6 +33,7 @@
 OBJECTS=	\
 	aio.o		\
 	clock_timer.o	\
+	fallocate.o	\
 	fdatasync.o	\
 	mqueue.o	\
 	pos4.o		\
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/lib/librt/common/fallocate.c	Tue Nov 15 08:00:52 2005 -0800
@@ -0,0 +1,72 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include "pos4.h"
+
+#include <stdio.h>
+
+int
+posix_fallocate(int fd, off_t offset, off_t len)
+{
+	struct flock lck;
+
+	lck.l_whence = 0;
+	lck.l_start = offset;
+	lck.l_len = len;
+	lck.l_type = F_WRLCK;
+
+	if (fcntl(fd, F_ALLOCSP, &lck) == -1) {
+		return (-1);
+	}
+
+	return (0);
+}
+
+#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64)
+
+int
+posix_fallocate64(int fd, off64_t offset, off64_t len)
+{
+	struct flock64 lck;
+
+	lck.l_whence = 0;
+	lck.l_start = offset;
+	lck.l_len = len;
+	lck.l_type = F_WRLCK;
+
+	if (fcntl(fd, F_ALLOCSP64, &lck) == -1) {
+		return (-1);
+	}
+
+	return (0);
+}
+
+#endif
--- a/usr/src/lib/librt/spec/rt.spec	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/lib/librt/spec/rt.spec	Tue Nov 15 08:00:52 2005 -0800
@@ -1,6 +1,3 @@
-#
-# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
 #
 # CDDL HEADER START
 #
@@ -22,6 +19,10 @@
 #
 # CDDL HEADER END
 #
+#
+# Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
 # ident	"%Z%%M%	%I%	%E% SMI"
 #
 # lib/librt/spec/rt.spec
@@ -76,6 +77,13 @@
 errno		EAGAIN EINTR ENOSYS
 end
 
+function	posix_fallocate
+include		<fcntl.h>
+declaration	int posix_fallocate(int fd, off_t offset, off_t len)
+version		SUNW_1.5
+errno		EBADF EFBIG EINTR EINVAL EIO ENODEV ENOSPC ESPIPE
+end
+
 function	fdatasync
 include		<unistd.h>
 declaration	int fdatasync(int fildes)
@@ -157,6 +165,12 @@
 version		SUNW_1.3
 end
 
+function	posix_fallocate64 extends librt/spec/rt.spec posix_fallocate
+declaration	int posix_fallocate(int fd, off64_t offset, off64_t len)
+arch		i386 sparc
+version		SUNW_1.5
+end
+
 function	mq_close
 include		<mqueue.h>
 declaration	int mq_close(mqd_t mqdes)
--- a/usr/src/lib/librt/spec/versions	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/lib/librt/spec/versions	Tue Nov 15 08:00:52 2005 -0800
@@ -26,6 +26,7 @@
 #
 
 sparc {
+	SUNW_1.5:	{SUNW_1.4};
 	SUNW_1.4:	{SUNW_1.3};
 	SUNW_1.3:	{SUNW_1.2};
 	SUNW_1.2:	{SUNW_1.1};
@@ -34,6 +35,7 @@
 	SUNWprivate_1.1;
 }
 i386 {
+	SUNW_1.5:	{SUNW_1.4};
 	SUNW_1.4:	{SUNW_1.3};
 	SUNW_1.3:	{SUNW_1.2};
 	SUNW_1.2:	{SUNW_1.1};
@@ -42,6 +44,7 @@
 	SUNWprivate_1.1;
 }
 sparcv9 {
+	SUNW_1.5:	{SUNW_1.4};
 	SUNW_1.4:	{SUNW_1.3};
 	SUNW_1.3:	{SUNW_1.2};
 	SUNW_1.2:	{SUNW_1.1};
@@ -50,6 +53,7 @@
 	SUNWprivate_1.1;
 }
 amd64 {
+	SUNW_1.5:	{SUNW_1.4};
 	SUNW_1.4:	{SUNW_1.3};
 	SUNW_1.3:	{SUNW_1.2};
 	SUNW_1.2:	{SUNW_1.1};
--- a/usr/src/uts/common/fs/ufs/ufs_alloc.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/fs/ufs/ufs_alloc.c	Tue Nov 15 08:00:52 2005 -0800
@@ -40,6 +40,7 @@
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
+#include <sys/condvar_impl.h>
 #include <sys/types.h>
 #include <sys/t_lock.h>
 #include <sys/debug.h>
@@ -116,9 +117,9 @@
 	ufsvfsp = ip->i_ufsvfs;
 	fs = ufsvfsp->vfs_fs;
 	if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
-		err = ufs_fault(ITOV(ip),
-	    "alloc: bad size, dev = 0x%lx, bsize = %d, size = %d, fs = %s\n",
-	    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
+		err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
+		    " bsize = %d, size = %d, fs = %s\n",
+		    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
 		return (err);
 	}
 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
@@ -194,9 +195,9 @@
 	if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
 	    (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
 		err = ufs_fault(ITOV(ip),
-	"realloccg: bad size, dev=0x%lx, bsize=%d, osize=%d, nsize=%d, fs=%s\n",
-		    ip->i_dev, fs->fs_bsize, osize, nsize,
-		    fs->fs_fsmnt);
+		    "realloccg: bad size, dev=0x%lx, bsize=%d, "
+		    "osize=%d, nsize=%d, fs=%s\n",
+		    ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
 		return (err);
 	}
 	if (freespace(fs, ufsvfsp) <= 0 &&
@@ -204,8 +205,8 @@
 		goto nospace;
 	if (bprev == 0) {
 		err = ufs_fault(ITOV(ip),
-	"realloccg: bad bprev, dev = 0x%lx, bsize = %d, bprev = %ld, fs = %s\n",
-		    ip->i_dev, fs->fs_bsize, bprev,
+		    "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
+		    " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
 		    fs->fs_fsmnt);
 		return (err);
 	}
@@ -403,9 +404,9 @@
 
 		if (ip->i_size) {
 			cmn_err(CE_WARN,
-			"%s: free inode %d had size 0x%llx, run fsck(1M)%s",
-			fs->fs_fsmnt, (int)ino, ip->i_size,
-			(TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
+			    "%s: free inode %d had size 0x%llx, run fsck(1M)%s",
+			    fs->fs_fsmnt, (int)ino, ip->i_size,
+			    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
 		}
 		/*
 		 * Clear any garbage left behind.
@@ -581,16 +582,27 @@
 	 * next block is requested contiguously, otherwise it is
 	 * requested rotationally delayed by fs_rotdelay milliseconds.
 	 */
-	nextblk = bap[indx - 1] + fs->fs_frag;
-	if (indx > fs->fs_maxcontig &&
-	    bap[indx - fs->fs_maxcontig] + blkstofrags(fs, fs->fs_maxcontig)
-	    != nextblk)
+
+	nextblk = bap[indx - 1];
+	/*
+	 * Provision for fallocate to return positive
+	 * blk preference based on last allocation
+	 */
+	if (nextblk < 0 && nextblk != UFS_HOLE) {
+		nextblk = (-bap[indx - 1]) + fs->fs_frag;
+	} else {
+		nextblk = bap[indx - 1] + fs->fs_frag;
+	}
+
+	if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
+	    blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
 		return (nextblk);
+	}
 	if (fs->fs_rotdelay != 0)
 		/*
 		 * Here we convert ms of delay to frags as:
 		 * (frags) = (ms) * (rev/sec) * (sect/rev) /
-		 *	((sect/frag) * (ms/sec))
+		 * 	((sect/frag) * (ms/sec))
 		 * then round up to the next block.
 		 */
 		nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
@@ -621,16 +633,25 @@
 	short *blks;
 	daddr_t blkno, cylno, rpos;
 
+	/*
+	 * fallocate'd files will have negative block address.
+	 * So negate it again to get original block address.
+	 */
+	if (bno < 0 && bno % fs->fs_bsize == 0 && bno != UFS_HOLE) {
+		bno = -bno;
+	}
+
 	if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		(void) ufs_fault(ITOV(ip),
-		"free: bad size, dev = 0x%lx, bsize = %d, size = %d, fs = %s\n",
-		    ip->i_dev, fs->fs_bsize, (int)size, fs->fs_fsmnt);
+		    "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
+		    "fs = %s\n", ip->i_dev, fs->fs_bsize,
+		    (int)size, fs->fs_fsmnt);
 		return;
 	}
 	cg = dtog(fs, bno);
 	ASSERT(!ufs_badblock(ip, bno));
 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
-		    (int)fs->fs_cgsize);
+	    (int)fs->fs_cgsize);
 
 	cgp = bp->b_un.b_cg;
 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
@@ -770,7 +791,7 @@
 	}
 	cg = (int)itog(fs, ino);
 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
-		    (int)fs->fs_cgsize);
+	    (int)fs->fs_cgsize);
 
 	cgp = bp->b_un.b_cg;
 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
@@ -785,9 +806,9 @@
 		mutex_exit(&ufsvfsp->vfs_lock);
 		brelse(bp);
 		(void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
-				    "mode: (imode) %o, (omode) %o, ino:%d, "
-				    "fs:%s",
-				    ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
+		    "mode: (imode) %o, (omode) %o, ino:%d, "
+		    "fs:%s",
+		    ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
 		return;
 	}
 	clrbit(iused, inot);
@@ -889,7 +910,7 @@
 	}
 
 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
-		    (int)fs->fs_cgsize);
+	    (int)fs->fs_cgsize);
 	cgp = bp->b_un.b_cg;
 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
 		brelse(bp);
@@ -963,7 +984,7 @@
 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
 		return (0);
 	bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
-		    (int)fs->fs_cgsize);
+	    (int)fs->fs_cgsize);
 
 	cgp = bp->b_un.b_cg;
 	if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
@@ -1162,8 +1183,8 @@
 
 		if (fs_postbl(ufsvfsp, pos)[i] == -1) {
 			(void) ufs_fault(ufsvfsp->vfs_root,
-	    "alloccgblk: cyl groups corrupted, pos = %d, i = %d, fs = %s\n",
-				    pos, i, fs->fs_fsmnt);
+			    "alloccgblk: cyl groups corrupted, pos = %d, "
+			    "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
 			return (0);
 		}
 
@@ -1196,8 +1217,8 @@
 			i += delta;
 		}
 		(void) ufs_fault(ufsvfsp->vfs_root,
-	"alloccgblk: can't find blk in cyl, pos:%d, i:%d, fs:%s bno: %x\n",
-		    pos, i, fs->fs_fsmnt, (int)bno);
+		    "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
+		    "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
 		return (0);
 	}
 norot:
@@ -1290,8 +1311,8 @@
 		if (loc == 0) {
 			mutex_exit(&ufsvfsp->vfs_lock);
 			(void) ufs_fault(ITOV(ip),
-		    "ialloccg: map corrupted, cg = %d, irotor = %d, fs = %s\n",
-				    cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
+			    "ialloccg: map corrupted, cg = %d, irotor = %d, "
+			    "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
 			return (0);
 		}
 	}
@@ -1409,9 +1430,10 @@
 			 * metadata into userdata (harpy).  If so, ignore.
 			 */
 			if (!TRANS_ISCANCEL(ufsvfsp,
-				ldbtob(fsbtodb(fs, (cfrag+bno))),
-				allocsiz * fs->fs_fsize))
+			    ldbtob(fsbtodb(fs, (cfrag+bno))),
+			    allocsiz * fs->fs_fsize))
 				return (bno);
+
 			/*
 			 * keep looking -- this block is being converted
 			 */
@@ -1445,6 +1467,348 @@
 #define	TRIPLE	2		/* triple indirect block ptr */
 
 /*
+ * Acquire a write lock, and keep trying till we get it
+ */
+static int
+allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
+{
+	int err = 0;
+
+lockagain:
+	do {
+		err = ufs_fiolfss(vp, lf);
+		if (err)
+			return (err);
+	} while (!LOCKFS_IS_ULOCK(lf));
+
+	lf->lf_lock = LOCKFS_WLOCK;
+	lf->lf_flags = 0;
+	lf->lf_comment = NULL;
+	err = ufs__fiolfs(vp, lf, 1, 0);
+
+	if (err == EBUSY || err == EINVAL)
+		goto lockagain;
+
+	return (err);
+}
+
+/*
+ * Release the write lock
+ */
+static int
+allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
+{
+	int err = 0;
+
+	lf->lf_lock = LOCKFS_ULOCK;
+	lf->lf_flags = 0;
+	err = ufs__fiolfs(vp, lf, 1, 0);
+	return (err);
+}
+
+struct allocsp_undo {
+	daddr_t offset;
+	daddr_t blk;
+	struct allocsp_undo *next;
+};
+
+/*
+ * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
+ * file system. The blocks are not initialized and are only marked as allocated.
+ * These addresses are then stored as negative block numbers in the inode to
+ * imply special handling. UFS has been modified where necessary to understand
+ * this new notion. Successfully fallocated files will have IFALLOCATE cflag
+ * set in the inode.
+ */
+int
+ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
+{
+	struct lockfs lf;
+	int berr, err, resv, issync;
+	off_t start, istart, len; /* istart, special for idb */
+	struct inode *ip;
+	struct fs *fs;
+	struct ufsvfs *ufsvfsp;
+	u_offset_t resid, i;
+	daddr32_t db_undo[NDADDR];	/* old direct blocks */
+	struct allocsp_undo *ib_undo = NULL;	/* ib undo */
+	struct allocsp_undo *undo = NULL;
+	u_offset_t osz;			/* old file size */
+	int chunkblks = 0;		/* # of blocks in 1 allocation */
+	int cnt = 0;
+	daddr_t allocblk;
+	daddr_t totblks = 0;
+	struct ulockfs	*ulp;
+
+	ASSERT(vp->v_type == VREG);
+
+	ip = VTOI(vp);
+	fs = ip->i_fs;
+	if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
+		err = EIO;
+		goto out_allocsp;
+	}
+
+	istart = start = blkroundup(fs, (lp->l_start));
+	len = blkroundup(fs, (lp->l_len));
+	chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
+	ulp = &ufsvfsp->vfs_ulockfs;
+
+	if (lp->l_start < 0 || lp->l_len <= 0)
+		return (EINVAL);
+
+	/* Quickly check to make sure we have space before we proceed */
+	if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
+		if (TRANS_ISTRANS(ufsvfsp)) {
+			ufs_delete_drain_wait(ufsvfsp, 1);
+			if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
+				return (ENOSPC);
+		} else
+			return (ENOSPC);
+	}
+
+	/*
+	 * We will keep i_rwlock locked as WRITER through out the function
+	 * since we don't want anyone else reading or writing to the inode
+	 * while we are in the middle of fallocating the file.
+	 */
+	rw_enter(&ip->i_rwlock, RW_WRITER);
+
+	/* Back up the direct block list, used for undo later if necessary */
+	rw_enter(&ip->i_contents, RW_READER);
+	for (i = 0; i < NDADDR; i++)
+		db_undo[i] = ip->i_db[i];
+	osz = ip->i_size;
+	rw_exit(&ip->i_contents);
+
+	/* Allocate any direct blocks now before we write lock the fs */
+	if (lblkno(fs, start) < NDADDR) {
+		ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
+		    &resv, &resid);
+		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
+
+		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
+		rw_enter(&ip->i_contents, RW_WRITER);
+
+		for (i = start; (i < len) && (lblkno(fs, i) < NDADDR);
+		    i += fs->fs_bsize) {
+			berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
+			    &allocblk, cr);
+			/* Yikes error, quit */
+			if (berr) {
+				TRANS_INODE(ufsvfsp, ip);
+				rw_exit(&ip->i_contents);
+				rw_exit(&ufsvfsp->vfs_dqrwlock);
+				TRANS_END_CSYNC(ufsvfsp, err, issync,
+				    TOP_ALLOCSP, resv);
+				goto exit;
+			}
+
+			if (allocblk) {
+				totblks++;
+				ip->i_size += fs->fs_bsize;
+			}
+		}
+
+		TRANS_INODE(ufsvfsp, ip);
+		rw_exit(&ip->i_contents);
+		rw_exit(&ufsvfsp->vfs_dqrwlock);
+		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
+
+		istart =  i;	/* start offset for indirect allocation */
+	}
+
+	/* Write lock the file system */
+	if (err = allocsp_wlockfs(vp, &lf))
+		goto exit;
+
+	/* Break the transactions into vfs_iotransz units */
+	ufs_trans_trunc_resv(ip, ip->i_size +
+	    blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
+	TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
+
+	rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
+	rw_enter(&ip->i_contents, RW_WRITER);
+
+	/* Now go about fallocating necessary indirect blocks */
+	for (i = istart; i < len; i += fs->fs_bsize) {
+		berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
+		    &allocblk, cr);
+		if (berr) {
+			TRANS_INODE(ufsvfsp, ip);
+			rw_exit(&ip->i_contents);
+			rw_exit(&ufsvfsp->vfs_dqrwlock);
+			TRANS_END_CSYNC(ufsvfsp, err, issync,
+			    TOP_ALLOCSP, resv);
+			err = allocsp_unlockfs(vp, &lf);
+			goto exit;
+		}
+
+		/* Update the blk counter only if new block was added */
+		if (allocblk) {
+			/* Save undo information */
+			undo = kmem_alloc(sizeof (struct allocsp_undo),
+			    KM_SLEEP);
+			undo->offset = i;
+			undo->blk = allocblk;
+			undo->next = ib_undo;
+			ib_undo = undo;
+			totblks++;
+			ip->i_size += fs->fs_bsize;
+		}
+		cnt++;
+
+		/* Being a good UFS citizen, let others get a share */
+		if (cnt == chunkblks) {
+			/*
+			 * If there are waiters or the fs is hard locked,
+			 * error locked, or read-only error locked,
+			 * quit with EIO
+			 */
+			if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
+			    ULOCKFS_IS_ROELOCK(ulp)) {
+				ip->i_cflags |= IFALLOCATE;
+				TRANS_INODE(ufsvfsp, ip);
+				rw_exit(&ip->i_contents);
+				rw_exit(&ufsvfsp->vfs_dqrwlock);
+
+				TRANS_END_CSYNC(ufsvfsp, err, issync,
+				    TOP_ALLOCSP, resv);
+				rw_exit(&ip->i_rwlock);
+				return (EIO);
+			}
+
+			TRANS_INODE(ufsvfsp, ip);
+			rw_exit(&ip->i_contents);
+			rw_exit(&ufsvfsp->vfs_dqrwlock);
+
+			/* End the current transaction */
+			TRANS_END_CSYNC(ufsvfsp, err, issync,
+			    TOP_ALLOCSP, resv);
+
+			if (CV_HAS_WAITERS(&ulp->ul_cv)) {
+				/* Release the write lock */
+				if (err = allocsp_unlockfs(vp, &lf))
+					goto exit;
+
+				/* Wake up others waiting to do operations */
+				mutex_enter(&ulp->ul_lock);
+				cv_broadcast(&ulp->ul_cv);
+				mutex_exit(&ulp->ul_lock);
+
+				/* Grab the write lock again */
+				if (err = allocsp_wlockfs(vp, &lf))
+					goto exit;
+			} /* end of CV_HAS_WAITERS(&ulp->ul_cv) */
+
+			/* Reserve more space in log for this file */
+			ufs_trans_trunc_resv(ip,
+			    ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
+			    &resv, &resid);
+			TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
+
+			rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
+			rw_enter(&ip->i_contents, RW_WRITER);
+
+			cnt = 0;	/* reset cnt b/c of new transaction */
+		}
+	}
+
+	if (!err && !berr)
+		ip->i_cflags |= IFALLOCATE;
+
+	/* Release locks, end log transaction and unlock fs */
+	TRANS_INODE(ufsvfsp, ip);
+	rw_exit(&ip->i_contents);
+	rw_exit(&ufsvfsp->vfs_dqrwlock);
+
+	TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
+	err = allocsp_unlockfs(vp, &lf);
+
+	/*
+	 * @ exit label, we should no longer be holding the fs write lock, and
+	 * all logging transactions should have been ended. We still hold
+	 * ip->i_rwlock.
+	 */
+exit:
+	/*
+	 * File has grown larger than 2GB. Set flag
+	 * in superblock to indicate this, if it
+	 * is not already set.
+	 */
+	if ((ip->i_size > MAXOFF32_T) &&
+		!(fs->fs_flags & FSLARGEFILES)) {
+		ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
+		mutex_enter(&ufsvfsp->vfs_lock);
+		fs->fs_flags |= FSLARGEFILES;
+		ufs_sbwrite(ufsvfsp);
+		mutex_exit(&ufsvfsp->vfs_lock);
+	}
+
+	/*
+	 * Since we couldn't allocate completely, we will undo the allocations.
+	 */
+	if (berr) {
+		ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
+		TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);
+
+		rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
+		rw_enter(&ip->i_contents, RW_WRITER);
+
+		/* Direct blocks */
+		for (i = 0; i < NDADDR; i++) {
+			/*
+			 * Only free the block if they are not same, and
+			 * the old one isn't zero (the fragment was
+			 * re-allocated).
+			 */
+			if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
+				free(ip, ip->i_db[i], fs->fs_bsize, 0);
+				ip->i_db[i] = 0;
+			}
+		}
+
+		/* Undo the indirect blocks */
+		while (ib_undo != NULL) {
+			undo = ib_undo;
+			err = bmap_set_bn(vp, undo->offset, 0);
+			if (err)
+				cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
+				    "undo allocation of block %ld",
+				    undo->offset);
+			free(ip, undo->blk, fs->fs_bsize, I_IBLK);
+			ib_undo = undo->next;
+			kmem_free(undo, sizeof (struct allocsp_undo));
+		}
+
+		ip->i_size = osz;
+		TRANS_INODE(ufsvfsp, ip);
+
+		rw_exit(&ip->i_contents);
+		rw_exit(&ufsvfsp->vfs_dqrwlock);
+
+		TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
+
+		rw_exit(&ip->i_rwlock);
+		return (berr);
+	}
+
+	/*
+	 * Don't forget to free the undo chain :)
+	 */
+	while (ib_undo != NULL) {
+		undo = ib_undo;
+		ib_undo = undo->next;
+		kmem_free(undo, sizeof (struct allocsp_undo));
+	}
+
+	rw_exit(&ip->i_rwlock);
+
+out_allocsp:
+	return (err);
+}
+
+/*
  * Free storage space associated with the specified inode.  The portion
  * to be freed is specified by lp->l_start and lp->l_len (already
  * normalized to a "whence" of 0).
@@ -1556,8 +1920,8 @@
 		 * find the largest contiguous range in this cg
 		 */
 		bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
-			(daddr_t)fsbtodb(fs, cgtod(fs, cg)),
-			(int)fs->fs_cgsize);
+		    (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
+		    (int)fs->fs_cgsize);
 		cgp = bp->b_un.b_cg;
 		if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
 			brelse(bp);
--- a/usr/src/uts/common/fs/ufs/ufs_bmap.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/fs/ufs/ufs_bmap.c	Tue Nov 15 08:00:52 2005 -0800
@@ -60,9 +60,9 @@
 #include <sys/errno.h>
 #include <sys/sysmacros.h>
 #include <sys/vfs.h>
-#include <sys/cmn_err.h>
 #include <sys/debug.h>
 #include <sys/kmem.h>
+#include <sys/cmn_err.h>
 
 /*
  * This structure is used to track blocks as we allocate them, so that
@@ -165,14 +165,14 @@
 #define	VERYLARGEFILESIZE	0x7FE00000
 
 /*
- * bmap{rd,wr} define the structure of file system storage by mapping
+ * bmap{read,write} define the structure of file system storage by mapping
  * a logical offset in a file to a physical block number on the device.
  * It should be called with a locked inode when allocation is to be
- * done (bmapwr).  Note this strangeness: bmapwr is always called from
+ * done (bmap_write).  Note this strangeness: bmap_write is always called from
  * getpage(), not putpage(), since getpage() is where all the allocation
  * is done.
  *
- * S_READ, S_OTHER -> bmaprd; S_WRITE -> bmapwr.
+ * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write.
  *
  * NOTICE: the block number returned is the disk block number, not the
  * file system block number.  All the worries about block offsets and
@@ -296,24 +296,21 @@
 }
 
 /*
- * See bmaprd for general notes.
+ * See bmap_read for general notes.
  *
  * The block must be at least size bytes and will be extended or
- * allocated as needed.  If alloc_only is set, bmap will not create
- * any in-core pages that correspond to the new disk allocation.
- * Otherwise, the in-core pages will be created and initialized as
- * needed.
+ * allocated as needed.  If alloc_type is of type BI_ALLOC_ONLY, then bmap
+ * will not create any in-core pages that correspond to the new disk allocation.
+ * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr
+ * and security is maintained b/c upon reading a negative block number pages
+ * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will
+ * be created and initialized as needed.
  *
  * Returns 0 on success, or a non-zero errno if an error occurs.
  */
-
 int
-bmap_write(
-	struct inode	*ip,
-	u_offset_t	off,
-	int		size,
-	int		alloc_only,
-	struct cred	*cr)
+bmap_write(struct inode	*ip, u_offset_t	off, int size,
+    enum bi_type alloc_type, daddr_t *allocblk, struct cred *cr)
 {
 	struct	fs *fs;
 	struct	buf *bp;
@@ -340,6 +337,9 @@
 
 	ASSERT(RW_WRITE_HELD(&ip->i_contents));
 
+	if (allocblk)
+		*allocblk = 0;
+
 	ufsvfsp = ip->i_ufsvfs;
 	fs = ufsvfsp->vfs_bufp->b_un.b_fs;
 	lbn = (daddr_t)lblkno(fs, off);
@@ -360,7 +360,7 @@
 	issync = ((ip->i_flag & ISYNC) != 0);
 
 	if (isdirquota || issync) {
-		alloc_only = 0;		/* make sure */
+		alloc_type = BI_NORMAL;	/* make sure */
 	}
 
 	/*
@@ -422,6 +422,7 @@
 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 			TRANS_INODE(ufsvfsp, ip);
 			ip->i_flag |= IUPD | ICHG | IATTCHG;
+
 			/* Caller is responsible for updating i_seq */
 			/*
 			 * Don't check metaflag here, directories won't do this
@@ -465,7 +466,7 @@
 					}
 				}
 				/*
-				 * need to allocate a block or frag
+				 * need to re-allocate a block or frag
 				 */
 				ob = nb;
 				pref = blkpref(ip, lbn, (int)lbn,
@@ -474,6 +475,8 @@
 						(int)nsize, &nb, cr);
 				if (err)
 					return (err);
+				if (allocblk)
+					*allocblk = nb;
 				ASSERT(!ufs_badblock(ip, nb));
 
 			} else {
@@ -501,6 +504,8 @@
 				err = alloc(ip, pref, (int)nsize, &nb, cr);
 				if (err)
 					return (err);
+				if (allocblk)
+					*allocblk = nb;
 				ASSERT(!ufs_badblock(ip, nb));
 				ob = nb;
 			}
@@ -513,7 +518,13 @@
 				/*
 				 * mmap S_WRITE faults always enter here
 				 */
-				if (!alloc_only || P2ROUNDUP_TYPED(size,
+				/*
+				 * We zero it if its also BI_FALLOCATE, but
+				 * only for direct blocks!
+				 */
+				if (alloc_type == BI_NORMAL ||
+				    alloc_type == BI_FALLOCATE ||
+				    P2ROUNDUP_TYPED(size,
 				    PAGESIZE, u_offset_t) < nsize) {
 					/* fbzero doesn't cause a pagefault */
 					fbzero(ITOV(ip),
@@ -548,6 +559,7 @@
 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 			TRANS_INODE(ufsvfsp, ip);
 			ip->i_flag |= IUPD | ICHG | IATTCHG;
+
 			/* Caller is responsible for updating i_seq */
 
 			/*
@@ -708,6 +720,7 @@
 		shft -= nindirshift;		/* sh /= nindir */
 		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
 		nb = bap[i];
+
 		if (nb == 0) {
 			/*
 			 * Check to see if doing this will make the
@@ -750,8 +763,10 @@
 			}
 
 			ASSERT(!ufs_badblock(ip, nb));
+			ASSERT(alloced_blocks <= NIADDR);
 
-			ASSERT(alloced_blocks <= NIADDR);
+			if (allocblk)
+				*allocblk = nb;
 
 			undo_table[alloced_blocks].this_block = nb;
 			undo_table[alloced_blocks].block_size = bsize;
@@ -787,7 +802,8 @@
 					return (err);
 				}
 				brelse(nbp);
-			} else if (!alloc_only || P2ROUNDUP_TYPED(size,
+			} else if (alloc_type == BI_NORMAL ||
+			    P2ROUNDUP_TYPED(size,
 			    PAGESIZE, u_offset_t) < bsize) {
 				TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0);
 				fbzero(ITOV(ip),
@@ -846,12 +862,24 @@
 			}
 			bap = bp->b_un.b_daddr;
 			bap[i] = nb;
+
+			/*
+			 * The magic explained: j will be equal to NIADDR
+			 * when we are at the lowest level, this is where the
+			 * array entries point directly to data blocks. Since
+			 * we will be 'fallocate'ing we will go ahead and negate
+			 * the addresses.
+			 */
+			if (alloc_type == BI_FALLOCATE && j == NIADDR)
+				bap[i] = -bap[i];
+
 			TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB);
 			added_sectors += btodb(bsize);
 			ip->i_blocks += btodb(bsize);
 			ASSERT((unsigned)ip->i_blocks <= INT_MAX);
 			TRANS_INODE(ufsvfsp, ip);
 			ip->i_flag |= IUPD | ICHG | IATTCHG;
+
 			/* Caller is responsible for updating i_seq */
 
 			undo_table[alloced_blocks-1].owner =
@@ -910,8 +938,8 @@
 	 */
 	if (dblks <= NDADDR)
 		return (mblks < dblks);
+	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
 
-	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
 	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
 	nindirblks = nindiroffset + 1;
 
@@ -965,6 +993,7 @@
 	bn = *sbp;
 	if (bn == 0)
 		return (0);
+
 	diff = fs->fs_frag;
 	if (*lenp) {
 		n = MIN(n, lblkno(fs, *lenp));
@@ -1286,3 +1315,95 @@
 	}
 	return (error);
 }
+
+/*
+ * Set a particular offset in the inode list to be a certain block.
+ * User is responsible for calling TRANS* functions
+ */
+int
+bmap_set_bn(struct vnode *vp, u_offset_t off, daddr32_t bn)
+{
+	daddr_t lbn;
+	struct inode *ip;
+	ufsvfs_t *ufsvfsp;
+	struct	fs *fs;
+	struct	buf *bp;
+	int	i, j;
+	int	shft;			/* we maintain sh = 1 << shft */
+	int err;
+	daddr_t	ob, nb, tbn;
+	daddr32_t *bap;
+	int	nindirshift, nindiroffset;
+
+	ip = VTOI(vp);
+	ufsvfsp = ip->i_ufsvfs;
+	fs = ufsvfsp->vfs_fs;
+	lbn = (daddr_t)lblkno(fs, off);
+
+	ASSERT(RW_LOCK_HELD(&ip->i_contents));
+
+	if (lbn < 0)
+		return (EFBIG);
+
+	/*
+	 * Take care of direct block assignment
+	 */
+	if (lbn < NDADDR) {
+		ip->i_db[lbn] = bn;
+		return (0);
+	}
+
+	nindirshift = ip->i_ufsvfs->vfs_nindirshift;
+	nindiroffset = ip->i_ufsvfs->vfs_nindiroffset;
+	/*
+	 * Determine how many levels of indirection.
+	 */
+	shft = 0;				/* sh = 1 */
+	tbn = lbn - NDADDR;
+	for (j = NIADDR; j > 0; j--) {
+		longlong_t	sh;
+
+		shft += nindirshift;		/* sh *= nindir */
+		sh = 1LL << shft;
+		if (tbn < sh)
+			break;
+		tbn -= sh;
+	}
+	if (j == 0)
+		return (EFBIG);
+
+	/*
+	 * Fetch the first indirect block.
+	 */
+	nb = ip->i_ib[NIADDR - j];
+	if (nb == 0)
+		err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE");
+
+	/*
+	 * Fetch through the indirect blocks.
+	 */
+	for (; j <= NIADDR; j++) {
+		ob = nb;
+		bp = UFS_BREAD(ufsvfsp,
+				ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize);
+		if (bp->b_flags & B_ERROR) {
+			err = geterror(bp);
+			brelse(bp);
+			return (err);
+		}
+		bap = bp->b_un.b_daddr;
+
+		ASSERT(!ufs_indir_badblock(ip, bap));
+
+		shft -= nindirshift;		/* sh / nindir */
+		i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */
+
+		if (j == NIADDR) {
+			bap[i] = bn;
+			bdrwrite(bp);
+			return (0);
+		}
+		brelse(bp);
+	}
+	return (0);
+}
--- a/usr/src/uts/common/fs/ufs/ufs_directio.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/fs/ufs/ufs_directio.c	Tue Nov 15 08:00:52 2005 -0800
@@ -515,15 +515,16 @@
 		n = (int)MIN(fs->fs_bsize - on, resid);
 		if ((uoff + n) > ip->i_size) {
 			error = bmap_write(ip, uoff, (int)(on + n),
-				    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
-			    cr);
+			    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
+			    NULL, cr);
 			/* Caller is responsible for updating i_seq if needed */
 			if (error)
 				break;
 			ip->i_size = uoff + n;
 			ip->i_flag |= IATTCHG;
 		} else if (n == MAXBSIZE) {
-			error = bmap_write(ip, uoff, (int)(on + n), 1, cr);
+			error = bmap_write(ip, uoff, (int)(on + n),
+			    BI_ALLOC_ONLY, NULL, cr);
 			/* Caller is responsible for updating i_seq if needed */
 		} else {
 			if (has_holes < 0)
@@ -535,7 +536,8 @@
 				offset = uoff & (offset_t)fs->fs_bmask;
 				blk_size = (int)blksize(fs, ip,
 				    (daddr_t)lblkno(fs, offset));
-				error = bmap_write(ip, uoff, blk_size, 0, cr);
+				error = bmap_write(ip, uoff, blk_size,
+				    BI_NORMAL, NULL, cr);
 				/*
 				 * Caller is responsible for updating
 				 * i_seq if needed
--- a/usr/src/uts/common/fs/ufs/ufs_extvnops.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/fs/ufs/ufs_extvnops.c	Tue Nov 15 08:00:52 2005 -0800
@@ -321,8 +321,8 @@
 			DEBUGF((CE_CONT, "?ufs_alloc_data: grow %llx -> %llx\n",
 			    ip->i_size, uoff + nbytes));
 
-			error = bmap_write(ip, uoff, (offsetn + nbytes), 1,
-			    credp);
+			error = bmap_write(ip, uoff, (offsetn + nbytes),
+			    BI_ALLOC_ONLY, NULL, credp);
 			if (ip->i_flag & (ICHG|IUPD))
 				ip->i_seq++;
 			if (error) {
@@ -456,7 +456,7 @@
 				 * We have to allocate blocks for the hole.
 				 */
 				error = bmap_write(ip, uoff, (offsetn + nbytes),
-				    1, credp);
+				    BI_ALLOC_ONLY, NULL, credp);
 				if (ip->i_flag & (ICHG|IUPD))
 					ip->i_seq++;
 				if (error) {
--- a/usr/src/uts/common/fs/ufs/ufs_lockfs.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/fs/ufs/ufs_lockfs.c	Tue Nov 15 08:00:52 2005 -0800
@@ -99,8 +99,11 @@
 typedef struct _ulockfs_info {
 	struct _ulockfs_info *next;
 	struct ulockfs *ulp;
+	uint_t flags;
 } ulockfs_info_t;
 
+#define	ULOCK_INFO_FALLOCATE	0x00000001	/* fallocate thread */
+
 /*
  * Check in TSD that whether we are already doing any VOP on this filesystem
  */
@@ -238,6 +241,11 @@
 ufs_quiesce(struct ulockfs *ulp)
 {
 	int error = 0;
+	ulockfs_info_t *head;
+	ulockfs_info_t *info;
+
+	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
+	SEARCH_ULOCKFSP(head, ulp, info);
 
 	/*
 	 * Set a softlock to suspend future ufs_vnops so that
@@ -247,16 +255,27 @@
 	ASSERT(ufs_quiesce_pend);
 
 	/* check if there is any outstanding ufs vnodeops calls */
-	while (ulp->ul_vnops_cnt)
+	while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) {
 		/*
 		 * use timed version of cv_wait_sig() to make sure we don't
 		 * miss a wake up call from ufs_pageio() when it doesn't use
 		 * ul_lock.
+		 *
+		 * when a fallocate thread comes in, the only way it returns
+		 * from this function is if there are no other vnode operations
+		 * going on (remember fallocate threads are tracked using
+		 * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread
+		 * hasn't already grabbed the fs write lock.
 		 */
+		if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
+			if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp))
+				goto out;
+		}
 		if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) {
 			error = EINTR;
 			goto out;
 		}
+	}
 
 out:
 	/*
@@ -266,6 +285,7 @@
 
 	return (error);
 }
+
 /*
  * ufs_flush_inode
  */
@@ -843,6 +863,8 @@
 	int		 errlck		= NO_ERRLCK;
 	int		 poll_events	= POLLPRI;
 	extern struct pollhead ufs_pollhd;
+	ulockfs_info_t *head;
+	ulockfs_info_t *info;
 
 	/* check valid lock type */
 	if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK)
@@ -855,6 +877,9 @@
 	ufsvfsp = (struct ufsvfs *)vfsp->vfs_data;
 	ulp = &ufsvfsp->vfs_ulockfs;
 
+	head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key);
+	SEARCH_ULOCKFSP(head, ulp, info);
+
 	/*
 	 * Suspend both the reclaim thread and the delete thread.
 	 * This must be done outside the lockfs locking protocol.
@@ -950,12 +975,31 @@
 	LOCKFS_SET_BUSY(&ulp->ul_lockfs);
 
 	/*
+	 * We  need to unset FWLOCK status before we call ufs_quiesce
+	 * so that the thread doesnt get suspended. We do this only if
+	 * this (fallocate) thread requested an unlock operation.
+	 */
+	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
+		if (!ULOCKFS_IS_WLOCK(ulp))
+			ULOCKFS_CLR_FWLOCK(ulp);
+	}
+
+	/*
 	 * Quiesce (wait for outstanding accesses to finish)
 	 */
 	if (error = ufs_quiesce(ulp))
 		goto errout;
 
 	/*
+	 * If the fallocate thread requested a write fs lock operation
+	 * then we set fwlock status in the ulp.
+	 */
+	if (info && (info->flags & ULOCK_INFO_FALLOCATE)) {
+		if (ULOCKFS_IS_WLOCK(ulp))
+			ULOCKFS_SET_FWLOCK(ulp);
+	}
+
+	/*
 	 * can't wlock or (ro)elock fs with accounting or local swap file
 	 */
 	if ((ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
@@ -1195,7 +1239,14 @@
 				return (EINTR);
 		}
 	}
-	atomic_add_long(&ulp->ul_vnops_cnt, 1);
+
+	if (mask & ULOCKFS_FWLOCK) {
+		atomic_add_long(&ulp->ul_falloc_cnt, 1);
+		ULOCKFS_SET_FALLOC(ulp);
+	} else {
+		atomic_add_long(&ulp->ul_vnops_cnt, 1);
+	}
+
 	return (0);
 }
 
@@ -1260,9 +1311,14 @@
 	 * First time VOP call
 	 */
 	mutex_enter(&ulp->ul_lock);
-	if (ULOCKFS_IS_JUSTULOCK(ulp))
-		atomic_add_long(&ulp->ul_vnops_cnt, 1);
-	else {
+	if (ULOCKFS_IS_JUSTULOCK(ulp)) {
+		if (mask & ULOCKFS_FWLOCK) {
+			atomic_add_long(&ulp->ul_falloc_cnt, 1);
+			ULOCKFS_SET_FALLOC(ulp);
+		} else {
+			atomic_add_long(&ulp->ul_vnops_cnt, 1);
+		}
+	} else {
 		if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) {
 			mutex_exit(&ulp->ul_lock);
 			if (ulockfs_info_free == NULL)
@@ -1275,9 +1331,13 @@
 
 	if (ulockfs_info_free != NULL) {
 		ulockfs_info_free->ulp = ulp;
+		if (mask & ULOCKFS_FWLOCK)
+			ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE;
 	} else {
 		ulockfs_info_temp->ulp = ulp;
 		ulockfs_info_temp->next = ulockfs_info;
+		if (mask & ULOCKFS_FWLOCK)
+			ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE;
 		ASSERT(ufs_lockfs_key != 0);
 		(void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp);
 	}
@@ -1339,7 +1399,20 @@
 
 	mutex_enter(&ulp->ul_lock);
 
-	if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
+	/* fallocate thread */
+	if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) {
+		if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1))
+			ULOCKFS_CLR_FALLOC(ulp);
+	} else  { /* normal thread */
+		if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1))
+			cv_broadcast(&ulp->ul_cv);
+	}
+
+	/* Clear the thread's fallocate state */
+	if (info->flags & ULOCK_INFO_FALLOCATE)
+		info->flags &= ~ULOCK_INFO_FALLOCATE;
+
+	if (ulp->ul_vnops_cnt == 0 && ulp->ul_falloc_cnt)
 		cv_broadcast(&ulp->ul_cv);
 
 	mutex_exit(&ulp->ul_lock);
--- a/usr/src/uts/common/fs/ufs/ufs_trans.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/fs/ufs/ufs_trans.c	Tue Nov 15 08:00:52 2005 -0800
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2003 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -740,7 +740,7 @@
  * trunc request.  If the amount of log space is too large, then
  * calculate the the size that the requests needs to be split into.
  */
-static void
+void
 ufs_trans_trunc_resv(
 	struct inode *ip,
 	u_offset_t length,
--- a/usr/src/uts/common/fs/ufs/ufs_vfsops.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/fs/ufs/ufs_vfsops.c	Tue Nov 15 08:00:52 2005 -0800
@@ -1444,7 +1444,7 @@
 	/*
 	 * if the file system is busy; return EBUSY
 	 */
-	if (ulp->ul_vnops_cnt || ULOCKFS_IS_SLOCK(ulp)) {
+	if (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt || ULOCKFS_IS_SLOCK(ulp)) {
 		error = EBUSY;
 		goto out;
 	}
--- a/usr/src/uts/common/fs/ufs/ufs_vnops.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c	Tue Nov 15 08:00:52 2005 -0800
@@ -908,7 +908,7 @@
 			 * is done here before we up the file size.
 			 */
 			error = bmap_write(ip, uoff, (int)(on + n),
-							mapon == 0, cr);
+			    mapon == 0, NULL, cr);
 			/*
 			 * bmap_write never drops i_contents so if
 			 * the flags are set it changed the file.
@@ -946,7 +946,8 @@
 			 * needed blocks are allocated first.
 			 */
 			iblocks = ip->i_blocks;
-			error = bmap_write(ip, uoff, (int)(on + n), 1, cr);
+			error = bmap_write(ip, uoff, (int)(on + n),
+			    BI_ALLOC_ONLY, NULL, cr);
 			/*
 			 * bmap_write never drops i_contents so if
 			 * the flags are set it changed the file.
@@ -4228,31 +4229,33 @@
 
 /* ARGSUSED */
 static int
-ufs_space(
-	struct vnode *vp,
-	int cmd,
-	struct flock64 *bfp,
-	int flag,
-	offset_t offset,
-	cred_t *cr,
-	caller_context_t *ct)
+ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag,
+	offset_t offset, cred_t *cr, caller_context_t *ct)
 {
-	struct ufsvfs *ufsvfsp	= VTOI(vp)->i_ufsvfs;
+	struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs;
 	struct ulockfs *ulp;
 	int error;
 
-	error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SPACE_MASK);
-	if (error)
-		return (error);
-
-
-	if (cmd != F_FREESP)
-		error =  EINVAL;
-	else if ((error = convoff(vp, bfp, 0, offset)) == 0)
-		error = ufs_freesp(vp, bfp, flag, cr);
-
-	if (ulp)
-		ufs_lockfs_end(ulp);
+	if ((error = convoff(vp, bfp, 0, offset)) == 0) {
+		if (cmd == F_FREESP) {
+			error = ufs_lockfs_begin(ufsvfsp, &ulp,
+			    ULOCKFS_SPACE_MASK);
+			if (error)
+				return (error);
+			error = ufs_freesp(vp, bfp, flag, cr);
+		} else if (cmd == F_ALLOCSP) {
+			error = ufs_lockfs_begin(ufsvfsp, &ulp,
+			    ULOCKFS_FALLOCATE_MASK);
+			if (error)
+				return (error);
+			error = ufs_allocsp(vp, bfp, cr);
+		} else
+			return (EINVAL); /* Command not handled here */
+
+		if (ulp)
+			ufs_lockfs_end(ulp);
+
+	}
 	return (error);
 }
 
@@ -4455,7 +4458,8 @@
 		offset = uoff & (offset_t)fs->fs_bmask;
 		while (offset < uoff + len) {
 			blk_size = (int)blksize(fs, ip, lblkno(fs, offset));
-			err = bmap_write(ip, offset, blk_size, 0, cr);
+			err = bmap_write(ip, offset, blk_size,
+			    BI_NORMAL, NULL, cr);
 			if (ip->i_flag & (ICHG|IUPD))
 				ip->i_seq++;
 			if (err)
@@ -4657,7 +4661,7 @@
 	page_t		*pp;
 	daddr_t		bn;
 	size_t		io_len;
-	int		crpage;
+	int		crpage = 0;
 	int		err;
 	int		contig;
 	int		bsize = ip->i_fs->fs_bsize;
@@ -4672,7 +4676,16 @@
 		contig = 0;
 		if (err = bmap_read(ip, off, &bn, &contig))
 			return (err);
+
 		crpage = (bn == UFS_HOLE);
+
+		/*
+		 * If its also a fallocated block that hasn't been written to
+		 * yet, we will treat it just like a UFS_HOLE and create
+		 * a zero page for it
+		 */
+		if (ISFALLOCBLK(ip, bn))
+			crpage = 1;
 	}
 
 	if (crpage) {
@@ -4684,6 +4697,7 @@
 
 		if (rw != S_CREATE)
 			pagezero(pp, 0, PAGESIZE);
+
 		io_len = PAGESIZE;
 	} else {
 		u_offset_t	io_off;
@@ -4777,6 +4791,7 @@
 	struct buf	*bp;
 	daddr_t		bn;
 	size_t		io_len;
+	int		err;
 	int		contig;
 	int		xlen;
 	int		bsize = ip->i_fs->fs_bsize;
@@ -4799,7 +4814,12 @@
 		return (0);
 
 	contig = 0;
-	if (bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UFS_HOLE)
+	err = bmap_read(ip, io_off, &bn, &contig);
+	/*
+	 * If its a UFS_HOLE or a fallocated block, do not perform
+	 * any read ahead's since there probably is nothing to read ahead
+	 */
+	if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn))
 		return (0);
 
 	/*
@@ -5202,6 +5222,18 @@
 	}
 
 	/*
+	 * If it is an fallocate'd block, reverse the negativity since
+	 * we are now writing to it
+	 */
+	if (ISFALLOCBLK(ip, bn)) {
+		err = bmap_set_bn(vp, off, dbtofsb(fs, -bn));
+		if (err)
+			goto out;
+
+		bn = -bn;
+	}
+
+	/*
 	 * Take the length (of contiguous bytes) passed back from bmap()
 	 * and _try_ and get a set of pages covering that extent.
 	 */
--- a/usr/src/uts/common/sys/fcntl.h	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/sys/fcntl.h	Tue Nov 15 08:00:52 2005 -0800
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -122,7 +122,6 @@
 #define	F_CHKFL		8	/* Unused */
 #define	F_DUP2FD	9	/* Duplicate fildes at third arg */
 
-#define	F_ALLOCSP	10	/* Reserved */
 #define	F_ISSTREAM	13	/* Is the file desc. a stream ? */
 #define	F_PRIV		15	/* Turn on private access to file */
 #define	F_NPRIV		16	/* Turn off private access to file */
@@ -153,6 +152,7 @@
 /* "Native" application compilation environment */
 #define	F_SETLK		6	/* Set file lock */
 #define	F_SETLKW	7	/* Set file lock and wait */
+#define	F_ALLOCSP	10	/* Allocate file space */
 #define	F_FREESP	11	/* Free file space */
 #define	F_GETLK		14	/* Get file lock */
 #define	F_SETLK_NBMAND	42	/* private */
@@ -160,6 +160,7 @@
 /* ILP32 large file application compilation environment version */
 #define	F_SETLK		34	/* Set file lock */
 #define	F_SETLKW	35	/* Set file lock and wait */
+#define	F_ALLOCSP	28	/* Alllocate file space */
 #define	F_FREESP	27	/* Free file space */
 #define	F_GETLK		33	/* Get file lock */
 #define	F_SETLK_NBMAND	44	/* private */
@@ -176,12 +177,14 @@
  */
 #define	F_SETLK64	34	/* Set file lock */
 #define	F_SETLKW64	35	/* Set file lock and wait */
+#define	F_ALLOCSP64	28	/* Allocate file space */
 #define	F_FREESP64	27	/* Free file space */
 #define	F_GETLK64	33	/* Get file lock */
 #define	F_SETLK64_NBMAND	44	/* private */
 #else
 #define	F_SETLK64	6	/* Set file lock */
 #define	F_SETLKW64	7	/* Set file lock and wait */
+#define	F_ALLOCSP64	10	/* Allocate file space */
 #define	F_FREESP64	11	/* Free file space */
 #define	F_GETLK64	14	/* Get file lock */
 #define	F_SETLK64_NBMAND	42	/* private */
--- a/usr/src/uts/common/sys/fs/ufs_inode.h	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/sys/fs/ufs_inode.h	Tue Nov 15 08:00:52 2005 -0800
@@ -368,7 +368,8 @@
 #define	IQUIET		0x20000		/* No file system full messages */
 
 /* cflags */
-#define	IXATTR		0x0001		/* Extended attribute */
+#define	IXATTR		0x0001		/* extended attribute */
+#define	IFALLOCATE	0x0002		/* fallocate'd file */
 
 /* modes */
 #define	IFMT		0170000		/* type of file */
@@ -562,7 +563,7 @@
  * and make sure any in-core pages are initialized.
  */
 #define	BMAPALLOC(ip, off, size, cr) \
-	bmap_write((ip), (u_offset_t)(off), (size), 0, cr)
+	bmap_write((ip), (u_offset_t)(off), (size), BI_NORMAL, NULL, cr)
 
 #define	ESAME	(-1)		/* trying to rename linked files (special) */
 
@@ -579,6 +580,16 @@
 enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME };
 
 /*
+ * block initialization type for bmap_write
+ *
+ * BI_NORMAL - allocate and zero fill pages in memory
+ * BI_ALLOC_ONLY - only allocate the block, do not zero out pages in mem
+ * BI_FALLOCATE - allocate only, do not zero out pages, and store as negative
+ *                block number in inode block list
+ */
+enum bi_type { BI_NORMAL, BI_ALLOC_ONLY, BI_FALLOCATE };
+
+/*
  * This overlays the fid structure (see vfs.h)
  *
  * LP64 note: we use int32_t instead of ino_t since UFS does not use
@@ -796,6 +807,10 @@
 /* inohsz is guaranteed to be a power of 2 */
 #define	INOHASH(ino)	(((int)ino) & (inohsz - 1))
 
+#define	ISFALLOCBLK(ip, bn)	\
+	(((bn) < 0) && ((bn) % ip->i_fs->fs_bsize == 0) && \
+	((ip)->i_cflags & IFALLOCATE && (bn) != UFS_HOLE))
+
 union ihead {
 	union	ihead	*ih_head[2];
 	struct	inode	*ih_chain[2];
@@ -857,6 +872,7 @@
 extern	int	alloc(struct inode *, daddr_t, int, daddr_t *, cred_t *);
 extern	int	realloccg(struct inode *, daddr_t, daddr_t, int, int,
     daddr_t *, cred_t *);
+extern	int	ufs_allocsp(struct vnode *, struct flock64 *, cred_t *);
 extern	int	ufs_freesp(struct vnode *, struct flock64 *, int, cred_t *);
 extern	ino_t	dirpref(inode_t *);
 extern	daddr_t	blkpref(struct inode *, daddr_t, int, daddr32_t *);
@@ -866,9 +882,11 @@
 	offset_t, enum uio_seg, int *, cred_t *);
 
 extern	int	bmap_read(struct inode *, u_offset_t, daddr_t *, int *);
-extern	int	bmap_write(struct inode *, u_offset_t, int, int, struct cred *);
+extern	int	bmap_write(struct inode *, u_offset_t, int, enum bi_type,
+    daddr_t *, struct cred *);
 extern	int	bmap_has_holes(struct inode *);
 extern	int	bmap_find(struct inode *, boolean_t, u_offset_t *);
+extern	int	bmap_set_bn(struct vnode *, u_offset_t, daddr32_t);
 
 extern	void	ufs_vfs_add(struct ufsvfs *);
 extern	void	ufs_vfs_remove(struct ufsvfs *);
--- a/usr/src/uts/common/sys/fs/ufs_lockfs.h	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/sys/fs/ufs_lockfs.h	Tue Nov 15 08:00:52 2005 -0800
@@ -79,14 +79,18 @@
 #define	ULOCKFS_BUSY	0x00000001	/* ul_fs_lock is being set */
 #define	ULOCKFS_NOIACC	0x00000004	/* don't keep access times */
 #define	ULOCKFS_NOIDEL	0x00000008	/* don't free deleted files */
+#define	ULOCKFS_FALLOC	0x00000010	/* fallocate threads exist */
 
 #define	ULOCKFS_IS_BUSY(LF)	((LF)->ul_flag & ULOCKFS_BUSY)
 #define	ULOCKFS_IS_NOIACC(LF)	((LF)->ul_flag & ULOCKFS_NOIACC)
 #define	ULOCKFS_IS_NOIDEL(LF)	((LF)->ul_flag & ULOCKFS_NOIDEL)
+#define	ULOCKFS_IS_FALLOC(LF)	((LF)->ul_flag & ULOCKFS_FALLOC)
 
 #define	ULOCKFS_CLR_BUSY(LF)	((LF)->ul_flag &= ~ULOCKFS_BUSY)
+#define	ULOCKFS_SET_BUSY(LF)	((LF)->ul_flag |= ULOCKFS_BUSY)
 
-#define	ULOCKFS_SET_BUSY(LF)	((LF)->ul_flag |= ULOCKFS_BUSY)
+#define	ULOCKFS_CLR_FALLOC(LF)	((LF)->ul_flag &= ~ULOCKFS_FALLOC)
+#define	ULOCKFS_SET_FALLOC(LF)	((LF)->ul_flag |= ULOCKFS_FALLOC)
 
 /*
  * ul_fs_mod
@@ -100,15 +104,20 @@
  *
  * softlock will temporarily block most ufs_vnodeops.
  * it is used so that a waiting lockfs command will not be starved
+ *
+ * fwlock will block other fallocate threads wanting to obtain a write lock
+ * on the file system.
  */
-#define	ULOCKFS_ULOCK    ((1 << LOCKFS_ULOCK))	/* unlock */
-#define	ULOCKFS_WLOCK    ((1 << LOCKFS_WLOCK))	/* write  lock */
-#define	ULOCKFS_NLOCK    ((1 << LOCKFS_NLOCK))	/* name   lock */
-#define	ULOCKFS_DLOCK    ((1 << LOCKFS_DLOCK))	/* delete lock */
-#define	ULOCKFS_HLOCK    ((1 << LOCKFS_HLOCK))	/* hard   lock */
-#define	ULOCKFS_ELOCK    ((1 << LOCKFS_ELOCK))	/* error  lock */
-#define	ULOCKFS_ROELOCK  ((1 << LOCKFS_ROELOCK)) /* error lock (read-only) */
-#define	ULOCKFS_SLOCK    0x80000000		/* soft   lock */
+#define	ULOCKFS_ULOCK	((1 << LOCKFS_ULOCK))	/* unlock */
+#define	ULOCKFS_WLOCK	((1 << LOCKFS_WLOCK))	/* write  lock */
+#define	ULOCKFS_NLOCK	((1 << LOCKFS_NLOCK))	/* name   lock */
+#define	ULOCKFS_DLOCK	((1 << LOCKFS_DLOCK))	/* delete lock */
+#define	ULOCKFS_HLOCK	((1 << LOCKFS_HLOCK))	/* hard   lock */
+#define	ULOCKFS_ELOCK	((1 << LOCKFS_ELOCK))	/* error  lock */
+#define	ULOCKFS_ROELOCK	((1 << LOCKFS_ROELOCK)) /* error lock (read-only) */
+/* Maximum number of LOCKFS lockfs defined in sys/lockfs.h are 6 */
+#define	ULOCKFS_FWLOCK	(1 << (LOCKFS_MAXLOCK + 1)) /* fallocate write lock */
+#define	ULOCKFS_SLOCK	0x80000000		/* soft   lock */
 
 #define	ULOCKFS_IS_WLOCK(LF)	((LF)->ul_fs_lock & ULOCKFS_WLOCK)
 #define	ULOCKFS_IS_HLOCK(LF)	((LF)->ul_fs_lock & ULOCKFS_HLOCK)
@@ -118,12 +127,16 @@
 #define	ULOCKFS_IS_NLOCK(LF)	((LF)->ul_fs_lock & ULOCKFS_NLOCK)
 #define	ULOCKFS_IS_DLOCK(LF)	((LF)->ul_fs_lock & ULOCKFS_DLOCK)
 #define	ULOCKFS_IS_SLOCK(LF)	((LF)->ul_fs_lock & ULOCKFS_SLOCK)
+#define	ULOCKFS_IS_FWLOCK(LF)	((LF)->ul_fs_lock & ULOCKFS_FWLOCK)
 #define	ULOCKFS_IS_JUSTULOCK(LF) \
 	(((LF)->ul_fs_lock & (ULOCKFS_SLOCK | ULOCKFS_ULOCK)) == ULOCKFS_ULOCK)
 
 #define	ULOCKFS_SET_SLOCK(LF)	((LF)->ul_fs_lock |= ULOCKFS_SLOCK)
 #define	ULOCKFS_CLR_SLOCK(LF)	((LF)->ul_fs_lock &= ~ULOCKFS_SLOCK)
 
+#define	ULOCKFS_SET_FWLOCK(LF)	((LF)->ul_fs_lock |= ULOCKFS_FWLOCK)
+#define	ULOCKFS_CLR_FWLOCK(LF)	((LF)->ul_fs_lock &= ~ULOCKFS_FWLOCK)
+
 #define	ULOCKFS_READ_MASK	(ULOCKFS_HLOCK | ULOCKFS_ELOCK | ULOCKFS_SLOCK)
 #define	ULOCKFS_WRITE_MASK	(ULOCKFS_HLOCK | ULOCKFS_ELOCK | \
 			ULOCKFS_ROELOCK | ULOCKFS_SLOCK | ULOCKFS_WLOCK)
@@ -161,6 +174,9 @@
 #define	ULOCKFS_FRLOCK_MASK	(ULOCKFS_HLOCK | ULOCKFS_ELOCK | ULOCKFS_SLOCK)
 #define	ULOCKFS_SPACE_MASK	(ULOCKFS_HLOCK | ULOCKFS_ELOCK | \
 				ULOCKFS_ROELOCK | ULOCKFS_SLOCK | ULOCKFS_WLOCK)
+#define	ULOCKFS_FALLOCATE_MASK	(ULOCKFS_HLOCK | ULOCKFS_ELOCK | \
+				ULOCKFS_ROELOCK | ULOCKFS_SLOCK | \
+				ULOCKFS_WLOCK | ULOCKFS_FWLOCK)
 #define	ULOCKFS_QUOTA_MASK	(ULOCKFS_HLOCK | ULOCKFS_ELOCK | \
 				ULOCKFS_ROELOCK | ULOCKFS_SLOCK | ULOCKFS_WLOCK)
 /* GETPAGE breaks up into two masks */
@@ -188,6 +204,7 @@
 	kcondvar_t 	ul_cv;
 	kthread_id_t	ul_sbowner;	/* thread than can write superblock */
 	struct lockfs	ul_lockfs;	/* ioctl lock struct */
+	ulong_t		ul_falloc_cnt;	/* # of on-going fallocate ops */
 };
 
 extern ulong_t ufs_quiesce_pend;
--- a/usr/src/uts/common/sys/fs/ufs_trans.h	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/sys/fs/ufs_trans.h	Tue Nov 15 08:00:52 2005 -0800
@@ -20,7 +20,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -99,7 +99,8 @@
 	TOP_SETSECATTR,		/* 31 */
 	TOP_QUOTA,		/* 32 */
 	TOP_ITRUNC,		/* 33 */
-	TOP_MAX			/* 34 */
+	TOP_ALLOCSP,		/* 34 */
+	TOP_MAX			/* 35 TOP_MAX MUST be the last entry */
 } top_t;
 
 struct inode;
@@ -509,6 +510,8 @@
 				int *, int *);
 extern int		ufs_trans_check(dev_t);
 extern void		ufs_trans_redev(dev_t odev, dev_t ndev);
+extern void		ufs_trans_trunc_resv(struct inode *, u_offset_t, int *,
+				u_offset_t *);
 
 /*
  * transaction prototypes
--- a/usr/src/uts/common/syscall/fcntl.c	Tue Nov 15 03:49:15 2005 -0800
+++ b/usr/src/uts/common/syscall/fcntl.c	Tue Nov 15 08:00:52 2005 -0800
@@ -21,7 +21,7 @@
  */
 /* ONC_PLUS EXTRACT START */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -56,6 +56,8 @@
 #include <sys/rctl.h>
 #include <sys/nbmlock.h>
 
+#include <sys/cmn_err.h>
+
 /* ONC_PLUS EXTRACT START */
 static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t);
 static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *);
@@ -402,17 +404,27 @@
 
 	case F_ALLOCSP:
 	case F_FREESP:
+	case F_ALLOCSP64:
+	case F_FREESP64:
 		if ((flag & FWRITE) == 0) {
 			error = EBADF;
 			break;
 		}
+
 		if (vp->v_type != VREG) {
 			error = EINVAL;
 			break;
 		}
 
+		if (datamodel != DATAMODEL_ILP32 &&
+		    (cmd == F_ALLOCSP64 || cmd == F_FREESP64)) {
+			error = EINVAL;
+			break;
+		}
+
 #if defined(_ILP32) || defined(_SYSCALL32_IMPL)
-		if (datamodel == DATAMODEL_ILP32) {
+		if (datamodel == DATAMODEL_ILP32 &&
+		    (cmd == F_ALLOCSP || cmd == F_FREESP)) {
 			struct flock32 sbf32;
 			/*
 			 * For compatibility we overlay an SVR3 flock on an SVR4
@@ -434,15 +446,47 @@
 #endif /* _ILP32 || _SYSCALL32_IMPL */
 
 #if defined(_LP64)
-		if (datamodel == DATAMODEL_LP64) {
+		if (datamodel == DATAMODEL_LP64 &&
+		    (cmd == F_ALLOCSP || cmd == F_FREESP)) {
 			if (copyin((void *)arg, &bf, sizeof (bf))) {
 				error = EFAULT;
 				break;
 			}
 		}
-#endif
+#endif /* defined(_LP64) */
 
-		if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0)
+#if !defined(_LP64) || defined(_SYSCALL32_IMPL)
+		if (datamodel == DATAMODEL_ILP32 &&
+		    (cmd == F_ALLOCSP64 || cmd == F_FREESP64)) {
+			if (copyin((void *)arg, &bf64_32, sizeof (bf64_32))) {
+				error = EFAULT;
+				break;
+			} else {
+				/*
+				 * Note that the size of flock64 is different in
+				 * the ILP32 and LP64 models, due to the l_pad
+				 * field. We do not want to assume that the
+				 * flock64 structure is laid out the same in
+				 * ILP32 and LP64 environments, so we will
+				 * copy in the ILP32 version of flock64
+				 * explicitly and copy it to the native
+				 * flock64 structure.
+				 */
+				bf.l_type = (short)bf64_32.l_type;
+				bf.l_whence = (short)bf64_32.l_whence;
+				bf.l_start = bf64_32.l_start;
+				bf.l_len = bf64_32.l_len;
+				bf.l_sysid = (int)bf64_32.l_sysid;
+				bf.l_pid = (pid_t)bf64_32.l_pid;
+			}
+		}
+#endif /* !defined(_LP64) || defined(_SYSCALL32_IMPL) */
+
+		if (cmd == F_ALLOCSP || cmd == F_FREESP)
+			error = flock_check(vp, &bf, offset, maxoffset);
+		else if (cmd == F_ALLOCSP64 || cmd == F_FREESP64)
+			error = flock_check(vp, &bf, offset, MAXOFFSET_T);
+		if (error)
 			break;
 
 		if (vp->v_type == VREG && bf.l_len == 0 &&
@@ -476,7 +520,14 @@
 				break;
 			}
 		}
+
+		if (cmd == F_ALLOCSP64)
+			cmd = F_ALLOCSP;
+		else if (cmd == F_FREESP64)
+			cmd = F_FREESP;
+
 		error = VOP_SPACE(vp, cmd, &bf, flag, offset, fp->f_cred, NULL);
+
 		break;
 
 #if !defined(_LP64) || defined(_SYSCALL32_IMPL)
@@ -519,6 +570,7 @@
 			error = EFAULT;
 			break;
 		}
+
 		bf.l_type = (short)bf64_32.l_type;
 		bf.l_whence = (short)bf64_32.l_whence;
 		bf.l_start = bf64_32.l_start;
@@ -563,79 +615,7 @@
 		}
 		break;
 /* ONC_PLUS EXTRACT END */
-
-	case F_FREESP64:
-		if (datamodel != DATAMODEL_ILP32) {
-			error = EINVAL;
-			break;
-		}
-		cmd = F_FREESP;
-		if ((flag & FWRITE) == 0)
-			error = EBADF;
-		else if (vp->v_type != VREG)
-			error = EINVAL;
-		else if (copyin((void *)arg, &bf64_32, sizeof (bf64_32)))
-			error = EFAULT;
-		else {
-			/*
-			 * Note that the size of flock64 is different in
-			 * the ILP32 and LP64 models, due to the l_pad field.
-			 * We do not want to assume that the flock64 structure
-			 * is laid out the same in ILP32 and LP64
-			 * environments, so we will copy in the ILP32
-			 * version of flock64 explicitly and copy it to
-			 * the native flock64 structure.
-			 */
-			bf.l_type = (short)bf64_32.l_type;
-			bf.l_whence = (short)bf64_32.l_whence;
-			bf.l_start = bf64_32.l_start;
-			bf.l_len = bf64_32.l_len;
-			bf.l_sysid = (int)bf64_32.l_sysid;
-			bf.l_pid = (pid_t)bf64_32.l_pid;
-
-			if ((error = flock_check(vp, &bf, offset,
-			    MAXOFFSET_T)) != 0)
-				break;
-
-			if (vp->v_type == VREG && bf.l_len == 0 &&
-			    bf.l_start > OFFSET_MAX(fp)) {
-				error = EFBIG;
-				break;
-			}
-			/*
-			 * Make sure that there are no conflicting non-blocking
-			 * mandatory locks in the region being manipulated. If
-			 * there are such locks then return EACCES.
-			 */
-			if ((error = flock_get_start(vp, &bf, offset,
-			    &start)) != 0)
-				break;
-			if (nbl_need_check(vp)) {
-				u_offset_t	begin;
-				ssize_t		length;
-
-				nbl_start_crit(vp, RW_READER);
-				in_crit = 1;
-				vattr.va_mask = AT_SIZE;
-				if ((error = VOP_GETATTR(vp, &vattr, 0,
-				    CRED())) != 0)
-					break;
-				begin = start > vattr.va_size ?
-					vattr.va_size : start;
-				length = vattr.va_size > start ?
-						vattr.va_size - start :
-						start - vattr.va_size;
-				if (nbl_conflict(vp, NBL_WRITE, begin,
-				    length, 0)) {
-					error = EACCES;
-					break;
-				}
-			}
-			error = VOP_SPACE(vp, cmd, &bf, flag, offset,
-			    fp->f_cred, NULL);
-		}
-		break;
-#endif /*  !_LP64 || _SYSCALL32_IMPL */
+#endif /* !defined(_LP64) || defined(_SYSCALL32_IMPL) */
 
 /* ONC_PLUS EXTRACT START */
 	case F_SHARE: