Mercurial > illumos > illumos-gate
changeset 923:78f6e60ae914
PSARC 2004/422 posix_fallocate
4517427 All filesystems need a way to ftruncate/mmap a file with disk-space reservation
line wrap: on
line diff
--- a/usr/src/cmd/fs.d/ufs/fsck/pass1.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/cmd/fs.d/ufs/fsck/pass1.c Tue Nov 15 08:00:52 2005 -0800 @@ -625,6 +625,14 @@ daddr32_t fragno = idesc->id_blkno; struct dinode *dp; + /* + * If this is a fallocate'd file, block numbers may be stored + * as negative. In that case negate the negative numbers. + */ + dp = ginode(idesc->id_number); + if (dp->di_cflags & IFALLOCATE && fragno < 0) + fragno = -fragno; + if ((anyout = chkrange(fragno, idesc->id_numfrags)) != 0) { /* * Note that blkerror() exits when preening.
--- a/usr/src/cmd/fs.d/ufs/mkfs/mkfs.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/cmd/fs.d/ufs/mkfs/mkfs.c Tue Nov 15 08:00:52 2005 -0800 @@ -207,6 +207,11 @@ #define RC_KEYWORD 1 #define RC_POSITIONAL 2 +/* + * ufs hole + */ +#define UFS_HOLE -1 + #ifndef STANDALONE #include <stdio.h> #include <sys/mnttab.h> @@ -4399,8 +4404,15 @@ frags = dbtofsb(&sblock, dp->di_blocks); checkdirect((ino_t)i, &frags, &dp->di_db[0], NDADDR+NIADDR); - for (j = 0; j < NIADDR && frags; ++j) - checkindirect((ino_t)i, &frags, dp->di_ib[j], j); + for (j = 0; j < NIADDR && frags; ++j) { + /* Negate the block if its an fallocate'd block */ + if (dp->di_ib[j] < 0 && dp->di_ib[j] != UFS_HOLE) + checkindirect((ino_t)i, &frags, + -(dp->di_ib[j]), j); + else + checkindirect((ino_t)i, &frags, + dp->di_ib[j], j); + } } }
--- a/usr/src/head/fcntl.h Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/head/fcntl.h Tue Nov 15 08:00:52 2005 -0800 @@ -77,6 +77,9 @@ #ifdef __PRAGMA_REDEFINE_EXTNAME #pragma redefine_extname open open64 #pragma redefine_extname creat creat64 +#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX) +#pragma redefine_extname posix_fallocate posix_fallocate64 +#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */ #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \ defined(_ATFILE_SOURCE) #pragma redefine_extname openat openat64 @@ -85,6 +88,9 @@ #else #define open open64 #define creat creat64 +#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX) +#define posix_fallocate posix_fallocate64 +#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */ #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \ defined(_ATFILE_SOURCE) #define openat openat64 @@ -97,6 +103,9 @@ #ifdef __PRAGMA_REDEFINE_EXTNAME #pragma redefine_extname open64 open #pragma redefine_extname creat64 creat +#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX) +#pragma redefine_extname posix_fallocate64 posix_fallocate +#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */ #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \ defined(_ATFILE_SOURCE) #pragma redefine_extname openat64 openat @@ -105,6 +114,9 @@ #else #define open64 open #define creat64 creat +#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX) +#define posix_fallocate64 posix_fallocate +#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */ #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \ defined(_ATFILE_SOURCE) #define openat64 openat @@ -118,6 +130,9 @@ extern int fcntl(int, int, ...); extern int open(const char *, int, ...); extern int creat(const char *, mode_t); +#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX) +extern int posix_fallocate(int fd, off_t offset, off_t len); +#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */ #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \ defined(_ATFILE_SOURCE) extern int openat(int, const char *, int, ...); @@ -132,6 +147,9 @@ !defined(__PRAGMA_REDEFINE_EXTNAME)) extern int open64(const char *, int, ...); extern int creat64(const char *, mode_t); +#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX) +extern int posix_fallocate64(int fd, off64_t offset, off64_t len); +#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */ #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \ defined(_ATFILE_SOURCE) extern int openat64(int, const char *, int, ...); @@ -144,6 +162,9 @@ extern int fcntl(); extern int open(); extern int creat(); +#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX) +extern int posix_fallocate(); +#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */ #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \ defined(_ATFILE_SOURCE) extern int openat(); @@ -159,6 +180,9 @@ !defined(__PRAGMA_REDEFINE_EXTNAME)) extern int open64(); extern int creat64(); +#if defined(__EXTENSIONS__) || defined(_XPG6) || !defined(__XOPEN_OR_POSIX) +extern int posix_fallocate64(); +#endif /* defined(__EXTENSIONS__) || defined(_XPG6) || ... */ #if defined(__EXTENSIONS__) || !defined(__XOPEN_OR_POSIX) || \ defined(_ATFILE_SOURCE) extern int openat64();
--- a/usr/src/lib/librt/Makefile.com Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/lib/librt/Makefile.com Tue Nov 15 08:00:52 2005 -0800 @@ -20,7 +20,7 @@ # CDDL HEADER END # # -# Copyright 2004 Sun Microsystems, Inc. All rights reserved. +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -33,6 +33,7 @@ OBJECTS= \ aio.o \ clock_timer.o \ + fallocate.o \ fdatasync.o \ mqueue.o \ pos4.o \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/usr/src/lib/librt/common/fallocate.c Tue Nov 15 08:00:52 2005 -0800 @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <errno.h> +#include <fcntl.h> +#include <sys/types.h> +#include "pos4.h" + +#include <stdio.h> + +int +posix_fallocate(int fd, off_t offset, off_t len) +{ + struct flock lck; + + lck.l_whence = 0; + lck.l_start = offset; + lck.l_len = len; + lck.l_type = F_WRLCK; + + if (fcntl(fd, F_ALLOCSP, &lck) == -1) { + return (-1); + } + + return (0); +} + +#if defined(_LARGEFILE64_SOURCE) && !defined(_LP64) + +int +posix_fallocate64(int fd, off64_t offset, off64_t len) +{ + struct flock64 lck; + + lck.l_whence = 0; + lck.l_start = offset; + lck.l_len = len; + lck.l_type = F_WRLCK; + + if (fcntl(fd, F_ALLOCSP64, &lck) == -1) { + return (-1); + } + + return (0); +} + +#endif
--- a/usr/src/lib/librt/spec/rt.spec Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/lib/librt/spec/rt.spec Tue Nov 15 08:00:52 2005 -0800 @@ -1,6 +1,3 @@ -# -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. # # CDDL HEADER START # @@ -22,6 +19,10 @@ # # CDDL HEADER END # +# +# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# # ident "%Z%%M% %I% %E% SMI" # # lib/librt/spec/rt.spec @@ -76,6 +77,13 @@ errno EAGAIN EINTR ENOSYS end +function posix_fallocate +include <fcntl.h> +declaration int posix_fallocate(int fd, off_t offset, off_t len) +version SUNW_1.5 +errno EBADF EFBIG EINTR EINVAL EIO ENODEV ENOSPC ESPIPE +end + function fdatasync include <unistd.h> declaration int fdatasync(int fildes) @@ -157,6 +165,12 @@ version SUNW_1.3 end +function posix_fallocate64 extends librt/spec/rt.spec posix_fallocate +declaration int posix_fallocate(int fd, off64_t offset, off64_t len) +arch i386 sparc +version SUNW_1.5 +end + function mq_close include <mqueue.h> declaration int mq_close(mqd_t mqdes)
--- a/usr/src/lib/librt/spec/versions Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/lib/librt/spec/versions Tue Nov 15 08:00:52 2005 -0800 @@ -26,6 +26,7 @@ # sparc { + SUNW_1.5: {SUNW_1.4}; SUNW_1.4: {SUNW_1.3}; SUNW_1.3: {SUNW_1.2}; SUNW_1.2: {SUNW_1.1}; @@ -34,6 +35,7 @@ SUNWprivate_1.1; } i386 { + SUNW_1.5: {SUNW_1.4}; SUNW_1.4: {SUNW_1.3}; SUNW_1.3: {SUNW_1.2}; SUNW_1.2: {SUNW_1.1}; @@ -42,6 +44,7 @@ SUNWprivate_1.1; } sparcv9 { + SUNW_1.5: {SUNW_1.4}; SUNW_1.4: {SUNW_1.3}; SUNW_1.3: {SUNW_1.2}; SUNW_1.2: {SUNW_1.1}; @@ -50,6 +53,7 @@ SUNWprivate_1.1; } amd64 { + SUNW_1.5: {SUNW_1.4}; SUNW_1.4: {SUNW_1.3}; SUNW_1.3: {SUNW_1.2}; SUNW_1.2: {SUNW_1.1};
--- a/usr/src/uts/common/fs/ufs/ufs_alloc.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/fs/ufs/ufs_alloc.c Tue Nov 15 08:00:52 2005 -0800 @@ -40,6 +40,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#include <sys/condvar_impl.h> #include <sys/types.h> #include <sys/t_lock.h> #include <sys/debug.h> @@ -116,9 +117,9 @@ ufsvfsp = ip->i_ufsvfs; fs = ufsvfsp->vfs_fs; if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) { - err = ufs_fault(ITOV(ip), - "alloc: bad size, dev = 0x%lx, bsize = %d, size = %d, fs = %s\n", - ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); + err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx," + " bsize = %d, size = %d, fs = %s\n", + ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt); return (err); } if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) @@ -194,9 +195,9 @@ if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { err = ufs_fault(ITOV(ip), - "realloccg: bad size, dev=0x%lx, bsize=%d, osize=%d, nsize=%d, fs=%s\n", - ip->i_dev, fs->fs_bsize, osize, nsize, - fs->fs_fsmnt); + "realloccg: bad size, dev=0x%lx, bsize=%d, " + "osize=%d, nsize=%d, fs=%s\n", + ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt); return (err); } if (freespace(fs, ufsvfsp) <= 0 && @@ -204,8 +205,8 @@ goto nospace; if (bprev == 0) { err = ufs_fault(ITOV(ip), - "realloccg: bad bprev, dev = 0x%lx, bsize = %d, bprev = %ld, fs = %s\n", - ip->i_dev, fs->fs_bsize, bprev, + "realloccg: bad bprev, dev = 0x%lx, bsize = %d," + " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev, fs->fs_fsmnt); return (err); } @@ -403,9 +404,9 @@ if (ip->i_size) { cmn_err(CE_WARN, - "%s: free inode %d had size 0x%llx, run fsck(1M)%s", - fs->fs_fsmnt, (int)ino, ip->i_size, - (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); + "%s: free inode %d had size 0x%llx, run fsck(1M)%s", + fs->fs_fsmnt, (int)ino, ip->i_size, + (TRANS_ISTRANS(ufsvfsp) ? " -o f" : "")); } /* * Clear any garbage left behind. @@ -581,16 +582,27 @@ * next block is requested contiguously, otherwise it is * requested rotationally delayed by fs_rotdelay milliseconds. */ - nextblk = bap[indx - 1] + fs->fs_frag; - if (indx > fs->fs_maxcontig && - bap[indx - fs->fs_maxcontig] + blkstofrags(fs, fs->fs_maxcontig) - != nextblk) + + nextblk = bap[indx - 1]; + /* + * Provision for fallocate to return positive + * blk preference based on last allocation + */ + if (nextblk < 0 && nextblk != UFS_HOLE) { + nextblk = (-bap[indx - 1]) + fs->fs_frag; + } else { + nextblk = bap[indx - 1] + fs->fs_frag; + } + + if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] + + blkstofrags(fs, fs->fs_maxcontig) != nextblk) { return (nextblk); + } if (fs->fs_rotdelay != 0) /* * Here we convert ms of delay to frags as: * (frags) = (ms) * (rev/sec) * (sect/rev) / - * ((sect/frag) * (ms/sec)) + * ((sect/frag) * (ms/sec)) * then round up to the next block. */ nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect / @@ -621,16 +633,25 @@ short *blks; daddr_t blkno, cylno, rpos; + /* + * fallocate'd files will have negative block address. + * So negate it again to get original block address. + */ + if (bno < 0 && bno % fs->fs_bsize == 0 && bno != UFS_HOLE) { + bno = -bno; + } + if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) { (void) ufs_fault(ITOV(ip), - "free: bad size, dev = 0x%lx, bsize = %d, size = %d, fs = %s\n", - ip->i_dev, fs->fs_bsize, (int)size, fs->fs_fsmnt); + "free: bad size, dev = 0x%lx, bsize = %d, size = %d, " + "fs = %s\n", ip->i_dev, fs->fs_bsize, + (int)size, fs->fs_fsmnt); return; } cg = dtog(fs, bno); ASSERT(!ufs_badblock(ip, bno)); bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize); + (int)fs->fs_cgsize); cgp = bp->b_un.b_cg; if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { @@ -770,7 +791,7 @@ } cg = (int)itog(fs, ino); bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize); + (int)fs->fs_cgsize); cgp = bp->b_un.b_cg; if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { @@ -785,9 +806,9 @@ mutex_exit(&ufsvfsp->vfs_lock); brelse(bp); (void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, " - "mode: (imode) %o, (omode) %o, ino:%d, " - "fs:%s", - ip->i_mode, mode, (int)ino, fs->fs_fsmnt); + "mode: (imode) %o, (omode) %o, ino:%d, " + "fs:%s", + ip->i_mode, mode, (int)ino, fs->fs_fsmnt); return; } clrbit(iused, inot); @@ -889,7 +910,7 @@ } bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize); + (int)fs->fs_cgsize); cgp = bp->b_un.b_cg; if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { brelse(bp); @@ -963,7 +984,7 @@ if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (0); bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize); + (int)fs->fs_cgsize); cgp = bp->b_un.b_cg; if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) || @@ -1162,8 +1183,8 @@ if (fs_postbl(ufsvfsp, pos)[i] == -1) { (void) ufs_fault(ufsvfsp->vfs_root, - "alloccgblk: cyl groups corrupted, pos = %d, i = %d, fs = %s\n", - pos, i, fs->fs_fsmnt); + "alloccgblk: cyl groups corrupted, pos = %d, " + "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt); return (0); } @@ -1196,8 +1217,8 @@ i += delta; } (void) ufs_fault(ufsvfsp->vfs_root, - "alloccgblk: can't find blk in cyl, pos:%d, i:%d, fs:%s bno: %x\n", - pos, i, fs->fs_fsmnt, (int)bno); + "alloccgblk: can't find blk in cyl, pos:%d, i:%d, " + "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno); return (0); } norot: @@ -1290,8 +1311,8 @@ if (loc == 0) { mutex_exit(&ufsvfsp->vfs_lock); (void) ufs_fault(ITOV(ip), - "ialloccg: map corrupted, cg = %d, irotor = %d, fs = %s\n", - cg, (int)cgp->cg_irotor, fs->fs_fsmnt); + "ialloccg: map corrupted, cg = %d, irotor = %d, " + "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt); return (0); } } @@ -1409,9 +1430,10 @@ * metadata into userdata (harpy). If so, ignore. */ if (!TRANS_ISCANCEL(ufsvfsp, - ldbtob(fsbtodb(fs, (cfrag+bno))), - allocsiz * fs->fs_fsize)) + ldbtob(fsbtodb(fs, (cfrag+bno))), + allocsiz * fs->fs_fsize)) return (bno); + /* * keep looking -- this block is being converted */ @@ -1445,6 +1467,348 @@ #define TRIPLE 2 /* triple indirect block ptr */ /* + * Acquire a write lock, and keep trying till we get it + */ +static int +allocsp_wlockfs(struct vnode *vp, struct lockfs *lf) +{ + int err = 0; + +lockagain: + do { + err = ufs_fiolfss(vp, lf); + if (err) + return (err); + } while (!LOCKFS_IS_ULOCK(lf)); + + lf->lf_lock = LOCKFS_WLOCK; + lf->lf_flags = 0; + lf->lf_comment = NULL; + err = ufs__fiolfs(vp, lf, 1, 0); + + if (err == EBUSY || err == EINVAL) + goto lockagain; + + return (err); +} + +/* + * Release the write lock + */ +static int +allocsp_unlockfs(struct vnode *vp, struct lockfs *lf) +{ + int err = 0; + + lf->lf_lock = LOCKFS_ULOCK; + lf->lf_flags = 0; + err = ufs__fiolfs(vp, lf, 1, 0); + return (err); +} + +struct allocsp_undo { + daddr_t offset; + daddr_t blk; + struct allocsp_undo *next; +}; + +/* + * ufs_allocsp() can be used to pre-allocate blocks for a file on a given + * file system. The blocks are not initialized and are only marked as allocated. + * These addresses are then stored as negative block numbers in the inode to + * imply special handling. UFS has been modified where necessary to understand + * this new notion. Successfully fallocated files will have IFALLOCATE cflag + * set in the inode. + */ +int +ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr) +{ + struct lockfs lf; + int berr, err, resv, issync; + off_t start, istart, len; /* istart, special for idb */ + struct inode *ip; + struct fs *fs; + struct ufsvfs *ufsvfsp; + u_offset_t resid, i; + daddr32_t db_undo[NDADDR]; /* old direct blocks */ + struct allocsp_undo *ib_undo = NULL; /* ib undo */ + struct allocsp_undo *undo = NULL; + u_offset_t osz; /* old file size */ + int chunkblks = 0; /* # of blocks in 1 allocation */ + int cnt = 0; + daddr_t allocblk; + daddr_t totblks = 0; + struct ulockfs *ulp; + + ASSERT(vp->v_type == VREG); + + ip = VTOI(vp); + fs = ip->i_fs; + if ((ufsvfsp = ip->i_ufsvfs) == NULL) { + err = EIO; + goto out_allocsp; + } + + istart = start = blkroundup(fs, (lp->l_start)); + len = blkroundup(fs, (lp->l_len)); + chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize; + ulp = &ufsvfsp->vfs_ulockfs; + + if (lp->l_start < 0 || lp->l_len <= 0) + return (EINVAL); + + /* Quickly check to make sure we have space before we proceed */ + if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) { + if (TRANS_ISTRANS(ufsvfsp)) { + ufs_delete_drain_wait(ufsvfsp, 1); + if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) + return (ENOSPC); + } else + return (ENOSPC); + } + + /* + * We will keep i_rwlock locked as WRITER through out the function + * since we don't want anyone else reading or writing to the inode + * while we are in the middle of fallocating the file. + */ + rw_enter(&ip->i_rwlock, RW_WRITER); + + /* Back up the direct block list, used for undo later if necessary */ + rw_enter(&ip->i_contents, RW_READER); + for (i = 0; i < NDADDR; i++) + db_undo[i] = ip->i_db[i]; + osz = ip->i_size; + rw_exit(&ip->i_contents); + + /* Allocate any direct blocks now before we write lock the fs */ + if (lblkno(fs, start) < NDADDR) { + ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize), + &resv, &resid); + TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); + + rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); + rw_enter(&ip->i_contents, RW_WRITER); + + for (i = start; (i < len) && (lblkno(fs, i) < NDADDR); + i += fs->fs_bsize) { + berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE, + &allocblk, cr); + /* Yikes error, quit */ + if (berr) { + TRANS_INODE(ufsvfsp, ip); + rw_exit(&ip->i_contents); + rw_exit(&ufsvfsp->vfs_dqrwlock); + TRANS_END_CSYNC(ufsvfsp, err, issync, + TOP_ALLOCSP, resv); + goto exit; + } + + if (allocblk) { + totblks++; + ip->i_size += fs->fs_bsize; + } + } + + TRANS_INODE(ufsvfsp, ip); + rw_exit(&ip->i_contents); + rw_exit(&ufsvfsp->vfs_dqrwlock); + TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); + + istart = i; /* start offset for indirect allocation */ + } + + /* Write lock the file system */ + if (err = allocsp_wlockfs(vp, &lf)) + goto exit; + + /* Break the transactions into vfs_iotransz units */ + ufs_trans_trunc_resv(ip, ip->i_size + + blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid); + TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); + + rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); + rw_enter(&ip->i_contents, RW_WRITER); + + /* Now go about fallocating necessary indirect blocks */ + for (i = istart; i < len; i += fs->fs_bsize) { + berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE, + &allocblk, cr); + if (berr) { + TRANS_INODE(ufsvfsp, ip); + rw_exit(&ip->i_contents); + rw_exit(&ufsvfsp->vfs_dqrwlock); + TRANS_END_CSYNC(ufsvfsp, err, issync, + TOP_ALLOCSP, resv); + err = allocsp_unlockfs(vp, &lf); + goto exit; + } + + /* Update the blk counter only if new block was added */ + if (allocblk) { + /* Save undo information */ + undo = kmem_alloc(sizeof (struct allocsp_undo), + KM_SLEEP); + undo->offset = i; + undo->blk = allocblk; + undo->next = ib_undo; + ib_undo = undo; + totblks++; + ip->i_size += fs->fs_bsize; + } + cnt++; + + /* Being a good UFS citizen, let others get a share */ + if (cnt == chunkblks) { + /* + * If there are waiters or the fs is hard locked, + * error locked, or read-only error locked, + * quit with EIO + */ + if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) || + ULOCKFS_IS_ROELOCK(ulp)) { + ip->i_cflags |= IFALLOCATE; + TRANS_INODE(ufsvfsp, ip); + rw_exit(&ip->i_contents); + rw_exit(&ufsvfsp->vfs_dqrwlock); + + TRANS_END_CSYNC(ufsvfsp, err, issync, + TOP_ALLOCSP, resv); + rw_exit(&ip->i_rwlock); + return (EIO); + } + + TRANS_INODE(ufsvfsp, ip); + rw_exit(&ip->i_contents); + rw_exit(&ufsvfsp->vfs_dqrwlock); + + /* End the current transaction */ + TRANS_END_CSYNC(ufsvfsp, err, issync, + TOP_ALLOCSP, resv); + + if (CV_HAS_WAITERS(&ulp->ul_cv)) { + /* Release the write lock */ + if (err = allocsp_unlockfs(vp, &lf)) + goto exit; + + /* Wake up others waiting to do operations */ + mutex_enter(&ulp->ul_lock); + cv_broadcast(&ulp->ul_cv); + mutex_exit(&ulp->ul_lock); + + /* Grab the write lock again */ + if (err = allocsp_wlockfs(vp, &lf)) + goto exit; + } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */ + + /* Reserve more space in log for this file */ + ufs_trans_trunc_resv(ip, + ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz), + &resv, &resid); + TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); + + rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); + rw_enter(&ip->i_contents, RW_WRITER); + + cnt = 0; /* reset cnt b/c of new transaction */ + } + } + + if (!err && !berr) + ip->i_cflags |= IFALLOCATE; + + /* Release locks, end log transaction and unlock fs */ + TRANS_INODE(ufsvfsp, ip); + rw_exit(&ip->i_contents); + rw_exit(&ufsvfsp->vfs_dqrwlock); + + TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); + err = allocsp_unlockfs(vp, &lf); + + /* + * @ exit label, we should no longer be holding the fs write lock, and + * all logging transactions should have been ended. We still hold + * ip->i_rwlock. + */ +exit: + /* + * File has grown larger than 2GB. Set flag + * in superblock to indicate this, if it + * is not already set. + */ + if ((ip->i_size > MAXOFF32_T) && + !(fs->fs_flags & FSLARGEFILES)) { + ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES); + mutex_enter(&ufsvfsp->vfs_lock); + fs->fs_flags |= FSLARGEFILES; + ufs_sbwrite(ufsvfsp); + mutex_exit(&ufsvfsp->vfs_lock); + } + + /* + * Since we couldn't allocate completely, we will undo the allocations. + */ + if (berr) { + ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid); + TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv); + + rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER); + rw_enter(&ip->i_contents, RW_WRITER); + + /* Direct blocks */ + for (i = 0; i < NDADDR; i++) { + /* + * Only free the block if they are not same, and + * the old one isn't zero (the fragment was + * re-allocated). + */ + if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) { + free(ip, ip->i_db[i], fs->fs_bsize, 0); + ip->i_db[i] = 0; + } + } + + /* Undo the indirect blocks */ + while (ib_undo != NULL) { + undo = ib_undo; + err = bmap_set_bn(vp, undo->offset, 0); + if (err) + cmn_err(CE_PANIC, "ufs_allocsp(): failed to " + "undo allocation of block %ld", + undo->offset); + free(ip, undo->blk, fs->fs_bsize, I_IBLK); + ib_undo = undo->next; + kmem_free(undo, sizeof (struct allocsp_undo)); + } + + ip->i_size = osz; + TRANS_INODE(ufsvfsp, ip); + + rw_exit(&ip->i_contents); + rw_exit(&ufsvfsp->vfs_dqrwlock); + + TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv); + + rw_exit(&ip->i_rwlock); + return (berr); + } + + /* + * Don't forget to free the undo chain :) + */ + while (ib_undo != NULL) { + undo = ib_undo; + ib_undo = undo->next; + kmem_free(undo, sizeof (struct allocsp_undo)); + } + + rw_exit(&ip->i_rwlock); + +out_allocsp: + return (err); +} + +/* * Free storage space associated with the specified inode. The portion * to be freed is specified by lp->l_start and lp->l_len (already * normalized to a "whence" of 0). @@ -1556,8 +1920,8 @@ * find the largest contiguous range in this cg */ bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, - (daddr_t)fsbtodb(fs, cgtod(fs, cg)), - (int)fs->fs_cgsize); + (daddr_t)fsbtodb(fs, cgtod(fs, cg)), + (int)fs->fs_cgsize); cgp = bp->b_un.b_cg; if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) { brelse(bp);
--- a/usr/src/uts/common/fs/ufs/ufs_bmap.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/fs/ufs/ufs_bmap.c Tue Nov 15 08:00:52 2005 -0800 @@ -60,9 +60,9 @@ #include <sys/errno.h> #include <sys/sysmacros.h> #include <sys/vfs.h> -#include <sys/cmn_err.h> #include <sys/debug.h> #include <sys/kmem.h> +#include <sys/cmn_err.h> /* * This structure is used to track blocks as we allocate them, so that @@ -165,14 +165,14 @@ #define VERYLARGEFILESIZE 0x7FE00000 /* - * bmap{rd,wr} define the structure of file system storage by mapping + * bmap{read,write} define the structure of file system storage by mapping * a logical offset in a file to a physical block number on the device. * It should be called with a locked inode when allocation is to be - * done (bmapwr). Note this strangeness: bmapwr is always called from + * done (bmap_write). Note this strangeness: bmap_write is always called from * getpage(), not putpage(), since getpage() is where all the allocation * is done. * - * S_READ, S_OTHER -> bmaprd; S_WRITE -> bmapwr. + * S_READ, S_OTHER -> bmap_read; S_WRITE -> bmap_write. * * NOTICE: the block number returned is the disk block number, not the * file system block number. All the worries about block offsets and @@ -296,24 +296,21 @@ } /* - * See bmaprd for general notes. + * See bmap_read for general notes. * * The block must be at least size bytes and will be extended or - * allocated as needed. If alloc_only is set, bmap will not create - * any in-core pages that correspond to the new disk allocation. - * Otherwise, the in-core pages will be created and initialized as - * needed. + * allocated as needed. If alloc_type is of type BI_ALLOC_ONLY, then bmap + * will not create any in-core pages that correspond to the new disk allocation. + * If alloc_type is of BI_FALLOCATE, blocks will be stored as (-1) * block addr + * and security is maintained b/c upon reading a negative block number pages + * are zeroed. For all other allocation types (BI_NORMAL) the in-core pages will + * be created and initialized as needed. * * Returns 0 on success, or a non-zero errno if an error occurs. */ - int -bmap_write( - struct inode *ip, - u_offset_t off, - int size, - int alloc_only, - struct cred *cr) +bmap_write(struct inode *ip, u_offset_t off, int size, + enum bi_type alloc_type, daddr_t *allocblk, struct cred *cr) { struct fs *fs; struct buf *bp; @@ -340,6 +337,9 @@ ASSERT(RW_WRITE_HELD(&ip->i_contents)); + if (allocblk) + *allocblk = 0; + ufsvfsp = ip->i_ufsvfs; fs = ufsvfsp->vfs_bufp->b_un.b_fs; lbn = (daddr_t)lblkno(fs, off); @@ -360,7 +360,7 @@ issync = ((ip->i_flag & ISYNC) != 0); if (isdirquota || issync) { - alloc_only = 0; /* make sure */ + alloc_type = BI_NORMAL; /* make sure */ } /* @@ -422,6 +422,7 @@ ASSERT((unsigned)ip->i_blocks <= INT_MAX); TRANS_INODE(ufsvfsp, ip); ip->i_flag |= IUPD | ICHG | IATTCHG; + /* Caller is responsible for updating i_seq */ /* * Don't check metaflag here, directories won't do this @@ -465,7 +466,7 @@ } } /* - * need to allocate a block or frag + * need to re-allocate a block or frag */ ob = nb; pref = blkpref(ip, lbn, (int)lbn, @@ -474,6 +475,8 @@ (int)nsize, &nb, cr); if (err) return (err); + if (allocblk) + *allocblk = nb; ASSERT(!ufs_badblock(ip, nb)); } else { @@ -501,6 +504,8 @@ err = alloc(ip, pref, (int)nsize, &nb, cr); if (err) return (err); + if (allocblk) + *allocblk = nb; ASSERT(!ufs_badblock(ip, nb)); ob = nb; } @@ -513,7 +518,13 @@ /* * mmap S_WRITE faults always enter here */ - if (!alloc_only || P2ROUNDUP_TYPED(size, + /* + * We zero it if its also BI_FALLOCATE, but + * only for direct blocks! + */ + if (alloc_type == BI_NORMAL || + alloc_type == BI_FALLOCATE || + P2ROUNDUP_TYPED(size, PAGESIZE, u_offset_t) < nsize) { /* fbzero doesn't cause a pagefault */ fbzero(ITOV(ip), @@ -548,6 +559,7 @@ ASSERT((unsigned)ip->i_blocks <= INT_MAX); TRANS_INODE(ufsvfsp, ip); ip->i_flag |= IUPD | ICHG | IATTCHG; + /* Caller is responsible for updating i_seq */ /* @@ -708,6 +720,7 @@ shft -= nindirshift; /* sh /= nindir */ i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */ nb = bap[i]; + if (nb == 0) { /* * Check to see if doing this will make the @@ -750,8 +763,10 @@ } ASSERT(!ufs_badblock(ip, nb)); + ASSERT(alloced_blocks <= NIADDR); - ASSERT(alloced_blocks <= NIADDR); + if (allocblk) + *allocblk = nb; undo_table[alloced_blocks].this_block = nb; undo_table[alloced_blocks].block_size = bsize; @@ -787,7 +802,8 @@ return (err); } brelse(nbp); - } else if (!alloc_only || P2ROUNDUP_TYPED(size, + } else if (alloc_type == BI_NORMAL || + P2ROUNDUP_TYPED(size, PAGESIZE, u_offset_t) < bsize) { TRANS_MATA_ALLOC(ufsvfsp, ip, nb, bsize, 0); fbzero(ITOV(ip), @@ -846,12 +862,24 @@ } bap = bp->b_un.b_daddr; bap[i] = nb; + + /* + * The magic explained: j will be equal to NIADDR + * when we are at the lowest level, this is where the + * array entries point directly to data blocks. Since + * we will be 'fallocate'ing we will go ahead and negate + * the addresses. + */ + if (alloc_type == BI_FALLOCATE && j == NIADDR) + bap[i] = -bap[i]; + TRANS_BUF_ITEM_128(ufsvfsp, bap[i], bap, bp, DT_AB); added_sectors += btodb(bsize); ip->i_blocks += btodb(bsize); ASSERT((unsigned)ip->i_blocks <= INT_MAX); TRANS_INODE(ufsvfsp, ip); ip->i_flag |= IUPD | ICHG | IATTCHG; + /* Caller is responsible for updating i_seq */ undo_table[alloced_blocks-1].owner = @@ -910,8 +938,8 @@ */ if (dblks <= NDADDR) return (mblks < dblks); + nindirshift = ip->i_ufsvfs->vfs_nindirshift; - nindirshift = ip->i_ufsvfs->vfs_nindirshift; nindiroffset = ip->i_ufsvfs->vfs_nindiroffset; nindirblks = nindiroffset + 1; @@ -965,6 +993,7 @@ bn = *sbp; if (bn == 0) return (0); + diff = fs->fs_frag; if (*lenp) { n = MIN(n, lblkno(fs, *lenp)); @@ -1286,3 +1315,95 @@ } return (error); } + +/* + * Set a particular offset in the inode list to be a certain block. + * User is responsible for calling TRANS* functions + */ +int +bmap_set_bn(struct vnode *vp, u_offset_t off, daddr32_t bn) +{ + daddr_t lbn; + struct inode *ip; + ufsvfs_t *ufsvfsp; + struct fs *fs; + struct buf *bp; + int i, j; + int shft; /* we maintain sh = 1 << shft */ + int err; + daddr_t ob, nb, tbn; + daddr32_t *bap; + int nindirshift, nindiroffset; + + ip = VTOI(vp); + ufsvfsp = ip->i_ufsvfs; + fs = ufsvfsp->vfs_fs; + lbn = (daddr_t)lblkno(fs, off); + + ASSERT(RW_LOCK_HELD(&ip->i_contents)); + + if (lbn < 0) + return (EFBIG); + + /* + * Take care of direct block assignment + */ + if (lbn < NDADDR) { + ip->i_db[lbn] = bn; + return (0); + } + + nindirshift = ip->i_ufsvfs->vfs_nindirshift; + nindiroffset = ip->i_ufsvfs->vfs_nindiroffset; + /* + * Determine how many levels of indirection. + */ + shft = 0; /* sh = 1 */ + tbn = lbn - NDADDR; + for (j = NIADDR; j > 0; j--) { + longlong_t sh; + + shft += nindirshift; /* sh *= nindir */ + sh = 1LL << shft; + if (tbn < sh) + break; + tbn -= sh; + } + if (j == 0) + return (EFBIG); + + /* + * Fetch the first indirect block. + */ + nb = ip->i_ib[NIADDR - j]; + if (nb == 0) + err = ufs_fault(ITOV(ip), "ufs_set_bn: nb == UFS_HOLE"); + + /* + * Fetch through the indirect blocks. + */ + for (; j <= NIADDR; j++) { + ob = nb; + bp = UFS_BREAD(ufsvfsp, + ip->i_dev, fsbtodb(fs, ob), fs->fs_bsize); + if (bp->b_flags & B_ERROR) { + err = geterror(bp); + brelse(bp); + return (err); + } + bap = bp->b_un.b_daddr; + + ASSERT(!ufs_indir_badblock(ip, bap)); + + shft -= nindirshift; /* sh / nindir */ + i = (tbn >> shft) & nindiroffset; /* (tbn / sh) % nindir */ + + if (j == NIADDR) { + bap[i] = bn; + bdrwrite(bp); + return (0); + } + brelse(bp); + } + return (0); +}
--- a/usr/src/uts/common/fs/ufs/ufs_directio.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/fs/ufs/ufs_directio.c Tue Nov 15 08:00:52 2005 -0800 @@ -515,15 +515,16 @@ n = (int)MIN(fs->fs_bsize - on, resid); if ((uoff + n) > ip->i_size) { error = bmap_write(ip, uoff, (int)(on + n), - (int)(uoff & (offset_t)MAXBOFFSET) == 0, - cr); + (int)(uoff & (offset_t)MAXBOFFSET) == 0, + NULL, cr); /* Caller is responsible for updating i_seq if needed */ if (error) break; ip->i_size = uoff + n; ip->i_flag |= IATTCHG; } else if (n == MAXBSIZE) { - error = bmap_write(ip, uoff, (int)(on + n), 1, cr); + error = bmap_write(ip, uoff, (int)(on + n), + BI_ALLOC_ONLY, NULL, cr); /* Caller is responsible for updating i_seq if needed */ } else { if (has_holes < 0) @@ -535,7 +536,8 @@ offset = uoff & (offset_t)fs->fs_bmask; blk_size = (int)blksize(fs, ip, (daddr_t)lblkno(fs, offset)); - error = bmap_write(ip, uoff, blk_size, 0, cr); + error = bmap_write(ip, uoff, blk_size, + BI_NORMAL, NULL, cr); /* * Caller is responsible for updating * i_seq if needed
--- a/usr/src/uts/common/fs/ufs/ufs_extvnops.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/fs/ufs/ufs_extvnops.c Tue Nov 15 08:00:52 2005 -0800 @@ -321,8 +321,8 @@ DEBUGF((CE_CONT, "?ufs_alloc_data: grow %llx -> %llx\n", ip->i_size, uoff + nbytes)); - error = bmap_write(ip, uoff, (offsetn + nbytes), 1, - credp); + error = bmap_write(ip, uoff, (offsetn + nbytes), + BI_ALLOC_ONLY, NULL, credp); if (ip->i_flag & (ICHG|IUPD)) ip->i_seq++; if (error) { @@ -456,7 +456,7 @@ * We have to allocate blocks for the hole. */ error = bmap_write(ip, uoff, (offsetn + nbytes), - 1, credp); + BI_ALLOC_ONLY, NULL, credp); if (ip->i_flag & (ICHG|IUPD)) ip->i_seq++; if (error) {
--- a/usr/src/uts/common/fs/ufs/ufs_lockfs.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/fs/ufs/ufs_lockfs.c Tue Nov 15 08:00:52 2005 -0800 @@ -99,8 +99,11 @@ typedef struct _ulockfs_info { struct _ulockfs_info *next; struct ulockfs *ulp; + uint_t flags; } ulockfs_info_t; +#define ULOCK_INFO_FALLOCATE 0x00000001 /* fallocate thread */ + /* * Check in TSD that whether we are already doing any VOP on this filesystem */ @@ -238,6 +241,11 @@ ufs_quiesce(struct ulockfs *ulp) { int error = 0; + ulockfs_info_t *head; + ulockfs_info_t *info; + + head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); + SEARCH_ULOCKFSP(head, ulp, info); /* * Set a softlock to suspend future ufs_vnops so that @@ -247,16 +255,27 @@ ASSERT(ufs_quiesce_pend); /* check if there is any outstanding ufs vnodeops calls */ - while (ulp->ul_vnops_cnt) + while (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt) { /* * use timed version of cv_wait_sig() to make sure we don't * miss a wake up call from ufs_pageio() when it doesn't use * ul_lock. + * + * when a fallocate thread comes in, the only way it returns + * from this function is if there are no other vnode operations + * going on (remember fallocate threads are tracked using + * ul_falloc_cnt not ul_vnops_cnt), and another fallocate thread + * hasn't already grabbed the fs write lock. */ + if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { + if (!ulp->ul_vnops_cnt && !ULOCKFS_IS_FWLOCK(ulp)) + goto out; + } if (!cv_timedwait_sig(&ulp->ul_cv, &ulp->ul_lock, lbolt + hz)) { error = EINTR; goto out; } + } out: /* @@ -266,6 +285,7 @@ return (error); } + /* * ufs_flush_inode */ @@ -843,6 +863,8 @@ int errlck = NO_ERRLCK; int poll_events = POLLPRI; extern struct pollhead ufs_pollhd; + ulockfs_info_t *head; + ulockfs_info_t *info; /* check valid lock type */ if (!lockfsp || lockfsp->lf_lock > LOCKFS_MAXLOCK) @@ -855,6 +877,9 @@ ufsvfsp = (struct ufsvfs *)vfsp->vfs_data; ulp = &ufsvfsp->vfs_ulockfs; + head = (ulockfs_info_t *)tsd_get(ufs_lockfs_key); + SEARCH_ULOCKFSP(head, ulp, info); + /* * Suspend both the reclaim thread and the delete thread. * This must be done outside the lockfs locking protocol. @@ -950,12 +975,31 @@ LOCKFS_SET_BUSY(&ulp->ul_lockfs); /* + * We need to unset FWLOCK status before we call ufs_quiesce + * so that the thread doesnt get suspended. We do this only if + * this (fallocate) thread requested an unlock operation. + */ + if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { + if (!ULOCKFS_IS_WLOCK(ulp)) + ULOCKFS_CLR_FWLOCK(ulp); + } + + /* * Quiesce (wait for outstanding accesses to finish) */ if (error = ufs_quiesce(ulp)) goto errout; /* + * If the fallocate thread requested a write fs lock operation + * then we set fwlock status in the ulp. + */ + if (info && (info->flags & ULOCK_INFO_FALLOCATE)) { + if (ULOCKFS_IS_WLOCK(ulp)) + ULOCKFS_SET_FWLOCK(ulp); + } + + /* * can't wlock or (ro)elock fs with accounting or local swap file */ if ((ULOCKFS_IS_WLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) || @@ -1195,7 +1239,14 @@ return (EINTR); } } - atomic_add_long(&ulp->ul_vnops_cnt, 1); + + if (mask & ULOCKFS_FWLOCK) { + atomic_add_long(&ulp->ul_falloc_cnt, 1); + ULOCKFS_SET_FALLOC(ulp); + } else { + atomic_add_long(&ulp->ul_vnops_cnt, 1); + } + return (0); } @@ -1260,9 +1311,14 @@ * First time VOP call */ mutex_enter(&ulp->ul_lock); - if (ULOCKFS_IS_JUSTULOCK(ulp)) - atomic_add_long(&ulp->ul_vnops_cnt, 1); - else { + if (ULOCKFS_IS_JUSTULOCK(ulp)) { + if (mask & ULOCKFS_FWLOCK) { + atomic_add_long(&ulp->ul_falloc_cnt, 1); + ULOCKFS_SET_FALLOC(ulp); + } else { + atomic_add_long(&ulp->ul_vnops_cnt, 1); + } + } else { if (error = ufs_check_lockfs(ufsvfsp, ulp, mask)) { mutex_exit(&ulp->ul_lock); if (ulockfs_info_free == NULL) @@ -1275,9 +1331,13 @@ if (ulockfs_info_free != NULL) { ulockfs_info_free->ulp = ulp; + if (mask & ULOCKFS_FWLOCK) + ulockfs_info_free->flags |= ULOCK_INFO_FALLOCATE; } else { ulockfs_info_temp->ulp = ulp; ulockfs_info_temp->next = ulockfs_info; + if (mask & ULOCKFS_FWLOCK) + ulockfs_info_temp->flags |= ULOCK_INFO_FALLOCATE; ASSERT(ufs_lockfs_key != 0); (void) tsd_set(ufs_lockfs_key, (void *)ulockfs_info_temp); } @@ -1339,7 +1399,20 @@ mutex_enter(&ulp->ul_lock); - if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) + /* fallocate thread */ + if (ULOCKFS_IS_FALLOC(ulp) && info->flags & ULOCK_INFO_FALLOCATE) { + if (!atomic_add_long_nv(&ulp->ul_falloc_cnt, -1)) + ULOCKFS_CLR_FALLOC(ulp); + } else { /* normal thread */ + if (!atomic_add_long_nv(&ulp->ul_vnops_cnt, -1)) + cv_broadcast(&ulp->ul_cv); + } + + /* Clear the thread's fallocate state */ + if (info->flags & ULOCK_INFO_FALLOCATE) + info->flags &= ~ULOCK_INFO_FALLOCATE; + + if (ulp->ul_vnops_cnt == 0 && ulp->ul_falloc_cnt) cv_broadcast(&ulp->ul_cv); mutex_exit(&ulp->ul_lock);
--- a/usr/src/uts/common/fs/ufs/ufs_trans.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/fs/ufs/ufs_trans.c Tue Nov 15 08:00:52 2005 -0800 @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -740,7 +740,7 @@ * trunc request. If the amount of log space is too large, then * calculate the the size that the requests needs to be split into. */ -static void +void ufs_trans_trunc_resv( struct inode *ip, u_offset_t length,
--- a/usr/src/uts/common/fs/ufs/ufs_vfsops.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/fs/ufs/ufs_vfsops.c Tue Nov 15 08:00:52 2005 -0800 @@ -1444,7 +1444,7 @@ /* * if the file system is busy; return EBUSY */ - if (ulp->ul_vnops_cnt || ULOCKFS_IS_SLOCK(ulp)) { + if (ulp->ul_vnops_cnt || ulp->ul_falloc_cnt || ULOCKFS_IS_SLOCK(ulp)) { error = EBUSY; goto out; }
--- a/usr/src/uts/common/fs/ufs/ufs_vnops.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/fs/ufs/ufs_vnops.c Tue Nov 15 08:00:52 2005 -0800 @@ -908,7 +908,7 @@ * is done here before we up the file size. */ error = bmap_write(ip, uoff, (int)(on + n), - mapon == 0, cr); + mapon == 0, NULL, cr); /* * bmap_write never drops i_contents so if * the flags are set it changed the file. @@ -946,7 +946,8 @@ * needed blocks are allocated first. */ iblocks = ip->i_blocks; - error = bmap_write(ip, uoff, (int)(on + n), 1, cr); + error = bmap_write(ip, uoff, (int)(on + n), + BI_ALLOC_ONLY, NULL, cr); /* * bmap_write never drops i_contents so if * the flags are set it changed the file. @@ -4228,31 +4229,33 @@ /* ARGSUSED */ static int -ufs_space( - struct vnode *vp, - int cmd, - struct flock64 *bfp, - int flag, - offset_t offset, - cred_t *cr, - caller_context_t *ct) +ufs_space(struct vnode *vp, int cmd, struct flock64 *bfp, int flag, + offset_t offset, cred_t *cr, caller_context_t *ct) { - struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; + struct ufsvfs *ufsvfsp = VTOI(vp)->i_ufsvfs; struct ulockfs *ulp; int error; - error = ufs_lockfs_begin(ufsvfsp, &ulp, ULOCKFS_SPACE_MASK); - if (error) - return (error); - - - if (cmd != F_FREESP) - error = EINVAL; - else if ((error = convoff(vp, bfp, 0, offset)) == 0) - error = ufs_freesp(vp, bfp, flag, cr); - - if (ulp) - ufs_lockfs_end(ulp); + if ((error = convoff(vp, bfp, 0, offset)) == 0) { + if (cmd == F_FREESP) { + error = ufs_lockfs_begin(ufsvfsp, &ulp, + ULOCKFS_SPACE_MASK); + if (error) + return (error); + error = ufs_freesp(vp, bfp, flag, cr); + } else if (cmd == F_ALLOCSP) { + error = ufs_lockfs_begin(ufsvfsp, &ulp, + ULOCKFS_FALLOCATE_MASK); + if (error) + return (error); + error = ufs_allocsp(vp, bfp, cr); + } else + return (EINVAL); /* Command not handled here */ + + if (ulp) + ufs_lockfs_end(ulp); + + } return (error); } @@ -4455,7 +4458,8 @@ offset = uoff & (offset_t)fs->fs_bmask; while (offset < uoff + len) { blk_size = (int)blksize(fs, ip, lblkno(fs, offset)); - err = bmap_write(ip, offset, blk_size, 0, cr); + err = bmap_write(ip, offset, blk_size, + BI_NORMAL, NULL, cr); if (ip->i_flag & (ICHG|IUPD)) ip->i_seq++; if (err) @@ -4657,7 +4661,7 @@ page_t *pp; daddr_t bn; size_t io_len; - int crpage; + int crpage = 0; int err; int contig; int bsize = ip->i_fs->fs_bsize; @@ -4672,7 +4676,16 @@ contig = 0; if (err = bmap_read(ip, off, &bn, &contig)) return (err); + crpage = (bn == UFS_HOLE); + + /* + * If its also a fallocated block that hasn't been written to + * yet, we will treat it just like a UFS_HOLE and create + * a zero page for it + */ + if (ISFALLOCBLK(ip, bn)) + crpage = 1; } if (crpage) { @@ -4684,6 +4697,7 @@ if (rw != S_CREATE) pagezero(pp, 0, PAGESIZE); + io_len = PAGESIZE; } else { u_offset_t io_off; @@ -4777,6 +4791,7 @@ struct buf *bp; daddr_t bn; size_t io_len; + int err; int contig; int xlen; int bsize = ip->i_fs->fs_bsize; @@ -4799,7 +4814,12 @@ return (0); contig = 0; - if (bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UFS_HOLE) + err = bmap_read(ip, io_off, &bn, &contig); + /* + * If its a UFS_HOLE or a fallocated block, do not perform + * any read ahead's since there probably is nothing to read ahead + */ + if (err || bn == UFS_HOLE || ISFALLOCBLK(ip, bn)) return (0); /* @@ -5202,6 +5222,18 @@ } /* + * If it is an fallocate'd block, reverse the negativity since + * we are now writing to it + */ + if (ISFALLOCBLK(ip, bn)) { + err = bmap_set_bn(vp, off, dbtofsb(fs, -bn)); + if (err) + goto out; + + bn = -bn; + } + + /* * Take the length (of contiguous bytes) passed back from bmap() * and _try_ and get a set of pages covering that extent. */
--- a/usr/src/uts/common/sys/fcntl.h Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/sys/fcntl.h Tue Nov 15 08:00:52 2005 -0800 @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -122,7 +122,6 @@ #define F_CHKFL 8 /* Unused */ #define F_DUP2FD 9 /* Duplicate fildes at third arg */ -#define F_ALLOCSP 10 /* Reserved */ #define F_ISSTREAM 13 /* Is the file desc. a stream ? */ #define F_PRIV 15 /* Turn on private access to file */ #define F_NPRIV 16 /* Turn off private access to file */ @@ -153,6 +152,7 @@ /* "Native" application compilation environment */ #define F_SETLK 6 /* Set file lock */ #define F_SETLKW 7 /* Set file lock and wait */ +#define F_ALLOCSP 10 /* Allocate file space */ #define F_FREESP 11 /* Free file space */ #define F_GETLK 14 /* Get file lock */ #define F_SETLK_NBMAND 42 /* private */ @@ -160,6 +160,7 @@ /* ILP32 large file application compilation environment version */ #define F_SETLK 34 /* Set file lock */ #define F_SETLKW 35 /* Set file lock and wait */ +#define F_ALLOCSP 28 /* Alllocate file space */ #define F_FREESP 27 /* Free file space */ #define F_GETLK 33 /* Get file lock */ #define F_SETLK_NBMAND 44 /* private */ @@ -176,12 +177,14 @@ */ #define F_SETLK64 34 /* Set file lock */ #define F_SETLKW64 35 /* Set file lock and wait */ +#define F_ALLOCSP64 28 /* Allocate file space */ #define F_FREESP64 27 /* Free file space */ #define F_GETLK64 33 /* Get file lock */ #define F_SETLK64_NBMAND 44 /* private */ #else #define F_SETLK64 6 /* Set file lock */ #define F_SETLKW64 7 /* Set file lock and wait */ +#define F_ALLOCSP64 10 /* Allocate file space */ #define F_FREESP64 11 /* Free file space */ #define F_GETLK64 14 /* Get file lock */ #define F_SETLK64_NBMAND 42 /* private */
--- a/usr/src/uts/common/sys/fs/ufs_inode.h Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/sys/fs/ufs_inode.h Tue Nov 15 08:00:52 2005 -0800 @@ -368,7 +368,8 @@ #define IQUIET 0x20000 /* No file system full messages */ /* cflags */ -#define IXATTR 0x0001 /* Extended attribute */ +#define IXATTR 0x0001 /* extended attribute */ +#define IFALLOCATE 0x0002 /* fallocate'd file */ /* modes */ #define IFMT 0170000 /* type of file */ @@ -562,7 +563,7 @@ * and make sure any in-core pages are initialized. */ #define BMAPALLOC(ip, off, size, cr) \ - bmap_write((ip), (u_offset_t)(off), (size), 0, cr) + bmap_write((ip), (u_offset_t)(off), (size), BI_NORMAL, NULL, cr) #define ESAME (-1) /* trying to rename linked files (special) */ @@ -579,6 +580,16 @@ enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* + * block initialization type for bmap_write + * + * BI_NORMAL - allocate and zero fill pages in memory + * BI_ALLOC_ONLY - only allocate the block, do not zero out pages in mem + * BI_FALLOCATE - allocate only, do not zero out pages, and store as negative + * block number in inode block list + */ +enum bi_type { BI_NORMAL, BI_ALLOC_ONLY, BI_FALLOCATE }; + +/* * This overlays the fid structure (see vfs.h) * * LP64 note: we use int32_t instead of ino_t since UFS does not use @@ -796,6 +807,10 @@ /* inohsz is guaranteed to be a power of 2 */ #define INOHASH(ino) (((int)ino) & (inohsz - 1)) +#define ISFALLOCBLK(ip, bn) \ + (((bn) < 0) && ((bn) % ip->i_fs->fs_bsize == 0) && \ + ((ip)->i_cflags & IFALLOCATE && (bn) != UFS_HOLE)) + union ihead { union ihead *ih_head[2]; struct inode *ih_chain[2]; @@ -857,6 +872,7 @@ extern int alloc(struct inode *, daddr_t, int, daddr_t *, cred_t *); extern int realloccg(struct inode *, daddr_t, daddr_t, int, int, daddr_t *, cred_t *); +extern int ufs_allocsp(struct vnode *, struct flock64 *, cred_t *); extern int ufs_freesp(struct vnode *, struct flock64 *, int, cred_t *); extern ino_t dirpref(inode_t *); extern daddr_t blkpref(struct inode *, daddr_t, int, daddr32_t *); @@ -866,9 +882,11 @@ offset_t, enum uio_seg, int *, cred_t *); extern int bmap_read(struct inode *, u_offset_t, daddr_t *, int *); -extern int bmap_write(struct inode *, u_offset_t, int, int, struct cred *); +extern int bmap_write(struct inode *, u_offset_t, int, enum bi_type, + daddr_t *, struct cred *); extern int bmap_has_holes(struct inode *); extern int bmap_find(struct inode *, boolean_t, u_offset_t *); +extern int bmap_set_bn(struct vnode *, u_offset_t, daddr32_t); extern void ufs_vfs_add(struct ufsvfs *); extern void ufs_vfs_remove(struct ufsvfs *);
--- a/usr/src/uts/common/sys/fs/ufs_lockfs.h Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/sys/fs/ufs_lockfs.h Tue Nov 15 08:00:52 2005 -0800 @@ -79,14 +79,18 @@ #define ULOCKFS_BUSY 0x00000001 /* ul_fs_lock is being set */ #define ULOCKFS_NOIACC 0x00000004 /* don't keep access times */ #define ULOCKFS_NOIDEL 0x00000008 /* don't free deleted files */ +#define ULOCKFS_FALLOC 0x00000010 /* fallocate threads exist */ #define ULOCKFS_IS_BUSY(LF) ((LF)->ul_flag & ULOCKFS_BUSY) #define ULOCKFS_IS_NOIACC(LF) ((LF)->ul_flag & ULOCKFS_NOIACC) #define ULOCKFS_IS_NOIDEL(LF) ((LF)->ul_flag & ULOCKFS_NOIDEL) +#define ULOCKFS_IS_FALLOC(LF) ((LF)->ul_flag & ULOCKFS_FALLOC) #define ULOCKFS_CLR_BUSY(LF) ((LF)->ul_flag &= ~ULOCKFS_BUSY) +#define ULOCKFS_SET_BUSY(LF) ((LF)->ul_flag |= ULOCKFS_BUSY) -#define ULOCKFS_SET_BUSY(LF) ((LF)->ul_flag |= ULOCKFS_BUSY) +#define ULOCKFS_CLR_FALLOC(LF) ((LF)->ul_flag &= ~ULOCKFS_FALLOC) +#define ULOCKFS_SET_FALLOC(LF) ((LF)->ul_flag |= ULOCKFS_FALLOC) /* * ul_fs_mod @@ -100,15 +104,20 @@ * * softlock will temporarily block most ufs_vnodeops. * it is used so that a waiting lockfs command will not be starved + * + * fwlock will block other fallocate threads wanting to obtain a write lock + * on the file system. */ -#define ULOCKFS_ULOCK ((1 << LOCKFS_ULOCK)) /* unlock */ -#define ULOCKFS_WLOCK ((1 << LOCKFS_WLOCK)) /* write lock */ -#define ULOCKFS_NLOCK ((1 << LOCKFS_NLOCK)) /* name lock */ -#define ULOCKFS_DLOCK ((1 << LOCKFS_DLOCK)) /* delete lock */ -#define ULOCKFS_HLOCK ((1 << LOCKFS_HLOCK)) /* hard lock */ -#define ULOCKFS_ELOCK ((1 << LOCKFS_ELOCK)) /* error lock */ -#define ULOCKFS_ROELOCK ((1 << LOCKFS_ROELOCK)) /* error lock (read-only) */ -#define ULOCKFS_SLOCK 0x80000000 /* soft lock */ +#define ULOCKFS_ULOCK ((1 << LOCKFS_ULOCK)) /* unlock */ +#define ULOCKFS_WLOCK ((1 << LOCKFS_WLOCK)) /* write lock */ +#define ULOCKFS_NLOCK ((1 << LOCKFS_NLOCK)) /* name lock */ +#define ULOCKFS_DLOCK ((1 << LOCKFS_DLOCK)) /* delete lock */ +#define ULOCKFS_HLOCK ((1 << LOCKFS_HLOCK)) /* hard lock */ +#define ULOCKFS_ELOCK ((1 << LOCKFS_ELOCK)) /* error lock */ +#define ULOCKFS_ROELOCK ((1 << LOCKFS_ROELOCK)) /* error lock (read-only) */ +/* Maximum number of LOCKFS lockfs defined in sys/lockfs.h are 6 */ +#define ULOCKFS_FWLOCK (1 << (LOCKFS_MAXLOCK + 1)) /* fallocate write lock */ +#define ULOCKFS_SLOCK 0x80000000 /* soft lock */ #define ULOCKFS_IS_WLOCK(LF) ((LF)->ul_fs_lock & ULOCKFS_WLOCK) #define ULOCKFS_IS_HLOCK(LF) ((LF)->ul_fs_lock & ULOCKFS_HLOCK) @@ -118,12 +127,16 @@ #define ULOCKFS_IS_NLOCK(LF) ((LF)->ul_fs_lock & ULOCKFS_NLOCK) #define ULOCKFS_IS_DLOCK(LF) ((LF)->ul_fs_lock & ULOCKFS_DLOCK) #define ULOCKFS_IS_SLOCK(LF) ((LF)->ul_fs_lock & ULOCKFS_SLOCK) +#define ULOCKFS_IS_FWLOCK(LF) ((LF)->ul_fs_lock & ULOCKFS_FWLOCK) #define ULOCKFS_IS_JUSTULOCK(LF) \ (((LF)->ul_fs_lock & (ULOCKFS_SLOCK | ULOCKFS_ULOCK)) == ULOCKFS_ULOCK) #define ULOCKFS_SET_SLOCK(LF) ((LF)->ul_fs_lock |= ULOCKFS_SLOCK) #define ULOCKFS_CLR_SLOCK(LF) ((LF)->ul_fs_lock &= ~ULOCKFS_SLOCK) +#define ULOCKFS_SET_FWLOCK(LF) ((LF)->ul_fs_lock |= ULOCKFS_FWLOCK) +#define ULOCKFS_CLR_FWLOCK(LF) ((LF)->ul_fs_lock &= ~ULOCKFS_FWLOCK) + #define ULOCKFS_READ_MASK (ULOCKFS_HLOCK | ULOCKFS_ELOCK | ULOCKFS_SLOCK) #define ULOCKFS_WRITE_MASK (ULOCKFS_HLOCK | ULOCKFS_ELOCK | \ ULOCKFS_ROELOCK | ULOCKFS_SLOCK | ULOCKFS_WLOCK) @@ -161,6 +174,9 @@ #define ULOCKFS_FRLOCK_MASK (ULOCKFS_HLOCK | ULOCKFS_ELOCK | ULOCKFS_SLOCK) #define ULOCKFS_SPACE_MASK (ULOCKFS_HLOCK | ULOCKFS_ELOCK | \ ULOCKFS_ROELOCK | ULOCKFS_SLOCK | ULOCKFS_WLOCK) +#define ULOCKFS_FALLOCATE_MASK (ULOCKFS_HLOCK | ULOCKFS_ELOCK | \ + ULOCKFS_ROELOCK | ULOCKFS_SLOCK | \ + ULOCKFS_WLOCK | ULOCKFS_FWLOCK) #define ULOCKFS_QUOTA_MASK (ULOCKFS_HLOCK | ULOCKFS_ELOCK | \ ULOCKFS_ROELOCK | ULOCKFS_SLOCK | ULOCKFS_WLOCK) /* GETPAGE breaks up into two masks */ @@ -188,6 +204,7 @@ kcondvar_t ul_cv; kthread_id_t ul_sbowner; /* thread than can write superblock */ struct lockfs ul_lockfs; /* ioctl lock struct */ + ulong_t ul_falloc_cnt; /* # of on-going fallocate ops */ }; extern ulong_t ufs_quiesce_pend;
--- a/usr/src/uts/common/sys/fs/ufs_trans.h Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/sys/fs/ufs_trans.h Tue Nov 15 08:00:52 2005 -0800 @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -99,7 +99,8 @@ TOP_SETSECATTR, /* 31 */ TOP_QUOTA, /* 32 */ TOP_ITRUNC, /* 33 */ - TOP_MAX /* 34 */ + TOP_ALLOCSP, /* 34 */ + TOP_MAX /* 35 TOP_MAX MUST be the last entry */ } top_t; struct inode; @@ -509,6 +510,8 @@ int *, int *); extern int ufs_trans_check(dev_t); extern void ufs_trans_redev(dev_t odev, dev_t ndev); +extern void ufs_trans_trunc_resv(struct inode *, u_offset_t, int *, + u_offset_t *); /* * transaction prototypes
--- a/usr/src/uts/common/syscall/fcntl.c Tue Nov 15 03:49:15 2005 -0800 +++ b/usr/src/uts/common/syscall/fcntl.c Tue Nov 15 08:00:52 2005 -0800 @@ -21,7 +21,7 @@ */ /* ONC_PLUS EXTRACT START */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -56,6 +56,8 @@ #include <sys/rctl.h> #include <sys/nbmlock.h> +#include <sys/cmn_err.h> + /* ONC_PLUS EXTRACT START */ static int flock_check(vnode_t *, flock64_t *, offset_t, offset_t); static int flock_get_start(vnode_t *, flock64_t *, offset_t, u_offset_t *); @@ -402,17 +404,27 @@ case F_ALLOCSP: case F_FREESP: + case F_ALLOCSP64: + case F_FREESP64: if ((flag & FWRITE) == 0) { error = EBADF; break; } + if (vp->v_type != VREG) { error = EINVAL; break; } + if (datamodel != DATAMODEL_ILP32 && + (cmd == F_ALLOCSP64 || cmd == F_FREESP64)) { + error = EINVAL; + break; + } + #if defined(_ILP32) || defined(_SYSCALL32_IMPL) - if (datamodel == DATAMODEL_ILP32) { + if (datamodel == DATAMODEL_ILP32 && + (cmd == F_ALLOCSP || cmd == F_FREESP)) { struct flock32 sbf32; /* * For compatibility we overlay an SVR3 flock on an SVR4 @@ -434,15 +446,47 @@ #endif /* _ILP32 || _SYSCALL32_IMPL */ #if defined(_LP64) - if (datamodel == DATAMODEL_LP64) { + if (datamodel == DATAMODEL_LP64 && + (cmd == F_ALLOCSP || cmd == F_FREESP)) { if (copyin((void *)arg, &bf, sizeof (bf))) { error = EFAULT; break; } } -#endif +#endif /* defined(_LP64) */ - if ((error = flock_check(vp, &bf, offset, maxoffset)) != 0) +#if !defined(_LP64) || defined(_SYSCALL32_IMPL) + if (datamodel == DATAMODEL_ILP32 && + (cmd == F_ALLOCSP64 || cmd == F_FREESP64)) { + if (copyin((void *)arg, &bf64_32, sizeof (bf64_32))) { + error = EFAULT; + break; + } else { + /* + * Note that the size of flock64 is different in + * the ILP32 and LP64 models, due to the l_pad + * field. We do not want to assume that the + * flock64 structure is laid out the same in + * ILP32 and LP64 environments, so we will + * copy in the ILP32 version of flock64 + * explicitly and copy it to the native + * flock64 structure. + */ + bf.l_type = (short)bf64_32.l_type; + bf.l_whence = (short)bf64_32.l_whence; + bf.l_start = bf64_32.l_start; + bf.l_len = bf64_32.l_len; + bf.l_sysid = (int)bf64_32.l_sysid; + bf.l_pid = (pid_t)bf64_32.l_pid; + } + } +#endif /* !defined(_LP64) || defined(_SYSCALL32_IMPL) */ + + if (cmd == F_ALLOCSP || cmd == F_FREESP) + error = flock_check(vp, &bf, offset, maxoffset); + else if (cmd == F_ALLOCSP64 || cmd == F_FREESP64) + error = flock_check(vp, &bf, offset, MAXOFFSET_T); + if (error) break; if (vp->v_type == VREG && bf.l_len == 0 && @@ -476,7 +520,14 @@ break; } } + + if (cmd == F_ALLOCSP64) + cmd = F_ALLOCSP; + else if (cmd == F_FREESP64) + cmd = F_FREESP; + error = VOP_SPACE(vp, cmd, &bf, flag, offset, fp->f_cred, NULL); + break; #if !defined(_LP64) || defined(_SYSCALL32_IMPL) @@ -519,6 +570,7 @@ error = EFAULT; break; } + bf.l_type = (short)bf64_32.l_type; bf.l_whence = (short)bf64_32.l_whence; bf.l_start = bf64_32.l_start; @@ -563,79 +615,7 @@ } break; /* ONC_PLUS EXTRACT END */ - - case F_FREESP64: - if (datamodel != DATAMODEL_ILP32) { - error = EINVAL; - break; - } - cmd = F_FREESP; - if ((flag & FWRITE) == 0) - error = EBADF; - else if (vp->v_type != VREG) - error = EINVAL; - else if (copyin((void *)arg, &bf64_32, sizeof (bf64_32))) - error = EFAULT; - else { - /* - * Note that the size of flock64 is different in - * the ILP32 and LP64 models, due to the l_pad field. - * We do not want to assume that the flock64 structure - * is laid out the same in ILP32 and LP64 - * environments, so we will copy in the ILP32 - * version of flock64 explicitly and copy it to - * the native flock64 structure. - */ - bf.l_type = (short)bf64_32.l_type; - bf.l_whence = (short)bf64_32.l_whence; - bf.l_start = bf64_32.l_start; - bf.l_len = bf64_32.l_len; - bf.l_sysid = (int)bf64_32.l_sysid; - bf.l_pid = (pid_t)bf64_32.l_pid; - - if ((error = flock_check(vp, &bf, offset, - MAXOFFSET_T)) != 0) - break; - - if (vp->v_type == VREG && bf.l_len == 0 && - bf.l_start > OFFSET_MAX(fp)) { - error = EFBIG; - break; - } - /* - * Make sure that there are no conflicting non-blocking - * mandatory locks in the region being manipulated. If - * there are such locks then return EACCES. - */ - if ((error = flock_get_start(vp, &bf, offset, - &start)) != 0) - break; - if (nbl_need_check(vp)) { - u_offset_t begin; - ssize_t length; - - nbl_start_crit(vp, RW_READER); - in_crit = 1; - vattr.va_mask = AT_SIZE; - if ((error = VOP_GETATTR(vp, &vattr, 0, - CRED())) != 0) - break; - begin = start > vattr.va_size ? - vattr.va_size : start; - length = vattr.va_size > start ? - vattr.va_size - start : - start - vattr.va_size; - if (nbl_conflict(vp, NBL_WRITE, begin, - length, 0)) { - error = EACCES; - break; - } - } - error = VOP_SPACE(vp, cmd, &bf, flag, offset, - fp->f_cred, NULL); - } - break; -#endif /* !_LP64 || _SYSCALL32_IMPL */ +#endif /* !defined(_LP64) || defined(_SYSCALL32_IMPL) */ /* ONC_PLUS EXTRACT START */ case F_SHARE: