Mercurial > illumos > nfs4.1
view usr/src/uts/common/fs/nfs/nfs41_srv.c @ 13954:bd01f634e673
nfs: fix erratum at op_rename
mds_op_rename and rfs4_op_rename have erratum about
getting attributes.
author | Vitaliy Gusev <gusev.vitaliy@nexenta.com> |
---|---|
date | Wed, 21 Aug 2013 23:38:55 +0400 |
parents | efa4353d4edf |
children | 31878c1ccb7e |
line wrap: on
line source
/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. */ /* * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All Rights Reserved */ #include <sys/param.h> #include <sys/types.h> #include <sys/systm.h> #include <sys/cred.h> #include <sys/buf.h> #include <sys/vfs.h> #include <sys/vnode.h> #include <sys/uio.h> #include <sys/errno.h> #include <sys/sysmacros.h> #include <sys/statvfs.h> #include <sys/kmem.h> #include <sys/dirent.h> #include <sys/cmn_err.h> #include <sys/disp.h> #include <sys/debug.h> #include <sys/systeminfo.h> #include <sys/flock.h> #include <sys/pathname.h> #include <sys/nbmlock.h> #include <sys/share.h> #include <sys/atomic.h> #include <sys/policy.h> #include <sys/fem.h> #include <sys/sdt.h> #include <sys/ddi.h> #include <sys/modctl.h> #include <sys/timod.h> #include <sys/id_space.h> #include <rpc/types.h> #include <rpc/auth.h> #include <rpc/rpcsec_gss.h> #include <rpc/svc.h> #include <nfs/nfs.h> #include <nfs/export.h> #include <nfs/lm.h> #include <nfs/nfs4.h> #include <sys/strsubr.h> #include <sys/strsun.h> #include <inet/common.h> #include <inet/ip.h> #include <inet/ip6.h> #include <sys/tsol/label.h> #include <sys/tsol/tndb.h> #include <nfs/nfs4_attrmap.h> #include <nfs/nfs4_srv_attr.h> #include <nfs/mds_state.h> #include <nfs/mds_odl.h> #include <nfs/nfs41_layout.h> #include <nfs/nfs41_filehandle.h> #include <nfs/ctl_mds_clnt.h> #include <nfs/spe_impl.h> #define RFS4_MAXLOCK_TRIES 4 /* Try to get the lock this many times */ static int rfs4_maxlock_tries = RFS4_MAXLOCK_TRIES; #define RFS4_LOCK_DELAY 10 /* Milliseconds */ static clock_t rfs4_lock_delay = RFS4_LOCK_DELAY; int mds_strict_seqid = 0; static void ping_cb_null_thr(mds_session_t *); /* End of Tunables */ /* * Used to bump the stateid4.seqid value and show changes in the stateid */ #define next_stateid(sp) (++(sp)->v41_bits.chgseq) /* * RFS4_MINLEN_ENTRY4: XDR-encoded size of smallest possible dirent. * This is used to return NFS4ERR_TOOSMALL when clients specify * maxcount that isn't large enough to hold the smallest possible * XDR encoded dirent. * * sizeof cookie (8 bytes) + * sizeof name_len (4 bytes) + * sizeof smallest (padded) name (4 bytes) + * sizeof bitmap4_len (12 bytes) + NOTE: we always encode len=2 bm4 * sizeof attrlist4_len (4 bytes) + * sizeof next boolean (4 bytes) * * RFS4_MINLEN_RDDIR4: XDR-encoded size of READDIR op reply containing * the smallest possible entry4 (assumes no attrs requested). * sizeof nfsstat4 (4 bytes) + * sizeof verifier4 (8 bytes) + * sizeof entry4list bool (4 bytes) + * sizeof entry4 (36 bytes) + * sizeof eof bool (4 bytes) * * RFS4_MINLEN_RDDIR_BUF: minimum length of buffer server will provide to * VOP_READDIR. Its value is the size of the maximum possible dirent * for solaris. The DIRENT64_RECLEN macro returns the size of dirent * required for a given name length. MAXNAMELEN is the maximum * filename length allowed in Solaris. The first two DIRENT64_RECLEN() * macros are to allow for . and .. entries -- just a minor tweak to try * and guarantee that buffer we give to VOP_READDIR will be large enough * to hold ., .., and the largest possible solaris dirent64. */ #define RFS4_MINLEN_ENTRY4 36 #define RFS4_MINLEN_RDDIR4 (4 + NFS4_VERIFIER_SIZE + 4 + RFS4_MINLEN_ENTRY4 + 4) #define RFS4_MINLEN_RDDIR_BUF \ (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2) + DIRENT64_RECLEN(MAXNAMELEN)) /* * It would be better to pad to 4 bytes since that's what XDR would do, * but the dirents UFS gives us are already padded to 8, so just take * what we're given. Dircount is only a hint anyway. Currently the * solaris kernel is ASCII only, so there's no point in calling the * UTF8 functions. * * dirent64: named padded to provide 8 byte struct alignment * d_ino(8) + d_off(8) + d_reclen(2) + d_name(namelen + null(1) + pad) * * cookie: uint64_t + utf8namelen: uint_t + utf8name padded to 8 bytes * */ #define DIRENT64_TO_DIRCOUNT(dp) \ (3 * BYTES_PER_XDR_UNIT + DIRENT64_NAMELEN((dp)->d_reclen)) /* * types of label comparison */ #define EQUALITY_CHECK 0 #define DOMINANCE_CHECK 1 static sysid_t lockt_sysid; /* dummy sysid for all LOCKT calls */ void rfs4_init_compound_state(struct compound_state *); static void nullfree(nfs_resop4 *); static void mds_op_inval(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_notsup(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_access(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_close(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_commit(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_create(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_delegreturn(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_getattr(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_getattr_free(nfs_resop4 *); static void mds_op_getfh(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_getfh_free(nfs_resop4 *); static void mds_op_link(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_lock(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_lock_denied_free(nfs_resop4 *); static void mds_op_locku(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_lockt(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_lookup(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_lookupp(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_openattr(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *); static void mds_op_nverify(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_open(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_open_downgrade(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_putfh(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_putpubfh(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_putrootfh(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_read(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_read_free(nfs_resop4 *); void mds_op_readdir(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_readdir_free(nfs_resop4 *); static void mds_op_readlink(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_readlink_free(nfs_resop4 *); static void mds_op_release_lockowner(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_remove(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_rename(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_renew(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_restorefh(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_savefh(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_setattr(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_verify(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_write(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_exchange_id(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_exid_free(nfs_resop4 *); static void mds_op_secinfo(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_secinfonn(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); nfsstat4 do_rfs4_op_secinfo(struct compound_state *, char *, int, SECINFO4res *); static void mds_op_secinfo_free(nfs_resop4 *); static void mds_op_backchannel_ctl(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_bind_conn_to_session(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_create_clientid(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_create_session(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_destroy_session(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_sequence(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_get_devlist(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_get_devinfo(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_layout_get(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_layout_get_free(nfs_resop4 *); static void mds_op_layout_commit(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_layout_return(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static void mds_op_reclaim_complete(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); static int seq_chk_limits(nfs_argop4 *, nfs_resop4 *, compound_state_t *); static bool_t valid_first_compound_op(nfs_opnum4); static nfsstat4 verify_compound_args(compound_state_t *, COMPOUND4args *); nfsstat4 check_open_access(uint32_t, struct compound_state *, struct svc_req *); nfsstat4 rfs4_client_sysid(rfs4_client_t *, sysid_t *); static void mds_free_reply(nfs_resop4 *); vnode_t *do_rfs4_op_mknod(CREATE4args *, CREATE4res *, struct svc_req *, struct compound_state *, vattr_t *, char *); nfsstat4 rfs4_do_lock(rfs4_lo_state_t *, nfs_lock_type4, seqid4, offset4, length4, cred_t *, nfs_resop4 *); rfs4_lo_state_t *mds_findlo_state_by_owner(rfs4_lockowner_t *, rfs4_state_t *, bool_t *); bool_t in_flavor_list(int, int *, int); nfsstat4 attrmap4_to_vattrmask(attrmap4 *, struct nfs4_svgetit_arg *); nfsstat4 bitmap4_get_sysattrs(struct nfs4_svgetit_arg *); nfsstat4 do_rfs4_op_getattr(attrmap4 *, fattr4 *, struct nfs4_svgetit_arg *); nfsstat4 do_rfs4_op_lookup(char *, uint_t, struct svc_req *, struct compound_state *); rfs4_lockowner_t *mds_findlockowner_by_pid(nfs_server_instance_t *, pid_t); mds_session_t *mds_findsession_by_id(nfs_server_instance_t *, sessionid4); rfs4_openowner_t *mds_findopenowner(nfs_server_instance_t *, open_owner4 *, bool_t *); static void mds_op_nverify(nfs_argop4 *, nfs_resop4 *, struct svc_req *, compound_state_t *); extern mds_mpd_t *mds_find_mpd(nfs_server_instance_t *, id_t); extern void rfs41_lo_seqid(stateid_t *); extern void mds_clean_grants_by_fsid(rfs4_client_t *, vnode_t *); nfsstat4 create_vnode(vnode_t *, char *, vattr_t *, createmode4, timespec32_t *, cred_t *, vnode_t **, bool_t *); /* HACKERY */ nfsstat4 rfs4_get_all_state(struct compound_state *, stateid4 *, rfs4_state_t **, rfs4_deleg_state_t **, rfs4_lo_state_t **); void rfs4_ss_clid(struct compound_state *, rfs4_client_t *, struct svc_req *); void rfs4_ss_chkclid(struct compound_state *, rfs4_client_t *); int layout_match(stateid_t, stateid4, nfsstat4 *); extern stateid4 special0; extern stateid4 special1; #define ISSPECIAL(id) (stateid4_cmp(id, &special0) || \ stateid4_cmp(id, &special1)) void rfs4_cn_release(compound_state_t *); mds_layout_grant_t *rfs41_findlogrant(struct compound_state *, rfs4_file_t *, rfs4_client_t *, bool_t *); void rfs41_lo_grant_rele(mds_layout_grant_t *); mds_ever_grant_t *rfs41_findevergrant(rfs4_client_t *, vnode_t *, bool_t *); void rfs41_ever_grant_rele(mds_ever_grant_t *); static uint32_t compute_use_pnfs_flags(uint32_t); /* ARGSUSED */ static void mds_op_notsup(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { DTRACE_NFSV4_1(op__notsup__start, strcut compound_state *, cs); *cs->statusp = *((nfsstat4 *)&(resop)->nfs_resop4_u) = NFS4ERR_NOTSUPP; DTRACE_NFSV4_1(op__notsup__done, struct compound_state *, cs); } /* ARGSUSED */ static void mds_op_illegal(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { DTRACE_NFSV4_1(op__illegal__start, struct compound_state *, cs); *cs->statusp = *((nfsstat4 *)&(resop)->nfs_resop4_u) = NFS4ERR_OP_ILLEGAL; DTRACE_NFSV4_1(op__illegal__done, struct compound_state *, cs); } /* ARGSUSED */ static void mds_op_inval(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { DTRACE_NFSV4_1(op__inval__start, struct compound_state *, cs); *cs->statusp = *((nfsstat4 *)&(resop)->nfs_resop4_u) = NFS4ERR_INVAL; DTRACE_NFSV4_1(op__inval__done, struct compound_state *, cs); } /*ARGSUSED*/ static void nullfree(nfs_resop4 *resop) { } static op_disp_tbl_t mds_disptab[] = { {mds_op_illegal, nullfree, DISP_OP_BAD, "BAD Op 0"}, {mds_op_illegal, nullfree, DISP_OP_BAD, "BAD Op 1"}, {mds_op_illegal, nullfree, DISP_OP_BAD, "BAD Op 2"}, {mds_op_access, nullfree, DISP_OP_MDS, "ACCESS"}, {mds_op_close, nullfree, DISP_OP_MDS, "CLOSE"}, {mds_op_commit, nullfree, DISP_OP_BOTH, "COMMIT"}, {mds_op_create, nullfree, DISP_OP_MDS, "CREATE"}, {mds_op_inval, nullfree, DISP_OP_BAD, "BAD Op 7"}, {mds_op_delegreturn, nullfree, DISP_OP_MDS, "DELEGRETURN"}, {mds_op_getattr, mds_op_getattr_free, DISP_OP_MDS, "GETATTR"}, {mds_op_getfh, mds_op_getfh_free, DISP_OP_MDS, "GETFH"}, {mds_op_link, nullfree, DISP_OP_MDS, "LINK"}, {mds_op_lock, mds_lock_denied_free, DISP_OP_MDS, "LOCK"}, {mds_op_lockt, mds_lock_denied_free, DISP_OP_MDS, "LOCKT"}, {mds_op_locku, nullfree, DISP_OP_MDS, "LOCKU"}, {mds_op_lookup, nullfree, DISP_OP_MDS, "LOOKUP"}, {mds_op_lookupp, nullfree, DISP_OP_MDS, "LOOKUPP"}, {mds_op_nverify, nullfree, DISP_OP_MDS, "NVERIFY"}, {mds_op_open, mds_free_reply, DISP_OP_MDS, "OPEN"}, {mds_op_openattr, nullfree, DISP_OP_MDS, "OPENATTR"}, {mds_op_notsup, nullfree, DISP_OP_BAD, "BAD Op 20"}, {mds_op_open_downgrade, nullfree, DISP_OP_MDS, "OPEN_DOWNGRADE"}, {mds_op_putfh, nullfree, DISP_OP_BOTH, "PUTFH"}, {mds_op_putpubfh, nullfree, DISP_OP_MDS, "PUTPUBFH"}, {mds_op_putrootfh, nullfree, DISP_OP_MDS, "PUTROOTFH"}, {mds_op_read, mds_op_read_free, DISP_OP_BOTH, "READ"}, {mds_op_readdir, mds_op_readdir_free, DISP_OP_MDS, "READDIR"}, {mds_op_readlink, mds_op_readlink_free, DISP_OP_MDS, "READLINK"}, {mds_op_remove, nullfree, DISP_OP_MDS, "REMOVE"}, {mds_op_rename, nullfree, DISP_OP_MDS, "RENAME"}, {mds_op_notsup, nullfree, DISP_OP_BAD, "BAD Op 30"}, {mds_op_restorefh, nullfree, DISP_OP_MDS, "RESTOREFH"}, {mds_op_savefh, nullfree, DISP_OP_MDS, "SAVEFH"}, {mds_op_secinfo, mds_op_secinfo_free, DISP_OP_MDS, "SECINFO"}, {mds_op_setattr, nullfree, DISP_OP_MDS, "SETATTR"}, {mds_op_notsup, nullfree, DISP_OP_BAD, "BAD Op 35"}, {mds_op_notsup, nullfree, DISP_OP_BAD, "BAD Op 36"}, {mds_op_verify, nullfree, DISP_OP_MDS, "VERIFY"}, {mds_op_write, nullfree, DISP_OP_BOTH, "WRITE"}, {mds_op_notsup, nullfree, DISP_OP_BAD, "BAD Op 39"}, {mds_op_backchannel_ctl, nullfree, DISP_OP_BOTH, "BACKCHANNEL_CTL"}, {mds_op_bind_conn_to_session, nullfree, DISP_OP_BOTH, "BIND_CONN_TO_SESS"}, {mds_op_exchange_id, mds_op_exid_free, DISP_OP_BOTH, "EXCHANGE_ID"}, {mds_op_create_session, nullfree, DISP_OP_BOTH, "CREATE_SESS"}, {mds_op_destroy_session, nullfree, DISP_OP_BOTH, "DESTROY_SESS"}, {mds_op_illegal, nullfree, DISP_OP_MDS, "FREE_STATEID"}, {mds_op_illegal, nullfree, DISP_OP_MDS, "GET_DIR_DELEG"}, {mds_op_get_devinfo, nullfree, DISP_OP_MDS, "GET_DEVINFO"}, {mds_op_get_devlist, nullfree, DISP_OP_MDS, "GET_DEVLIST"}, {mds_op_layout_commit, nullfree, DISP_OP_MDS, "LAYOUT_COMMIT"}, {mds_op_layout_get, mds_op_layout_get_free, DISP_OP_MDS, "LAYOUT_GET"}, {mds_op_layout_return, nullfree, DISP_OP_MDS, "LAYOUT_RETURN"}, {mds_op_secinfonn, nullfree, DISP_OP_BOTH, "SECINFO_NONAME"}, {mds_op_sequence, nullfree, DISP_OP_BOTH, "SEQUENCE"}, {mds_op_notsup, nullfree, DISP_OP_BOTH, "SET_SSV"}, {mds_op_notsup, nullfree, DISP_OP_MDS, "TEST_STATEID"}, {mds_op_notsup, nullfree, DISP_OP_MDS, "WANT_DELEG"}, {mds_op_notsup, nullfree, DISP_OP_BOTH, "DESTROY_CLIENTID"}, {mds_op_reclaim_complete, nullfree, DISP_OP_MDS, "RECLAIM_COMPLETE"} }; static uint_t mds_disp_cnt = sizeof (mds_disptab) / sizeof (mds_disptab[0]); #define OP_ILLEGAL_IDX (mds_disp_cnt) extern size_t strlcpy(char *dst, const char *src, size_t dstsize); #ifdef nextdp #undef nextdp #endif #define nextdp(dp) ((struct dirent64 *)((char *)(dp) + (dp)->d_reclen)) /*ARGSUSED*/ static void mds_op_readdir_free(nfs_resop4 *resop) { /* Common function used for NFSv4.0 and NFSv4.1 */ rfs4_op_readdir_free(resop); } /*ARGSUSED*/ static void mds_op_secinfo_free(nfs_resop4 *resop) { /* Common function used for NFSv4.0 and NFSv4.1 */ rfs4_op_secinfo_free(resop); } /* */ void mds_srvrfini(void) { /* some shutdown stuff for the minor verson 1 server */ } nfsstat4 rfs4_state_has_access(rfs4_state_t *, int, vnode_t *); int rfs4_verify_attr(struct nfs4_svgetit_arg *, attrmap4 *, struct nfs4_ntov_table *); /* * Given the I/O mode (FREAD or FWRITE), the vnode, the stateid and whether * the file is being truncated, return NFS4_OK if allowed or approriate * V4 error if not. Note NFS4ERR_DELAY will be returned and a recall on * the associated file will be done if the I/O is not consistent with any * delegation in effect on the file. Should be holding VOP_RWLOCK, either * as reader or writer as appropriate. rfs4_op_open will accquire the * VOP_RWLOCK as writer when setting up delegation. If the stateid is bad * this routine will return NFS4ERR_BAD_STATEID. In addition, through the * deleg parameter, we will return whether a write delegation is held by * the client associated with this stateid. * If the server instance associated with the relevant client is in its * grace period, return NFS4ERR_GRACE. */ nfsstat4 mds_validate_stateid(int mode, struct compound_state *cs, vnode_t *vp, stateid4 *stateid, bool_t trunc, bool_t *deleg, bool_t do_access) { rfs4_file_t *fp; bool_t create = FALSE; rfs4_state_t *sp; rfs4_deleg_state_t *dsp; rfs4_lo_state_t *lsp; stateid_t *id = (stateid_t *)stateid; nfsstat4 stat = NFS4_OK; if (ISSPECIAL(stateid)) { fp = rfs4_findfile(cs->instp, vp, NULL, &create); if (fp == NULL) return (NFS4_OK); if (fp->rf_dinfo->rd_dtype == OPEN_DELEGATE_NONE) { rfs4_file_rele(fp); return (NFS4_OK); } if (mode == FWRITE || fp->rf_dinfo->rd_dtype == OPEN_DELEGATE_WRITE) { rfs4_recall_deleg(fp, trunc, NULL); rfs4_file_rele(fp); return (NFS4ERR_DELAY); } rfs4_file_rele(fp); return (NFS4_OK); } stat = rfs4_get_all_state(cs, stateid, &sp, &dsp, &lsp); if (stat != NFS4_OK) return (stat); /* * Ordering of the following 'if' statements is specific * since rfs4_get_all_state() may return a value for sp and * lsp. First we check lsp, then 'fall' through to sp. */ if (lsp != NULL) { /* Is associated server instance in its grace period? */ if (rfs4_clnt_in_grace(lsp->rls_locker->rl_client)) { rfs4_lo_state_rele(lsp, FALSE); if (sp != NULL) rfs4_dbe_rele(sp->rs_dbe); return (NFS4ERR_GRACE); } if (lsp->rls_lockid.v41_bits.chgseq != 0) { /* Seqid in the future? - that's bad */ if (lsp->rls_lockid.v41_bits.chgseq < id->v41_bits.chgseq) { rfs4_lo_state_rele(lsp, FALSE); if (sp != NULL) rfs4_dbe_rele(sp->rs_dbe); return (NFS4ERR_BAD_STATEID); } /* Seqid in the past? - that's old */ if (lsp->rls_lockid.v41_bits.chgseq > id->v41_bits.chgseq) { rfs4_lo_state_rele(lsp, FALSE); if (sp != NULL) rfs4_dbe_rele(sp->rs_dbe); return (NFS4ERR_OLD_STATEID); } } /* Ensure specified filehandle matches */ if (lsp->rls_state->rs_finfo->rf_vp != vp) { rfs4_lo_state_rele(lsp, FALSE); if (sp != NULL) rfs4_dbe_rele(sp->rs_dbe); return (NFS4ERR_BAD_STATEID); } rfs4_lo_state_rele(lsp, FALSE); } /* * Stateid provided was an "open" or via the lock stateid */ if (sp != NULL) { /* * only check if the passed in stateid was an OPENID, * ie. Skip if we got here via the LOCKID. */ if (id->v41_bits.type == OPENID) { /* Is associated server instance in its grace period? */ if (rfs4_clnt_in_grace(sp->rs_owner->ro_client)) { rfs4_dbe_rele(sp->rs_dbe); return (NFS4ERR_GRACE); } if (sp->rs_stateid.v41_bits.chgseq != 0) { /* Seqid in the future? - that's bad */ if (sp->rs_stateid.v41_bits.chgseq < id->v41_bits.chgseq) { rfs4_dbe_rele(sp->rs_dbe); return (NFS4ERR_BAD_STATEID); } /* Seqid in the past - that's old */ if (sp->rs_stateid.v41_bits.chgseq > id->v41_bits.chgseq) { rfs4_dbe_rele(sp->rs_dbe); return (NFS4ERR_OLD_STATEID); } } /* Ensure specified filehandle matches */ if (sp->rs_finfo->rf_vp != vp) { rfs4_dbe_rele(sp->rs_dbe); return (NFS4ERR_BAD_STATEID); } } if (sp->rs_owner->ro_need_confirm) { rfs4_dbe_rele(sp->rs_dbe); return (NFS4ERR_BAD_STATEID); } if (sp->rs_closed == TRUE) { rfs4_dbe_rele(sp->rs_dbe); return (NFS4ERR_OLD_STATEID); } if (do_access) stat = rfs4_state_has_access(sp, mode, vp); else stat = NFS4_OK; /* * Return whether this state has write * delegation if desired */ if (deleg && (sp->rs_finfo->rf_dinfo->rd_dtype == OPEN_DELEGATE_WRITE)) *deleg = TRUE; /* * We got a valid stateid, so we update the * lease on the client. Ideally we would like * to do this after the calling op succeeds, * but for now this will be good * enough. Callers of this routine are * currently insulated from the state stuff. */ rfs4_update_lease(sp->rs_owner->ro_client); /* * If a delegation is present on this file and * this is a WRITE, then update the lastwrite * time to indicate that activity is present. */ if (sp->rs_finfo->rf_dinfo->rd_dtype == OPEN_DELEGATE_WRITE && mode == FWRITE) { sp->rs_finfo->rf_dinfo->rd_time_lastwrite = nfs_sys_uptime(); } rfs4_dbe_rele(sp->rs_dbe); return (stat); } if (dsp != NULL) { /* Is associated server instance in its grace period? */ if (rfs4_clnt_in_grace(dsp->rds_client)) { rfs4_deleg_state_rele(dsp); return (NFS4ERR_GRACE); } if ((dsp->rds_delegid.v41_bits.chgseq != 0) && (dsp->rds_delegid.v41_bits.chgseq != id->v41_bits.chgseq)) { rfs4_deleg_state_rele(dsp); return (NFS4ERR_BAD_STATEID); } /* Ensure specified filehandle matches */ if (dsp->rds_finfo->rf_vp != vp) { rfs4_deleg_state_rele(dsp); return (NFS4ERR_BAD_STATEID); } /* * Return whether this state has write * delegation if desired */ if (deleg && (dsp->rds_finfo->rf_dinfo->rd_dtype == OPEN_DELEGATE_WRITE)) *deleg = TRUE; rfs4_update_lease(dsp->rds_client); /* * If a delegation is present on this file and * this is a WRITE, then update the lastwrite * time to indicate that activity is present. */ if (dsp->rds_finfo->rf_dinfo->rd_dtype == OPEN_DELEGATE_WRITE && mode == FWRITE) { dsp->rds_finfo->rf_dinfo->rd_time_lastwrite = nfs_sys_uptime(); } /* * XXX - what happens if this is a WRITE and the * delegation type of for READ. */ rfs4_deleg_state_rele(dsp); return (stat); } /* * If we got this far, something bad happened */ return (NFS4ERR_BAD_STATEID); } nfsstat4 mds_setattr(attrmap4 *resp, fattr4 *fattrp, struct compound_state *cs, stateid4 *stateid) { int error = 0; struct nfs4_svgetit_arg sarg; bool_t trunc; nfsstat4 status = NFS4_OK; cred_t *cr = cs->cr; vnode_t *vp = cs->vp; struct nfs4_ntov_table ntov; struct statvfs64 sb; struct vattr bva; struct flock64 bf; int in_crit = 0; uint_t saved_mask = 0; caller_context_t ct; attrvers_t avers; struct nfs4_ntov_map *nvmap; avers = nfs4_attrvers(cs); nvmap = NFS4_NTOV_MAP(avers); *resp = NFS4_EMPTY_ATTRMAP(avers); sarg.sbp = &sb; nfs4_ntov_table_init(&ntov, avers); status = do_rfs4_set_attrs(resp, fattrp, cs, &sarg, &ntov, NFS4ATTR_SETIT); if (status != NFS4_OK) { /* * failed set attrs */ goto done; } if (sarg.vap->va_mask == 0 && ! ATTR_ISSET(fattrp->attrmask, ACL) && ! ATTR_ISSET(fattrp->attrmask, LAYOUT_HINT)) { /* * no further work to be done */ goto done; } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; /* * If we got a request to set the ACL and the MODE, only * allow changing VSUID, VSGID, and VSVTX. Attempting * to change any other bits, along with setting an ACL, * gives NFS4ERR_INVAL. */ if (ATTR_ISSET(fattrp->attrmask, ACL) && ATTR_ISSET(fattrp->attrmask, MODE)) { vattr_t va; va.va_mask = AT_MODE; error = VOP_GETATTR(vp, &va, 0, cs->cr, &ct); if (error) { status = puterrno4(error); goto done; } if ((sarg.vap->va_mode ^ va.va_mode) & ~(VSUID | VSGID | VSVTX)) { status = NFS4ERR_INVAL; goto done; } } /* Check stateid only if size has been set */ if (sarg.vap->va_mask & AT_SIZE) { trunc = (sarg.vap->va_size == 0); status = mds_validate_stateid(FWRITE, cs, cs->vp, stateid, trunc, &cs->deleg, sarg.vap->va_mask & AT_SIZE); if (status != NFS4_OK) goto done; } /* XXX start of possible race with delegations */ /* * We need to specially handle size changes because it is * possible for the client to create a file with read-only * modes, but with the file opened for writing. If the client * then tries to set the file size, e.g. ftruncate(3C), * fcntl(F_FREESP), the normal access checking done in * VOP_SETATTR would prevent the client from doing it even though * it should be allowed to do so. To get around this, we do the * access checking for ourselves and use VOP_SPACE which doesn't * do the access checking. * Also the client should not be allowed to change the file * size if there is a conflicting non-blocking mandatory lock in * the region of the change. */ if (vp->v_type == VREG && (sarg.vap->va_mask & AT_SIZE)) { u_offset_t offset; ssize_t length; /* * ufs_setattr clears AT_SIZE from vap->va_mask, but * before returning, sarg.vap->va_mask is used to * generate the setattr reply bitmap. We also clear * AT_SIZE below before calling VOP_SPACE. For both * of these cases, the va_mask needs to be saved here * and restored after calling VOP_SETATTR. */ saved_mask = sarg.vap->va_mask; /* * Check any possible conflict due to NBMAND locks. * Get into critical region before VOP_GETATTR, so the * size attribute is valid when checking conflicts. */ if (nbl_need_check(vp)) { nbl_start_crit(vp, RW_READER); in_crit = 1; } bva.va_mask = AT_UID|AT_SIZE; if (error = VOP_GETATTR(vp, &bva, 0, cr, &ct)) { status = puterrno4(error); goto done; } if (in_crit) { if (sarg.vap->va_size < bva.va_size) { offset = sarg.vap->va_size; length = bva.va_size - sarg.vap->va_size; } else { offset = bva.va_size; length = sarg.vap->va_size - bva.va_size; } if (nbl_conflict(vp, NBL_WRITE, offset, length, 0, &ct)) { status = NFS4ERR_LOCKED; goto done; } } if (crgetuid(cr) == bva.va_uid) { sarg.vap->va_mask &= ~AT_SIZE; bf.l_type = F_WRLCK; bf.l_whence = 0; bf.l_start = (off64_t)sarg.vap->va_size; bf.l_len = 0; bf.l_sysid = 0; bf.l_pid = 0; error = VOP_SPACE(vp, F_FREESP, &bf, FWRITE, (offset_t)sarg.vap->va_size, cr, &ct); } } if (!error && sarg.vap->va_mask != 0) error = VOP_SETATTR(vp, sarg.vap, sarg.flag, cr, &ct); /* restore va_mask -- ufs_setattr clears AT_SIZE */ if (saved_mask & AT_SIZE) sarg.vap->va_mask |= AT_SIZE; /* * If an ACL was being set, it has been delayed until now, * in order to set the mode (via the VOP_SETATTR() above) first. */ if (! error && ATTR_ISSET(fattrp->attrmask, ACL)) { int i; for (i = 0; i < ntov.attrcnt; i++) if (ntov.amap[i] == FATTR4_ACL) break; if (i < ntov.attrcnt) { error = (*nvmap[FATTR4_ACL].sv_getit)(NFS4ATTR_SETIT, &sarg, &ntov.na[i]); if (error == 0) { ATTR_SET(*resp, ACL); } else if (error == ENOTSUP) { (void) rfs4_verify_attr(&sarg, resp, &ntov); status = NFS4ERR_ATTRNOTSUPP; goto done; } } else { error = EINVAL; } } if (! error && ATTR_ISSET(fattrp->attrmask, LAYOUT_HINT)) { /* * Store layout hint. Layout hint will be stored * in file struct (which means it can only be set * when the file is open). If layout hint is allowed * for files not open, then it must be stored * persistently. * * status assignment placates lint. it will * be replaced with code to store the layout * hint. */ status = NFS4_OK; } if (error) { /* check if a monitor detected a delegation conflict */ if (error == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) status = NFS4ERR_DELAY; else status = puterrno4(error); /* * Set the response bitmap when setattr failed. * If VOP_SETATTR partially succeeded, test by doing a * VOP_GETATTR on the object and comparing the data * to the setattr arguments. */ (void) rfs4_verify_attr(&sarg, resp, &ntov); } else { /* * Force modified metadata out to stable storage. */ (void) VOP_FSYNC(vp, FNODSYNC, cr, &ct); /* * Set response bitmap */ nfs4_vmask_to_nmask_set(sarg.vap->va_mask, resp); } /* Return early and already have a NFSv4 error */ done: /* * Except for nfs4_vmask_to_nmask_set(), vattr --> fattr * conversion sets both readable and writeable NFS4 attrs * for AT_MTIME and AT_ATIME. The line below masks out * unrequested attrs from the setattr result bitmap. This * is placed after the done: label to catch the ATTRNOTSUP * case. */ ATTRMAP_MASK(*resp, fattrp->attrmask); if (in_crit) nbl_end_crit(vp); nfs4_ntov_table_free(&ntov, &sarg); return (status); } /* ARGSUSED */ void mds_op_secinfonn(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { SECINFO_NO_NAME4res *respnn; int dotdot; DTRACE_NFSV4_1(op__secinfo__no__name__start, struct compound_state *, cs); respnn = &resop->nfs_resop4_u.opsecinfo_no_name; /* * Current file handle (cfh) should have been set before * getting into this function. If not, return error. */ if (!rfs4_cs_has_fh(cs)) { *cs->statusp = respnn->status = NFS4ERR_NOFILEHANDLE; goto final; } dotdot = (argop->nfs_argop4_u.opsecinfo_no_name == SECINFO_STYLE4_PARENT); *cs->statusp = respnn->status = do_rfs4_op_secinfo(cs, NULL, dotdot, (SECINFO4res *)respnn); /* Cleanup FH as described at 18.45.3. */ if (respnn->status == NFS4_OK) rfs4_cs_invalidate_fh(cs); final: DTRACE_NFSV4_2(op__secinfo__no__name__done, struct compound_state *, cs, SECINFO_NO_NAME4res *, respnn); } /* ARGSUSED */ void mds_op_secinfo(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { SECINFO4res *resp; utf8string *utfnm; uint_t len, dotdot; char *nm; SECINFO4args *args = &argop->nfs_argop4_u.opsecinfo; DTRACE_NFSV4_2(op__secinfo__start, struct compound_state *, cs, SECINFO4args *, args); resp = &resop->nfs_resop4_u.opsecinfo; /* * Current file handle (cfh) should have been set before * getting into this function. If not, return error. */ if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (cs->vp->v_type != VDIR) { *cs->statusp = resp->status = NFS4ERR_NOTDIR; goto final; } /* * Verify the component name. If failed, error out, but * do not error out if the component name is a "..". * SECINFO will return its parents secinfo data for SECINFO "..". */ utfnm = &argop->nfs_argop4_u.opsecinfo.name; if (!utf8_dir_verify(utfnm)) { if (utfnm->utf8string_len != 2 || utfnm->utf8string_val[0] != '.' || utfnm->utf8string_val[1] != '.') { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } dotdot = 1; } else dotdot = 0; nm = utf8_to_str(utfnm, &len, NULL); if (nm == NULL) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } if (len > MAXNAMELEN) { *cs->statusp = resp->status = NFS4ERR_NAMETOOLONG; kmem_free(nm, len); goto final; } *cs->statusp = resp->status = do_rfs4_op_secinfo(cs, nm, dotdot, resp); kmem_free(nm, len); /* Cleanup FH as described at 18.45.3. */ if (resp->status == NFS4_OK) rfs4_cs_invalidate_fh(cs); final: DTRACE_NFSV4_2(op__secinfo__done, struct compound_state *, cs, SECINFO4res *, resp); } /* * verify and nverify are exactly the same, except that nverify * succeeds when some argument changed, and verify succeeds when * when none changed. */ /* ARGSUSED */ void mds_op_verify(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { VERIFY4args *args = &argop->nfs_argop4_u.opverify; VERIFY4res *resp = &resop->nfs_resop4_u.opverify; int error; struct nfs4_svgetit_arg sarg; struct statvfs64 sb; struct nfs4_ntov_table ntov; DTRACE_NFSV4_2(op__verify__start, struct compound_state *, cs, VERIFY4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } sarg.sbp = &sb; nfs4_ntov_table_init(&ntov, nfs4_attrvers(cs)); resp->status = do_rfs4_set_attrs(NULL, &args->obj_attributes, cs, &sarg, &ntov, NFS4ATTR_VERIT); if (resp->status != NFS4_OK) { /* * do_rfs4_set_attrs will try to verify systemwide attrs, * so could return -1 for "no match". */ if (resp->status == -1) resp->status = NFS4ERR_NOT_SAME; goto done; } error = rfs4_verify_attr(&sarg, NULL, &ntov); switch (error) { case 0: resp->status = NFS4_OK; break; case -1: resp->status = NFS4ERR_NOT_SAME; break; default: resp->status = puterrno4(error); break; } done: *cs->statusp = resp->status; nfs4_ntov_table_free(&ntov, &sarg); final: DTRACE_NFSV4_2(op__verify__done, struct compound_state *, cs, VERIFY4res *, resp); } /* ARGSUSED */ void mds_op_nverify(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { NVERIFY4args *args = &argop->nfs_argop4_u.opnverify; NVERIFY4res *resp = &resop->nfs_resop4_u.opnverify; int error; struct nfs4_svgetit_arg sarg; struct statvfs64 sb; struct nfs4_ntov_table ntov; DTRACE_NFSV4_2(op__nverify__start, struct compound_state *, cs, NVERIFY4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } sarg.sbp = &sb; nfs4_ntov_table_init(&ntov, nfs4_attrvers(cs)); resp->status = do_rfs4_set_attrs(NULL, &args->obj_attributes, cs, &sarg, &ntov, NFS4ATTR_VERIT); if (resp->status != NFS4_OK) { /* * do_rfs4_set_attrs will try to verify systemwide attrs, * so could return -1 for "no match". */ if (resp->status == -1) resp->status = NFS4_OK; goto done; } error = rfs4_verify_attr(&sarg, NULL, &ntov); switch (error) { case 0: resp->status = NFS4ERR_SAME; break; case -1: resp->status = NFS4_OK; break; default: resp->status = puterrno4(error); break; } done: *cs->statusp = resp->status; nfs4_ntov_table_free(&ntov, &sarg); final: DTRACE_NFSV4_2(op__nverify__done, struct compound_state *, cs, NVERIFY4res *, resp); } /* ARGSUSED */ void mds_op_access(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { ACCESS4args *args = &argop->nfs_argop4_u.opaccess; ACCESS4res *resp = &resop->nfs_resop4_u.opaccess; int error; vnode_t *vp; struct vattr va; int checkwriteperm; cred_t *cr = cs->cr; bslabel_t *clabel, *slabel; ts_label_t *tslabel; boolean_t admin_low_client; DTRACE_NFSV4_2(op__access__start, struct compound_state *, cs, ACCESS4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } ASSERT(cr != NULL); vp = cs->vp; /* * If the file system is exported read only, it is not appropriate * to check write permissions for regular files and directories. * Special files are interpreted by the client, so the underlying * permissions are sent back to the client for interpretation. */ if (rdonly4(cs->exi, cs->vp, req) && (vp->v_type == VREG || vp->v_type == VDIR)) checkwriteperm = 0; else checkwriteperm = 1; /* * XXX * We need the mode so that we can correctly determine access * permissions relative to a mandatory lock file. Access to * mandatory lock files is denied on the server, so it might * as well be reflected to the server during the open. */ va.va_mask = AT_MODE; error = VOP_GETATTR(vp, &va, 0, cr, NULL); if (error) { *cs->statusp = resp->status = puterrno4(error); goto final; } resp->access = 0; resp->supported = 0; if (is_system_labeled()) { ASSERT(req->rq_label != NULL); clabel = req->rq_label; DTRACE_PROBE2(tx__rfs4__log__info__opaccess__clabel, char *, "got client label from request(1)", struct svc_req *, req); if (!blequal(&l_admin_low->tsl_label, clabel)) { if ((tslabel = nfs_getflabel(vp, cs->exi)) == NULL) { *cs->statusp = resp->status = puterrno4(EACCES); goto final; } slabel = label2bslabel(tslabel); DTRACE_PROBE3(tx__rfs4__log__info__opaccess__slabel, char *, "got server label(1) for vp(2)", bslabel_t *, slabel, vnode_t *, vp); admin_low_client = B_FALSE; } else admin_low_client = B_TRUE; } if (args->access & ACCESS4_READ) { error = VOP_ACCESS(vp, VREAD, 0, cr, NULL); if (!error && !MANDLOCK(vp, va.va_mode) && (!is_system_labeled() || admin_low_client || bldominates(clabel, slabel))) resp->access |= ACCESS4_READ; resp->supported |= ACCESS4_READ; } if ((args->access & ACCESS4_LOOKUP) && vp->v_type == VDIR) { error = VOP_ACCESS(vp, VEXEC, 0, cr, NULL); if (!error && (!is_system_labeled() || admin_low_client || bldominates(clabel, slabel))) resp->access |= ACCESS4_LOOKUP; resp->supported |= ACCESS4_LOOKUP; } if (checkwriteperm && (args->access & (ACCESS4_MODIFY|ACCESS4_EXTEND))) { error = VOP_ACCESS(vp, VWRITE, 0, cr, NULL); if (!error && !MANDLOCK(vp, va.va_mode) && (!is_system_labeled() || admin_low_client || blequal(clabel, slabel))) resp->access |= (args->access & (ACCESS4_MODIFY|ACCESS4_EXTEND)); resp->supported |= (ACCESS4_MODIFY|ACCESS4_EXTEND); } if (checkwriteperm && (args->access & ACCESS4_DELETE) && vp->v_type == VDIR) { error = VOP_ACCESS(vp, VWRITE, 0, cr, NULL); if (!error && (!is_system_labeled() || admin_low_client || blequal(clabel, slabel))) resp->access |= ACCESS4_DELETE; resp->supported |= ACCESS4_DELETE; } if (args->access & ACCESS4_EXECUTE && vp->v_type != VDIR) { error = VOP_ACCESS(vp, VEXEC, 0, cr, NULL); if (!error && !MANDLOCK(vp, va.va_mode) && (!is_system_labeled() || admin_low_client || bldominates(clabel, slabel))) resp->access |= ACCESS4_EXECUTE; resp->supported |= ACCESS4_EXECUTE; } if (is_system_labeled() && !admin_low_client) label_rele(tslabel); *cs->statusp = resp->status = NFS4_OK; final: DTRACE_NFSV4_2(op__access__done, struct compound_state *, cs, ACCESS4res *, resp); } /* ARGSUSED */ static void mds_op_commit(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { COMMIT4args *args = &argop->nfs_argop4_u.opcommit; COMMIT4res *resp = &resop->nfs_resop4_u.opcommit; int error; vnode_t *vp = cs->vp; cred_t *cr = cs->cr; vattr_t va; caller_context_t ct; DTRACE_NFSV4_2(op__commit__start, struct compound_state *, cs, COMMIT4args *, args); /* * XXX kludge: fake the commit if we are a data server * This will be replaced once we have nnop_commit(). */ if (rfs4_cs_has_fh(cs) && (cs->vp == NULL)) { *cs->statusp = resp->status = NFS4_OK; resp->writeverf = cs->instp->Write4verf; goto final; } if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (cs->access == CS_ACCESS_DENIED) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } if (args->offset + args->count < args->offset) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; va.va_mask = AT_UID; error = VOP_GETATTR(vp, &va, 0, cr, &ct); /* * If we can't get the attributes, then we can't do the * right access checking. So, we'll fail the request. */ if (error) { *cs->statusp = resp->status = puterrno4(error); goto final; } if (rdonly4(cs->exi, cs->vp, req)) { *cs->statusp = resp->status = NFS4ERR_ROFS; goto final; } if (vp->v_type != VREG) { if (vp->v_type == VDIR) resp->status = NFS4ERR_ISDIR; else resp->status = NFS4ERR_INVAL; *cs->statusp = resp->status; goto final; } if (crgetuid(cr) != va.va_uid && (error = VOP_ACCESS(vp, VWRITE, 0, cs->cr, &ct))) { *cs->statusp = resp->status = puterrno4(error); goto final; } error = VOP_FSYNC(vp, FNODSYNC, cr, &ct); if (error) { *cs->statusp = resp->status = puterrno4(error); goto final; } *cs->statusp = resp->status = NFS4_OK; resp->writeverf = cs->instp->Write4verf; final: DTRACE_NFSV4_2(op__commit__done, struct compound_state *, cs, COMMIT4res *, resp); } /* * rfs4_op_mknod is called from rfs4_op_create after all initial verification * was completed. It does the nfsv4 create for special files. * * nfsv4 create is used to create non-regular files. For regular files, * use nfsv4 open. */ /* ARGSUSED */ static void mds_op_create(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { CREATE4args *args = &argop->nfs_argop4_u.opcreate; CREATE4res *resp = &resop->nfs_resop4_u.opcreate; int error; struct vattr bva, iva, iva2, ava, *vap; cred_t *cr = cs->cr; vnode_t *dvp = cs->vp; vnode_t *vp = NULL; vnode_t *realvp; char *nm, *lnm; uint_t len, llen; int syncval = 0; struct nfs4_svgetit_arg sarg; struct nfs4_ntov_table ntov; struct statvfs64 sb; nfsstat4 status; caller_context_t ct; DTRACE_NFSV4_2(op__create__start, struct compound_state *, cs, CREATE4args *, args); resp->attrset = NFS4_EMPTY_ATTRMAP(nfs4_attrvers(cs)); resp->cinfo.atomic = FALSE; if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } /* * If there is an unshared filesystem mounted on this vnode, * do not allow to create an object in this directory. */ if (vn_ismntpt(dvp)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; /* Verify that type is correct */ switch (args->type) { case NF4LNK: case NF4BLK: case NF4CHR: case NF4SOCK: case NF4FIFO: case NF4DIR: break; default: *cs->statusp = resp->status = NFS4ERR_BADTYPE; goto final; }; if (cs->access == CS_ACCESS_DENIED) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } if (dvp->v_type != VDIR) { *cs->statusp = resp->status = NFS4ERR_NOTDIR; goto final; } if (!utf8_dir_verify(&args->objname)) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } if (rdonly4(cs->exi, cs->vp, req)) { *cs->statusp = resp->status = NFS4ERR_ROFS; goto final; } /* * Name of newly created object */ nm = utf8_to_fn(&args->objname, &len, NULL); if (nm == NULL) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } if (len > MAXNAMELEN) { *cs->statusp = resp->status = NFS4ERR_NAMETOOLONG; kmem_free(nm, len); goto final; } sarg.sbp = &sb; nfs4_ntov_table_init(&ntov, nfs4_attrvers(cs)); status = do_rfs4_set_attrs(&resp->attrset, &args->createattrs, cs, &sarg, &ntov, NFS4ATTR_SETIT); if (sarg.vap->va_mask == 0 && status == NFS4_OK) status = NFS4ERR_INVAL; if (status != NFS4_OK) { *cs->statusp = resp->status = status; kmem_free(nm, len); nfs4_ntov_table_free(&ntov, &sarg); resp->attrset = NFS4_EMPTY_ATTRMAP(nfs4_attrvers(cs)); goto final; } /* Get "before" change value */ bva.va_mask = AT_CTIME|AT_SEQ; error = VOP_GETATTR(dvp, &bva, 0, cr, &ct); if (error) { *cs->statusp = resp->status = puterrno4(error); kmem_free(nm, len); nfs4_ntov_table_free(&ntov, &sarg); resp->attrset = NFS4_EMPTY_ATTRMAP(nfs4_attrvers(cs)); goto final; } NFS4_SET_FATTR4_CHANGE(resp->cinfo.before, bva.va_ctime); vap = sarg.vap; /* * Set default initial values for attributes when not specified * in createattrs. */ if ((vap->va_mask & AT_UID) == 0) { vap->va_uid = crgetuid(cr); vap->va_mask |= AT_UID; } if ((vap->va_mask & AT_GID) == 0) { vap->va_gid = crgetgid(cr); vap->va_mask |= AT_GID; } vap->va_mask |= AT_TYPE; switch (args->type) { case NF4DIR: vap->va_type = VDIR; if ((vap->va_mask & AT_MODE) == 0) { vap->va_mode = 0700; /* default: owner rwx only */ vap->va_mask |= AT_MODE; } error = VOP_MKDIR(dvp, nm, vap, &vp, cr, &ct, 0, NULL); if (error) break; /* * Get the initial "after" sequence number, if it fails, * set to zero */ iva.va_mask = AT_SEQ; if (VOP_GETATTR(dvp, &iva, 0, cs->cr, &ct)) iva.va_seq = 0; break; case NF4LNK: vap->va_type = VLNK; if ((vap->va_mask & AT_MODE) == 0) { vap->va_mode = 0700; /* default: owner rwx only */ vap->va_mask |= AT_MODE; } /* * symlink names must be treated as data */ lnm = utf8_to_str(&args->ftype4_u.linkdata, &llen, NULL); if (lnm == NULL) { *cs->statusp = resp->status = NFS4ERR_INVAL; kmem_free(nm, len); nfs4_ntov_table_free(&ntov, &sarg); resp->attrset = NFS4_EMPTY_ATTRMAP(nfs4_attrvers(cs)); goto final; } if (llen > MAXPATHLEN) { *cs->statusp = resp->status = NFS4ERR_NAMETOOLONG; kmem_free(nm, len); kmem_free(lnm, llen); nfs4_ntov_table_free(&ntov, &sarg); resp->attrset = NFS4_EMPTY_ATTRMAP(nfs4_attrvers(cs)); goto final; } error = VOP_SYMLINK(dvp, nm, vap, lnm, cr, &ct, 0); if (lnm != NULL) kmem_free(lnm, llen); if (error) break; /* * Get the initial "after" sequence number, if it fails, * set to zero */ iva.va_mask = AT_SEQ; if (VOP_GETATTR(dvp, &iva, 0, cs->cr, &ct)) iva.va_seq = 0; error = VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, cr, &ct, 0, NULL); if (error) break; /* * va_seq is not safe over VOP calls, check it again * if it has changed zero out iva to force atomic = FALSE. */ iva2.va_mask = AT_SEQ; if (VOP_GETATTR(dvp, &iva2, 0, cs->cr, &ct) || iva2.va_seq != iva.va_seq) iva.va_seq = 0; break; default: /* * probably a special file. */ if ((vap->va_mask & AT_MODE) == 0) { vap->va_mode = 0600; /* default: owner rw only */ vap->va_mask |= AT_MODE; } syncval = FNODSYNC; /* * We know this will only generate one VOP call */ vp = do_rfs4_op_mknod(args, resp, req, cs, vap, nm); if (vp == NULL) { kmem_free(nm, len); nfs4_ntov_table_free(&ntov, &sarg); resp->attrset = NFS4_EMPTY_ATTRMAP(nfs4_attrvers(cs)); goto final; } /* * Get the initial "after" sequence number, if it fails, * set to zero */ iva.va_mask = AT_SEQ; if (VOP_GETATTR(dvp, &iva, 0, cs->cr, &ct)) iva.va_seq = 0; break; } kmem_free(nm, len); if (error) { *cs->statusp = resp->status = puterrno4(error); } /* * Force modified data and metadata out to stable storage. */ (void) VOP_FSYNC(dvp, 0, cr, &ct); if (resp->status != NFS4_OK) { if (vp != NULL) VN_RELE(vp); nfs4_ntov_table_free(&ntov, &sarg); resp->attrset = NFS4_EMPTY_ATTRMAP(nfs4_attrvers(cs)); goto final; } /* * Finish setup of cinfo response, "before" value already set. * Get "after" change value, if it fails, simply return the * before value. */ ava.va_mask = AT_CTIME|AT_SEQ; if (VOP_GETATTR(dvp, &ava, 0, cr, &ct)) { ava.va_ctime = bva.va_ctime; ava.va_seq = 0; } NFS4_SET_FATTR4_CHANGE(resp->cinfo.after, ava.va_ctime); /* * True verification that object was created with correct * attrs is impossible. The attrs could have been changed * immediately after object creation. If attributes did * not verify, the only recourse for the server is to * destroy the object. Maybe if some attrs (like gid) * are set incorrectly, the object should be destroyed; * however, seems bad as a default policy. Do we really * want to destroy an object over one of the times not * verifying correctly? For these reasons, the server * currently sets bits in attrset for createattrs * that were set; however, no verification is done. * * vmask_to_nmask accounts for vattr bits set on create * [do_rfs4_set_attrs() only sets resp bits for * non-vattr/vfs bits.] * Mask off any bits set by default so as not to return * more attrset bits than were requested in createattrs */ nfs4_vmask_to_nmask(sarg.vap->va_mask, &resp->attrset, nfs4_attrvers(cs)); ATTRMAP_MASK(resp->attrset, args->createattrs.attrmask); nfs4_ntov_table_free(&ntov, &sarg); /* * Force modified metadata out to stable storage. * * if a underlying vp exists, pass it to VOP_FSYNC */ if (VOP_REALVP(vp, &realvp, &ct) == 0) (void) VOP_FSYNC(realvp, syncval, cr, &ct); else (void) VOP_FSYNC(vp, syncval, cr, &ct); error = rfs4_cs_update_fh(cs, vp); if (error) { *cs->statusp = resp->status = puterrno4(error); resp->cinfo.atomic = FALSE; goto final; } /* * The cinfo.atomic = TRUE only if we got no errors, we have * non-zero va_seq's, and it has incremented by exactly one * during the creation and it didn't change during the VOP_LOOKUP * or VOP_FSYNC. */ if (bva.va_seq && iva.va_seq && ava.va_seq && iva.va_seq == (bva.va_seq + 1) && iva.va_seq == ava.va_seq) resp->cinfo.atomic = TRUE; *cs->statusp = resp->status = NFS4_OK; final: if (vp != NULL) VN_RELE(vp); DTRACE_NFSV4_2(op__create__done, struct compound_state *, cs, CREATE4res *, resp); } /*ARGSUSED*/ static void mds_op_delegreturn(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { DELEGRETURN4args *args = &argop->nfs_argop4_u.opdelegreturn; DELEGRETURN4res *resp = &resop->nfs_resop4_u.opdelegreturn; rfs4_deleg_state_t *dsp; nfsstat4 status; DTRACE_NFSV4_2(op__delegreturn__start, struct compound_state *, cs, DELEGRETURN4args *, args); status = rfs4_get_deleg_state(cs, &args->deleg_stateid, &dsp); resp->status = *cs->statusp = status; if (status != NFS4_OK) goto final; /* Ensure specified filehandle matches */ if (cs->vp != dsp->rds_finfo->rf_vp) { resp->status = *cs->statusp = NFS4ERR_BAD_STATEID; } else rfs4_return_deleg(dsp, FALSE); rfs4_update_lease(dsp->rds_client); rfs4_deleg_state_rele(dsp); final: DTRACE_NFSV4_2(op__delegreturn__done, struct compound_state *, cs, DELEGRETURN4res *, resp); } /* ARGSUSED */ static void mds_op_getattr(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { GETATTR4args *args = &argop->nfs_argop4_u.opgetattr; GETATTR4res *resp = &resop->nfs_resop4_u.opgetattr; struct nfs4_svgetit_arg sarg; struct statvfs64 sb; nfsstat4 status; DTRACE_NFSV4_2(op__getattr__start, struct compound_state *, cs, GETATTR4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (cs->access == CS_ACCESS_DENIED) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } bzero(&sarg, sizeof (struct nfs4_svgetit_arg)); sarg.sbp = &sb; sarg.cs = cs; status = attrmap4_to_vattrmask(&args->attr_request, &sarg); if (status == NFS4_OK) { status = bitmap4_get_sysattrs(&sarg); if (status == NFS4_OK) { if (vn_is_nfs_reparse(cs->vp, cs->cr)) sarg.is_referral = B_TRUE; status = do_rfs4_op_getattr(&args->attr_request, &resp->obj_attributes, &sarg); } } *cs->statusp = resp->status = status; final: DTRACE_NFSV4_2(op__getattr__done, struct compound_state *, cs, GETATTR4res *, resp); } /*ARGSUSED*/ void mds_op_getattr_free(nfs_resop4 *resop) { /* Common function for NFSv4.0 and NFSv4.1 */ rfs4_op_getattr_free(resop); } /* ARGSUSED */ static void mds_op_getfh(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { GETFH4res *resp = &resop->nfs_resop4_u.opgetfh; DTRACE_NFSV4_1(op__getfh__start, struct compound_state *, cs); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (cs->access == CS_ACCESS_DENIED) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } /* check for reparse point at the share point */ if (cs->exi->exi_moved || vn_is_nfs_reparse(cs->exi->exi_vp, cs->cr)) { /* it's all bad */ cs->exi->exi_moved = 1; *cs->statusp = resp->status = NFS4ERR_MOVED; DTRACE_PROBE2(nfs41serv__func__referral__shared__moved, vnode_t *, cs->vp, char *, "mds_op_getfh"); return; } /* check for reparse point at vp */ if (vn_is_nfs_reparse(cs->vp, cs->cr) && !client_is_downrev(cs->instp, req)) { /* it's not all bad */ *cs->statusp = resp->status = NFS4ERR_MOVED; DTRACE_PROBE2(nfs41serv__func__referral__moved, vnode_t *, cs->vp, char *, "mds_op_getfh"); return; } resp->object.nfs_fh4_val = kmem_alloc(cs->fh.nfs_fh4_len, KM_SLEEP); nfs_fh4_copy(&cs->fh, &resp->object); *cs->statusp = resp->status = NFS4_OK; final: DTRACE_NFSV4_2(op__getfh__done, struct compound_state *, cs, GETFH4res *, resp); } /*ARGSUSED*/ static void mds_op_getfh_free(nfs_resop4 *resop) { /* Common function for NFSv4.0 and NFSv4.1 */ rfs4_op_getfh_free(resop); } /* * link: args: SAVED_FH: file, CURRENT_FH: target directory * res: status. If success - CURRENT_FH unchanged, return change_info */ /* ARGSUSED */ static void mds_op_link(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { LINK4args *args = &argop->nfs_argop4_u.oplink; LINK4res *resp = &resop->nfs_resop4_u.oplink; int error; vnode_t *vp; vnode_t *dvp; struct vattr bdva, idva, adva; char *nm; uint_t len; caller_context_t ct; DTRACE_NFSV4_2(op__link__start, struct compound_state *, cs, LINK4args *, args); if (!rfs4_cs_has_fh(cs) || !rfs4_cs_has_savedfh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } /* * SAVED_FH: source object * CURRENT_FH: target directory */ vp = cs->saved_vp; dvp = cs->vp; /* * If there is a non-shared filesystem mounted on this vnode, * do not allow to link any file in this directory. */ if (vn_ismntpt(dvp)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } if (cs->access == CS_ACCESS_DENIED) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } /* Check source object's type validity */ if (vp->v_type == VDIR) { *cs->statusp = resp->status = NFS4ERR_ISDIR; goto final; } /* Check target directory's type */ if (dvp->v_type != VDIR) { *cs->statusp = resp->status = NFS4ERR_NOTDIR; goto final; } if (cs->saved_exi != cs->exi) { *cs->statusp = resp->status = NFS4ERR_XDEV; goto final; } if (!utf8_dir_verify(&args->newname)) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } nm = utf8_to_fn(&args->newname, &len, NULL); if (nm == NULL) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } if (len > MAXNAMELEN) { *cs->statusp = resp->status = NFS4ERR_NAMETOOLONG; kmem_free(nm, len); goto final; } if (rdonly4(cs->exi, cs->vp, req)) { *cs->statusp = resp->status = NFS4ERR_ROFS; kmem_free(nm, len); goto final; } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; /* Get "before" change value */ bdva.va_mask = AT_CTIME|AT_SEQ; error = VOP_GETATTR(dvp, &bdva, 0, cs->cr, &ct); if (error) { *cs->statusp = resp->status = puterrno4(error); kmem_free(nm, len); goto final; } NFS4_SET_FATTR4_CHANGE(resp->cinfo.before, bdva.va_ctime); error = VOP_LINK(dvp, vp, nm, cs->cr, &ct, 0); kmem_free(nm, len); /* * Get the initial "after" sequence number, if it fails, set to zero */ idva.va_mask = AT_SEQ; if (VOP_GETATTR(dvp, &idva, 0, cs->cr, &ct)) idva.va_seq = 0; /* * Force modified data and metadata out to stable storage. */ (void) VOP_FSYNC(vp, FNODSYNC, cs->cr, &ct); (void) VOP_FSYNC(dvp, 0, cs->cr, &ct); if (error) { *cs->statusp = resp->status = puterrno4(error); goto final; } /* * Get "after" change value, if it fails, simply return the * before value. */ adva.va_mask = AT_CTIME|AT_SEQ; if (VOP_GETATTR(dvp, &adva, 0, cs->cr, &ct)) { adva.va_ctime = bdva.va_ctime; adva.va_seq = 0; } NFS4_SET_FATTR4_CHANGE(resp->cinfo.after, adva.va_ctime); /* * The cinfo.atomic = TRUE only if we have * non-zero va_seq's, and it has incremented by exactly one * during the VOP_LINK and it didn't change during the VOP_FSYNC. */ if (bdva.va_seq && idva.va_seq && adva.va_seq && idva.va_seq == (bdva.va_seq + 1) && idva.va_seq == adva.va_seq) resp->cinfo.atomic = TRUE; else resp->cinfo.atomic = FALSE; *cs->statusp = resp->status = NFS4_OK; final: DTRACE_NFSV4_2(op__link__done, struct compound_state *, cs, LINK4res *, resp); } /* * Used by mds_op_lookup and mds_op_lookupp to do the actual work. */ /* ARGSUSED */ static nfsstat4 mds_do_lookup(char *nm, uint_t buflen, struct svc_req *req, struct compound_state *cs) { int error; int different_export = 0; vnode_t *vp, *tvp, *pre_tvp = NULL; struct exportinfo *exi = NULL, *pre_exi = NULL; nfsstat4 stat; fid_t fid; int attrdir, dotdot, walk; caller_context_t ct; nfs41_fh_fmt_t *fhp; fhp = (nfs41_fh_fmt_t *)cs->fh.nfs_fh4_val; attrdir = ((cs->vp->v_flag & V_XATTRDIR) == V_XATTRDIR) ? FH41_ATTRDIR : 0; ASSERT(FH41_GET_FLAG(fhp, FH41_ATTRDIR) == attrdir); dotdot = (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0'); /* * If dotdotting, then need to check whether it's * above the root of a filesystem, or above an * export point. */ if (dotdot) { /* * If dotdotting at the root of a filesystem, then * need to traverse back to the mounted-on filesystem * and do the dotdot lookup there. */ if (cs->vp->v_flag & VROOT) { /* * If at the system root, then can * go up no further. */ if (VN_CMP(cs->vp, rootdir)) return (puterrno4(ENOENT)); /* * Traverse back to the mounted-on filesystem */ cs->vp = untraverse(cs->vp); /* * Set the different_export flag so we remember * to pick up a new exportinfo entry for * this new filesystem. */ different_export = 1; } else { /* * If dotdotting above an export point then set * the different_export to get new export info. */ different_export = nfs_exported(cs->exi, cs->vp); } } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; error = VOP_LOOKUP(cs->vp, nm, &vp, NULL, 0, NULL, cs->cr, &ct, 0, NULL); if (error) return (puterrno4(error)); /* * If the vnode is in a pseudo filesystem, check whether it is visible. * * XXX if the vnode is a symlink and it is not visible in * a pseudo filesystem, return ENOENT (not following symlink). * V4 client can not mount such symlink. * * In the same exported filesystem, if the security flavor used * is not an explicitly shared flavor, limit the view to the visible * list entries only. This is not a WRONGSEC case because it's already * checked via PUTROOTFH/PUTPUBFH or PUTFH. */ if (!different_export && (PSEUDO(cs->exi) || ! is_exported_sec(cs->nfsflavor, cs->exi) || cs->access & CS_ACCESS_LIMITED)) { if (! nfs_visible(cs->exi, vp, &different_export)) { VN_RELE(vp); return (puterrno4(ENOENT)); } } /* * If it's a mountpoint, then traverse it. */ if (vn_ismntpt(vp)) { pre_exi = cs->exi; /* save pre-traversed exportinfo */ pre_tvp = vp; /* save pre-traversed vnode */ /* * hold pre_tvp to counteract rele by traverse. We will * need pre_tvp below if checkexport4 fails */ VN_HOLD(pre_tvp); tvp = vp; if ((error = traverse(&tvp)) != 0) { VN_RELE(vp); VN_RELE(pre_tvp); return (puterrno4(error)); } vp = tvp; different_export = 1; } else if (vp->v_vfsp != cs->vp->v_vfsp) { /* * The vfsp comparison is to handle the case where * a LOFS mount is shared. lo_lookup traverses mount points, * and NFS is unaware of local fs transistions because * v_vfsmountedhere isn't set. For this special LOFS case, * the dir and the obj returned by lookup will have different * vfs ptrs. */ different_export = 1; } if (different_export) { vnode_t *oldvp; bzero(&fid, sizeof (fid)); fid.fid_len = MAXFIDSZ; error = vop_fid_pseudo(vp, &fid); if (error) { VN_RELE(vp); if (pre_tvp) VN_RELE(pre_tvp); return (puterrno4(error)); } if (dotdot) exi = nfs_vptoexi(NULL, vp, cs->cr, &walk, NULL, TRUE); else exi = checkexport4(&vp->v_vfsp->vfs_fsid, &fid, vp); if (exi == NULL) { if (pre_tvp) { /* * If this vnode is a mounted-on vnode, * but the mounted-on file system is not * exported, send back the filehandle for * the mounted-on vnode, not the root of * the mounted-on file system. */ VN_RELE(vp); vp = pre_tvp; exi = pre_exi; } else { VN_RELE(vp); return (puterrno4(EACCES)); } } else if (pre_tvp) { /* we're done with pre_tvp now. release extra hold */ VN_RELE(pre_tvp); } cs->exi = exi; /* * Now do a checkauth4. * * Checking here since the client/principle may not have * access to the cs->exi exported file system. * * If the client has access we also need to validate * the principle since it may have been re-mapped. * * We start with a new credential as a previous call to * checkauth4(), via a PUT*FH operation, wrote over cs->cr. */ crfree(cs->cr); cs->cr = crdup(cs->basecr); oldvp = cs->vp; cs->vp = vp; stat = call_checkauth4(cs, req); cs->vp = oldvp; if (stat != NFS4_OK) { VN_RELE(vp); return (stat); } } /* * After various NFS checks, do a label check on the path * component. The label on this path should either be the * global zone's label or a zone's label. We are only * interested in the zone's label because exported files * in global zone is accessible (though read-only) to * clients. The exportability/visibility check is already * done before reaching this code. */ if (is_system_labeled()) { bslabel_t *clabel; ASSERT(req->rq_label != NULL); clabel = req->rq_label; DTRACE_PROBE2(tx__rfs4__log__info__oplookup__clabel, char *, "got client label from request(1)", struct svc_req *, req); if (!blequal(&l_admin_low->tsl_label, clabel)) { if (!do_rfs_label_check(clabel, vp, DOMINANCE_CHECK, cs->exi)) { error = EACCES; goto err_out; } } else { /* * We grant access to admin_low label clients * only if the client is trusted, i.e. also * running Solaris Trusted Extension. */ struct sockaddr *ca; int addr_type; void *ipaddr; tsol_tpc_t *tp; ca = (struct sockaddr *)svc_getrpccaller( req->rq_xprt)->buf; if (ca->sa_family == AF_INET) { addr_type = IPV4_VERSION; ipaddr = &((struct sockaddr_in *)ca)->sin_addr; } else if (ca->sa_family == AF_INET6) { addr_type = IPV6_VERSION; ipaddr = &((struct sockaddr_in6 *) ca)->sin6_addr; } tp = find_tpc(ipaddr, addr_type, B_FALSE); if (tp == NULL || tp->tpc_tp.tp_doi != l_admin_low->tsl_doi || tp->tpc_tp.host_type != SUN_CIPSO) { error = EACCES; goto err_out; } } } error = rfs4_cs_update_fh(cs, vp); VN_RELE(vp); err_out: if (error) return (puterrno4(error)); /* * if did lookup on attrdir and didn't lookup .., set named * attr fh flag */ if (attrdir && ! dotdot) FH41_SET_FLAG(fhp, FH41_NAMEDATTR); /* Assume false for now, open proc will set this */ cs->mandlock = FALSE; return (NFS4_OK); } /* ARGSUSED */ static void mds_op_lookup(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { LOOKUP4args *args = &argop->nfs_argop4_u.oplookup; LOOKUP4res *resp = &resop->nfs_resop4_u.oplookup; char *nm; uint_t len; DTRACE_NFSV4_2(op__lookup__start, struct compound_state *, cs, LOOKUP4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (cs->vp->v_type == VLNK) { *cs->statusp = resp->status = NFS4ERR_SYMLINK; goto final; } if (cs->vp->v_type != VDIR) { *cs->statusp = resp->status = NFS4ERR_NOTDIR; goto final; } if (!utf8_dir_verify(&args->objname)) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } nm = utf8_to_str(&args->objname, &len, NULL); if (nm == NULL) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } if (len > MAXNAMELEN) { *cs->statusp = resp->status = NFS4ERR_NAMETOOLONG; kmem_free(nm, len); goto final; } *cs->statusp = resp->status = mds_do_lookup(nm, len, req, cs); kmem_free(nm, len); final: DTRACE_NFSV4_2(op__lookup__done, struct compound_state *, cs, LOOKUP4res *, resp); } /* ARGSUSED */ static void mds_op_lookupp(nfs_argop4 *args, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { LOOKUPP4res *resp = &resop->nfs_resop4_u.oplookupp; DTRACE_NFSV4_1(op__lookupp__start, struct compound_state *, cs); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (cs->vp->v_type != VDIR) { *cs->statusp = resp->status = NFS4ERR_NOTDIR; goto final; } *cs->statusp = resp->status = mds_do_lookup("..", 3, req, cs); /* * From NFSV4 Specification, LOOKUPP should not check for * NFS4ERR_WRONGSEC. Retrun NFS4_OK instead. */ if (resp->status == NFS4ERR_WRONGSEC) { *cs->statusp = resp->status = NFS4_OK; } final: DTRACE_NFSV4_2(op__lookupp__done, struct compound_state *, cs, LOOKUPP4res *, resp); } /*ARGSUSED2*/ static void mds_op_openattr(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { OPENATTR4args *args = &argop->nfs_argop4_u.opopenattr; OPENATTR4res *resp = &resop->nfs_resop4_u.opopenattr; vnode_t *avp = NULL; int lookup_flags = LOOKUP_XATTR, error; int exp_ro = 0; caller_context_t ct; DTRACE_NFSV4_2(op__openattr__start, struct compound_state *, cs, OPENATTR4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } /* * Make a couple of checks made by copen() * * Check to make sure underlying fs supports xattrs. This * is required because solaris filesystem implementations * (UFS/TMPFS) don't enforce the noxattr mount option * in VOP_LOOKUP(LOOKUP_XATTR). If fs doesn't support this * pathconf cmd or if fs supports cmd but doesn't claim * support for xattr, return NOTSUPP. It would be better * to use VOP_PATHCONF( _PC_XATTR_ENABLED) for this; however, * that cmd is not available to VOP_PATHCONF interface * (it's only implemented inside pathconf syscall)... * * Verify permission to put attributes on files (access * checks from copen). */ if ((cs->vp->v_vfsp->vfs_flag & VFS_XATTR) == 0) { error = ENOTSUP; goto error_out; } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; if ((VOP_ACCESS(cs->vp, VREAD, 0, cs->cr, &ct) != 0) && (VOP_ACCESS(cs->vp, VWRITE, 0, cs->cr, &ct) != 0) && (VOP_ACCESS(cs->vp, VEXEC, 0, cs->cr, &ct) != 0)) { error = EACCES; goto error_out; } /* * The CREATE_XATTR_DIR VOP flag cannot be specified if * the file system is exported read-only -- regardless of * createdir flag. Otherwise the attrdir would be created * (assuming server fs isn't mounted readonly locally). If * VOP_LOOKUP returns ENOENT in this case, the error will * be translated into EROFS. ENOSYS is mapped to ENOTSUP * because specfs has no VOP_LOOKUP op, so the macro would * return ENOSYS. EINVAL is returned by all (current) * Solaris file system implementations when any of their * restrictions are violated (xattr(dir) can't have xattrdir). * Returning NOTSUPP is more appropriate in this case * because the object will never be able to have an attrdir. */ if (args->createdir && ! (exp_ro = rdonly4(cs->exi, cs->vp, req))) lookup_flags |= CREATE_XATTR_DIR; error = VOP_LOOKUP(cs->vp, "", &avp, NULL, lookup_flags, NULL, cs->cr, &ct, 0, NULL); if (error) { if (error == ENOENT && args->createdir && exp_ro) error = EROFS; else if (error == EINVAL || error == ENOSYS) error = ENOTSUP; goto error_out; } ASSERT(avp->v_flag & V_XATTRDIR); error = rfs4_cs_update_fh(cs, avp); VN_RELE(avp); if (error) goto error_out; /* * There is no requirement for an attrdir fh flag * because the attrdir has a vnode flag to distinguish * it from regular (non-xattr) directories. The * FH41_ATTRDIR flag is set for future sanity checks. */ FH41_SET_FLAG((nfs41_fh_fmt_t *)cs->fh.nfs_fh4_val, FH41_ATTRDIR); *cs->statusp = resp->status = NFS4_OK; goto final; error_out: *cs->statusp = resp->status = puterrno4(error); final: DTRACE_NFSV4_2(op__openattr__done, struct compound_state *, cs, OPENATTR4res *, resp); } static int do_io(int direction, vnode_t *vp, struct uio *uio, int ioflag, cred_t *cred, caller_context_t *ct) { int error; int i; clock_t delaytime; delaytime = MSEC_TO_TICK_ROUNDUP(rfs4_lock_delay); /* * Don't block on mandatory locks. If this routine returns * EAGAIN, the caller should return NFS4ERR_LOCKED. */ uio->uio_fmode = FNONBLOCK; for (i = 0; i < rfs4_maxlock_tries; i++) { if (direction == FREAD) { (void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, ct); error = VOP_READ(vp, uio, ioflag, cred, ct); VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, ct); } else { (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, ct); error = VOP_WRITE(vp, uio, ioflag, cred, ct); VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, ct); } if (error != EAGAIN) break; if (i < rfs4_maxlock_tries - 1) { delay(delaytime); delaytime *= 2; } } return (error); } /* ARGSUSED */ static void mds_op_read(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { READ4args *args = &argop->nfs_argop4_u.opread; READ4res *resp = &resop->nfs_resop4_u.opread; int error; nnode_t *nn = NULL; struct iovec iov; struct uio uio; bool_t *deleg = &cs->deleg; nfsstat4 stat; mblk_t *mp; int alloc_err = 0; caller_context_t ct; uint32_t nnioflags = 0; DTRACE_NFSV4_2(op__read__start, struct compound_state *, cs, READ4args, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } error = nnode_from_fh_v41(&nn, &cs->fh); if (error != 0) { *cs->statusp = resp->status = nnode_stat4(error, 1); goto final; } if (cs->access == CS_ACCESS_DENIED) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } if ((stat = nnop_check_stateid(nn, cs, FREAD, &args->stateid, FALSE, deleg, TRUE, &ct, NULL)) != NFS4_OK) { *cs->statusp = resp->status = stat; goto final; } error = nnop_io_prep(nn, &nnioflags, cs->cr, &ct, args->offset, args->count, NULL); if (error != 0) { *cs->statusp = resp->status = nnode_stat4(error, 1); goto out; } if (nnioflags & NNODE_IO_FLAG_PAST_EOF) { *cs->statusp = resp->status = NFS4_OK; resp->eof = TRUE; resp->data_len = 0; resp->data_val = NULL; resp->mblk = NULL; *cs->statusp = resp->status = NFS4_OK; goto out; } if (args->count == 0) { *cs->statusp = resp->status = NFS4_OK; resp->eof = FALSE; resp->data_len = 0; resp->data_val = NULL; resp->mblk = NULL; goto out; } /* * Do not allocate memory more than maximum allowed * transfer size */ if (args->count > rfs4_tsize(req)) args->count = rfs4_tsize(req); if (args->wlist) { mp = NULL; (void) rdma_get_wchunk(req, &iov, args->wlist); } else { /* * mp will contain the data to be sent out in the read reply. * It will be freed after the reply has been sent. * Let's roundup the data to a BYTES_PER_XDR_UNIT multiple, * so that the call to xdrmblk_putmblk() never fails. * If the first alloc of the requested size fails, then * decrease the size to something more reasonable and wait * for the allocation to occur. */ mp = allocb(RNDUP(args->count), BPRI_MED); if (mp == NULL) { if (args->count > MAXBSIZE) args->count = MAXBSIZE; mp = allocb_wait(RNDUP(args->count), BPRI_MED, STR_NOSIG, &alloc_err); } ASSERT(mp != NULL); ASSERT(alloc_err == 0); iov.iov_base = (caddr_t)mp->b_datap->db_base; iov.iov_len = args->count; } uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_segflg = UIO_SYSSPACE; uio.uio_extflg = UIO_COPY_CACHED; uio.uio_loffset = args->offset; uio.uio_resid = args->count; error = nnop_read(nn, &nnioflags, cs->cr, &ct, &uio, 0); if (error) { if (mp != NULL) freeb(mp); *cs->statusp = resp->status = nnode_stat4(error, 1); goto out; } *cs->statusp = resp->status = NFS4_OK; ASSERT(uio.uio_resid >= 0); resp->data_len = args->count - uio.uio_resid; if (mp) { resp->data_val = (char *)mp->b_datap->db_base; rfs_rndup_mblks(mp, resp->data_len, 0); } else { resp->data_val = (caddr_t)iov.iov_base; } resp->mblk = mp; resp->eof = (nnioflags & NNODE_IO_FLAG_EOF) ? TRUE : FALSE; out: nnop_io_release(nn, nnioflags, &ct); final: if (nn != NULL) nnode_rele(&nn); DTRACE_NFSV4_2(op__read__done, struct compound_state *, cs, READ4res *, resp); } /*ARGSUSED*/ static void mds_op_read_free(nfs_resop4 *resop) { /* Common function for NFSv4.0 and NFSv4.1 */ rfs4_op_read_free(resop); } /* ARGSUSED */ static void mds_op_putpubfh(nfs_argop4 *args, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { PUTPUBFH4res *resp = &resop->nfs_resop4_u.opputpubfh; int error; vnode_t *vp; struct exportinfo *exi, *sav_exi; nfs41_fh_fmt_t *fhp; fid_t exp_fid; DTRACE_NFSV4_1(op__putpubfh__start, struct compound_state *, cs); rfs4_cs_invalidate_fh(cs); if (cs->cr) crfree(cs->cr); cs->cr = crdup(cs->basecr); vp = exi_public->exi_vp; if (vp == NULL) { *cs->statusp = resp->status = NFS4ERR_SERVERFAULT; goto final; } sav_exi = cs->exi; cs->exi = exi_public; error = rfs4_cs_update_fh(cs, vp); if (error != 0) { cs->exi = sav_exi; *cs->statusp = resp->status = puterrno4(error); goto final; } if (exi_public == exi_root) { /* * No filesystem is actually shared public, so we default * to exi_root. In this case, we must check whether root * is exported. */ fhp = (nfs41_fh_fmt_t *)cs->fh.nfs_fh4_val; exp_fid.fid_len = fhp->fh.v1.export_fid.len; bcopy(fhp->fh.v1.export_fid.val, exp_fid.fid_data, exp_fid.fid_len); /* * if root filesystem is exported, the exportinfo struct that we * should use is what checkexport4 returns, because root_exi is * actually a mostly empty struct. */ exi = checkexport4(&fhp->fh.v1.export_fsid, &exp_fid, NULL); cs->exi = ((exi != NULL) ? exi : exi_public); } if ((resp->status = call_checkauth4(cs, req)) != NFS4_OK) { cs->exi = sav_exi; rfs4_cs_invalidate_fh(cs); goto final; } *cs->statusp = resp->status = NFS4_OK; final: DTRACE_NFSV4_2(op__putpubfh__done, struct compound_state *, cs, PUTPUBFH4res *, resp); } /* * XXX - issue with put*fh operations. * * let us assume that /export/home is shared via NFS and a NFS client * wishes to mount /export/home/joe. * * If /export, home, or joe have restrictive search permissions, then * the NFS Server should not return a filehandle to the client. * * This case is easy to enforce. However, the NFS Client does not know * which security flavor should be used until the pathname has been * fully resolved. In addition there is another complication for uid * mapping. If the credential being used is root, the default behaviour * will be to map it to the anonymous user. However the NFS Server can not * map it until the pathname has been fully resolved. * * XXX: JEFF: Proposed solution. * * Luckily, SECINFO uses a full pathname. So what we will * have to do in mds_op_lookup is check that flavor of * the target object matches that of the request, and if root was the * caller, check for the root= and anon= options, and if necessary, * repeat the lookup using the right cred_t. * * But that's not done yet. */ /* ARGSUSED */ static void mds_op_putfh(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { PUTFH4args *args = &argop->nfs_argop4_u.opputfh; PUTFH4res *resp = &resop->nfs_resop4_u.opputfh; nfs41_fh_fmt_t *fhp = NULL; fid_t exp_fid; int error; DTRACE_NFSV4_2(op__putfh__start, struct compound_state *, cs, PUTFH4args *, args); rfs4_cs_invalidate_fh(cs); if (cs->cr) { crfree(cs->cr); cs->cr = NULL; } /* * Check exportinfo only if it's a FH41_TYPE_NFS filehandle. * If the filehandle is otherwise incorrect, * nnode_from_fh_v41() will return an error. */ fhp = (nfs41_fh_fmt_t *)args->object.nfs_fh4_val; if (fhp->type == FH41_TYPE_NFS) { exp_fid.fid_len = fhp->fh.v1.export_fid.len; bcopy(fhp->fh.v1.export_fid.val, exp_fid.fid_data, exp_fid.fid_len); cs->exi = checkexport4(&fhp->fh.v1.export_fsid, &exp_fid, NULL); if (cs->exi == NULL) { *cs->statusp = resp->status = NFS4ERR_STALE; DTRACE_PROBE(nfss41__e__chkexp); goto final; } /* * DS doesn't have vnode on its pNFS dataset. So get * vnode only when we deal with MDS and then FH41_TYPE_NFS * filehandle. */ cs->vp = nfs41_fhtovp(&args->object, cs->exi, cs->statusp); if (cs->vp == NULL) { resp->status = *cs->statusp; goto final; } } cs->cr = crdup(cs->basecr); ASSERT(cs->cr != NULL); if (fhp->type == FH41_TYPE_NFS) { if ((resp->status = call_checkauth4(cs, req)) != NFS4_OK) { crfree(cs->cr); cs->cr = NULL; rfs4_cs_invalidate_fh(cs); *cs->statusp = resp->status; DTRACE_PROBE(nfss41__e__fail_auth); goto final; } } else if (fhp->type == FH41_TYPE_DMU_DS) cs->access = CS_ACCESS_OK; nfs_fh4_copy(&args->object, &cs->fh); *cs->statusp = resp->status = NFS4_OK; cs->deleg = FALSE; final: DTRACE_NFSV4_2(op__putfh__done, struct compound_state *, cs, PUTFH4res *, resp); } /* ARGSUSED */ static void mds_op_putrootfh(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { PUTROOTFH4res *resp = &resop->nfs_resop4_u.opputrootfh; int error; fid_t fid; struct exportinfo *exi, *sav_exi; DTRACE_NFSV4_1(op__putrootfh__start, struct compound_state *, cs); rfs4_cs_invalidate_fh(cs); if (cs->cr) crfree(cs->cr); cs->cr = crdup(cs->basecr); /* * Using rootdir, the system root vnode, * get its fid. */ bzero(&fid, sizeof (fid)); fid.fid_len = MAXFIDSZ; error = vop_fid_pseudo(rootdir, &fid); if (error != 0) { *cs->statusp = resp->status = puterrno4(error); goto final; } /* * Then use the root fsid & fid it to find out if it's exported * * If the server root isn't exported directly, then * it should at least be a pseudo export based on * one or more exports further down in the server's * file tree. */ exi = checkexport4(&rootdir->v_vfsp->vfs_fsid, &fid, NULL); if (exi == NULL || exi->exi_export.ex_flags & EX_PUBLIC) { DTRACE_PROBE(nfss41__e__chkexp); *cs->statusp = resp->status = NFS4ERR_SERVERFAULT; goto final; } /* * Now make a filehandle based on the root * export and root vnode. */ sav_exi = cs->exi; cs->exi = exi; error = rfs4_cs_update_fh(cs, rootdir); if (error != 0) { cs->exi = sav_exi; *cs->statusp = resp->status = puterrno4(error); goto final; } if ((resp->status = call_checkauth4(cs, req)) != NFS4_OK) { rfs4_cs_invalidate_fh(cs); cs->exi = sav_exi; goto final; } *cs->statusp = resp->status = NFS4_OK; cs->deleg = FALSE; final: DTRACE_NFSV4_2(op__putrootfh__done, struct compound_state *, cs, PUTROOTFH4res *, resp); } /* * readlink: args: CURRENT_FH. * res: status. If success - CURRENT_FH unchanged, return linktext. */ /* ARGSUSED */ static void mds_op_readlink(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { READLINK4res *resp = &resop->nfs_resop4_u.opreadlink; int error; vnode_t *vp; struct iovec iov; struct vattr va; struct uio uio; char *data; caller_context_t ct; DTRACE_NFSV4_1(op__readlink__start, struct compound_state *, cs); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } /* CURRENT_FH: directory */ vp = cs->vp; if (cs->access == CS_ACCESS_DENIED) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } if (vp->v_type == VDIR) { *cs->statusp = resp->status = NFS4ERR_ISDIR; goto final; } if (vp->v_type != VLNK) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; va.va_mask = AT_MODE; error = VOP_GETATTR(vp, &va, 0, cs->cr, &ct); if (error) { *cs->statusp = resp->status = puterrno4(error); goto final; } if (MANDLOCK(vp, va.va_mode)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } data = kmem_alloc(MAXPATHLEN + 1, KM_SLEEP); iov.iov_base = data; iov.iov_len = MAXPATHLEN; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_segflg = UIO_SYSSPACE; uio.uio_extflg = UIO_COPY_CACHED; uio.uio_loffset = 0; uio.uio_resid = MAXPATHLEN; error = VOP_READLINK(vp, &uio, cs->cr, &ct); if (error) { kmem_free((caddr_t)data, (uint_t)MAXPATHLEN + 1); *cs->statusp = resp->status = puterrno4(error); goto final; } *(data + MAXPATHLEN - uio.uio_resid) = '\0'; /* * treat link name as data */ (void) str_to_utf8(data, &resp->link); kmem_free((caddr_t)data, (uint_t)MAXPATHLEN + 1); *cs->statusp = resp->status = NFS4_OK; final: DTRACE_NFSV4_2(op__readlink__done, struct compound_state *, cs, READLINK4res *, resp); } /*ARGSUSED*/ static void mds_op_readlink_free(nfs_resop4 *resop) { /* Common function used for NFSv4.0 and NFSv4.1 */ rfs4_op_readlink_free(resop); } /* ARGSUSED */ static void mds_op_reclaim_complete(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { RECLAIM_COMPLETE4args *args = &argop->nfs_argop4_u.opreclaim_complete; RECLAIM_COMPLETE4res *resp = &resop->nfs_resop4_u.opreclaim_complete; rfs4_client_t *cp; cp = cs->cp; if (cp->rc_reclaim_completed) { *cs->statusp = resp->rcr_status = NFS4ERR_COMPLETE_ALREADY; return; } if (args->rca_one_fs) { /* do what? we don't track this */ *cs->statusp = resp->rcr_status = NFS4_OK; return; } cp->rc_reclaim_completed = 1; /* did we have reclaimable state stored for this client? */ if (cp->rc_can_reclaim) atomic_add_32(&(cs->instp->reclaim_cnt), -1); *cs->statusp = resp->rcr_status = NFS4_OK; } /* * short utility function to lookup a file and recall the delegation */ static rfs4_file_t * mds_lookup_and_findfile(vnode_t *dvp, char *nm, vnode_t **vpp, int *lkup_error, struct compound_state *cs) { vnode_t *vp; rfs4_file_t *fp = NULL; bool_t fcreate = FALSE; int error; if (vpp) *vpp = NULL; if ((error = VOP_LOOKUP(dvp, nm, &vp, NULL, 0, NULL, cs->cr, NULL, 0, NULL)) == 0) { if (vp->v_type == VREG) fp = rfs4_findfile(cs->instp, vp, NULL, &fcreate); if (vpp) *vpp = vp; else VN_RELE(vp); } if (lkup_error) *lkup_error = error; return (fp); } int do_ctl_mds_remove(vnode_t *vp, mds_layout_t *layout, compound_state_t *cs) { fid_t fid; nfs41_fid_t nfs41_fid; int error = 0; /* * Use the file layout to determine which data servers to * send DS_REMOVEs to. If the layout is not cached in the * rfs4_file_t either this means that we do not have a layout * or it needs to be read in from disk. Right now, we do not * attempt to read the layout in from disk, but future phases * of REMOVE handling will take this into consideration. * * Known Problems with this implementation of REMOVE: * 1. Not attempting to read a layout from disk could mean * that if an on-disk layout did exist, storage on the data * servers will not be freed. * * 2. The server populates the layout stored in the rfs4_file_t * when it receives a LAYOUTGET. If the file has been written * (perhaps in a past server instance), but no clients have * issued new LAYOUTGETs, we will not have a cached layout and * we will not free space on the data servers. * * 3. If any of the DS_REMOVE calls to the data servers fail * the errors are ignored and will not be retried. This may * cause leaked space on the the data server. */ bzero(&fid, sizeof (fid)); fid.fid_len = MAXFIDSZ; error = vop_fid_pseudo(vp, &fid); if (error) { DTRACE_NFSV4_1(nfss__e__vop_fid_pseudo_failed, int, error); return (error); } else { nfs41_fid.len = fid.fid_len; bcopy(fid.fid_data, nfs41_fid.val, nfs41_fid.len); } error = ctl_mds_clnt_remove_file(cs->instp, cs->exi->exi_fsid, &nfs41_fid, layout); return (error); } /* * remove: args: CURRENT_FH: directory; name. * res: status. If success - CURRENT_FH unchanged, return change_info * for directory. */ /* ARGSUSED */ static void mds_op_remove(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { REMOVE4args *args = &argop->nfs_argop4_u.opremove; REMOVE4res *resp = &resop->nfs_resop4_u.opremove; int error; vnode_t *dvp, *vp; struct vattr bdva, idva, adva; char *nm; uint_t len; rfs4_file_t *fp; int in_crit = 0; bslabel_t *clabel; caller_context_t ct; DTRACE_NFSV4_2(op__remove__start, struct compound_state *, cs, REMOVE4args *, args); /* CURRENT_FH: directory */ dvp = cs->vp; if (dvp == NULL) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (cs->access == CS_ACCESS_DENIED) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } /* * If there is an unshared filesystem mounted on this vnode, * Do not allow to remove anything in this directory. */ if (vn_ismntpt(dvp)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } if (dvp->v_type != VDIR) { *cs->statusp = resp->status = NFS4ERR_NOTDIR; goto final; } if (!utf8_dir_verify(&args->target)) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } /* * Lookup the file so that we can check if it's a directory */ nm = utf8_to_fn(&args->target, &len, NULL); if (nm == NULL) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } if (len > MAXNAMELEN) { *cs->statusp = resp->status = NFS4ERR_NAMETOOLONG; kmem_free(nm, len); goto final; } if (rdonly4(cs->exi, cs->vp, req)) { *cs->statusp = resp->status = NFS4ERR_ROFS; kmem_free(nm, len); goto final; } /* * Lookup the file to determine type and while we are see if * there is a file struct around and check for delegation. * We don't need to acquire va_seq before this lookup, if * it causes an update, cinfo.before will not match, which will * trigger a cache flush even if atomic is TRUE. */ fp = mds_lookup_and_findfile(dvp, nm, &vp, &error, cs); if (vp != NULL) { if (rfs4_check_delegated(FWRITE, vp, TRUE, TRUE, TRUE, NULL)) { VN_RELE(vp); rfs4_file_rele(fp); *cs->statusp = resp->status = NFS4ERR_DELAY; kmem_free(nm, len); goto final; } } else { /* Didn't find anything to remove */ *cs->statusp = resp->status = error; kmem_free(nm, len); goto final; } if (nbl_need_check(vp)) { nbl_start_crit(vp, RW_READER); in_crit = 1; if (nbl_conflict(vp, NBL_REMOVE, 0, 0, 0, &ct)) { *cs->statusp = resp->status = NFS4ERR_FILE_OPEN; kmem_free(nm, len); nbl_end_crit(vp); VN_RELE(vp); if (fp) { rfs4_clear_dont_grant(cs->instp, fp); rfs4_file_rele(fp); } goto final; } } /* check label before allowing removal */ if (is_system_labeled()) { ASSERT(req->rq_label != NULL); clabel = req->rq_label; DTRACE_PROBE2(tx__rfs4__log__info__opremove__clabel, char *, "got client label from request(1)", struct svc_req *, req); if (!blequal(&l_admin_low->tsl_label, clabel)) { if (!do_rfs_label_check(clabel, vp, EQUALITY_CHECK, cs->exi)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; kmem_free(nm, len); if (in_crit) nbl_end_crit(vp); VN_RELE(vp); if (fp) { rfs4_clear_dont_grant(cs->instp, fp); rfs4_file_rele(fp); } goto final; } } } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; /* Get dir "before" change value */ bdva.va_mask = AT_CTIME|AT_SEQ; error = VOP_GETATTR(dvp, &bdva, 0, cs->cr, &ct); if (error) { *cs->statusp = resp->status = puterrno4(error); kmem_free(nm, len); if (in_crit) nbl_end_crit(vp); VN_RELE(vp); if (fp) { rfs4_clear_dont_grant(cs->instp, fp); rfs4_file_rele(fp); } goto final; } NFS4_SET_FATTR4_CHANGE(resp->cinfo.before, bdva.va_ctime); /* Actually do the REMOVE operation */ if (vp->v_type == VDIR) { /* * Can't remove a directory that has a mounted-on filesystem. */ if (vn_ismntpt(vp)) { error = EACCES; } else { /* * System V defines rmdir to return EEXIST, * not * ENOTEMPTY, if the directory is not * empty. A System V NFS server needs to map * NFS4ERR_EXIST to NFS4ERR_NOTEMPTY to * transmit over the wire. */ if ((error = VOP_RMDIR(dvp, nm, rootdir, cs->cr, &ct, 0)) == EEXIST) error = ENOTEMPTY; if (error == 0) nnode_vnode_invalidate(vp); } } else if ((error = VOP_REMOVE(dvp, nm, cs->cr, &ct, 0)) == 0) { struct vattr va; /* * This is va_seq safe because we are not * manipulating dvp. */ va.va_mask = AT_NLINK; if (!VOP_GETATTR(vp, &va, 0, cs->cr, &ct) && va.va_nlink == 0) { if (in_crit) { nbl_end_crit(vp); in_crit = 0; } /* Need to clear nnode from cache */ nnode_vnode_invalidate(vp); if (pnfs_enabled(cs->exi)) { mds_layout_t *l; /* Remove the layout */ l = pnfs_delete_mds_layout(vp); /* * Remove objects on data servers. * Ignore errors for now.. */ if (l) { do_ctl_mds_remove(vp, l, cs); mds_layout_put(l); } } /* Remove state on file remove */ if (fp) rfs4_close_all_state(fp); } } if (in_crit) nbl_end_crit(vp); VN_RELE(vp); if (fp) { rfs4_clear_dont_grant(cs->instp, fp); rfs4_file_rele(fp); fp = NULL; } kmem_free(nm, len); if (error) { *cs->statusp = resp->status = puterrno4(error); goto final; } /* * Get the initial "after" sequence number, if it fails, set to zero */ idva.va_mask = AT_SEQ; if (VOP_GETATTR(dvp, &idva, 0, cs->cr, &ct)) idva.va_seq = 0; /* * Force modified data and metadata out to stable storage. */ (void) VOP_FSYNC(dvp, 0, cs->cr, &ct); /* * Get "after" change value, if it fails, simply return the * before value. */ adva.va_mask = AT_CTIME|AT_SEQ; if (VOP_GETATTR(dvp, &adva, 0, cs->cr, &ct)) { adva.va_ctime = bdva.va_ctime; adva.va_seq = 0; } NFS4_SET_FATTR4_CHANGE(resp->cinfo.after, adva.va_ctime); /* * The cinfo.atomic = TRUE only if we have * non-zero va_seq's, and it has incremented by exactly one * during the VOP_REMOVE/RMDIR and it didn't change during * the VOP_FSYNC. */ if (bdva.va_seq && idva.va_seq && adva.va_seq && idva.va_seq == (bdva.va_seq + 1) && idva.va_seq == adva.va_seq) resp->cinfo.atomic = TRUE; else resp->cinfo.atomic = FALSE; *cs->statusp = resp->status = NFS4_OK; final: DTRACE_NFSV4_2(op__remove__done, struct compound_state *, cs, REMOVE4res *, resp); } /* * rename: args: SAVED_FH: from directory, CURRENT_FH: target directory, * oldname and newname. * res: status. If success - CURRENT_FH unchanged, return change_info * for both from and target directories. */ /* ARGSUSED */ static void mds_op_rename(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { RENAME4args *args = &argop->nfs_argop4_u.oprename; RENAME4res *resp = &resop->nfs_resop4_u.oprename; int error; vnode_t *odvp; vnode_t *ndvp; vnode_t *srcvp, *targvp; struct vattr obdva, oidva, oadva; struct vattr nbdva, nidva, nadva; char *onm, *nnm; uint_t olen, nlen; rfs4_file_t *fp, *sfp; int in_crit_src, in_crit_targ; int fp_rele_grant_hold, sfp_rele_grant_hold; bslabel_t *clabel; caller_context_t ct; DTRACE_NFSV4_2(op__rename__start, struct compound_state *, cs, RENAME4args *, args); fp = sfp = NULL; srcvp = targvp = NULL; in_crit_src = in_crit_targ = 0; fp_rele_grant_hold = sfp_rele_grant_hold = 0; if (!rfs4_cs_has_fh(cs) || !rfs4_cs_has_savedfh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } /* CURRENT_FH: target directory, SAVED_FH: from directory */ ndvp = cs->vp; odvp = cs->saved_vp; if (cs->access == CS_ACCESS_DENIED) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } /* * If there is an unshared filesystem mounted on this vnode, * do not allow to rename objects in this directory. */ if (vn_ismntpt(odvp)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } /* * If there is an unshared filesystem mounted on this vnode, * do not allow to rename to this directory. */ if (vn_ismntpt(ndvp)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } if (odvp->v_type != VDIR || ndvp->v_type != VDIR) { *cs->statusp = resp->status = NFS4ERR_NOTDIR; goto final; } if (cs->saved_exi != cs->exi) { *cs->statusp = resp->status = NFS4ERR_XDEV; goto final; } if (!utf8_dir_verify(&args->oldname)) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } if (!utf8_dir_verify(&args->newname)) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } onm = utf8_to_fn(&args->oldname, &olen, NULL); if (onm == NULL) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } nnm = utf8_to_fn(&args->newname, &nlen, NULL); if (nnm == NULL) { *cs->statusp = resp->status = NFS4ERR_INVAL; kmem_free(onm, olen); goto final; } if (olen > MAXNAMELEN || nlen > MAXNAMELEN) { *cs->statusp = resp->status = NFS4ERR_NAMETOOLONG; kmem_free(onm, olen); kmem_free(nnm, nlen); goto final; } if (rdonly4(cs->exi, cs->vp, req)) { *cs->statusp = resp->status = NFS4ERR_ROFS; kmem_free(onm, olen); kmem_free(nnm, nlen); goto final; } /* check label of the target dir */ if (is_system_labeled()) { ASSERT(req->rq_label != NULL); clabel = req->rq_label; DTRACE_PROBE2(tx__rfs4__log__info__oprename__clabel, char *, "got client label from request(1)", struct svc_req *, req); if (!blequal(&l_admin_low->tsl_label, clabel)) { if (!do_rfs_label_check(clabel, ndvp, EQUALITY_CHECK, cs->exi)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } } } /* * Is the source a file and have a delegation? * We don't need to acquire va_seq before these lookups, if * it causes an update, cinfo.before will not match, which will * trigger a cache flush even if atomic is TRUE. */ sfp = mds_lookup_and_findfile(odvp, onm, &srcvp, &error, cs); if (srcvp != NULL) { if (rfs4_check_delegated(FWRITE, srcvp, TRUE, TRUE, TRUE, NULL)) { *cs->statusp = resp->status = NFS4ERR_DELAY; goto err_out; } } else { *cs->statusp = resp->status = puterrno4(error); kmem_free(onm, olen); kmem_free(nnm, nlen); goto final; } sfp_rele_grant_hold = 1; /* Does the destination exist and a file and have a delegation? */ fp = mds_lookup_and_findfile(ndvp, nnm, &targvp, NULL, cs); if (targvp != NULL) { if (rfs4_check_delegated(FWRITE, targvp, TRUE, TRUE, TRUE, NULL)) { *cs->statusp = resp->status = NFS4ERR_DELAY; goto err_out; } } fp_rele_grant_hold = 1; /* Check for NBMAND lock on both source and target */ if (nbl_need_check(srcvp)) { nbl_start_crit(srcvp, RW_READER); in_crit_src = 1; if (nbl_conflict(srcvp, NBL_RENAME, 0, 0, 0, &ct)) { *cs->statusp = resp->status = NFS4ERR_FILE_OPEN; goto err_out; } } if (targvp && nbl_need_check(targvp)) { nbl_start_crit(targvp, RW_READER); in_crit_targ = 1; if (nbl_conflict(targvp, NBL_REMOVE, 0, 0, 0, &ct)) { *cs->statusp = resp->status = NFS4ERR_FILE_OPEN; goto err_out; } } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; /* Get source "before" change value */ obdva.va_mask = AT_CTIME|AT_SEQ; error = VOP_GETATTR(odvp, &obdva, 0, cs->cr, &ct); if (!error) { nbdva.va_mask = AT_CTIME|AT_SEQ; error = VOP_GETATTR(ndvp, &nbdva, 0, cs->cr, &ct); } if (error) { *cs->statusp = resp->status = puterrno4(error); goto err_out; } NFS4_SET_FATTR4_CHANGE(resp->source_cinfo.before, obdva.va_ctime); NFS4_SET_FATTR4_CHANGE(resp->target_cinfo.before, nbdva.va_ctime); if ((error = VOP_RENAME(odvp, onm, ndvp, nnm, cs->cr, &ct, 0)) == 0 && fp != NULL) { struct vattr va; vnode_t *tvp; rfs4_dbe_lock(fp->rf_dbe); tvp = fp->rf_vp; if (tvp) VN_HOLD(tvp); rfs4_dbe_unlock(fp->rf_dbe); if (tvp) { va.va_mask = AT_NLINK; if (!VOP_GETATTR(tvp, &va, 0, cs->cr, &ct) && va.va_nlink == 0) { /* The file is gone and so should the state */ if (in_crit_targ) { nbl_end_crit(targvp); in_crit_targ = 0; } rfs4_close_all_state(fp); } VN_RELE(tvp); } } if (error == 0) { char *tmp; /* fix the path name for the renamed file */ mutex_enter(&srcvp->v_lock); tmp = srcvp->v_path; srcvp->v_path = NULL; mutex_exit(&srcvp->v_lock); vn_setpath(rootdir, ndvp, srcvp, nnm, nlen - 1); if (tmp != NULL) kmem_free(tmp, strlen(tmp) + 1); } if (in_crit_src) nbl_end_crit(srcvp); if (srcvp) VN_RELE(srcvp); if (in_crit_targ) nbl_end_crit(targvp); if (targvp) VN_RELE(targvp); if (sfp) { rfs4_clear_dont_grant(cs->instp, sfp); rfs4_file_rele(sfp); sfp = NULL; } if (fp) { rfs4_clear_dont_grant(cs->instp, fp); rfs4_file_rele(fp); fp = NULL; } kmem_free(onm, olen); kmem_free(nnm, nlen); /* * Get the initial "after" sequence number, if it fails, set to zero */ oidva.va_mask = AT_SEQ; if (VOP_GETATTR(odvp, &oidva, 0, cs->cr, &ct)) oidva.va_seq = 0; nidva.va_mask = AT_SEQ; if (VOP_GETATTR(ndvp, &nidva, 0, cs->cr, &ct)) nidva.va_seq = 0; /* * Force modified data and metadata out to stable storage. */ (void) VOP_FSYNC(odvp, 0, cs->cr, &ct); (void) VOP_FSYNC(ndvp, 0, cs->cr, &ct); if (error) { *cs->statusp = resp->status = puterrno4(error); goto final; } /* * Get "after" change values, if it fails, simply return the * before value. */ oadva.va_mask = AT_CTIME|AT_SEQ; if (VOP_GETATTR(odvp, &oadva, 0, cs->cr, &ct)) { oadva.va_ctime = obdva.va_ctime; oadva.va_seq = 0; } nadva.va_mask = AT_CTIME|AT_SEQ; if (VOP_GETATTR(ndvp, &nadva, 0, cs->cr, &ct)) { nadva.va_ctime = nbdva.va_ctime; nadva.va_seq = 0; } NFS4_SET_FATTR4_CHANGE(resp->source_cinfo.after, oadva.va_ctime); NFS4_SET_FATTR4_CHANGE(resp->target_cinfo.after, nadva.va_ctime); /* * The cinfo.atomic = TRUE only if we have * non-zero va_seq's, and it has incremented by exactly one * during the VOP_RENAME and it didn't change during the VOP_FSYNC. */ if (obdva.va_seq && oidva.va_seq && oadva.va_seq && oidva.va_seq == (obdva.va_seq + 1) && oidva.va_seq == oadva.va_seq) resp->source_cinfo.atomic = TRUE; else resp->source_cinfo.atomic = FALSE; if (nbdva.va_seq && nidva.va_seq && nadva.va_seq && nidva.va_seq == (nbdva.va_seq + 1) && nidva.va_seq == nadva.va_seq) resp->target_cinfo.atomic = TRUE; else resp->target_cinfo.atomic = FALSE; *cs->statusp = resp->status = NFS4_OK; goto final; err_out: kmem_free(onm, olen); kmem_free(nnm, nlen); if (in_crit_src) nbl_end_crit(srcvp); if (in_crit_targ) nbl_end_crit(targvp); if (targvp) VN_RELE(targvp); if (srcvp) VN_RELE(srcvp); if (sfp) { if (sfp_rele_grant_hold) rfs4_clear_dont_grant(cs->instp, sfp); rfs4_file_rele(sfp); } if (fp) { if (fp_rele_grant_hold) rfs4_clear_dont_grant(cs->instp, fp); rfs4_file_rele(fp); } final: DTRACE_NFSV4_2(op__rename__done, struct compound_state *, cs, RENAME4res *, resp); } /* ARGSUSED */ static void mds_op_restorefh(nfs_argop4 *args, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { RESTOREFH4res *resp = &resop->nfs_resop4_u.oprestorefh; DTRACE_NFSV4_1(op__restorefh__start, struct compound_state *, cs); /* No need to check cs->access - we are not accessing any object */ if (!rfs4_cs_has_savedfh(cs)) { *cs->statusp = resp->status = NFS4ERR_RESTOREFH; goto final; } rfs4_cs_invalidate_fh(cs); cs->vp = cs->saved_vp; VN_HOLD(cs->vp); cs->exi = cs->saved_exi; nfs_fh4_copy(&cs->saved_fh, &cs->fh); *cs->statusp = resp->status = NFS4_OK; cs->deleg = FALSE; rfs4_cs_invalidate_savedfh(cs); final: DTRACE_NFSV4_2(op__restorefh__done, struct compound_state *, cs, RESTOREFH4res *, resp); } /* ARGSUSED */ static void mds_op_savefh(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { SAVEFH4res *resp = &resop->nfs_resop4_u.opsavefh; DTRACE_NFSV4_1(op__savefh__start, struct compound_state *, cs); /* No need to check cs->access - we are not accessing any object */ if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } rfs4_cs_invalidate_savedfh(cs); cs->saved_vp = cs->vp; VN_HOLD(cs->saved_vp); cs->saved_exi = cs->exi; /* * since SAVEFH is fairly rare, don't alloc space for its fh * unless necessary. */ if (cs->saved_fh.nfs_fh4_val == NULL) { cs->saved_fh.nfs_fh4_val = kmem_alloc(NFS4_FHSIZE, KM_SLEEP); } nfs_fh4_copy(&cs->fh, &cs->saved_fh); *cs->statusp = resp->status = NFS4_OK; final: DTRACE_NFSV4_2(op__savefh__done, struct compound_state *, cs, SAVEFH4res *, resp); } /* ARGSUSED */ static void mds_op_setattr(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { SETATTR4args *args = &argop->nfs_argop4_u.opsetattr; SETATTR4res *resp = &resop->nfs_resop4_u.opsetattr; bslabel_t *clabel; DTRACE_NFSV4_2(op__setattr__start, struct compound_state *, cs, SETATTR4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } /* * If there is an unshared filesystem mounted on this vnode, * do not allow to setattr on this vnode. */ if (vn_ismntpt(cs->vp)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } resp->attrsset = NFS4_EMPTY_ATTRMAP(nfs4_attrvers(cs)); if (rdonly4(cs->exi, cs->vp, req)) { *cs->statusp = resp->status = NFS4ERR_ROFS; goto final; } /* check label before setting attributes */ if (is_system_labeled()) { ASSERT(req->rq_label != NULL); clabel = req->rq_label; DTRACE_PROBE2(tx__rfs4__log__info__opsetattr__clabel, char *, "got client label from request(1)", struct svc_req *, req); if (!blequal(&l_admin_low->tsl_label, clabel)) { if (!do_rfs_label_check(clabel, cs->vp, EQUALITY_CHECK, cs->exi)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } } } *cs->statusp = resp->status = mds_setattr(&resp->attrsset, &args->obj_attributes, cs, &args->stateid); final: DTRACE_NFSV4_2(op__setattr__done, struct compound_state *, cs, SETATTR4res *, resp); } /* ARGSUSED */ static void mds_op_write(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { WRITE4args *args = &argop->nfs_argop4_u.opwrite; WRITE4res *resp = &resop->nfs_resop4_u.opwrite; nnode_io_flags_t nnioflags = NNODE_IO_FLAG_WRITE; int error; nnode_t *nn = NULL; u_offset_t rlimit; struct uio uio; struct iovec iov[NFS_MAX_IOVECS]; struct iovec *iovp = iov; int iovcnt; int ioflag; cred_t *savecred, *cr; bool_t *deleg = &cs->deleg; nfsstat4 stat; caller_context_t ct; DTRACE_NFSV4_2(op__write__start, struct compound_state *, cs, WRITE4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } error = nnode_from_fh_v41(&nn, &cs->fh); if (error != 0) { *cs->statusp = resp->status = nnode_stat4(error, 1); goto final; } /* * cs->access is set in call_checkauth4 called in putfh code. The * current putfh code will not invoke these security functions on the * DS codepath since it goes by the filehandle, not by nnodes per se. */ if (cs->access == CS_ACCESS_DENIED) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto final; } cr = cs->cr; if ((cs->vp != NULL) && (rdonly4(cs->exi, cs->vp, req))) { *cs->statusp = resp->status = NFS4ERR_ROFS; goto final; } if ((stat = nnop_check_stateid(nn, cs, FWRITE, &args->stateid, FALSE, deleg, TRUE, &ct, NULL)) != NFS4_OK) { *cs->statusp = resp->status = stat; goto out; } error = nnop_io_prep(nn, &nnioflags, cr, &ct, args->offset, args->data_len, NULL); if (error != 0) goto err; if (args->data_len == 0) { *cs->statusp = resp->status = NFS4_OK; resp->count = 0; resp->committed = args->stable; resp->writeverf = cs->instp->Write4verf; goto out; } if (args->mblk != NULL) { mblk_t *m; uint_t bytes, round_len; iovcnt = 0; bytes = 0; round_len = roundup(args->data_len, BYTES_PER_XDR_UNIT); for (m = args->mblk; m != NULL && bytes < round_len; m = m->b_cont) { iovcnt++; bytes += MBLKL(m); } #ifdef DEBUG /* should have ended on an mblk boundary */ if (bytes != round_len) { printf("bytes=0x%x, round_len=0x%x, req len=0x%x\n", bytes, round_len, args->data_len); printf("args=%p, args->mblk=%p, m=%p", (void *)args, (void *)args->mblk, (void *)m); ASSERT(bytes == round_len); } #endif if (iovcnt <= NFS_MAX_IOVECS) { iovp = iov; } else { iovp = kmem_alloc(sizeof (*iovp) * iovcnt, KM_SLEEP); } mblk_to_iov(args->mblk, iovcnt, iovp); } else { iovcnt = 1; iovp = iov; iovp->iov_base = args->data_val; iovp->iov_len = args->data_len; } uio.uio_iov = iovp; uio.uio_iovcnt = iovcnt; uio.uio_segflg = UIO_SYSSPACE; uio.uio_extflg = UIO_COPY_DEFAULT; uio.uio_loffset = args->offset; uio.uio_resid = args->data_len; uio.uio_llimit = curproc->p_fsz_ctl; rlimit = uio.uio_llimit - args->offset; if (rlimit < (u_offset_t)uio.uio_resid) uio.uio_resid = (int)rlimit; if (args->stable == UNSTABLE4) ioflag = 0; else if (args->stable == FILE_SYNC4) ioflag = FSYNC; else if (args->stable == DATA_SYNC4) ioflag = FDSYNC; else { if (iovp != iov) kmem_free(iovp, sizeof (*iovp) * iovcnt); *cs->statusp = resp->status = NFS4ERR_INVAL; goto out; } /* * We're changing creds because VM may fault and we need * the cred of the current thread to be used if quota * checking is enabled. */ savecred = curthread->t_cred; curthread->t_cred = cr; error = nnop_write(nn, &nnioflags, &uio, ioflag, cr, &ct, NULL); curthread->t_cred = savecred; if (iovp != iov) kmem_free(iovp, sizeof (*iovp) * iovcnt); err: if (error) { *cs->statusp = resp->status = nnode_stat4(error, 1); goto out; } *cs->statusp = resp->status = NFS4_OK; resp->count = args->data_len - uio.uio_resid; if (ioflag == 0) resp->committed = UNSTABLE4; else resp->committed = FILE_SYNC4; resp->writeverf = cs->instp->Write4verf; nnop_update(nn, nnioflags, cr, &ct, args->offset + resp->count); out: nnop_io_release(nn, nnioflags, &ct); final: if (nn != NULL) nnode_rele(&nn); DTRACE_NFSV4_2(op__write__done, struct compound_state *, cs, WRITE4res *, resp); } static void rfs41_op_dispatch(compound_state_t *cs, COMPOUND4args *args, COMPOUND4res *resp, struct svc_req *req) { nfs_argop4 *argop; nfs_resop4 *resop; uint_t op; argop = &args->array[cs->op_ndx]; resop = &resp->array[cs->op_ndx]; op = (uint_t)argop->argop; resop->resop = op; if (op >= OP_ILLEGAL_IDX) { /* * This is effectively dead code since XDR code * will have already returned BADXDR if op doesn't * decode to legal value. This only done for a * day when XDR code doesn't verify v4 opcodes. * or some bozo didn't update the operation dispatch * table. */ rfsproccnt_v41_ptr[OP_ILLEGAL_IDX].value.ui64++; mds_op_illegal(argop, resop, req, cs); DTRACE_PROBE(nfss41__e__operation_tilt); goto bail; } /* * First if this is a bad operation stop * the compound processing right now! */ if (mds_disptab[op].op_flag == DISP_OP_BAD) { mds_op_illegal(argop, resop, req, cs); DTRACE_PROBE1(nfss41__e__disp_op_inval, int, op); goto bail; } if (seq_chk_limits(argop, resop, cs)) { DTRACE_PROBE2(nfss41__i__scl_error, char *, nfs4_op_to_str(op), char *, nfs41_strerror(*cs->statusp)); } else { (*mds_disptab[op].dis_op)(argop, resop, req, cs); } bail: if (*cs->statusp != NFS4_OK) cs->cont = FALSE; /* * If not at last op, and if we are to stop, then * compact the results array. */ if ((cs->op_ndx + 1) < cs->op_len && !cs->cont) { nfs_resop4 *new_res = kmem_alloc( (cs->op_ndx+1) * sizeof (nfs_resop4), KM_SLEEP); bcopy(resp->array, new_res, (cs->op_ndx+1) * sizeof (nfs_resop4)); kmem_free(resp->array, cs->op_len * sizeof (nfs_resop4)); resp->array_len = cs->op_ndx + 1; resp->array = new_res; } } void rfs41_err_resp(COMPOUND4args *args, COMPOUND4res *resp, nfsstat4 err) { size_t sz; resp->array_len = 1; sz = resp->array_len * sizeof (nfs_resop4); resp->array = kmem_zalloc(sz, KM_SLEEP); resp->array[0].resop = args->array[0].argop; resp->array[0].nfs_resop4_u.opillegal.status = err; } /* ARGSUSED */ void mds_compound(compound_state_t *cs, COMPOUND4args *args, COMPOUND4res *resp, struct exportinfo *exi, struct svc_req *req, int *rv) { cred_t *cr; size_t reslen; nfsstat4 err; if (rv != NULL) *rv = 0; ASSERT(exi == NULL); cr = crget(); ASSERT(cr != NULL); if (sec_svc_getcred(req, cr, &cs->principal, &cs->nfsflavor) == 0) { DTRACE_NFSV4_2(compound__start, struct compound_state *, &cs, COMPOUND4args *, args); crfree(cr); DTRACE_NFSV4_2(compound__done, struct compound_state *, &cs, COMPOUND4res *, resp); svcerr_badcred(req->rq_xprt); if (rv != NULL) *rv = 1; return; } if (cs->basecr != NULL) crfree(cs->basecr); cs->basecr = cr; cs->req = req; /* * Form a reply tag by copying over the reqeuest tag. */ resp->tag.utf8string_val = kmem_alloc(args->tag.utf8string_len, KM_SLEEP); resp->tag.utf8string_len = args->tag.utf8string_len; bcopy(args->tag.utf8string_val, resp->tag.utf8string_val, resp->tag.utf8string_len); DTRACE_NFSV4_2(compound__start, struct compound_state *, &cs, COMPOUND4args *, args); /* * For now, NFS4 compound processing must be protected by * exported_lock because it can access more than one exportinfo * per compound and share/unshare can now change multiple * exinfo structs. The NFS2/3 code only refs 1 exportinfo * per proc (excluding public exinfo), and exi_count design * is sufficient to protect concurrent execution of NFS2/3 * ops along with unexport. */ rw_enter(&exported_lock, RW_READER); /* * If this is the first compound we've seen, we need to start * the instances' grace period. */ if (cs->instp->seen_first_compound == 0) { rfs4_grace_start_new(cs->instp); cs->instp->seen_first_compound = 1; } if (args->array_len == 0) goto out; err = verify_compound_args(cs, args); if (err != NFS4_OK) { *cs->statusp = err; rfs41_err_resp(args, resp, *cs->statusp); goto out; } /* * Everything kosher; allocate results array */ reslen = cs->op_len = resp->array_len = args->array_len; resp->array = kmem_zalloc(reslen * sizeof (nfs_resop4), KM_SLEEP); /* * Iterate over the compound until we have exhausted the operations * or the compound state indicates that we should terminate. */ for (cs->op_ndx = 0; cs->op_ndx < cs->op_len && cs->cont == TRUE; cs->op_ndx++) rfs41_op_dispatch(cs, args, resp, req); out: rw_exit(&exported_lock); /* * done with this compound request, free the label */ if (req->rq_label != NULL) { kmem_free(req->rq_label, sizeof (bslabel_t)); req->rq_label = NULL; } DTRACE_NFSV4_2(compound__done, struct compound_state *, &cs, COMPOUND4res *, resp); } void rfs41_compound_free(COMPOUND4res *resp) { uint_t i; if (resp->tag.utf8string_val) { UTF8STRING_FREE(resp->tag) } for (i = 0; i < resp->array_len; i++) { nfs_resop4 *resop; uint_t op; resop = &resp->array[i]; op = (uint_t)resop->resop; if (op < OP_ILLEGAL_IDX) { (*mds_disptab[op].dis_resfree)(resop); } } if (resp->array != NULL) { kmem_free(resp->array, resp->array_len * sizeof (nfs_resop4)); resp->array = NULL; resp->array_len = 0; } } delegreq_t do_41_deleg_hack(int osa) { int want_deleg; want_deleg = (osa & OPEN4_SHARE_ACCESS_WANT_DELEG_MASK); switch (want_deleg) { case OPEN4_SHARE_ACCESS_WANT_READ_DELEG: return (DELEG_READ); case OPEN4_SHARE_ACCESS_WANT_WRITE_DELEG: return (DELEG_WRITE); case OPEN4_SHARE_ACCESS_WANT_ANY_DELEG: return (DELEG_ANY); case OPEN4_SHARE_ACCESS_WANT_NO_DELEG: return (DELEG_NONE); } return (DELEG_ANY); } /* * XXX: This will go away with the SMF work for npools. */ extern mds_layout_t *mds_gen_default_layout(nfs_server_instance_t *, vnode_t *); /* * We are going to create the file, so we need to get * a layout in play for it. */ nfsstat4 mds_createfile_get_layout(struct svc_req *req, vnode_t *vp, struct compound_state *cs, caller_context_t *ct, mds_layout_t **plo) { vattr_t spe_va; int i; layout_core_t lc; int error; struct netbuf *claddr; nfsstat4 status = NFS4_OK; if (!pnfs_enabled(cs->exi)) return (NFS4ERR_LAYOUTUNAVAILABLE); spe_va.va_mask = AT_GID|AT_UID; error = VOP_GETATTR(vp, &spe_va, 0, cs->cr, ct); if (error) return (puterrno4(error)); /* * Taken from nfsauth_cache_get(): */ claddr = svc_getrpccaller(req->rq_xprt); lc.lc_mds_sids = NULL; /* * XXX: We may not be able to trust vp->v_path, * but if it is filled in, we will use it. Otherwise * we will evaluate polices ignoring the path components. */ error = nfs41_spe_allocate(&spe_va, claddr, vp->v_path, &lc, TRUE); if (error) { /* * XXX: Until we get the SMF code * in place, we handle all errors by * using the default layout of the * old prototype code * * At that point, we should return the * given error. */ *plo = mds_gen_default_layout(mds_server, vp); if (*plo == NULL) status = NFS4ERR_LAYOUTUNAVAILABLE; return (status); } *plo = pnfs_add_mds_layout(vp, &lc); if (lc.lc_mds_sids) { for (i = 0; i < lc.lc_stripe_count; i++) { kmem_free(lc.lc_mds_sids[i].val, lc.lc_mds_sids[i].len); } kmem_free(lc.lc_mds_sids, lc.lc_stripe_count * sizeof (mds_sid)); } if (*plo == NULL) status = NFS4ERR_LAYOUTUNAVAILABLE; return (status); } static inline struct fattr4 * args_to_fattr(OPEN4args *args, createmode4 mode) { struct fattr4 *fattr; ASSERT(mode != EXCLUSIVE4); if (mode == EXCLUSIVE4_1) fattr = &args->createhow4_u.ch_createboth.cva_attrs; else fattr = &args->createhow4_u.createattrs; return (fattr); } /* * If we call the spe in here, we return the new layout in *plo. */ static nfsstat4 mds_createfile(OPEN4args *args, struct svc_req *req, struct compound_state *cs, change_info4 *cinfo, attrmap4 *attrset, mds_layout_t **plo) { struct nfs4_svgetit_arg sarg; struct statvfs64 sb; nfsstat4 status = NFS4_OK; vnode_t *vp; vattr_t bva, ava, iva, cva, *vap; vnode_t *dvp; timespec32_t *mtime; char *nm = NULL; uint_t buflen; bool_t created; bool_t setsize = FALSE; len_t reqsize; int error; bool_t trunc; caller_context_t ct; component4 *component; bslabel_t *clabel; attrvers_t avers; createmode4 mode = args->mode; avers = nfs4_attrvers(cs); ASSERT(avers == AV_NFS41); sarg.sbp = &sb; dvp = cs->vp; /* Check if the file system is read only */ if (rdonly4(cs->exi, dvp, req)) return (NFS4ERR_ROFS); /* check the label of including directory */ if (is_system_labeled()) { ASSERT(req->rq_label != NULL); clabel = req->rq_label; DTRACE_PROBE2(tx__rfs4__log__info__opremove__clabel, char *, "got client label from request(1)", struct svc_req *, req); if (!blequal(&l_admin_low->tsl_label, clabel)) { if (!do_rfs_label_check(clabel, dvp, EQUALITY_CHECK, cs->exi)) { return (NFS4ERR_ACCESS); } } } /* * Get the last component of path name in nm. cs will reference * the including directory on success. */ component = &args->open_claim4_u.file; if (!utf8_dir_verify(component)) return (NFS4ERR_INVAL); nm = utf8_to_fn(component, &buflen, NULL); if (nm == NULL) return (NFS4ERR_RESOURCE); if (buflen > MAXNAMELEN) { kmem_free(nm, buflen); return (NFS4ERR_NAMETOOLONG); } bva.va_mask = AT_TYPE|AT_CTIME|AT_SEQ; error = VOP_GETATTR(dvp, &bva, 0, cs->cr, NULL); if (error) { kmem_free(nm, buflen); return (puterrno4(error)); } if (bva.va_type != VDIR) { kmem_free(nm, buflen); return (NFS4ERR_NOTDIR); } NFS4_SET_FATTR4_CHANGE(cinfo->before, bva.va_ctime); vap = sarg.vap; bzero(vap, sizeof (vattr_t)); if (mode == GUARDED4 || mode == UNCHECKED4 || mode == EXCLUSIVE4_1) { struct nfs4_ntov_table ntov; struct fattr4 *fattr; fattr = args_to_fattr(args, mode); *attrset = NFS4_EMPTY_ATTRMAP(avers); nfs4_ntov_table_init(&ntov, avers); status = do_rfs4_set_attrs(attrset, fattr, cs, &sarg, &ntov, NFS4ATTR_SETIT); nfs4_ntov_table_free(&ntov, &sarg); if (status != NFS4_OK) goto err_free; if ((vap->va_mask & AT_TYPE) && vap->va_type != VREG) { if (vap->va_type == VDIR) status = NFS4ERR_ISDIR; else if (vap->va_type == VLNK) status = NFS4ERR_SYMLINK; else status = NFS4ERR_INVAL; goto err_free; } if ((vap->va_mask & AT_MODE) == 0) { vap->va_mask |= AT_MODE; vap->va_mode = (mode_t)0600; } if (vap->va_mask & AT_SIZE) { reqsize = sarg.vap->va_size; /* Disallow create with a non-zero size */ if (reqsize != 0) { status = NFS4ERR_INVAL; goto err_free; } setsize = TRUE; } vap->va_type = VREG; vap->va_mask |= AT_TYPE; } if (mode == EXCLUSIVE4 || mode == EXCLUSIVE4_1) { verifier4 *v = &args->createhow4_u.ch_createboth.cva_verf; if (mode == EXCLUSIVE4) v = &args->createhow4_u.createverf; /* prohibit EXCL create of named attributes */ if (dvp->v_flag & V_XATTRDIR) { status = NFS4ERR_INVAL; goto err_free; } vap->va_mask |= AT_TYPE | AT_MTIME | AT_MODE; vap->va_type = VREG; /* * Ensure no time overflows. Assumes underlying * filesystem supports at least 32 bits. * Truncate nsec to usec resolution to allow valid * compares even if the underlying filesystem truncates. */ mtime = (timespec32_t *)v; vap->va_mtime.tv_sec = mtime->tv_sec % TIME32_MAX; vap->va_mtime.tv_nsec = (mtime->tv_nsec / 1000) * 1000; } status = create_vnode(dvp, nm, vap, mode, mtime, cs->cr, &vp, &created); kmem_free(nm, buflen); if (status != NFS4_OK) { *attrset = NFS4_EMPTY_ATTRMAP(avers); return (status); } trunc = (setsize && !created); if (mode != EXCLUSIVE4) { struct fattr4 *fattr = args_to_fattr(args, mode); attrmap4 createmask = fattr->attrmask; /* * True verification that object was created with correct * attrs is impossible. The attrs could have been changed * immediately after object creation. If attributes did * not verify, the only recourse for the server is to * destroy the object. Maybe if some attrs (like gid) * are set incorrectly, the object should be destroyed; * however, seems bad as a default policy. Do we really * want to destroy an object over one of the times not * verifying correctly? For these reasons, the server * currently sets bits in attrset for createattrs * that were set; however, no verification is done. * * vmask_to_nmask accounts for vattr bits set on create * [do_rfs4_set_attrs() only sets resp bits for * non-vattr/vfs bits.] * Mask off any bits we set by default so as not to return * more attrset bits than were requested in createattrs */ if (created) { nfs4_vmask_to_nmask(sarg.vap->va_mask, attrset, avers); ATTRMAP_MASK(*attrset, createmask); } else { /* * We did not create the vnode (we tried but it * already existed). In this case, the only createattr * that the spec allows the server to set is size, * and even then, it can only be set if it is 0. */ *attrset = NFS4_EMPTY_ATTRMAP(avers); if (trunc) ATTR_SET(*attrset, SIZE); } } /* * Get the initial "after" sequence number, if it fails, * set to zero, time to before. */ iva.va_mask = AT_CTIME|AT_SEQ; if (VOP_GETATTR(dvp, &iva, 0, cs->cr, NULL)) { iva.va_seq = 0; iva.va_ctime = bva.va_ctime; } /* * create_vnode attempts to create the file exclusive, * if it already exists the VOP_CREATE will fail and * may not increase va_seq. It is atomic if * we haven't changed the directory, but if it has changed * we don't know what changed it. */ if (!created) { if (bva.va_seq && iva.va_seq && bva.va_seq == iva.va_seq) cinfo->atomic = TRUE; else cinfo->atomic = FALSE; NFS4_SET_FATTR4_CHANGE(cinfo->after, iva.va_ctime); } else { /* * The entry was created, we need to sync the * directory metadata. */ (void) VOP_FSYNC(dvp, 0, cs->cr, NULL); /* * Get "after" change value, if it fails, simply return the * before value. */ ava.va_mask = AT_CTIME|AT_SEQ; if (VOP_GETATTR(dvp, &ava, 0, cs->cr, NULL)) { ava.va_ctime = bva.va_ctime; ava.va_seq = 0; } NFS4_SET_FATTR4_CHANGE(cinfo->after, ava.va_ctime); /* * The cinfo->atomic = TRUE only if we have * non-zero va_seq's, and it has incremented by exactly one * during the create_vnode and it didn't * change during the VOP_FSYNC. */ if (bva.va_seq && iva.va_seq && ava.va_seq && iva.va_seq == (bva.va_seq + 1) && iva.va_seq == ava.va_seq) cinfo->atomic = TRUE; else cinfo->atomic = FALSE; } /* Check for mandatory locking and that the size gets set. */ cva.va_mask = AT_MODE; if (setsize) cva.va_mask |= AT_SIZE; /* Assume the worst */ cs->mandlock = TRUE; if (VOP_GETATTR(vp, &cva, 0, cs->cr, NULL) == 0) { cs->mandlock = MANDLOCK(cs->vp, cva.va_mode); /* * Truncate the file if necessary; this would be * the case for create over an existing file. */ if (trunc) { int in_crit = 0; rfs4_file_t *fp; bool_t create = FALSE; /* * We are writing over an existing file. * Check to see if we need to recall a delegation. */ rfs4_hold_deleg_policy(cs->instp); if ((fp = rfs4_findfile(cs->instp, vp, NULL, &create)) != NULL) { if (rfs4_check_delegated_byfp(cs->instp, FWRITE, fp, (reqsize == 0), FALSE, FALSE, &cs->cp->rc_clientid)) { rfs4_file_rele(fp); rfs4_rele_deleg_policy(cs->instp); VN_RELE(vp); *attrset = NFS4_EMPTY_ATTRMAP(avers); return (NFS4ERR_DELAY); } rfs4_file_rele(fp); } rfs4_rele_deleg_policy(cs->instp); if (nbl_need_check(vp)) { in_crit = 1; ASSERT(reqsize == 0); nbl_start_crit(vp, RW_READER); if (nbl_conflict(vp, NBL_WRITE, 0, cva.va_size, 0, NULL)) { in_crit = 0; nbl_end_crit(vp); VN_RELE(vp); *attrset = NFS4_EMPTY_ATTRMAP(avers); return (NFS4ERR_ACCESS); } } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; cva.va_mask = AT_SIZE; cva.va_size = reqsize; (void) VOP_SETATTR(vp, &cva, 0, cs->cr, &ct); if (in_crit) nbl_end_crit(vp); } } error = rfs4_cs_update_fh(cs, vp); VN_RELE(vp); /* * Force modified data and metadata out to stable storage. */ (void) VOP_FSYNC(vp, FNODSYNC, cs->cr, NULL); if (error) { *attrset = NFS4_EMPTY_ATTRMAP(avers); return (puterrno4(error)); } /* if parent dir is attrdir, set namedattr fh flag */ if (dvp->v_flag & V_XATTRDIR) FH41_SET_FLAG((nfs41_fh_fmt_t *)cs->fh.nfs_fh4_val, FH41_NAMEDATTR); /* * if we did not create the file, we will need to check * the access bits on the file */ if (!created) { if (setsize) args->share_access |= OPEN4_SHARE_ACCESS_WRITE; status = check_open_access(args->share_access, cs, req); if (status != NFS4_OK) *attrset = NFS4_EMPTY_ATTRMAP(avers); } else { status = mds_createfile_get_layout(req, vp, cs, &ct, plo); /* * Allow mds_createfile_get_layout() to be verbose * in what it presents as a status, but be aware * that it is permissible to not generate a * layout. */ if (status == NFS4ERR_LAYOUTUNAVAILABLE) { status = NFS4_OK; } } return (status); err_free: kmem_free(nm, buflen); *attrset = NFS4_EMPTY_ATTRMAP(avers); return (status); } /* * 1) CB RACE <kill stored rs record> [done] * 2) slot reuse <kill stored rs record> [done] * 3) CB_RECALL <(typical/normal case) use new sessid in deleg_state * to find session originally granted the delegation to * issue recall over _that_ session's back channel> * XXX - <<< check spec >>> */ void rfs41_rs_record(struct compound_state *cs, stateid_type_t type, void *p) { rfs4_deleg_state_t *dsp; slot_ent_t *slotent; #ifdef DEBUG_VERBOSE /* * XXX - Do not change this to a static D probe; * this is not intended for production !!! */ ulong_t offset; char *who; who = modgetsymname((uintptr_t)caller(), &offset); #endif /* DEBUG_VERBOSE */ switch (type) { case DELEGID: /* sessid/slot/seqid + rsid */ ASSERT(cs != NULL && cs->sp != NULL); dsp = (rfs4_deleg_state_t *)p; ASSERT(dsp != NULL); #ifdef DEBUG_VERBOSE cmn_err(CE_NOTE, "rfs41_rs_record: (%s, dsp = 0x%p)", who, dsp); #endif /* DEBUG_VERBOSE */ /* delegation state id stored in rfs4_deleg_state_t */ bcopy(cs->sp->sn_sessid, dsp->rds_rs.sessid, sizeof (sessionid4)); dsp->rds_rs.seqid = cs->seqid; dsp->rds_rs.slotno = cs->slotno; rfs41_deleg_rs_hold(dsp); /* add it to slrc slot to track slot-reuse case */ slotent = slrc_slot_get(cs->sp->sn_replay, cs->slotno); ASSERT(slotent != NULL); ASSERT(slotent->se_p == NULL); mutex_enter(&slotent->se_lock); slotent->se_p = (rfs4_deleg_state_t *)dsp; mutex_exit(&slotent->se_lock); rfs4_dbe_hold(dsp->rds_dbe); /* added ref to deleg_state */ break; case LAYOUTID: /* * Layout stateid race detection will be done * using the stateid's embedded seqid field. */ /* FALLTHROUGH */ default: break; } } void rfs41_rs_erase(void *p) { rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)p; #ifdef DEBUG_VERBOSE /* * XXX - Do not change this to a static D probe; * this is not intended for production !!! */ ulong_t offset; char *who; who = modgetsymname((uintptr_t)caller(), &offset); cmn_err(CE_NOTE, "rfs41_rs_erase: (%s, dsp = 0x%p)", who, dsp); #endif /* DEBUG_VERBOSE */ ASSERT(dsp != NULL); if (dsp->rds_rs.refcnt > 0) { rfs41_deleg_rs_rele(dsp); rfs4_deleg_state_rele(dsp); } } #ifdef DEBUG /* * XXX - This is a handy way to force the server to "wait" before * granting a delegation to the requesting client (thereby * forcing the CB_RACE condition). rsec == # of secs to wait. */ int rsec = 0; #endif /*ARGSUSED*/ static void mds_do_open(struct compound_state *cs, struct svc_req *req, rfs4_openowner_t *oo, delegreq_t deleg, uint32_t access, uint32_t deny, OPEN4res *resp, int deleg_cur, mds_layout_t *plo) { rfs4_state_t *sp; rfs4_file_t *fp; bool_t screate = TRUE; bool_t fcreate = TRUE; uint32_t amodes; uint32_t dmodes; rfs4_deleg_state_t *dsp; sysid_t sysid; nfsstat4 status; caller_context_t ct; int fflags = 0; int recall = 0; int err; int first_open; /* get the file struct and hold a lock on it during initial open */ fp = rfs4_findfile_withlock(cs->instp, cs->vp, &cs->fh, &fcreate); if (fp == NULL) { DTRACE_PROBE(nfss__e__no_file); resp->status = NFS4ERR_SERVERFAULT; return; } sp = rfs4_findstate_by_owner_file(cs, oo, fp, &screate); if (sp == NULL) { DTRACE_PROBE(nfss__e__no_state); resp->status = NFS4ERR_RESOURCE; /* No need to keep any reference */ rfs4_file_rele_withunlock(fp); return; } /* try to get the sysid before continuing */ if ((status = rfs4_client_sysid(oo->ro_client, &sysid)) != NFS4_OK) { resp->status = status; rfs4_file_rele(fp); /* Not a fully formed open; "close" it */ if (screate == TRUE) rfs4_state_close(sp, FALSE, FALSE, cs->cr); rfs4_state_rele(sp); return; } /* * Assign the layout if there is one * Note that this means the file was just created. */ if (plo) { ASSERT(fp->rf_mlo == NULL); if (fp->rf_mlo) { rfs4_dbe_rele(fp->rf_mlo->mlo_dbe); } fp->rf_mlo = plo; } /* Calculate the fflags for this OPEN */ if (access & OPEN4_SHARE_ACCESS_READ) fflags |= FREAD; if (access & OPEN4_SHARE_ACCESS_WRITE) fflags |= FWRITE; rfs4_dbe_lock(sp->rs_dbe); /* * Calculate the new deny and access mode that this open is adding to * the file for this open owner; */ dmodes = (deny & ~sp->rs_share_deny); amodes = (access & ~sp->rs_share_access); first_open = (sp->rs_share_access & OPEN4_SHARE_ACCESS_BOTH) == 0; /* * Check to see the client has already sent an open for this * open owner on this file with the same share/deny modes. * If so, we don't need to check for a conflict and we don't * need to add another shrlock. If not, then we need to * check for conflicts in deny and access before checking for * conflicts in delegation. We don't want to recall a * delegation based on an open that will eventually fail based * on shares modes. */ if (dmodes || amodes) { if ((err = rfs4_share(sp, access, deny)) != 0) { rfs4_dbe_unlock(sp->rs_dbe); resp->status = err; rfs4_file_rele(fp); /* Not a fully formed open; "close" it */ if (screate == TRUE) rfs4_state_close(sp, FALSE, FALSE, cs->cr); rfs4_state_rele(sp); return; } } rfs4_dbe_lock(fp->rf_dbe); /* * Check to see if this file is delegated and if so, if a * recall needs to be done. * This only checke the delegations for this instance. If another * instance has a delegation for this file, then the conflict * detection will be done in the monitor on OPEN. We just need to * check if we have a delegation and if the calling client is the * owner. The monitor doesn't have enough info to determine if the * caller is the owner of the delegation or not. */ if (rfs4_check_recall(sp, access)) { rfs4_dbe_unlock(fp->rf_dbe); rfs4_dbe_unlock(sp->rs_dbe); rfs4_recall_deleg(fp, FALSE, sp->rs_owner->ro_client); delay(NFS4_DELEGATION_CONFLICT_DELAY); rfs4_dbe_lock(sp->rs_dbe); /* if state closed while lock was dropped */ if (sp->rs_closed) { if (dmodes || amodes) (void) rfs4_unshare(sp); rfs4_dbe_unlock(sp->rs_dbe); rfs4_file_rele(fp); /* Not a fully formed open; "close" it */ if (screate == TRUE) rfs4_state_close(sp, FALSE, FALSE, cs->cr); rfs4_state_rele(sp); resp->status = NFS4ERR_OLD_STATEID; return; } rfs4_dbe_lock(fp->rf_dbe); /* Let's see if the delegation was returned */ if (rfs4_check_recall(sp, access)) { rfs4_dbe_unlock(fp->rf_dbe); if (dmodes || amodes) (void) rfs4_unshare(sp); rfs4_dbe_unlock(sp->rs_dbe); rfs4_file_rele(fp); rfs4_update_lease(sp->rs_owner->ro_client); /* Not a fully formed open; "close" it */ if (screate == TRUE) rfs4_state_close(sp, FALSE, FALSE, cs->cr); rfs4_state_rele(sp); resp->status = NFS4ERR_DELAY; return; } } /* * the share check passed and any delegation conflict has been * taken care of, now call vop_open. * if this is the first open then call vop_open with fflags. * if not, call vn_open_upgrade with just the upgrade flags. * * if the file has been opened already, it will have the current * access mode in the state struct. if it has no share access, then * this is a new open. * * However, if this is open with CLAIM_DELEGATE_CUR, then don't * call VOP_OPEN(), just do the open upgrade. */ if (first_open && !deleg_cur) { ct.cc_sysid = sysid; ct.cc_pid = rfs4_dbe_getid(sp->rs_owner->ro_dbe); ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; err = VOP_OPEN(&cs->vp, fflags, cs->cr, &ct); if (err) { rfs4_dbe_unlock(fp->rf_dbe); if (dmodes || amodes) (void) rfs4_unshare(sp); rfs4_dbe_unlock(sp->rs_dbe); rfs4_file_rele(fp); /* Not a fully formed open; "close" it */ if (screate == TRUE) rfs4_state_close(sp, FALSE, FALSE, cs->cr); rfs4_state_rele(sp); if (err == EAGAIN && (ct.cc_flags & CC_WOULDBLOCK)) resp->status = NFS4ERR_DELAY; else resp->status = NFS4ERR_SERVERFAULT; return; } } else { /* open upgrade */ /* * calculate the fflags for the new mode that is being added * by this upgrade. */ fflags = 0; if (amodes & OPEN4_SHARE_ACCESS_READ) fflags |= FREAD; if (amodes & OPEN4_SHARE_ACCESS_WRITE) fflags |= FWRITE; vn_open_upgrade(cs->vp, fflags); } sp->rs_opened = TRUE; if (dmodes & OPEN4_SHARE_DENY_READ) fp->rf_deny_read++; if (dmodes & OPEN4_SHARE_DENY_WRITE) fp->rf_deny_write++; fp->rf_share_deny |= deny; if (amodes & OPEN4_SHARE_ACCESS_READ) fp->rf_access_read++; if (amodes & OPEN4_SHARE_ACCESS_WRITE) fp->rf_access_write++; fp->rf_share_access |= access; /* * Check for delegation here. if the deleg argument is not * DELEG_ANY, then this is a reclaim from a client and * we must honor the delegation requested. If necessary we can * set the recall flag. */ dsp = rfs4_grant_delegation(cs, deleg, sp, &recall); cs->deleg = (fp->rf_dinfo->rd_dtype == OPEN_DELEGATE_WRITE); next_stateid(&sp->rs_stateid); resp->stateid = sp->rs_stateid.stateid; rfs4_dbe_unlock(fp->rf_dbe); rfs4_dbe_unlock(sp->rs_dbe); if (dsp) { rfs4_set_deleg_response(dsp, &resp->delegation, NULL, recall); rfs41_rs_record(cs, DELEGID, dsp); rfs4_deleg_state_rele(dsp); #ifdef DEBUG if (rsec) { /* add delay here to force CB_RACE; rick */ delay(SEC_TO_TICK(rsec)); } #endif } rfs4_file_rele(fp); rfs4_state_rele(sp); resp->status = NFS4_OK; } nfsstat4 mds_lookupfile(component4 *component, struct svc_req *req, struct compound_state *cs, uint32_t access, change_info4 *cinfo) { nfsstat4 status; char *nm; uint32_t len; vnode_t *dvp = cs->vp; vattr_t bva, ava, fva; int error; if (dvp == NULL) { return (NFS4ERR_NOFILEHANDLE); } if (dvp->v_type != VDIR) { return (NFS4ERR_NOTDIR); } if (!utf8_dir_verify(component)) return (NFS4ERR_INVAL); nm = utf8_to_fn(component, &len, NULL); if (nm == NULL) { return (NFS4ERR_INVAL); } if (len > MAXNAMELEN) { kmem_free(nm, len); return (NFS4ERR_NAMETOOLONG); } /* Get "before" change value */ bva.va_mask = AT_CTIME|AT_SEQ; error = VOP_GETATTR(dvp, &bva, 0, cs->cr, NULL); if (error) return (puterrno4(error)); /* mds_lookup may VN_RELE directory */ VN_HOLD(dvp); status = mds_do_lookup(nm, len, req, cs); kmem_free(nm, len); if (status != NFS4_OK) { VN_RELE(dvp); return (status); } /* * Get "after" change value, if it fails, simply return the * before value. */ ava.va_mask = AT_CTIME|AT_SEQ; if (VOP_GETATTR(dvp, &ava, 0, cs->cr, NULL)) { ava.va_ctime = bva.va_ctime; ava.va_seq = 0; } VN_RELE(dvp); /* * Validate the file is a file */ fva.va_mask = AT_TYPE|AT_MODE; error = VOP_GETATTR(cs->vp, &fva, 0, cs->cr, NULL); if (error) return (puterrno4(error)); if (fva.va_type != VREG) { if (fva.va_type == VDIR) return (NFS4ERR_ISDIR); if (fva.va_type == VLNK) return (NFS4ERR_SYMLINK); return (NFS4ERR_INVAL); } NFS4_SET_FATTR4_CHANGE(cinfo->before, bva.va_ctime); NFS4_SET_FATTR4_CHANGE(cinfo->after, ava.va_ctime); /* * It is undefined if VOP_LOOKUP will change va_seq, so * cinfo.atomic = TRUE only if we have * non-zero va_seq's, and they have not changed. */ if (bva.va_seq && ava.va_seq && ava.va_seq == bva.va_seq) cinfo->atomic = TRUE; else cinfo->atomic = FALSE; /* Check for mandatory locking */ cs->mandlock = MANDLOCK(cs->vp, fva.va_mode); return (check_open_access(access, cs, req)); } /*ARGSUSED*/ static void mds_do_openfh(struct compound_state *cs, struct svc_req *req, OPEN4args *args, rfs4_openowner_t *oo, OPEN4res *resp) { /* cs->vp and cs->fh have been updated by putfh. */ mds_do_open(cs, req, oo, do_41_deleg_hack(args->share_access), (args->share_access & 0xff), args->share_deny, resp, 0, NULL); } /*ARGSUSED*/ static void mds_do_opennull(struct compound_state *cs, struct svc_req *req, OPEN4args *args, rfs4_openowner_t *oo, OPEN4res *resp) { change_info4 *cinfo = &resp->cinfo; attrmap4 *attrset = &resp->attrset; mds_layout_t *plo = NULL; if (args->opentype == OPEN4_NOCREATE) resp->status = mds_lookupfile(&args->open_claim4_u.file, req, cs, (args->share_access & 0xff), cinfo); else { /* inhibit delegation grants during exclusive create */ if (args->mode == EXCLUSIVE4) rfs4_disable_delegation(cs->instp); /* * Create the file and get the layout. */ resp->status = mds_createfile(args, req, cs, cinfo, attrset, &plo); } if (resp->status == NFS4_OK) { /* cs->vp and cs->fh now references the desired file */ mds_do_open(cs, req, oo, do_41_deleg_hack(args->share_access), (args->share_access & 0xff), args->share_deny, resp, 0, plo); /* * If rfs4_createfile set attrset, we must * clear this attrset before the response is copied. */ if (resp->status != NFS4_OK) resp->attrset = NFS4_EMPTY_ATTRMAP(nfs4_attrvers(cs)); } else *cs->statusp = resp->status; if (args->mode == EXCLUSIVE4) rfs4_enable_delegation(cs->instp); } /*ARGSUSED*/ static void mds_do_openprev(struct compound_state *cs, struct svc_req *req, OPEN4args *args, rfs4_openowner_t *oo, OPEN4res *resp) { change_info4 *cinfo = &resp->cinfo; vattr_t va; vtype_t v_type = cs->vp->v_type; int error = 0; caller_context_t ct; /* Verify that we have a regular file */ if (v_type != VREG) { if (v_type == VDIR) resp->status = NFS4ERR_ISDIR; else if (v_type == VLNK) resp->status = NFS4ERR_SYMLINK; else resp->status = NFS4ERR_INVAL; return; } ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; va.va_mask = AT_MODE|AT_UID; error = VOP_GETATTR(cs->vp, &va, 0, cs->cr, &ct); if (error) { resp->status = puterrno4(error); return; } cs->mandlock = MANDLOCK(cs->vp, va.va_mode); /* * Check if we have access to the file, Note the the file * could have originally been open UNCHECKED or GUARDED * with mode bits that will now fail, but there is nothing * we can really do about that except in the case that the * owner of the file is the one requesting the open. */ if (crgetuid(cs->cr) != va.va_uid) { resp->status = check_open_access((args->share_access & 0xff), cs, req); if (resp->status != NFS4_OK) { return; } } /* * cinfo on a CLAIM_PREVIOUS is undefined, initialize to zero */ cinfo->before = 0; cinfo->after = 0; cinfo->atomic = FALSE; mds_do_open(cs, req, oo, NFS4_DELEG4TYPE2REQTYPE(args->open_claim4_u.delegate_type), (args->share_access && 0xff), args->share_deny, resp, 0, NULL); } static void mds_do_opendelcur(struct compound_state *cs, struct svc_req *req, OPEN4args *args, rfs4_openowner_t *oo, OPEN4res *resp) { int error; nfsstat4 status; stateid4 stateid = args->open_claim4_u.delegate_cur_info.delegate_stateid; rfs4_deleg_state_t *dsp; /* * Find the state info from the stateid and confirm that the * file is delegated. If the state openowner is the same as * the supplied openowner we're done. If not, get the file * info from the found state info. Use that file info to * create the state for this lock owner. Note solaris doen't * really need the pathname to find the file. We may want to * lookup the pathname and make sure that the vp exist and * matches the vp in the file structure. However it is * possible that the pathname nolonger exists (local process * unlinks the file), so this may not be that useful. */ status = rfs4_get_deleg_state(cs, &stateid, &dsp); if (status != NFS4_OK) { resp->status = status; return; } ASSERT(dsp->rds_finfo->rf_dinfo->rd_dtype != OPEN_DELEGATE_NONE); /* * New lock owner, create state. Since this was probably called * in response to a CB_RECALL we set deleg to DELEG_NONE */ ASSERT(cs->vp != NULL); error = rfs4_cs_update_fh(cs, dsp->rds_finfo->rf_vp); if (error != 0) { rfs4_deleg_state_rele(dsp); *cs->statusp = resp->status = puterrno4(error); return; } /* Mark progress for delegation returns */ dsp->rds_finfo->rf_dinfo->rd_time_lastwrite = nfs_sys_uptime(); rfs4_deleg_state_rele(dsp); mds_do_open(cs, req, oo, DELEG_NONE, (args->share_access & 0xff), args->share_deny, resp, 1, NULL); } /*ARGSUSED*/ static void mds_do_opendelprev(struct compound_state *cs, struct svc_req *req, OPEN4args *args, rfs4_openowner_t *oo, OPEN4res *resp) { /* * Lookup the pathname, it must already exist since this file * was delegated. * * Find the file and state info for this vp and open owner pair. * check that they are in fact delegated. * check that the state access and deny modes are the same. * * Return the delgation possibly seting the recall flag. */ rfs4_file_t *fp; rfs4_state_t *sp; bool_t create = FALSE; bool_t dcreate = FALSE; rfs4_deleg_state_t *dsp; nfsace4 *ace; /* Note we ignore oflags */ resp->status = mds_lookupfile(&args->open_claim4_u.file_delegate_prev, req, cs, (args->share_access & 0xff), &resp->cinfo); if (resp->status != NFS4_OK) { return; } /* get the file struct and hold a lock on it during initial open */ fp = rfs4_findfile_withlock(cs->instp, cs->vp, NULL, &create); if (fp == NULL) { DTRACE_PROBE(nfss__e__no_file); resp->status = NFS4ERR_SERVERFAULT; return; } sp = rfs4_findstate_by_owner_file(cs, oo, fp, &create); if (sp == NULL) { DTRACE_PROBE(nfss__e__no_state); resp->status = NFS4ERR_SERVERFAULT; rfs4_file_rele_withunlock(fp); return; } rfs4_dbe_lock(sp->rs_dbe); rfs4_dbe_lock(fp->rf_dbe); if ((args->share_access & 0xff) != sp->rs_share_access || args->share_deny != sp->rs_share_deny || sp->rs_finfo->rf_dinfo->rd_dtype == OPEN_DELEGATE_NONE) { DTRACE_PROBE2(nfss__e__state_mixup, rfs4_state_t *, sp, OPEN4args *, args); rfs4_dbe_unlock(fp->rf_dbe); rfs4_dbe_unlock(sp->rs_dbe); rfs4_file_rele(fp); rfs4_state_rele(sp); resp->status = NFS4ERR_SERVERFAULT; return; } rfs4_dbe_unlock(fp->rf_dbe); rfs4_dbe_unlock(sp->rs_dbe); dsp = rfs4_finddeleg(cs, sp, &dcreate); if (dsp == NULL) { DTRACE_PROBE(nfss__e__no_deleg); rfs4_state_rele(sp); rfs4_file_rele(fp); resp->status = NFS4ERR_SERVERFAULT; return; } next_stateid(&sp->rs_stateid); resp->stateid = sp->rs_stateid.stateid; resp->delegation.delegation_type = dsp->rds_dtype; if (dsp->rds_dtype == OPEN_DELEGATE_READ) { open_read_delegation4 *rv = &resp->delegation.open_delegation4_u.read; rv->stateid = dsp->rds_delegid.stateid; rv->recall = FALSE; /* no policy in place to set to TRUE */ ace = &rv->permissions; } else { open_write_delegation4 *rv = &resp->delegation.open_delegation4_u.write; rv->stateid = dsp->rds_delegid.stateid; rv->recall = FALSE; /* no policy in place to set to TRUE */ ace = &rv->permissions; rv->space_limit.limitby = NFS_LIMIT_SIZE; rv->space_limit.nfs_space_limit4_u.filesize = UINT64_MAX; } /* XXX For now */ ace->type = ACE4_ACCESS_ALLOWED_ACE_TYPE; ace->flag = 0; ace->access_mask = 0; ace->who.utf8string_len = 0; ace->who.utf8string_val = 0; rfs4_deleg_state_rele(dsp); rfs4_state_rele(sp); rfs4_file_rele(fp); } static void mds_op_open(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { OPEN4args *args = &argop->nfs_argop4_u.opopen; OPEN4res *resp = &resop->nfs_resop4_u.opopen; open_owner4 *owner = &args->owner; open_claim_type4 claim = args->claim; rfs4_client_t *cp; rfs4_openowner_t *oo; bool_t create; int can_reclaim; int share_access; DTRACE_NFSV4_2(op__open__start, struct compound_state *, cs, OPEN4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } cp = cs->cp; owner->clientid = cp->rc_clientid; can_reclaim = cp->rc_can_reclaim; retry: create = TRUE; oo = mds_findopenowner(cs->instp, owner, &create); if (oo == NULL) { /* XXX: this seems a little fishy... */ *cs->statusp = resp->status = NFS4ERR_STALE_CLIENTID; goto final; } /* Need to serialize access to the stateid space */ rfs4_sw_enter(&oo->ro_sw); /* Grace only applies to regular-type OPENs */ if (rfs4_clnt_in_grace(cp) && (claim == CLAIM_NULL || claim == CLAIM_DELEGATE_CUR || claim == CLAIM_FH)) { *cs->statusp = resp->status = NFS4ERR_GRACE; goto out; } /* * If previous state at the server existed then can_reclaim * will be set. If not reply NFS4ERR_NO_GRACE to the * client. */ if (rfs4_clnt_in_grace(cp) && claim == CLAIM_PREVIOUS && !can_reclaim) { *cs->statusp = resp->status = NFS4ERR_NO_GRACE; goto out; } /* * Reject the open if the client has missed the grace period */ if (!rfs4_clnt_in_grace(cp) && claim == CLAIM_PREVIOUS) { *cs->statusp = resp->status = NFS4ERR_NO_GRACE; goto out; } /* * OPEN_CONFIRM is mandatory not to impl in 4.1. */ oo->ro_need_confirm = FALSE; resp->rflags |= OPEN4_RESULT_LOCKTYPE_POSIX; /* * If there is an unshared filesystem mounted on this vnode, * do not allow to open/create in this directory. */ if (vn_ismntpt(cs->vp)) { *cs->statusp = resp->status = NFS4ERR_ACCESS; goto out; } share_access = (args->share_access && 0xff); /* * access must READ, WRITE, or BOTH. No access is invalid. * deny can be READ, WRITE, BOTH, or NONE. * bits not defined for access/deny are invalid. */ if (! (share_access & OPEN4_SHARE_ACCESS_BOTH) || (share_access & ~OPEN4_SHARE_ACCESS_BOTH) || (args->share_deny & ~OPEN4_SHARE_DENY_BOTH)) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto out; } /* * make sure attrset is zero before response is built. */ resp->attrset = NFS4_EMPTY_ATTRMAP(nfs4_attrvers(cs)); switch (claim) { case CLAIM_NULL: mds_do_opennull(cs, req, args, oo, resp); break; case CLAIM_PREVIOUS: mds_do_openprev(cs, req, args, oo, resp); break; case CLAIM_DELEGATE_CUR: mds_do_opendelcur(cs, req, args, oo, resp); break; case CLAIM_DELEGATE_PREV: mds_do_opendelprev(cs, req, args, oo, resp); break; case CLAIM_FH: mds_do_openfh(cs, req, args, oo, resp); break; default: resp->status = NFS4ERR_INVAL; break; } out: switch (resp->status) { case NFS4ERR_BADXDR: case NFS4ERR_BAD_SEQID: case NFS4ERR_BAD_STATEID: case NFS4ERR_NOFILEHANDLE: case NFS4ERR_RESOURCE: case NFS4ERR_STALE_CLIENTID: case NFS4ERR_STALE_STATEID: /* * The protocol states that if any of these errors are * being returned, the sequence id should not be * incremented. Any other return requires an * increment. */ break; } *cs->statusp = resp->status; rfs4_sw_exit(&oo->ro_sw); rfs4_openowner_rele(oo); final: DTRACE_NFSV4_2(op__open__done, struct compound_state *, cs, OPEN4res *, resp); } /*ARGSUSED*/ static void mds_free_reply(nfs_resop4 *resop) { /* Common function for NFSv4.0 and NFSv4.1 */ rfs4_free_reply(resop); } /*ARGSUSED*/ void mds_op_open_downgrade(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { OPEN_DOWNGRADE4args *args = &argop->nfs_argop4_u.opopen_downgrade; OPEN_DOWNGRADE4res *resp = &resop->nfs_resop4_u.opopen_downgrade; uint32_t access = (args->share_access & 0xff); uint32_t deny = args->share_deny; nfsstat4 status; rfs4_state_t *sp; rfs4_file_t *fp; int fflags = 0; int rc; DTRACE_NFSV4_2(op__open__downgrade__start, struct compound_state *, cs, OPEN_DOWNGRADE4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (mds_strict_seqid && args->seqid) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } status = rfs4_get_state(cs, &args->open_stateid, &sp, RFS4_DBS_VALID); if (status != NFS4_OK) { *cs->statusp = resp->status = status; goto final; } /* Ensure specified filehandle matches */ if (cs->vp != sp->rs_finfo->rf_vp) { rfs4_state_rele(sp); *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto final; } /* hold off other access to open_owner while we tinker */ rfs4_sw_enter(&sp->rs_owner->ro_sw); rc = mds_check_stateid_seqid(sp, &args->open_stateid); switch (rc) { case NFS4_CHECK_STATEID_OKAY: break; case NFS4_CHECK_STATEID_OLD: *cs->statusp = resp->status = NFS4ERR_OLD_STATEID; goto end; case NFS4_CHECK_STATEID_BAD: *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto end; case NFS4_CHECK_STATEID_EXPIRED: *cs->statusp = resp->status = NFS4ERR_EXPIRED; goto end; case NFS4_CHECK_STATEID_CLOSED: *cs->statusp = resp->status = NFS4ERR_OLD_STATEID; goto end; case NFS4_CHECK_STATEID_UNCONFIRMED: *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto end; case NFS4_CHECK_STATEID_REPLAY: ASSERT(0); break; default: ASSERT(FALSE); break; } rfs4_dbe_lock(sp->rs_dbe); /* * Check that the new access modes and deny modes are valid. * Check that no invalid bits are set. */ if ((access & ~(OPEN4_SHARE_ACCESS_READ | OPEN4_SHARE_ACCESS_WRITE)) || (deny & ~(OPEN4_SHARE_DENY_READ | OPEN4_SHARE_DENY_WRITE))) { *cs->statusp = resp->status = NFS4ERR_INVAL; rfs4_dbe_unlock(sp->rs_dbe); goto end; } /* * The new modes must be a subset of the current modes and * the access must specify at least one mode. To test that * the new mode is a subset of the current modes we bitwise * AND them together and check that the result equals the new * mode. For example: * New mode, access == R and current mode, sp->share_access == RW * access & sp->share_access == R == access, so the new access mode * is valid. Consider access == RW, sp->share_access = R * access & sp->share_access == R != access, so the new access mode * is invalid. */ if ((access & sp->rs_share_access) != access || (deny & sp->rs_share_deny) != deny || (access & (OPEN4_SHARE_ACCESS_READ | OPEN4_SHARE_ACCESS_WRITE)) == 0) { *cs->statusp = resp->status = NFS4ERR_INVAL; rfs4_dbe_unlock(sp->rs_dbe); goto end; } /* * Release any share locks associated with this stateID. * Strictly speaking, this violates the spec because the * spec effectively requires that open downgrade be atomic. * At present, fs_shrlock does not have this capability. */ rfs4_unshare(sp); fp = sp->rs_finfo; rfs4_dbe_lock(fp->rf_dbe); /* * If the current mode has deny read and the new mode * does not, decrement the number of deny read mode bits * and if it goes to zero turn off the deny read bit * on the file. */ if ((sp->rs_share_deny & OPEN4_SHARE_DENY_READ) && (deny & OPEN4_SHARE_DENY_READ) == 0) { fp->rf_deny_read--; if (fp->rf_deny_read == 0) fp->rf_share_deny &= ~OPEN4_SHARE_DENY_READ; } /* * If the current mode has deny write and the new mode * does not, decrement the number of deny write mode bits * and if it goes to zero turn off the deny write bit * on the file. */ if ((sp->rs_share_deny & OPEN4_SHARE_DENY_WRITE) && (deny & OPEN4_SHARE_DENY_WRITE) == 0) { fp->rf_deny_write--; if (fp->rf_deny_write == 0) fp->rf_share_deny &= ~OPEN4_SHARE_DENY_WRITE; } /* * If the current mode has access read and the new mode * does not, decrement the number of access read mode bits * and if it goes to zero turn off the access read bit * on the file. set fflags to FREAD for the call to * vn_open_downgrade(). */ if ((sp->rs_share_access & OPEN4_SHARE_ACCESS_READ) && (access & OPEN4_SHARE_ACCESS_READ) == 0) { fp->rf_access_read--; if (fp->rf_access_read == 0) fp->rf_share_access &= ~OPEN4_SHARE_ACCESS_READ; fflags |= FREAD; } /* * If the current mode has access write and the new mode * does not, decrement the number of access write mode bits * and if it goes to zero turn off the access write bit * on the file. set fflags to FWRITE for the call to * vn_open_downgrade(). */ if ((sp->rs_share_access & OPEN4_SHARE_ACCESS_WRITE) && (access & OPEN4_SHARE_ACCESS_WRITE) == 0) { fp->rf_access_write--; if (fp->rf_access_write == 0) fp->rf_share_deny &= ~OPEN4_SHARE_ACCESS_WRITE; fflags |= FWRITE; } /* Check that the file is still accessible */ ASSERT(fp->rf_share_access); rfs4_dbe_unlock(fp->rf_dbe); status = rfs4_share(sp, access, deny); rfs4_dbe_unlock(sp->rs_dbe); if (status != NFS4_OK) { *cs->statusp = resp->status = NFS4ERR_SERVERFAULT; goto end; } /* * we successfully downgraded the share lock, now we need to downgrade * the open. it is possible that the downgrade was only for a deny * mode and we have nothing else to do. */ if ((fflags & (FREAD|FWRITE)) != 0) vn_open_downgrade(cs->vp, fflags); rfs4_dbe_lock(sp->rs_dbe); /* Update the stateid */ next_stateid(&sp->rs_stateid); resp->open_stateid = sp->rs_stateid.stateid; rfs4_dbe_unlock(sp->rs_dbe); *cs->statusp = resp->status = NFS4_OK; /* Update the lease */ rfs4_update_lease(sp->rs_owner->ro_client); end: rfs4_sw_exit(&sp->rs_owner->ro_sw); rfs4_state_rele(sp); final: DTRACE_NFSV4_2(op__open__downgrade__done, struct compound_state *, cs, OPEN_DOWNGRADE4res *, resp); } /*ARGSUSED*/ void mds_op_close(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { /* XXX Currently not using req arg */ CLOSE4args *args = &argop->nfs_argop4_u.opclose; CLOSE4res *resp = &resop->nfs_resop4_u.opclose; rfs4_state_t *sp; nfsstat4 status; int rc; DTRACE_NFSV4_2(op__close__start, struct compound_state *, cs, CLOSE4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (mds_strict_seqid && args->seqid) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } status = rfs4_get_state(cs, &args->open_stateid, &sp, RFS4_DBS_INVALID); if (status != NFS4_OK) { *cs->statusp = resp->status = status; goto final; } /* Ensure specified filehandle matches */ if (cs->vp != sp->rs_finfo->rf_vp) { rfs4_state_rele(sp); *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto final; } /* hold off other access to open_owner while we tinker */ rfs4_sw_enter(&sp->rs_owner->ro_sw); rc = mds_check_stateid_seqid(sp, &args->open_stateid); switch (rc) { case NFS4_CHECK_STATEID_OKAY: break; case NFS4_CHECK_STATEID_OLD: *cs->statusp = resp->status = NFS4ERR_OLD_STATEID; goto end; case NFS4_CHECK_STATEID_BAD: *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto end; case NFS4_CHECK_STATEID_EXPIRED: *cs->statusp = resp->status = NFS4ERR_EXPIRED; goto end; case NFS4_CHECK_STATEID_CLOSED: *cs->statusp = resp->status = NFS4ERR_OLD_STATEID; goto end; case NFS4_CHECK_STATEID_UNCONFIRMED: *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto end; case NFS4_CHECK_STATEID_REPLAY: ASSERT(0); break; default: ASSERT(FALSE); break; } rfs4_dbe_lock(sp->rs_dbe); /* Update the stateid. */ next_stateid(&sp->rs_stateid); resp->open_stateid = sp->rs_stateid.stateid; rfs4_dbe_unlock(sp->rs_dbe); rfs4_update_lease(sp->rs_owner->ro_client); rfs4_state_close(sp, FALSE, FALSE, cs->cr); *cs->statusp = resp->status = status; end: rfs4_sw_exit(&sp->rs_owner->ro_sw); rfs4_state_rele(sp); final: DTRACE_NFSV4_2(op__close__done, struct compound_state *, cs, CLOSE4res *, resp); } /* * lock_denied: Fill in a LOCK4deneid structure given an flock64 structure. */ static nfsstat4 mds_lock_denied(nfs_server_instance_t *instp, LOCK4denied *dp, struct flock64 *flk) { rfs4_lockowner_t *lo; rfs4_client_t *cp; uint32_t len; lo = findlockowner_by_pid(instp, flk->l_pid); if (lo != NULL) { cp = lo->rl_client; if (rfs4_lease_expired(cp)) { rfs4_lockowner_rele(lo); rfs4_dbe_hold(cp->rc_dbe); rfs4_client_close(cp); return (NFS4ERR_EXPIRED); } dp->owner.clientid = lo->rl_owner.clientid; len = lo->rl_owner.owner_len; dp->owner.owner_val = kmem_alloc(len, KM_SLEEP); bcopy(lo->rl_owner.owner_val, dp->owner.owner_val, len); dp->owner.owner_len = len; rfs4_lockowner_rele(lo); goto finish; } /* * Its not a NFS4 lock. We take advantage that the upper 32 bits * of the client id contain the boot time for a NFS4 lock. So we * fabricate and identity by setting clientid to the sysid, and * the lock owner to the pid. */ dp->owner.clientid = flk->l_sysid; len = sizeof (pid_t); dp->owner.owner_len = len; dp->owner.owner_val = kmem_alloc(len, KM_SLEEP); bcopy(&flk->l_pid, dp->owner.owner_val, len); finish: dp->offset = flk->l_start; dp->length = flk->l_len; if (flk->l_type == F_RDLCK) dp->locktype = READ_LT; else if (flk->l_type == F_WRLCK) dp->locktype = WRITE_LT; else return (NFS4ERR_INVAL); /* no mapping from POSIX ltype to v4 */ return (NFS4_OK); } /*ARGSUSED*/ void mds_op_lock(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { /* XXX Currently not using req arg */ LOCK4args *args = &argop->nfs_argop4_u.oplock; LOCK4res *resp = &resop->nfs_resop4_u.oplock; nfsstat4 status; stateid4 *stateid; rfs4_lockowner_t *lo; rfs4_client_t *cp; rfs4_state_t *sp = NULL; rfs4_lo_state_t *lsp = NULL; bool_t ls_sw_held = FALSE; bool_t create = TRUE; bool_t lcreate = TRUE; int rc; DTRACE_NFSV4_2(op__lock__start, struct compound_state *, cs, LOCK4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (args->locker.new_lock_owner) { /* Create a new lockowner for this instance */ open_to_lock_owner4 *olo = &args->locker.locker4_u.open_owner; /* * validate that open_seqid, lock_seqid and the * clientid in lock_owner are all zero. */ if (mds_strict_seqid && (olo->open_seqid || olo->lock_seqid || olo->lock_owner.clientid)) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } /* * get/validate the open stateid */ stateid = &olo->open_stateid; status = rfs4_get_state(cs, stateid, &sp, RFS4_DBS_VALID); if (status != NFS4_OK) { *cs->statusp = resp->status = status; goto final; } /* Ensure specified filehandle matches */ if (cs->vp != sp->rs_finfo->rf_vp) { rfs4_state_rele(sp); *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto final; } /* hold off other access to open_owner while we tinker */ rfs4_sw_enter(&sp->rs_owner->ro_sw); rc = mds_check_stateid_seqid(sp, stateid); switch (rc) { case NFS4_CHECK_STATEID_OLD: *cs->statusp = resp->status = NFS4ERR_OLD_STATEID; goto end; case NFS4_CHECK_STATEID_BAD: *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto end; case NFS4_CHECK_STATEID_EXPIRED: *cs->statusp = resp->status = NFS4ERR_EXPIRED; goto end; case NFS4_CHECK_STATEID_UNCONFIRMED: *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto end; case NFS4_CHECK_STATEID_CLOSED: *cs->statusp = resp->status = NFS4ERR_OLD_STATEID; goto end; case NFS4_CHECK_STATEID_OKAY: case NFS4_CHECK_STATEID_REPLAY: break; } /* * Use the clientid4 from the session. * * XXX a quick hack is to plop the clientid4 from the * XXX compound state into the lock_owner structure, * XXX since the _hash and _compare functions use * XXX that field. */ olo->lock_owner.clientid = cs->cp->rc_clientid; lo = findlockowner(cs->instp, &olo->lock_owner, &lcreate); if (lo == NULL) { *cs->statusp = resp->status = NFS4ERR_RESOURCE; goto end; } lsp = mds_findlo_state_by_owner(lo, sp, &create); if (lsp == NULL) { rfs4_update_lease(sp->rs_owner->ro_client); *cs->statusp = resp->status = NFS4ERR_SERVERFAULT; rfs4_lockowner_rele(lo); goto end; } /* * This is the new_lock_owner branch and the client is * supposed to be associating a new lock_owner with * the open file at this point. If we find that a * lock_owner/state association already exists and a * successful LOCK request was returned to the client, * an error is returned to the client since this is * not appropriate. The client should be using the * existing lock_owner branch. */ if (create == FALSE) { if (lsp->rls_lock_completed == TRUE) { *cs->statusp = resp->status = NFS4ERR_BAD_SEQID; rfs4_lockowner_rele(lo); goto end; } } rfs4_update_lease(sp->rs_owner->ro_client); rfs4_dbe_lock(lsp->rls_dbe); /* hold off other access to lsp while we tinker */ rfs4_sw_enter(&lsp->rls_sw); ls_sw_held = TRUE; rfs4_dbe_unlock(lsp->rls_dbe); rfs4_lockowner_rele(lo); } else { /* * validate lock_seqid is zero. */ if (mds_strict_seqid && args->locker.locker4_u.lock_owner.lock_seqid) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } stateid = &args->locker.locker4_u.lock_owner.lock_stateid; /* get lsp and hold the lock on the underlying file struct */ if ((status = rfs4_get_lo_state(cs, stateid, &lsp, TRUE)) != NFS4_OK) { *cs->statusp = resp->status = status; goto final; } create = FALSE; /* We didn't create lsp */ /* Ensure specified filehandle matches */ if (cs->vp != lsp->rls_state->rs_finfo->rf_vp) { rfs4_lo_state_rele(lsp, TRUE); *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto final; } /* hold off other access to lsp while we tinker */ rfs4_sw_enter(&lsp->rls_sw); ls_sw_held = TRUE; switch (rfs4_check_lo_stateid_seqid(lsp, stateid, cs->flags)) { /* * The stateid looks like it was okay (expected to be * the next one) */ case NFS4_CHECK_STATEID_OKAY: /* * The sequence id is now checked. Determine * if this is a replay or if it is in the * expected (next) sequence. In the case of a * replay, there are two replay conditions * that may occur. The first is the normal * condition where a LOCK is done with a * NFS4_OK response and the stateid is * updated. That case is handled below when * the stateid is identified as a REPLAY. The * second is the case where an error is * returned, like NFS4ERR_DENIED, and the * sequence number is updated but the stateid * is not updated. This second case is dealt * with here. So it may seem odd that the * stateid is okay but the sequence id is a * replay but it is okay. */ /* XXX: rbg -- missing code ? :-) */ break; case NFS4_CHECK_STATEID_OLD: *cs->statusp = resp->status = NFS4ERR_OLD_STATEID; goto end; case NFS4_CHECK_STATEID_BAD: *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto end; case NFS4_CHECK_STATEID_EXPIRED: *cs->statusp = resp->status = NFS4ERR_EXPIRED; goto end; case NFS4_CHECK_STATEID_CLOSED: *cs->statusp = resp->status = NFS4ERR_OLD_STATEID; goto end; case NFS4_CHECK_STATEID_REPLAY: ASSERT(0); break; default: ASSERT(FALSE); break; } rfs4_update_lease(lsp->rls_locker->rl_client); } /* * NFS4 only allows locking on regular files, so * verify type of object. */ if (cs->vp->v_type != VREG) { if (cs->vp->v_type == VDIR) status = NFS4ERR_ISDIR; else status = NFS4ERR_INVAL; goto out; } cp = lsp->rls_state->rs_owner->ro_client; if (rfs4_clnt_in_grace(cp) && !args->reclaim) { status = NFS4ERR_GRACE; goto out; } if (rfs4_clnt_in_grace(cp) && args->reclaim && !cp->rc_can_reclaim) { status = NFS4ERR_NO_GRACE; goto out; } if (!rfs4_clnt_in_grace(cp) && args->reclaim) { status = NFS4ERR_NO_GRACE; goto out; } if (lsp->rls_state->rs_finfo->rf_dinfo->rd_dtype == OPEN_DELEGATE_WRITE) cs->deleg = TRUE; status = rfs4_do_lock(lsp, args->locktype, args->locker.locker4_u.lock_owner.lock_seqid, args->offset, args->length, cs->cr, resop); out: *cs->statusp = resp->status = status; if (status == NFS4_OK) { resp->LOCK4res_u.lock_stateid = lsp->rls_lockid.stateid; lsp->rls_lock_completed = TRUE; } end: if (lsp) { if (ls_sw_held) rfs4_sw_exit(&lsp->rls_sw); /* * If an sp obtained, then the lsp does not represent * a lock on the file struct. */ if (sp != NULL) rfs4_lo_state_rele(lsp, FALSE); else rfs4_lo_state_rele(lsp, TRUE); } if (sp) { rfs4_sw_exit(&sp->rs_owner->ro_sw); rfs4_state_rele(sp); } final: DTRACE_NFSV4_2(op__lock__done, struct compound_state *, cs, LOCK4res *, resp); } /* free function for LOCK/LOCKT */ /*ARGSUSED*/ static void mds_lock_denied_free(nfs_resop4 *resop) { /* Common function for NFSv4.0 and NFSv4.1 */ lock_denied_free(resop); } /*ARGSUSED*/ void mds_op_locku(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { /* XXX Currently not using req arg */ LOCKU4args *args = &argop->nfs_argop4_u.oplocku; LOCKU4res *resp = &resop->nfs_resop4_u.oplocku; nfsstat4 status; stateid4 *stateid = &args->lock_stateid; rfs4_lo_state_t *lsp; DTRACE_NFSV4_2(op__locku__start, struct compound_state *, cs, LOCKU4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } if (mds_strict_seqid && args->seqid) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } if ((status = rfs4_get_lo_state(cs, stateid, &lsp, TRUE)) != NFS4_OK) { *cs->statusp = resp->status = status; goto final; } /* Ensure specified filehandle matches */ if (cs->vp != lsp->rls_state->rs_finfo->rf_vp) { rfs4_lo_state_rele(lsp, TRUE); *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto final; } /* hold off other access to lsp while we tinker */ rfs4_sw_enter(&lsp->rls_sw); switch (rfs4_check_lo_stateid_seqid(lsp, stateid, cs->flags)) { case NFS4_CHECK_STATEID_OKAY: break; case NFS4_CHECK_STATEID_OLD: *cs->statusp = resp->status = NFS4ERR_OLD_STATEID; goto end; case NFS4_CHECK_STATEID_BAD: *cs->statusp = resp->status = NFS4ERR_BAD_STATEID; goto end; case NFS4_CHECK_STATEID_EXPIRED: *cs->statusp = resp->status = NFS4ERR_EXPIRED; goto end; case NFS4_CHECK_STATEID_CLOSED: *cs->statusp = resp->status = NFS4ERR_OLD_STATEID; goto end; case NFS4_CHECK_STATEID_REPLAY: ASSERT(0); break; default: ASSERT(FALSE); break; } rfs4_update_lock_sequence(lsp); rfs4_update_lease(lsp->rls_locker->rl_client); /* * NFS4 only allows locking on regular files, so * verify type of object. */ if (cs->vp->v_type != VREG) { if (cs->vp->v_type == VDIR) status = NFS4ERR_ISDIR; else status = NFS4ERR_INVAL; goto out; } if (rfs4_clnt_in_grace(lsp->rls_state->rs_owner->ro_client)) { status = NFS4ERR_GRACE; goto out; } status = rfs4_do_lock(lsp, args->locktype, args->seqid, args->offset, args->length, cs->cr, resop); out: *cs->statusp = resp->status = status; if (status == NFS4_OK) resp->lock_stateid = lsp->rls_lockid.stateid; end: rfs4_sw_exit(&lsp->rls_sw); rfs4_lo_state_rele(lsp, TRUE); final: DTRACE_NFSV4_2(op__locku__done, struct compound_state *, cs, LOCKU4res *, resp); } /* * LOCKT is a best effort routine, the client can not be guaranteed that * the status return is still in effect by the time the reply is received. * They are numerous race conditions in this routine, but we are not required * and can not be accurate. */ /*ARGSUSED*/ void mds_op_lockt(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { LOCKT4args *args = &argop->nfs_argop4_u.oplockt; LOCKT4res *resp = &resop->nfs_resop4_u.oplockt; rfs4_lockowner_t *lo; bool_t create = FALSE; struct flock64 flk; int error; int flag = FREAD | FWRITE; int ltype; length4 posix_length; sysid_t sysid; pid_t pid; caller_context_t ct; DTRACE_NFSV4_2(op__lockt__start, struct compound_state *, cs, LOCKT4args *, args); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->status = NFS4ERR_NOFILEHANDLE; goto final; } args->owner.clientid = cs->cp->rc_clientid; ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; /* * NFS4 only allows locking on regular files, so * verify type of object. */ if (cs->vp->v_type != VREG) { if (cs->vp->v_type == VDIR) *cs->statusp = resp->status = NFS4ERR_ISDIR; else *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } resp->status = NFS4_OK; switch (args->locktype) { case READ_LT: case READW_LT: ltype = F_RDLCK; break; case WRITE_LT: case WRITEW_LT: ltype = F_WRLCK; break; } posix_length = args->length; /* Check for zero length. To lock to end of file use all ones for V4 */ if (posix_length == 0) { *cs->statusp = resp->status = NFS4ERR_INVAL; goto final; } else if (posix_length == (length4)(~0)) { posix_length = 0; /* Posix to end of file */ } /* Find or create a lockowner */ lo = findlockowner(cs->instp, &args->owner, &create); if (lo) { pid = lo->rl_pid; if ((resp->status = rfs4_client_sysid(lo->rl_client, &sysid)) != NFS4_OK) goto out; } else { pid = 0; sysid = cs->instp->lockt_sysid; } retry: flk.l_type = ltype; flk.l_whence = 0; /* SEEK_SET */ flk.l_start = args->offset; flk.l_len = posix_length; flk.l_sysid = sysid; flk.l_pid = pid; flag |= F_REMOTELOCK; /* Note that length4 is uint64_t but l_len and l_start are off64_t */ if (flk.l_len < 0 || flk.l_start < 0) { resp->status = NFS4ERR_INVAL; goto out; } error = VOP_FRLOCK(cs->vp, F_GETLK, &flk, flag, (u_offset_t)0, NULL, cs->cr, &ct); /* * N.B. We map error values to nfsv4 errors. This is differrent * than puterrno4 routine. */ switch (error) { case 0: if (flk.l_type == F_UNLCK) resp->status = NFS4_OK; else { if (mds_lock_denied(cs->instp, &resp->denied, &flk) == NFS4ERR_EXPIRED) goto retry; resp->status = NFS4ERR_DENIED; } break; case EOVERFLOW: resp->status = NFS4ERR_INVAL; break; case EINVAL: resp->status = NFS4ERR_NOTSUPP; break; default: cmn_err(CE_WARN, "rfs4_op_lockt: unexpected errno (%d)", error); resp->status = NFS4ERR_SERVERFAULT; break; } out: if (lo) rfs4_lockowner_rele(lo); *cs->statusp = resp->status; final: DTRACE_NFSV4_2(op__lockt__done, struct compound_state *, cs, LOCKT4res *, resp); } /* * NFSv4.1 Server Sessions */ /* Renew Lease */ void mds_refresh(mds_session_t *sp) { rfs4_client_t *cp; ASSERT(sp != NULL && sp->sn_clnt != NULL); rfs4_dbe_lock(sp->sn_dbe); cp = sp->sn_clnt; sp->sn_laccess = nfs_sys_uptime(); rfs4_dbe_unlock(sp->sn_dbe); rfs4_dbe_hold(cp->rc_dbe); rfs4_update_lease(cp); rfs4_client_rele(cp); } nfsstat4 mds_lease_chk(mds_session_t *sp) { rfs4_client_t *cp; nfsstat4 error = NFS4_OK; /* * If the client lease expired, go ahead and invalidate * all the sessions associated with this clientid. */ ASSERT(sp != NULL && sp->sn_clnt != NULL); cp = sp->sn_clnt; if (rfs4_lease_expired(cp)) { error = NFS4ERR_BADSESSION; } return (error); } /* * Rudimentary server implementation (XXX - for now) */ void mds_get_server_impl_id(EXCHANGE_ID4resok *resp) { timestruc_t currtime; char *sol_impl = "Solaris NFSv4.1 Server Implementation"; char *sol_idom = "nfsv41.ietf.org"; void *p; uint_t len = 0; nfs_impl_id4 *nip; resp->eir_server_impl_id.eir_server_impl_id_len = 1; nip = kmem_zalloc(sizeof (nfs_impl_id4), KM_SLEEP); resp->eir_server_impl_id.eir_server_impl_id_val = nip; /* Domain */ nip->nii_domain.utf8string_len = len = strlen(sol_idom); p = kmem_zalloc(len * sizeof (char), KM_SLEEP); nip->nii_domain.utf8string_val = p; bcopy(sol_idom, p, len); /* Implementation */ nip->nii_name.utf8string_len = len = strlen(sol_impl); p = kmem_zalloc(len * sizeof (char), KM_SLEEP); nip->nii_name.utf8string_val = p; bcopy(sol_impl, p, len); /* Time */ gethrestime(&currtime); (void) nfs4_time_vton(&currtime, &nip->nii_date); } /* * Principal handling routines */ void rfs4_free_cred_princ(rfs4_client_t *cp) { cred_princ_t *p; rpc_gss_principal_t ppl; ASSERT(cp != NULL); if ((p = cp->rc_cr_set) == NULL) return; switch (p->cp_aflavor) { case AUTH_DES: kmem_free(p->cp_princ, strlen(p->cp_princ) + 1); break; case RPCSEC_GSS: ppl = (rpc_gss_principal_t)p->cp_princ; kmem_free(ppl, ppl->len + sizeof (int)); break; } kmem_free(p, sizeof (cred_princ_t)); cp->rc_cr_set = NULL; } static rpc_gss_principal_t rfs4_dup_princ(rpc_gss_principal_t ppl) { rpc_gss_principal_t pdup; int len; if (ppl == NULL) return (NULL); len = sizeof (int) + ppl->len; pdup = (rpc_gss_principal_t)kmem_alloc(len, KM_SLEEP); bcopy(ppl, pdup, len); return (pdup); } void rfs4_set_cred_princ(cred_princ_t **pp, struct compound_state *cs) { cred_princ_t *p; caddr_t t; ASSERT(pp != NULL); if (*pp == NULL) *pp = kmem_zalloc(sizeof (cred_princ_t), KM_SLEEP); p = *pp; p->cp_cr = crdup(cs->basecr); p->cp_aflavor = cs->req->rq_cred.oa_flavor; p->cp_secmod = cs->nfsflavor; /* secmod != flavor for RPCSEC_GSS */ /* * Set principal as per security flavor */ switch (p->cp_aflavor) { case AUTH_DES: p->cp_princ = kstrdup(cs->principal); break; case RPCSEC_GSS: t = (caddr_t)rfs4_dup_princ((rpc_gss_principal_t)cs->principal); p->cp_princ = (caddr_t)t; break; case AUTH_SYS: case AUTH_NONE: default: break; } } /* returns 0 if no match; or 1 for a match */ int rfs4_cmp_cred_princ(cred_princ_t *p, struct compound_state *cs) { int rc = 0; rpc_gss_principal_t recp; /* cached clnt princ */ rpc_gss_principal_t ibrp; /* inbound req princ */ if (p == NULL) return (rc); /* nothing to compare with */ if (p->cp_aflavor != cs->req->rq_cred.oa_flavor) return (rc); if (p->cp_secmod != cs->nfsflavor) return (rc); if (crcmp(p->cp_cr, cs->basecr)) return (rc); switch (p->cp_aflavor) { case AUTH_DES: rc = (strcmp(p->cp_princ, cs->principal) == 0); break; case RPCSEC_GSS: recp = (rpc_gss_principal_t)p->cp_princ; ibrp = (rpc_gss_principal_t)cs->principal; if (recp->len != ibrp->len) break; rc = (bcmp(recp->name, ibrp->name, ibrp->len) == 0); break; case AUTH_SYS: case AUTH_NONE: default: rc = 1; break; } return (rc); } /* { co_ownerid, co_verifier, principal, clientid, confirmed } */ rfs4_client_t * client_record(nfs_client_id4 *cip, struct compound_state *cs) { rfs4_client_t *cp; bool_t create = TRUE; /* * 1. co_ownerid * 2. co_verifier */ cp = findclient(cs->instp, cip, &create, NULL); /* 3. principal */ if (cp != NULL) rfs4_set_cred_princ(&cp->rc_cr_set, cs); /* * Both of the following items of the 5-tuple are built as * part of creating the rfs4_client_t. * * 4. clientid; created as part of findclient() * 5. confirmed; cp->need_confirmed is initialized to TRUE */ return (cp); } rfs4_client_t * client_lookup(nfs_client_id4 *cip, struct compound_state *cs) { bool_t create = FALSE; return (findclient(cs->instp, cip, &create, NULL)); } bool_t nfs_clid4_cmp(nfs_client_id4 *s1, nfs_client_id4 *s2) { if (s1->verifier != s2->verifier) return (FALSE); if (bcmp(s1->id_val, s2->id_val, s2->id_len)) return (FALSE); return (TRUE); } /* * Compute the "use bits", i.e. the flags specifying the permissible * regular, MDS, and data server ops for the returned clientid. * * The minorversion1 specification allows a server implementor two * alternatives: allow PNFS_MDS and PNFS_DS on the same clientid, or * force the client to create separate clientids to distinguish * MDS versus DS operations. * * Our design distinguishes operations based upon filehandle, and thus * there is no reason to force the client to create separate clientids. * Thus, we give the client as much as possible, while keeping the result * within the allowed combinations as specified in the specification. * * Our constraints are: use a subset of the client's request, unless * the client requested nothing, in which case we may return any * legal combination; and, the combination of NON_PNFS and PNFS_MDS * may not both be set in the results. These constraints are reflected * in the ASSERT()s at the end. */ static uint32_t compute_use_pnfs_flags(uint32_t request) { uint32_t rc; /* Start with the client's initial request */ rc = request & EXCHGID4_FLAG_MASK_PNFS; /* If the client requested nothing, return the most permissive. */ if (rc == 0) { rc = (EXCHGID4_FLAG_USE_PNFS_MDS | EXCHGID4_FLAG_USE_PNFS_DS); goto done; } /* Don't permit the illegal combination of MDS and NON_PNFS */ if ((rc & (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS)) == (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS)) rc &= ~EXCHGID4_FLAG_USE_NON_PNFS; done: ASSERT(((request & EXCHGID4_FLAG_MASK_PNFS) == 0) || ((rc & ~(request & EXCHGID4_FLAG_MASK_PNFS)) == 0)); ASSERT((rc & (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS)) != (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS)); ASSERT(rc != 0); return (rc); } /* * Session Trunking Support */ static struct netbuf * netbuf_dup(struct netbuf *obp) { struct netbuf *np = NULL; np = (struct netbuf *)kmem_zalloc(sizeof (struct netbuf), KM_SLEEP); np->maxlen = np->len = obp->len; np->buf = (char *)kmem_zalloc(obp->len, KM_SLEEP); bcopy(obp->buf, np->buf, obp->len); return (np); } static void netbuf_destroy(struct netbuf *np) { kmem_free((char *)np->buf, np->len); kmem_free((struct netbuf *)np, sizeof (struct netbuf)); } static t_scalar_t svc_get_type(SVCXPRT *xprt) { t_scalar_t xtype; xtype = svc_gettype(xprt); switch (xtype) { case T_RDMA: break; case T_COTS: case T_COTS_ORD: xtype = T_COTS_ORD; break; case T_CLTS: default: cmn_err(CE_WARN, "svc_get_type: Bad service type %d\n", xtype); xtype = 0; } return (xtype); } static rfs41_tie_t * rfs41_tie_init(SVCXPRT *xprt) { rfs41_tie_t *tip = NULL; struct sockaddr *sa; struct sockaddr_in *sa4; struct sockaddr_in6 *sa6; tip = kmem_zalloc(sizeof (rfs41_tie_t), KM_SLEEP); sa = (struct sockaddr *)svc_getendpoint(xprt); tip->t_famly = sa->sa_family; tip->t_xtype = svc_get_type(xprt); tip->t_netbf = netbuf_dup(svc_getlocaladdr(xprt)); switch (tip->t_famly) { case AF_INET: sa4 = (struct sockaddr_in *)(tip->t_netbf->buf); bcopy(&sa4->sin_addr, &tip->t_ipaddr_u.ip4, sizeof (struct in_addr)); break; case AF_INET6: sa6 = (struct sockaddr_in6 *)(tip->t_netbf->buf); bcopy(&sa6->sin6_addr, &tip->t_ipaddr_u.ip6, sizeof (struct in6_addr)); break; default: cmn_err(CE_WARN, "rfs41_tie_init: Bad family (%d)\n", tip->t_famly); netbuf_destroy(tip->t_netbf); kmem_free(tip, sizeof (rfs41_tie_t)); tip = NULL; break; } return (tip); } static void rfs41_exid_so_major(struct server_owner4 *sop, struct compound_state *cs) { int len = sizeof (void *) / sizeof (char); sop->so_major_id.so_major_id_len = (len * 2) + 1; sop->so_major_id.so_major_id_val = tohex(cs->instp, len); } /* * XXX - rfs4_srv_trunk_test is disabled by default; enabling it will * cause ip_dump() to spew addresses of inbound EXCHANGE_ID's * to the console. rfs4_srv_trunk_test and ip_dump() will go * away after client trunking is done. This is handy info to * have for debugging. */ int rfs4_srv_trunk_test = 0; static void ip_dump(struct netbuf *np, char *msg) { struct sockaddr_in *sa4; struct sockaddr_in6 *sa6; if (np == NULL || np->buf == NULL || !rfs4_srv_trunk_test) return; sa4 = (struct sockaddr_in *)(np->buf); switch (sa4->sin_family) { case AF_INET: cmn_err(CE_WARN, "\n%s ip: %d.%d.%d.%d", msg, sa4->sin_addr.S_un.S_un_b.s_b1, sa4->sin_addr.S_un.S_un_b.s_b2, sa4->sin_addr.S_un.S_un_b.s_b3, sa4->sin_addr.S_un.S_un_b.s_b4); break; case AF_INET6: sa6 = (struct sockaddr_in6 *)(np->buf); cmn_err(CE_WARN, "\n%s ip6: " "%2x%2x:%0x%0x:%0x%0x:%0x%0x:%2x%2x:%2x%2x:%2x%2x:%2x%2x", msg, sa6->sin6_addr._S6_un._S6_u8[0], sa6->sin6_addr._S6_un._S6_u8[1], sa6->sin6_addr._S6_un._S6_u8[2], sa6->sin6_addr._S6_un._S6_u8[3], sa6->sin6_addr._S6_un._S6_u8[4], sa6->sin6_addr._S6_un._S6_u8[5], sa6->sin6_addr._S6_un._S6_u8[6], sa6->sin6_addr._S6_un._S6_u8[7], sa6->sin6_addr._S6_un._S6_u8[8], sa6->sin6_addr._S6_un._S6_u8[9], sa6->sin6_addr._S6_un._S6_u8[10], sa6->sin6_addr._S6_un._S6_u8[11], sa6->sin6_addr._S6_un._S6_u8[12], sa6->sin6_addr._S6_un._S6_u8[13], sa6->sin6_addr._S6_un._S6_u8[14], sa6->sin6_addr._S6_un._S6_u8[15]); break; default: cmn_err(CE_WARN, "%s <cannot translate ip>", msg); break; } } static int ip_addr_cmp(rfs41_tie_t *tip, rfs41_tie_t *p) { int match = 0; ASSERT(tip != NULL); ASSERT(p != NULL); if (tip->t_famly != p->t_famly) return (0); if (tip->t_famly == AF_INET) { if (bcmp(&tip->t_ipaddr_u.ip4, &p->t_ipaddr_u.ip4, sizeof (struct in_addr)) == 0) { match = 1; /* IPv4 addr match */ } } else if (tip->t_famly == AF_INET6) { if (bcmp(&tip->t_ipaddr_u.ip6, &p->t_ipaddr_u.ip6, sizeof (struct in6_addr)) == 0) { match = 1; /* IPv6 addr match */ } } return (match); } static void rfs41_set_trunkinfo(SVCXPRT *xprt, struct compound_state *cs, rfs4_client_t *cp, EXCHANGE_ID4resok *rok) { rfs41_tie_t *tip; rfs41_tie_t *p; ASSERT(cs != NULL && cp != NULL && rok != NULL); if (cs == NULL || cp == NULL || rok == NULL) return; /* * start out w/some sane defaults * XXX - scope needs to be revisited. */ rok->eir_clientid = cp->rc_clientid; rfs41_exid_so_major(&rok->eir_server_owner, cs); ip_dump(svc_getlocaladdr(xprt), "inbound"); /* build trunkinfo entry */ if (xprt == NULL || (tip = rfs41_tie_init(xprt)) == NULL) return; /* fastpath for 1st exid */ rfs4_dbe_lock(cp->rc_dbe); if (list_is_empty(&cp->rc_trunkinfo)) { list_insert_head(&cp->rc_trunkinfo, tip); ip_dump(tip->t_netbf, "first-in-list"); rok->eir_server_owner.so_minor_id = (uint64_t)(uintptr_t)&cp->rc_trunkinfo; rfs4_dbe_unlock(cp->rc_dbe); return; } /* run thru trunkinfo list to see if IP has been seen */ for (p = list_head(&cp->rc_trunkinfo); p != NULL; p = list_next(&cp->rc_trunkinfo, p)) { /* * Is the IP already in list ? */ if (ip_addr_cmp(tip, p)) { ip_dump(p->t_netbf, "already-in-list"); rok->eir_server_owner.so_minor_id = (uint64_t)(uintptr_t)&cp->rc_trunkinfo; rfs4_dbe_unlock(cp->rc_dbe); return; } } /* IP hasn't been seen; rerun list to see if equivalent exists */ for (p = list_head(&cp->rc_trunkinfo); p != NULL; p = list_next(&cp->rc_trunkinfo, p)) { /* * Do we have an equivalent (ie. transport) entry */ if (p->t_xtype == tip->t_xtype) { list_insert_head(&cp->rc_trunkinfo, tip); ip_dump(tip->t_netbf, "Equiv FOUND: inserted-in-list"); rok->eir_server_owner.so_minor_id = (uint64_t)(uintptr_t)&cp->rc_trunkinfo; rfs4_dbe_unlock(cp->rc_dbe); return; } } /* nothing in list has same IP addr or is equivalent to tip */ list_insert_head(&cp->rc_trunkinfo, tip); ip_dump(tip->t_netbf, "No IP or Equiv FOUND: inserted-in-list"); rfs4_dbe_unlock(cp->rc_dbe); rok->eir_server_owner.so_minor_id = (uint64_t)(uintptr_t)&tip; } void mds_clean_up_trunkinfo(rfs4_client_t *cp) { rfs41_tie_t *p; ASSERT(cp != NULL); if (cp == NULL) return; rfs4_dbe_lock(cp->rc_dbe); while (p = list_remove_head(&cp->rc_trunkinfo)) { netbuf_destroy(p->t_netbf); kmem_free(p, sizeof (rfs41_tie_t)); } list_destroy(&cp->rc_trunkinfo); rfs4_dbe_unlock(cp->rc_dbe); } /*ARGSUSED*/ static void mds_op_exid_free(nfs_resop4 *resop) { EXCHANGE_ID4res *resp = &resop->nfs_resop4_u.opexchange_id; EXCHANGE_ID4resok *rok = &resp->EXCHANGE_ID4res_u.eir_resok4; struct server_owner4 *sop = &rok->eir_server_owner; nfs_impl_id4 *nip; int len = 0; /* Server Owner: major */ if ((len = sop->so_major_id.so_major_id_len) != 0) kmem_free(sop->so_major_id.so_major_id_val, len); if ((nip = rok->eir_server_impl_id.eir_server_impl_id_val) != NULL) { /* Immplementation */ len = nip->nii_name.utf8string_len; kmem_free(nip->nii_name.utf8string_val, len * sizeof (char)); /* Domain */ len = nip->nii_domain.utf8string_len; kmem_free(nip->nii_domain.utf8string_val, len * sizeof (char)); /* Server Impl */ kmem_free(nip, sizeof (nfs_impl_id4)); } } /* XXX - NOTE: EXCHANGE_ID conforms to draft-19 behavior */ /*ARGSUSED*/ void mds_op_exchange_id(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { EXCHANGE_ID4args *args = &argop->nfs_argop4_u.opexchange_id; EXCHANGE_ID4res *resp = &resop->nfs_resop4_u.opexchange_id; EXCHANGE_ID4resok *rok = &resp->EXCHANGE_ID4res_u.eir_resok4; rfs4_client_t *cp; rfs4_client_t *ocp; bool_t update; client_owner4 *cop; nfs_client_id4 *cip; verifier4 old_verifier_arg; DTRACE_NFSV4_2(op__exchange__id__start, struct compound_state *, cs, EXCHANGE_ID4args *, args); /* * EXCHANGE_ID's may be preceded by SEQUENCE * * Check that eia_flags only has "valid" spec bits * and that no 'eir_flag' ONLY bits are specified. */ if (args->eia_flags & ~EXID4_FLAG_MASK || args->eia_flags & EXID4_FLAG_INVALID_ARGS) { *cs->statusp = resp->eir_status = NFS4ERR_INVAL; goto final; } update = (args->eia_flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A); cop = &args->eia_clientowner; cip = (nfs_client_id4 *)cop; cip->cl_addr = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf; /* * Refer to Section 18.35.4 of draft 19 */ cp = client_lookup(cip, cs); if (cp == NULL) { /* no record exists */ if (!update) { case1: /* case 1 - utok */ cp = client_record(cip, cs); ASSERT(cp != NULL); *cs->statusp = resp->eir_status = NFS4_OK; rok->eir_clientid = cp->rc_clientid; rok->eir_sequenceid = cp->rc_contrived.xi_sid; goto out; } /* no record and trying to update */ *cs->statusp = resp->eir_status = NFS4ERR_NOENT; goto final; } /* record exists */ old_verifier_arg = cp->rc_nfs_client.verifier; if (CLID_REC_CONFIRMED(cp)) { if (!update) { if (old_verifier_arg != cip->verifier) { /* case 5 - utok */ /* * previous incarnation of clientid is first * hidden such that any subsequent lookups * will not find it in DB, then the current * reference to it is dropped; this will * force the reaper thread to clean it up. */ ocp = cp; mds_clean_up_sessions(ocp); rfs4_dbe_hide(ocp->rc_dbe); rfs4_client_rele(ocp); cp = client_record(cip, cs); ASSERT(cp != NULL); *cs->statusp = resp->eir_status = NFS4_OK; rok->eir_clientid = cp->rc_clientid; rok->eir_sequenceid = cp->rc_contrived.xi_sid; /* trickle down to "out" */ } else if (!rfs4_cmp_cred_princ(cp->rc_cr_set, cs)) { /* case 3 */ if (rfs4_lease_expired(cp)) { rfs4_client_close(cp); goto case1; } /* * case 3: clid_in_use - utok * old_client_ret has unexpired lease w/state. */ *cs->statusp = NFS4ERR_CLID_INUSE; resp->eir_status = NFS4ERR_CLID_INUSE; rfs4_client_rele(cp); goto final; } else if (nfs_clid4_cmp(&cp->rc_nfs_client, cip)) { /* case 2 - utok */ *cs->statusp = NFS4_OK; resp->eir_status = NFS4_OK; rok->eir_clientid = cp->rc_clientid; rok->eir_sequenceid = cp->rc_contrived.xi_sid; /* trickle down to "out" */ } else { /* something is really wacky in srv state */ *cs->statusp = resp->eir_status = NFS4ERR_SERVERFAULT; rfs4_client_rele(cp); goto final; } } else { /* UPDATE */ if (old_verifier_arg != cip->verifier) { /* 18.35.4 case 8 */ *cs->statusp = resp->eir_status = NFS4ERR_NOT_SAME; rfs4_client_rele(cp); goto final; } if (!rfs4_cmp_cred_princ(cp->rc_cr_set, cs)) { /* 18.35.4 case 9 */ *cs->statusp = resp->eir_status = NFS4ERR_PERM; rfs4_client_rele(cp); goto final; } /* case 6 - utok */ *cs->statusp = resp->eir_status = NFS4_OK; rok->eir_clientid = cp->rc_clientid; rok->eir_sequenceid = cp->rc_contrived.xi_sid; /* trickle down to "out" */ } } else { /* UNCONFIRMED */ if (!update) { /* case 4 - utok */ rfs4_client_close(cp); goto case1; } else { /* case 7 - utok */ *cs->statusp = resp->eir_status = NFS4ERR_NOENT; rfs4_client_rele(cp); goto final; } } out: rok->eir_flags = 0; if (resp->eir_status == NFS4_OK && CLID_REC_CONFIRMED(cp)) rok->eir_flags |= EXCHGID4_FLAG_CONFIRMED_R; /* * State Protection Mojo */ cp->rc_state_prot.sp_type = args->eia_state_protect.spa_how; switch (cp->rc_state_prot.sp_type) { case SP4_NONE: break; case SP4_MACH_CRED: /* XXX - Some of Karen's secret sauce here... */ break; case SP4_SSV: /* XXX - ... and here */ if (args->ssv_args.ssp_ops.spo_must_allow & OP_EXCHANGE_ID) /* * if the client ID was created specifying SP4_SSV * state protection and EXCHANGE_ID as the one of * the operations in spo_must_allow, then server MUST * authorize EXCHANGE_IDs with the SSV principal in * addition to the principal that created the client * ID. */ /* EMPTY */; break; } /* * XXX - Still need to clarify if the NFSv4.1 * server will be supporting referrals. */ if (args->eia_flags & EXCHGID4_FLAG_SUPP_MOVED_REFER) /* EMPTY */; /* * Migration/Replication not (yet) supported */ if (args->eia_flags & EXCHGID4_FLAG_SUPP_MOVED_MIGR) rok->eir_flags &= ~EXCHGID4_FLAG_SUPP_MOVED_MIGR; /* * Add the appropriate "use_pnfs" flags. */ rok->eir_flags |= compute_use_pnfs_flags(args->eia_flags); /* force no state protection for now */ rok->eir_state_protect.spr_how = SP4_NONE; /* Implementation specific mojo */ if (args->eia_client_impl_id.eia_client_impl_id_len != 0) /* EMPTY */; /* XXX - jw - best guess */ rfs4_ss_clid(cs, cp, req); /* Server's implementation */ mds_get_server_impl_id(rok); /* compute trunking capabilities */ bzero(&rok->eir_server_scope, sizeof (rok->eir_server_scope)); bzero(&rok->eir_server_owner, sizeof (server_owner4)); rfs41_set_trunkinfo(req->rq_xprt, cs, cp, rok); /* * XXX - jw - best guess * Check to see if client can perform reclaims */ rfs4_ss_chkclid(cs, cp); rfs4_client_rele(cp); final: DTRACE_NFSV4_2(op__exchange__id__done, struct compound_state *, cs, EXCHANGE_ID4res *, resp); } /*ARGSUSED*/ void mds_op_create_session(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { CREATE_SESSION4args *args = &argop->nfs_argop4_u.opcreate_session; CREATE_SESSION4res *resp = &resop->nfs_resop4_u.opcreate_session; CREATE_SESSION4resok *rok = &resp->CREATE_SESSION4res_u.csr_resok4; CREATE_SESSION4resok *crp; rfs4_client_t *cp; mds_session_t *sp; session41_create_t sca; sequenceid4 stseq; sequenceid4 agseq; DTRACE_NFSV4_2(op__create__session__start, struct compound_state *, cs, CREATE_SESSION4args *, args); /* * A CREATE_SESSION request can be prefixed by OP_SEQUENCE. * In this case, the newly created session has no relation * to the sessid used for the OP_SEQUENCE. */ /* * Find the clientid */ cp = findclient_by_id(cs->instp, args->csa_clientid); if (cp == NULL) { *cs->statusp = resp->csr_status = NFS4ERR_STALE_CLIENTID; goto final; } /* * Make sure the lease is still valid. */ if (rfs4_lease_expired(cp)) { rfs4_client_close(cp); *cs->statusp = resp->csr_status = NFS4ERR_STALE_CLIENTID; goto final; } /* * Sequenceid processing (handling replay's, etc) */ agseq = args->csa_sequence; stseq = cp->rc_contrived.cs_slot.seqid; if (stseq == agseq) { /* * If the same sequenceid, then must be a replay of a * previous CREATE_SESSION; return the cached result. */ crp = (CREATE_SESSION4resok *)&cp->rc_contrived.cs_res; *cs->statusp = resp->csr_status = cp->rc_contrived.cs_slot.status; rok->csr_sequence = cp->rc_contrived.cs_slot.seqid; bcopy(crp->csr_sessionid, rok->csr_sessionid, sizeof (sessionid4)); rok->csr_flags = crp->csr_flags; rok->csr_fore_chan_attrs = crp->csr_fore_chan_attrs; rok->csr_back_chan_attrs = crp->csr_back_chan_attrs; rfs4_update_lease(cp); rfs4_client_rele(cp); goto final; } if (stseq + 1 == agseq) { /* Valid sequencing */ cp->rc_contrived.cs_slot.seqid = args->csa_sequence; } else { /* * No way to differentiate MISORD_NEWREQ vs. MISORD_REPLAY, * so anything else, we simply treat as SEQ_MISORDERED. */ *cs->statusp = resp->csr_status = NFS4ERR_SEQ_MISORDERED; rfs4_client_rele(cp); goto final; } /* * Clientid confirmation */ if (cp->rc_need_confirm && cp->rc_clientid == args->csa_clientid) { if (rfs4_cmp_cred_princ(cp->rc_cr_set, cs)) { cp->rc_need_confirm = FALSE; } else { *cs->statusp = resp->csr_status = NFS4ERR_CLID_INUSE; rfs4_client_rele(cp); goto final; } } /* * Session creation */ sca.cs_error = 0; sca.cs_req = req; sca.cs_client = cp; sca.cs_aotw = *args; sp = mds_createsession(cs->instp, &sca); if (sca.cs_error) { *cs->statusp = resp->csr_status = sca.cs_error; rfs4_client_rele(cp); if (sp != NULL) rfs41_session_rele(sp); goto final; } if (sp == NULL) { *cs->statusp = resp->csr_status = NFS4ERR_SERVERFAULT; rfs4_client_rele(cp); goto final; } /* * Need to store the result in the rfs4_client_t's contrived * result slot and then respond from there. This way, when the * csa_sequence == contrived.cc_sid, we can return the latest * cached result. (see replay: above) */ crp = (CREATE_SESSION4resok *)&cp->rc_contrived.cs_res; *cs->statusp = resp->csr_status = cp->rc_contrived.cs_slot.status = NFS4_OK; rok->csr_sequence = cp->rc_contrived.xi_sid; bcopy(sp->sn_sessid, rok->csr_sessionid, sizeof (sessionid4)); bcopy(sp->sn_sessid, crp->csr_sessionid, sizeof (sessionid4)); rok->csr_flags = crp->csr_flags = sp->sn_csflags; /* * XXX: struct assignment of channel4_attrs is broken because * ca_rdma_ird is specified as a single element array. A struct * assignment will copy the ca_rdma_ird array ptr to multiple args/ * res structs, and the ptr will be free'd multiple times. * Without RDMA, ca_rdma_ird is a zero element array so its ptr * is NULL (which is why this doesn't cause problems right now). * * Struct assignment is convenient, and it would be best to enable * it by creating an in-kernel channel4_attrs struct which didn't * contain the single element array, but just contained the inbound * receive queue depth. Let the XDR encode/decode funcs convert * from the in-kernel form to the OTW form. */ rok->csr_fore_chan_attrs = crp->csr_fore_chan_attrs = sp->sn_fore->cn_attrs; rok->csr_back_chan_attrs = crp->csr_back_chan_attrs = args->csa_back_chan_attrs; rfs4_update_lease(cp); /* * References from the session to the client are * accounted for while session is being created. */ rfs4_client_rele(cp); rfs41_session_rele(sp); final: DTRACE_NFSV4_2(op__create__session__done, struct compound_state *, cs, CREATE_SESSION4res *, resp); } /*ARGSUSED*/ void mds_op_destroy_session(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { DESTROY_SESSION4args *args = &argop->nfs_argop4_u.opdestroy_session; DESTROY_SESSION4res *resp = &resop->nfs_resop4_u.opdestroy_session; mds_session_t *sp; rfs4_client_t *cp; int vsc = 0; DTRACE_NFSV4_2(op__destroy__session__start, struct compound_state *, cs, DESTROY_SESSION4args *, args); /* * As noted in section 18.37.3 of draft-29, the DESTROY_SESSION * MAY be the only op in the compound... */ if (cs->sp != NULL) { /* we must be in a compound with a sequence */ if (bcmp(args->dsa_sessionid, cs->sp->sn_sessid, sizeof (sessionid4)) == 0) { /* * ... if the compound is SEQUENCE'd _AND_ the * sessid's of the SEQUENCE and DESTROY_SESSION * ops match, then the DESTROY_SESSION op MUST * be the last op in the compound. */ if ((cs->op_len - 1) != cs->op_ndx) { /* * XXX - What's the right error to * return if DESTROY_SESSION is NOT * the last op in compound ??? Using * UNSAFE_COMPOUND for now. */ *cs->statusp = resp->dsr_status = NFS4ERR_UNSAFE_COMPOUND; goto final; } /* utok */ } else { /* * if we're here, it's because the compound is * SEQUENCE'd and the session being destroyed is * NOT the same session being used for SEQUENCE. * Hi/low watermarks accounted in seq_chk_limits. */ DTRACE_PROBE(nfss41__i__destroy_encap_session); /* * XXX - Remember that if the sessid in SEQUENCE * differs from DESTROY_SESSION, we'll want * to verify that both sessions are sharing * the connection. */ vsc = 1; } } /* * Find session and check for clientid and lease expiration */ if ((sp = mds_findsession_by_id(cs->instp, args->dsa_sessionid)) == NULL) { *cs->statusp = resp->dsr_status = NFS4ERR_BADSESSION; goto final; } /* * Verify that "this" connection is associated * w/the session being targeted for destruction. */ if (vsc) { /* * XXX - Still need to Code. placeholder * for verification of shared conn */ DTRACE_PROBE(nfss41__i__destroy_session_conn_verify); } /* * Once we can trace back to the rfs4_client struct, verify the * cred that was used to create the session matches and is in * concordance w/the state protection type used. */ if ((cp = sp->sn_clnt) != NULL) { switch (cp->rc_state_prot.sp_type) { case SP4_MACH_CRED: cmn_err(CE_NOTE, "op_destroy_session: SP4_MACH_CRED"); if (!rfs4_cmp_cred_princ(cp->rc_cr_set, cs)) { *cs->statusp = resp->dsr_status = NFS4ERR_PERM; rfs41_session_rele(sp); goto final; } break; case SP4_SSV: /* * XXX - Need some of Karen's secret ssv sauce * here. For now, just allow the destroy. */ cmn_err(CE_NOTE, "op_destroy_session: SP4_SSV"); break; case SP4_NONE: cmn_err(CE_NOTE, "op_destroy_session: SP4_NONE"); break; default: break; } } /* session rele taken care of in mds_destroysession */ *cs->statusp = resp->dsr_status = mds_destroysession(sp); final: DTRACE_NFSV4_2(op__destroy__session__done, struct compound_state *, cs, DESTROY_SESSION4res *, resp); } /*ARGSUSED*/ void mds_op_backchannel_ctl(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { } /* * The thread will traverse the entire list pinging the connections * that need it and refreshing any stale/dead connections. */ static void ping_cb_null_thr(mds_session_t *sp) { CLIENT *ch = NULL; struct timeval tv; enum clnt_stat cs; int conn_num, attempts = 5; tv.tv_sec = 30; tv.tv_usec = 0; if ((ch = rfs41_cb_getch(sp)) == NULL) goto out; /* * Flag to let RPC know these are ping calls. RPC will only use * untested connections. */ CLNT_CONTROL(ch, CLSET_CB_TEST, (void *)NULL); /* * If another thread is working on the pings then * just exit. */ rfs4_dbe_lock(sp->sn_dbe); if (sp->sn_bc.pnginprog != 0) { rfs4_dbe_unlock(sp->sn_dbe); goto out; } sp->sn_bc.pnginprog = 1; rfs4_dbe_unlock(sp->sn_dbe); /* * Get the number of untested conections */ if (!CLNT_CONTROL(ch, CLGET_CB_UNTESTED, (void *)&conn_num)) goto out; /* * If number of untested connections is zero, either * - another thread's already tested it * - a previously tested connection is being reused * So no further testing is required */ if (conn_num == 0) { rfs4_dbe_lock(sp->sn_dbe); sp->sn_bc.paths++; if (sp->sn_bc.pngcnt) sp->sn_bc.pngcnt--; rfs4_dbe_unlock(sp->sn_dbe); goto out; } call_again: while (conn_num-- > 0) { /* * With CB_TEST flag set, RPC iterates over untested * connections for each of these CLNT_CALL() */ cs = CLNT_CALL(ch, CB_NULL, xdr_void, NULL, xdr_void, NULL, tv); if (cs == RPC_SUCCESS) { rfs4_dbe_lock(sp->sn_dbe); sp->sn_bc.paths++; sp->sn_bc.pngcnt--; rfs4_dbe_unlock(sp->sn_dbe); } } rfs4_dbe_lock(sp->sn_dbe); if (sp->sn_bc.paths == 0) sp->sn_bc.failed = 1; rfs4_dbe_unlock(sp->sn_dbe); if (!CLNT_CONTROL(ch, CLGET_CB_UNTESTED, (void *)&conn_num)) goto out; if (conn_num != 0) { /* * Pause inbetween attempts and * only try 5 times. */ attempts--; if (attempts > 0) { delay(2 * drv_usectohz(1000000)); goto call_again; } DTRACE_PROBE(nfss41__i__cb_null_failed_attempts); } out: rfs4_dbe_lock(sp->sn_dbe); sp->sn_bc.pnginprog = 0; rfs4_dbe_unlock(sp->sn_dbe); if (ch != NULL) { CLNT_CONTROL(ch, CLSET_CB_TEST_CLEAR, (void *)NULL); rfs41_cb_freech(sp, ch); } rfs41_session_rele(sp); thread_exit(); } /* * Process the SEQUENCE operation. The session pointer has already been * cached in the compound state, so we just dereference */ /*ARGSUSED*/ void mds_op_sequence(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { SEQUENCE4args *args = &argop->nfs_argop4_u.opsequence; SEQUENCE4res *resp = &resop->nfs_resop4_u.opsequence; SEQUENCE4resok *rok = &resp->SEQUENCE4res_u.sr_resok4; mds_session_t *sp = cs->sp; slot_ent_t *slt; slotid4 slot = args->sa_slotid; nfsstat4 status = NFS4_OK; uint32_t cbstat = 0x0; DTRACE_NFSV4_2(op__sequence__start, struct compound_state *, cs, SEQUENCE4args *, args); if (cs->op_ndx != 0) { *cs->statusp = resp->sr_status = NFS4ERR_SEQUENCE_POS; goto final; } if ((status = mds_lease_chk(sp)) != NFS4_OK) { *cs->statusp = resp->sr_status = status; goto final; } /* * If the back channel has been established... * . if the channel has _not_ been marked as failed _AND_ * there are connections that have pings outstanding, * we go ahead and fire the thread to traverse all of * the session's conns, issuing CB_NULL's to those that * need a ping. * . if the channel is _not_ OK (ie. failed), then notify * client that there is currently a problem with the CB * path. */ rfs4_dbe_lock(sp->sn_dbe); if (SN_CB_CHAN_EST(sp)) { if (SN_CB_CHAN_OK(sp)) { if (sp->sn_bc.pngcnt > 0 && !sp->sn_bc.pnginprog) { kthread_t *t; rfs41_session_hold(sp); t = thread_create(NULL, 0, ping_cb_null_thr, sp, 0, &p0, TS_RUN, minclsyspri); if (!t) rfs41_session_rele(sp); } } else { cbstat |= SEQ4_STATUS_CB_PATH_DOWN; } } cs->cp = sp->sn_clnt; rfs4_dbe_hold(cs->cp->rc_dbe); /* compound state ref */ DTRACE_PROBE1(nfss41__i__compound_clid, clientid4, cs->cp->rc_clientid); /* * Valid range is [0, N-1] */ if (slot < 0 || slot >= sp->sn_replay->st_currw) { /* slot not in valid range */ cmn_err(CE_WARN, "mds_op_sequence: Bad Slot"); *cs->statusp = resp->sr_status = NFS4ERR_BADSLOT; goto sessrel; } /* * valid slot ! * * Duplicates/retransmissions have already been handled by * rfs41_slrc_prologue(), so if we're here, it _must_ mean * this is indeed a new request. We perform some sanity * checks and return NFS4_OK if everything looks kosher; * this reply will need to be cached by our caller. */ slt = slrc_slot_get(sp->sn_replay, slot); ASSERT(slt != NULL); if (args->sa_sequenceid != slt->se_seqid + 1) { cmn_err(CE_WARN, "mds_op_sequence: Misordered New Request"); *cs->statusp = resp->sr_status = NFS4ERR_SEQ_MISORDERED; goto sessrel; } if (args->sa_sequenceid == slt->se_seqid + 1) { /* * New request. */ mutex_enter(&slt->se_lock); slt->se_seqid = args->sa_sequenceid; if (slt->se_p != NULL) { /* * slot previously used to return recallable state; * since slot reused (NEW request) we are guaranteed * the client saw the reply, so it's safe to nuke the * race-detection accounting info. */ rfs41_rs_erase(slt->se_p); slt->se_p = NULL; } mutex_exit(&slt->se_lock); } /* * Update access time and lease */ cs->slotno = slot; cs->seqid = slt->se_seqid; sp->sn_laccess = nfs_sys_uptime(); rfs4_update_lease(cs->cp); /* * Let's keep it simple for now */ bcopy(sp->sn_sessid, rok->sr_sessionid, sizeof (sessionid4)); rok->sr_sequenceid = slt->se_seqid; rok->sr_slotid = slot; rok->sr_highest_slotid = sp->sn_replay->st_currw; rok->sr_target_highest_slotid = sp->sn_replay->st_currw; rok->sr_status_flags |= cbstat; *cs->statusp = resp->sr_status = NFS4_OK; sessrel: rfs4_dbe_unlock(sp->sn_dbe); final: DTRACE_NFSV4_2(op__sequence__done, struct compound_state *, cs, SEQUENCE4res *, resp); } void rfs41_bc_setup(mds_session_t *sp) { sess_channel_t *bcp; sess_bcsd_t *bsdp; bool_t bcp_init = FALSE; ASSERT(sp != NULL); rfs4_dbe_lock(sp->sn_dbe); /* * If sn_back == NULL, setup and initialize the * back channel. */ if (sp->sn_back == NULL) { bcp = rfs41_create_session_channel(CDFS4_BACK); bcp_init = TRUE; } else { bcp = sp->sn_back; } rfs4_dbe_unlock(sp->sn_dbe); ASSERT(bcp != NULL); rw_enter(&bcp->cn_lock, RW_WRITER); bcp->cn_dir |= CDFS4_BACK; /* now set the conn's state so we know a ping is needed */ atomic_add_32(&sp->sn_bc.pngcnt, 1); /* * Initialize back channel specific data */ if (bcp_init) { if ((bsdp = CTOBSD(bcp)) == NULL) { cmn_err(CE_PANIC, "Back Chan Spec Data Not Set\t" "<Internal Inconsistency>"); } rw_enter(&bsdp->bsd_rwlock, RW_WRITER); /* * XXX - 08/15/2008 (rick) if we're barely creating the * back channel, then the back channel attrs should * have been saved off by the originating CREATE_SESSION * call. If that's not the case, default to MAXSLOTS. */ slrc_table_create(&bsdp->bsd_stok, MAXSLOTS); rw_exit(&bsdp->bsd_rwlock); } rw_exit(&bcp->cn_lock); /* * If no back channel yet, then we must've created one above. * Make sure we set the session's back channel appropriately. */ rfs4_dbe_lock(sp->sn_dbe); if (sp->sn_back == NULL) sp->sn_back = bcp; rfs4_dbe_unlock(sp->sn_dbe); } /*ARGSUSED*/ void mds_op_bind_conn_to_session(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *req, compound_state_t *cs) { BIND_CONN_TO_SESSION4args *args = &argop->a_bc2s; BIND_CONN_TO_SESSION4res *resp = &resop->r_bc2s; BIND_CONN_TO_SESSION4resok *rok = &resp->rok_bc2s; mds_session_t *sp; SVCMASTERXPRT *mxprt; rpcprog_t prog; SVCCB_ARGS cbargs; DTRACE_NFSV4_2(op__bind__conn__to__session__start, struct compound_state *, cs, BIND_CONN_TO_SESSION4args *, args); if (cs->op_ndx != 0) { *cs->statusp = resp->bctsr_status = NFS4ERR_NOT_ONLY_OP; goto final; } /* * Find session and check for clientid and lease expiration */ if ((sp = mds_findsession_by_id(cs->instp, args->bctsa_sessid)) == NULL) { *cs->statusp = resp->bctsr_status = NFS4ERR_BADSESSION; goto final; } mds_refresh(sp); bzero(rok, sizeof (SEQUENCE4resok)); rfs4_dbe_lock(sp->sn_dbe); prog = sp->sn_bc.progno; rfs4_dbe_unlock(sp->sn_dbe); rok->bctsr_use_conn_in_rdma_mode = FALSE; mxprt = (SVCMASTERXPRT *)req->rq_xprt->xp_master; switch (args->bctsa_dir) { case CDFC4_FORE: case CDFC4_FORE_OR_BOTH: /* always map to Fore */ rok->bctsr_dir = CDFS4_FORE; SVC_CTL( req->rq_xprt, SVCCTL_SET_TAG, (void *)sp->sn_sessid); break; case CDFC4_BACK: case CDFC4_BACK_OR_BOTH: /* always map to Back */ rok->bctsr_dir = CDFS4_BACK; rfs41_bc_setup(sp); SVC_CTL( req->rq_xprt, SVCCTL_SET_TAG, (void *)sp->sn_sessid); cbargs.xprt = mxprt; cbargs.prog = prog; cbargs.vers = NFS_CB; cbargs.family = AF_INET; cbargs.tag = (void *)sp->sn_sessid; SVC_CTL(req->rq_xprt, SVCCTL_SET_CBCONN, (void *)&cbargs); /* Recall: these bits denote # of active back chan conns */ rfs41_seq4_hold(&sp->sn_seq4, SEQ4_STATUS_CB_PATH_DOWN_SESSION); rfs41_seq4_hold(&sp->sn_clnt->rc_seq4, SEQ4_STATUS_CB_PATH_DOWN); break; default: break; } /* * Handcraft the results ! */ bcopy(sp->sn_sessid, rok->bctsr_sessid, sizeof (sessionid4)); *cs->statusp = NFS4_OK; rfs41_session_rele(sp); final: DTRACE_NFSV4_2(op__bind__conn__to__session__done, struct compound_state *, cs, BIND_CONN_TO_SESSION4res *, resp); } /* located in nfs4_state */ extern void mds_mpd_list(rfs4_entry_t, void *); nfsstat4 mds_getdevicelist(nfs_server_instance_t *instp, deviceid4 **dlpp, int *len) { mds_device_list_t mdl; int sz; /* * no table updates till we're done with this... */ rw_enter(&instp->mds_mpd_lock, RW_READER); sz = instp->mds_mpd_tab->dbt_count * sizeof (deviceid4); mdl.mdl_dl = kmem_alloc(sz, KM_SLEEP); mdl.mdl_count = 0; rfs4_dbe_walk(instp->mds_mpd_tab, mds_mpd_list, &mdl); rw_exit(&instp->mds_mpd_lock); *len = mdl.mdl_count; *dlpp = mdl.mdl_dl; return (NFS4_OK); } /*ARGSUSED*/ static void mds_op_get_devlist(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *reqp, compound_state_t *cs) { GETDEVICELIST4res *resp = &resop->nfs_resop4_u.opgetdevicelist; deviceid4 *devlist = NULL; nfsstat4 nfsstat = NFS4_OK; int len = 0; DTRACE_NFSV4_1(op__getdevicelist__start, struct compound_state *, cs); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->gdlr_status = NFS4ERR_NOFILEHANDLE; goto final; } /* mds_getdevicelist will allocate space for devlist. */ nfsstat = mds_getdevicelist(cs->instp, &devlist, &len); *cs->statusp = resp->gdlr_status = nfsstat; if (nfsstat == NFS4_OK) { resp->GETDEVICELIST4res_u.gdlr_resok4.gdlr_cookie = 1234; resp->GETDEVICELIST4res_u.gdlr_resok4.gdlr_cookieverf = 1234; resp->GETDEVICELIST4res_u.gdlr_resok4.gdlr_deviceid_list.\ gdlr_deviceid_list_len = len; resp->GETDEVICELIST4res_u.gdlr_resok4.gdlr_deviceid_list.\ gdlr_deviceid_list_val = devlist; resp->GETDEVICELIST4res_u.gdlr_resok4.gdlr_eof = TRUE; } final: DTRACE_NFSV4_2(op__getdevicelist__done, struct compound_state *, cs, GETDEVICELIST4res *, resp); } /*ARGSUSED*/ static void mds_op_get_devlist_free(nfs_resop4 *resop) { } /*ARGSUSED*/ static void mds_op_get_devinfo(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *reqp, compound_state_t *cs) { ba_devid_t devid; mds_mpd_t *mp = NULL; GETDEVICEINFO4args *argp = &argop->nfs_argop4_u.opgetdeviceinfo; GETDEVICEINFO4res *resp = &resop->nfs_resop4_u.opgetdeviceinfo; DTRACE_NFSV4_2(op__getdeviceinfo__start, struct compound_state *, cs, GETDEVICEINFO4args *, argp); /* preset for failure */ *cs->statusp = resp->gdir_status = NFS4ERR_INVAL; if (argp->gdia_layout_type != LAYOUT4_NFSV4_1_FILES) goto final; bcopy(&argp->gdia_device_id, &devid, sizeof (devid)); mp = mds_find_mpd(cs->instp, devid.i.did); if (mp == NULL) goto final; /* * Do the too_small check */ if (mp->mpd_encoded_len > argp->gdia_maxcount) { *cs->statusp = resp->gdir_status = NFS4ERR_TOOSMALL; goto final; } /* * If the client requests notifications, then say we're * willing to send them. */ resp->GETDEVICEINFO4res_u.gdir_resok4.gdir_notification = 0; if (argp->gdia_notify_types & NOTIFY_DEVICEID4_CHANGE_MASK) resp->GETDEVICEINFO4res_u.gdir_resok4.gdir_notification |= NOTIFY_DEVICEID4_CHANGE_MASK; if (argp->gdia_notify_types & NOTIFY_DEVICEID4_DELETE_MASK) resp->GETDEVICEINFO4res_u.gdir_resok4.gdir_notification |= NOTIFY_DEVICEID4_DELETE_MASK; resp->GETDEVICEINFO4res_u.gdir_resok4.gdir_device_addr.\ da_layout_type = LAYOUT4_NFSV4_1_FILES; resp->GETDEVICEINFO4res_u.gdir_resok4.gdir_device_addr.\ da_addr_body.da_addr_body_len = mp->mpd_encoded_len; resp->GETDEVICEINFO4res_u.gdir_resok4.gdir_device_addr.\ da_addr_body.da_addr_body_val = mp->mpd_encoded_val; *cs->statusp = resp->gdir_status = NFS4_OK; final: DTRACE_NFSV4_2(op__getdeviceinfo__done, struct compound_state *, cs, GETDEVICEINFO4res *, resp); if (mp != NULL) rfs4_dbe_rele(mp->mpd_dbe); } int mds_alloc_ds_fh(fsid_t fsid, const nfs41_fid_t *fid, mds_sid *sid, nfs_fh4 *fhp) { mds_ds_fh dsfh; unsigned long hostid = 0; (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); bzero(&dsfh, sizeof (mds_ds_fh)); dsfh.vers = DS_FH_v1; dsfh.type = FH41_TYPE_DMU_DS; dsfh.fh.v1.mds_id = (uint64_t)hostid; /* * Use the FSID for the MDS Dataset ID for now. In the future * the MDS MDS Dataset ID will be made up of information from the * ZFS dataset (i.e. dataset guid) on the MDS. Regardless, * the mds_dataset_id portion of the file handle is opaque so * we can put what we want there (as long as it identifies the * MDS file system). */ dsfh.fh.v1.mds_dataset_id.len = sizeof (fsid); bcopy(&fsid, dsfh.fh.v1.mds_dataset_id.val, dsfh.fh.v1.mds_dataset_id.len); /* * The MDS SID portion identifies which dataset * on the DS to use... */ dsfh.fh.v1.mds_sid.len = sid->len; /* * The mds_dataset_id already has storage allocated * for the value. The mds_sid does not. We could * allocate it here, but why? We are abotu to just * throw it away. So instead, we copy the pointer * and avoid the free case in the error cleanup. */ dsfh.fh.v1.mds_sid.val = sid->val; dsfh.fh.v1.mds_fid.len = fid->len; bcopy(fid->val, dsfh.fh.v1.mds_fid.val, fid->len); if (!xdr_encode_ds_fh(&dsfh, fhp)) return (EINVAL); return (0); } int mds_layout_is_dense = 1; /* * get file layout * XXX? should the rfs4_file_t be cached in compound state? */ nfsstat4 mds_get_file_layout(vnode_t *vp, mds_layout_t **plp) { mds_layout_t *layout; ASSERT(vp); layout = pnfs_get_mds_layout(vp); if (layout == NULL) return (NFS4ERR_LAYOUTUNAVAILABLE); *plp = layout; return (NFS4_OK); } static void mds_free_fh_list(nfs_fh4 *nfl_fh_list, int count) { int i; for (i = 0; i < count; i++) xdr_free_ds_fh(&(nfl_fh_list[i])); kmem_free(nfl_fh_list, count * sizeof (nfs_fh4)); } static void mds_free_devices_list() { } /*ARGSUSED*/ nfsstat4 mds_fetch_layout(struct compound_state *cs, LAYOUTGET4args *argp, LAYOUTGET4res *resp) { mds_layout_t *lp; mds_mpd_t *mp; layout4 *logrp; nfsv4_1_file_layout4 otw_flo; nfs_fh4 *nfl_fh_list; rfs4_file_t *fp; mds_layout_grant_t *lg; mds_ever_grant_t *eg; int i, err, nfl_size; bool_t create; XDR xdr; int xdr_size = 0; char *xdr_buffer; if (!pnfs_enabled(cs->exi) || mds_get_file_layout(cs->vp, &lp) != NFS4_OK) return (NFS4ERR_LAYOUTUNAVAILABLE); /* * validate the device id */ mp = mds_find_mpd(cs->instp, lp->mlo_mpd_id); if (mp == NULL) { DTRACE_PROBE1(nfss41__e__bad_devid, uint32_t, lp->mlo_mpd_id); rfs4_dbe_rele(lp->mlo_dbe); return (NFS4ERR_LAYOUTUNAVAILABLE); } rfs4_dbe_rele(mp->mpd_dbe); bzero(&otw_flo, sizeof (otw_flo)); mds_set_deviceid(lp->mlo_mpd_id, &otw_flo.nfl_deviceid); /* * NFL4_UFLG_COMMIT_THRU_MDS is FALSE */ otw_flo.nfl_util = (lp->mlo_lc.lc_stripe_unit & NFL4_UFLG_STRIPE_UNIT_SIZE_MASK); if (mds_layout_is_dense) otw_flo.nfl_util |= NFL4_UFLG_DENSE; /* * Always start at the begining of the device array */ otw_flo.nfl_first_stripe_index = 0; /* */ nfl_size = lp->mlo_lc.lc_stripe_count * sizeof (nfs_fh4); nfl_fh_list = kmem_zalloc(nfl_size, KM_NOSLEEP); if (nfl_fh_list == NULL) { rfs4_dbe_rele(lp->mlo_dbe); return (NFS4ERR_LAYOUTTRYLATER); } /* * this of course is still somewhat bogus and this * whole function might be re-whacked in the * product. */ for (i = 0; i < lp->mlo_lc.lc_stripe_count; i++) { nfs41_fid_t fid = ((nfs41_fh_fmt_t *)cs->fh.nfs_fh4_val)->fh.v1.obj_fid; /* * Build DS Filehandles. */ err = mds_alloc_ds_fh(cs->exi->exi_fsid, &fid, &lp->mlo_lc.lc_mds_sids[i], &(nfl_fh_list[i])); if (err) { mds_free_fh_list(nfl_fh_list, lp->mlo_lc.lc_stripe_count); rfs4_dbe_rele(lp->mlo_dbe); return (NFS4ERR_LAYOUTUNAVAILABLE); } } otw_flo.nfl_fh_list.nfl_fh_list_len = lp->mlo_lc.lc_stripe_count; otw_flo.nfl_fh_list.nfl_fh_list_val = nfl_fh_list; xdr_size = xdr_sizeof(xdr_nfsv4_1_file_layout4, &otw_flo); ASSERT(xdr_size); /* * Not a big deal if we are resource constrained * and the kmem_alloc fails. NFS Client will have * to do IO through MDS. */ xdr_buffer = kmem_alloc(xdr_size, KM_NOSLEEP); if (xdr_buffer == NULL) { mds_free_fh_list(nfl_fh_list, lp->mlo_lc.lc_stripe_count); rfs4_dbe_rele(lp->mlo_dbe); return (NFS4ERR_LAYOUTTRYLATER); } /* * Lets XDR Encode like we did last summer.. * (or twist again..) */ xdrmem_create(&xdr, xdr_buffer, xdr_size, XDR_ENCODE); if (xdr_nfsv4_1_file_layout4(&xdr, &otw_flo) == FALSE) { kmem_free(xdr_buffer, xdr_size); mds_free_fh_list(nfl_fh_list, lp->mlo_lc.lc_stripe_count); rfs4_dbe_rele(lp->mlo_dbe); return (NFS4ERR_LAYOUTTRYLATER); } /* * create the layout grant */ create = FALSE; fp = rfs4_findfile(cs->instp, cs->vp, NULL, &create); /* what if fp == NULL??? */ create = TRUE; lg = rfs41_findlogrant(cs, fp, cs->cp, &create); if (lg == NULL) { printf("rfs41_findlogrant() returned NULL; create=%d\n ", create); kmem_free(xdr_buffer, xdr_size); mds_free_fh_list(nfl_fh_list, lp->mlo_lc.lc_stripe_count); rfs4_file_rele(fp); rfs4_dbe_rele(lp->mlo_dbe); return (NFS4ERR_SERVERFAULT); } if (create == TRUE) { /* Insert the grant on the client's list */ rfs4_dbe_lock(cs->cp->rc_dbe); insque(&lg->lo_clientgrantlist, cs->cp->rc_clientgrantlist.prev); rfs4_dbe_unlock(cs->cp->rc_dbe); /* Insert the grant on the file's list */ rfs4_dbe_lock(fp->rf_dbe); insque(&lg->lo_grant_list, fp->rf_lo_grant_list.prev); rfs4_dbe_unlock(fp->rf_dbe); } rfs4_file_rele(fp); lg->lo_lop = lp; /* * We may have just created the layout grant, or it could have * already existed. The client could be asking for "more" of the * file. However, it doesn't matter since we always hand out the * entire file, we can remove any range fragments that might * be on this grant and just have one range in the list that * covers the whole file. */ (void) nfs_range_set(lg->lo_range, 0, -1); /* * create the ever grant structure */ create = TRUE; eg = rfs41_findevergrant(cs->cp, cs->vp, &create); if (create == FALSE) rfs41_ever_grant_rele(eg); /* * Build layout get reply */ logrp = kmem_zalloc(sizeof (layout4), KM_SLEEP); resp->LAYOUTGET4res_u.logr_resok4.logr_layout.logr_layout_len = 1; resp->LAYOUTGET4res_u.logr_resok4.logr_layout.logr_layout_val = logrp; resp->LAYOUTGET4res_u.logr_resok4.logr_return_on_close = FALSE; resp->LAYOUTGET4res_u.logr_will_signal_layout_avail = FALSE; rfs41_lo_seqid(&lg->lo_stateid); resp->LAYOUTGET4res_u.logr_resok4.logr_stateid = lg->lo_stateid.stateid; logrp->lo_offset = 0; logrp->lo_length = -1; logrp->lo_iomode = LAYOUTIOMODE4_RW; logrp->lo_content.loc_type = lp->mlo_type; logrp->lo_content.loc_body.loc_body_len = xdr_size; logrp->lo_content.loc_body.loc_body_val = xdr_buffer; mds_free_fh_list(nfl_fh_list, lp->mlo_lc.lc_stripe_count); rfs4_dbe_rele(lp->mlo_dbe); return (NFS4_OK); } extern nfsstat4 mds_validate_logstateid(struct compound_state *, stateid_t *); static mds_layout_grant_t * mds_get_lo_grant_by_cp(struct compound_state *cs) { rfs4_file_t *fp; rfs4_client_t *cp = cs->cp; mds_layout_grant_t *lg; bool_t create = FALSE; if (cp->rc_clientgrantlist.next->lg == NULL) return (NULL); fp = rfs4_findfile(cs->instp, cs->vp, NULL, &create); if (fp == NULL) return (NULL); lg = rfs41_findlogrant(cs, fp, cp, &create); rfs4_file_rele(fp); return (lg); } /* * XXX - Eventually, we will support multi-range layouts. range_overlap * would be set to true if a new LAYOUTGET for a range (or part of * a range) that was already recalled, is obtained. For now, the * server does not handle multi-range layouts, but this is needed * for eventual CB_LAYOUT_RECALL race detection. Refer to section * 12.5.5.2.1.3 of draft-25 for further details. */ uint32_t range_overlap = 0; /*ARGSUSED*/ static void mds_op_layout_get(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *reqp, compound_state_t *cs) { LAYOUTGET4args *argp = &argop->nfs_argop4_u.oplayoutget; LAYOUTGET4res *resp = &resop->nfs_resop4_u.oplayoutget; nfsstat4 nfsstat = NFS4_OK; stateid_t *arg_stateid; mds_layout_grant_t *lg = NULL; sequenceid4 log_seqid; DTRACE_NFSV4_2(op__layoutget__start, struct compound_state *, cs, LAYOUTGET4args *, argp); if (!rfs4_cs_has_fh(cs)) { nfsstat = NFS4ERR_NOFILEHANDLE; goto final; } if (argp->loga_layout_type != LAYOUT4_NFSV4_1_FILES) { nfsstat = NFS4ERR_UNKNOWN_LAYOUTTYPE; goto final; } if (argp->loga_iomode == LAYOUTIOMODE4_ANY) { nfsstat = NFS4ERR_BADIOMODE; goto final; } if (argp->loga_length < argp->loga_minlength) { nfsstat = NFS4ERR_INVAL; goto final; } /* * Validate the provided stateid */ arg_stateid = (stateid_t *)&(argp->loga_stateid); log_seqid = arg_stateid->v41_bits.chgseq; /* * We may already have given this client a layout. Better * get it, if it exists. */ lg = mds_get_lo_grant_by_cp(cs); /* * first, only open, deleg, lock * or layout stateids are permitted. * currently open, deleg and lock stateids are handed * out by v4 routines and are of v4 format. * check if it is a layout stateid and treat differently. */ if (arg_stateid->v41_bits.type == LAYOUTID) { /* * if they are using a layout stateid, then we must * have already handed it out and should have found * it above. */ if (lg == NULL) { nfsstat = NFS4ERR_BAD_STATEID; goto final; } /* * Layout race detection */ if (lg->lo_status & LO_RECALL_INPROG) { ASSERT(lg->lor_seqid != 0); if (log_seqid == (lg->lor_seqid - 2)) { /* case 1 - pending recall in prog */ nfsstat = NFS4ERR_RECALLCONFLICT; rfs41_lo_grant_rele(lg); goto final; } else if (log_seqid >= lg->lor_seqid && !lg->lor_reply) { /* * case 2 - server has NOT received * the cb_lorecall reply yet. */ if (range_overlap) { nfsstat = NFS4ERR_RECALLCONFLICT; rfs41_lo_grant_rele(lg); goto final; } } else if (log_seqid == lg->lor_seqid && lg->lor_reply) { /* * case 3 - server HAS received * reply to cb_lorecall */ nfsstat = NFS4ERR_RETURNCONFLICT; rfs41_lo_grant_rele(lg); goto final; } } rfs41_lo_grant_rele(lg); } else { /* * not using a layout stateid, so we better not * have handed one out already for this file or * this client gets an error. */ if (lg != NULL) { rfs41_lo_grant_rele(lg); nfsstat = NFS4ERR_BAD_STATEID; goto final; } nfsstat = mds_validate_logstateid(cs, arg_stateid); if (nfsstat != NFS4_OK) { goto final; } } nfsstat = mds_fetch_layout(cs, argp, resp); final: *cs->statusp = resp->logr_status = nfsstat; DTRACE_NFSV4_2(op__layoutget__done, struct compound_state *, cs, LAYOUTGET4res *, resp); } /* * Freeing the layoutget response. */ /*ARGSUSED*/ static void mds_op_layout_get_free(nfs_resop4 *resop) { LAYOUTGET4res *resp = &(resop->nfs_resop4_u.oplayoutget); layout4 *lo = resp->LAYOUTGET4res_u.logr_resok4.logr_layout.logr_layout_val; if ((resp->logr_status == NFS4_OK) && (lo != NULL)) { kmem_free(lo->lo_content.loc_body.loc_body_val, lo->lo_content.loc_body.loc_body_len); kmem_free(lo, sizeof (layout4) * resp->LAYOUTGET4res_u.logr_resok4. logr_layout.logr_layout_len); } } /*ARGSUSED*/ static void mds_op_layout_commit(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *reqp, compound_state_t *cs) { LAYOUTCOMMIT4args *argp = &argop->nfs_argop4_u.oplayoutcommit; LAYOUTCOMMIT4res *resp = &resop->nfs_resop4_u.oplayoutcommit; nfsstat4 nfsstat = NFS4_OK; vnode_t *vp = cs->vp; cred_t *cr = cs->cr; rfs4_client_t *cp = cs->cp; offset4 newsize; caller_context_t ct; vattr_t va; uio_t uio; iovec_t iov; char null_byte; int error; DTRACE_NFSV4_2(op__layoutcommit__start, struct compound_state *, cs, LAYOUTCOMMIT4args *, argp); if (!rfs4_cs_has_fh(cs)) { *cs->statusp = resp->locr_status = NFS4ERR_NOFILEHANDLE; goto final; } if (rfs4_clnt_in_grace(cp) && !argp->loca_reclaim) { *cs->statusp = resp->locr_status = NFS4ERR_GRACE; goto final; } if (argp->loca_reclaim) { if (!rfs4_clnt_in_grace(cp) || !cp->rc_can_reclaim) { *cs->statusp = resp->locr_status = NFS4ERR_NO_GRACE; goto final; } } #ifdef NOT_FIXED_6846909 else { /* * validate loca_stateid */ mds_layout_grant_t *lg; if ((lg = mds_get_lo_grant_by_cp(cs)) == NULL) { *cs->statusp = resp->locr_status = NFS4ERR_BADLAYOUT; goto final; } if (layout_match(lg->lo_stateid, argp->loca_stateid, &nfsstat) == 0) { rfs41_lo_grant_rele(lg); *cs->statusp = resp->locr_status = NFS4ERR_BADLAYOUT; goto final; } rfs41_lo_grant_rele(lg); } #endif resp->LAYOUTCOMMIT4res_u.locr_resok4.locr_newsize.\ ns_sizechanged = FALSE; if (argp->loca_last_write_offset.no_newoffset) { newsize = argp->loca_last_write_offset.newoffset4_u.no_offset + 1; ct.cc_sysid = 0; ct.cc_pid = 0; ct.cc_caller_id = cs->instp->caller_id; ct.cc_flags = CC_DONTBLOCK; va.va_mask = AT_SIZE; error = VOP_GETATTR(vp, &va, 0, cr, &ct); if (error != 0) { *cs->statusp = resp->locr_status = puterrno4(error); goto final; } /* * write a null byte at newsize-1 so that the size * is correct. VOP_SETATTR may fail if the mode of * the file denies the implementation. */ if (newsize > va.va_size) { null_byte = '\0'; iov.iov_base = &null_byte; iov.iov_len = 1; bzero(&uio, sizeof (uio)); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = newsize - 1; uio.uio_segflg = UIO_SYSSPACE; uio.uio_fmode = FWRITE; uio.uio_extflg = 0; uio.uio_limit = newsize; uio.uio_resid = 1; (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, &ct); error = VOP_WRITE(vp, &uio, FWRITE, cr, &ct); VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, &ct); if (error != 0) { *cs->statusp = resp->locr_status = \ puterrno4(error); goto final; } resp->LAYOUTCOMMIT4res_u.locr_resok4.locr_newsize.\ ns_sizechanged = TRUE; resp->LAYOUTCOMMIT4res_u.locr_resok4.locr_newsize.\ newsize4_u.ns_size = va.va_size; } } *cs->statusp = resp->locr_status = nfsstat; final: DTRACE_NFSV4_2(op__layoutcommit__done, struct compound_state *, cs, LAYOUTCOMMIT4res *, resp); } int layout_match(stateid_t lo_stateid, stateid4 lrf_id, nfsstat4 *status) { stateid_t *id = (stateid_t *)&lrf_id; if (id->v41_bits.type != LAYOUTID) { *status = NFS4ERR_BAD_STATEID; return (0); } if (lo_stateid.v41_bits.boottime == id->v41_bits.boottime && lo_stateid.v41_bits.state_ident == id->v41_bits.state_ident) { *status = NFS4_OK; return (1); } else { *status = NFS4ERR_BAD_STATEID; return (0); } } nfsstat4 mds_return_layout_file(layoutreturn_file4 *lorf, struct compound_state *cs, LAYOUTRETURN4res *resp) { rfs4_file_t *fp; mds_layout_grant_t *lg; nfsstat4 status; bool_t create = FALSE; nfs_range_query_t remain; if (!rfs4_cs_has_fh(cs)) { cmn_err(CE_WARN, "lo_return(): putfh first"); return (NFS4ERR_NOFILEHANDLE); } fp = rfs4_findfile(cs->instp, cs->vp, NULL, &create); if (fp == NULL) { cmn_err(CE_WARN, "lo_return(): findfile returned NULL"); return (NFS4ERR_SERVERFAULT); } lg = rfs41_findlogrant(cs, fp, cs->cp, &create); if (lg == NULL) { /* * Is this really so bad? If the server reboots and then * the client returns a layout, we won't have a grant * structure for it. */ cmn_err(CE_WARN, "lo_return(): findlogrant returned NULL"); rfs4_file_rele(fp); return (NFS4ERR_SERVERFAULT); } if (!layout_match(lg->lo_stateid, lorf->lrf_stateid, &status)) { rfs41_lo_grant_rele(lg); rfs4_file_rele(fp); return (status); } /* * Refer to Section 18.44.3 of draft-25 for the right * lrs_present mojo and corresponding stateid setting. */ rfs41_lo_seqid(&lg->lo_stateid); resp->lorr_stid_u.lrs_stateid = lg->lo_stateid.stateid; remain = nfs_range_clear(lg->lo_range, lorf->lrf_offset, lorf->lrf_length); #ifdef NOT_DONE /* XXX - could (should?) be async operation */ /* need to add range to this */ mds_invalidate_ds_state(lg->lo_stateid, cs->cp, LAYOUT); #endif /* if entire layout has been returned, clean up */ if (remain == NFS_RANGE_NONE) { mutex_enter(&lg->lo_lock); if (lg->lo_status == LO_RECALL_INPROG) { lg->lor_seqid = 0; /* reset */ lg->lo_status = LO_RECALLED; } else { lg->lo_status = LO_RETURNED; } mutex_exit(&lg->lo_lock); /* remove the layout grant from both lists */ rfs4_dbe_lock(lg->lo_cp->rc_dbe); remque(&lg->lo_clientgrantlist); lg->lo_clientgrantlist.next = lg->lo_clientgrantlist.prev = &lg->lo_clientgrantlist; rfs4_dbe_unlock(lg->lo_cp->rc_dbe); lg->lo_cp = NULL; rfs4_dbe_lock(lg->lo_fp->rf_dbe); remque(&lg->lo_grant_list); lg->lo_grant_list.next = lg->lo_grant_list.prev = &lg->lo_grant_list; rfs4_dbe_unlock(lg->lo_fp->rf_dbe); rfs4_file_rele(lg->lo_fp); lg->lo_fp = NULL; rfs4_dbe_invalidate(lg->lo_dbe); rfs41_lo_grant_rele(lg); #ifdef RECALL_ENGINE if (&fp->rf_lo_grant_list == fp->rf_lo_grant_list.next) { fp->rf_mlo->mlo_flags = LAYOUT_RETURNED; rfs4_dbe_cv_broadcast(fp->rf_dbe); } #endif } else { /* * XXX - just in case our client does this * remove this when the control protocol work is complete * and we are telling the DS about partial returns. */ cmn_err(CE_WARN, "SURPRISE, partial layout return"); #ifdef RECALL_ENGINE /* * Was a recall in progress? Reset timers due to progress? */ #endif } rfs4_file_rele(fp); rfs41_lo_grant_rele(lg); return (NFS4_OK); } #define JW_STARTED 1 #ifdef JW_STARTED void mds_return_layout_fsid(struct compound_state *cs) { mds_ever_grant_t *eg; bool_t create = FALSE; /* * hg nits doesn't like any of this so i'm making it a comment block * loop through the DS's * mds_invalidate_ds_state(fsid, cp, LAYOUT_FSID) * } * * clean up state * 1) ever_grant * 2) layout_grants */ eg = rfs41_findevergrant(cs->cp, cs->vp, &create); if (eg != NULL) { rfs4_dbe_lock(eg->eg_dbe); eg->eg_cp = NULL; rfs4_dbe_invalidate(eg->eg_dbe); rfs4_dbe_unlock(eg->eg_dbe); rfs41_ever_grant_rele(eg); } mds_clean_grants_by_fsid(cs->cp, cs->vp); } void mds_return_layout_all(rfs4_client_t *cp) { /* * loop throug the DS's * mds_invalidate_ds_state(NULL, cp, LAYOUT_ALL) * } */ /* * clean up state * 1) layout_grants * 2) ever_grants */ mds_clean_up_grants(cp); } #endif /*ARGSUSED*/ static void mds_op_layout_return(nfs_argop4 *argop, nfs_resop4 *resop, struct svc_req *reqp, compound_state_t *cs) { LAYOUTRETURN4args *args = &argop->nfs_argop4_u.oplayoutreturn; LAYOUTRETURN4res *resp = &resop->nfs_resop4_u.oplayoutreturn; nfsstat4 nfsstat = NFS4_OK; rfs4_client_t *cp = cs->cp; layoutreturn_file4 *lorf; bool_t lora_reclaim = args->lora_reclaim; int ingrace; DTRACE_NFSV4_1(op__layoutreturn__start, struct compound_state *, cs); /* * Layout type */ switch (args->lora_layout_type) { case LAYOUT4_NFSV4_1_FILES: break; case LAYOUT4_OSD2_OBJECTS: case LAYOUT4_BLOCK_VOLUME: nfsstat = NFS4ERR_NOTSUPP; goto final; default: nfsstat = NFS4ERR_UNKNOWN_LAYOUTTYPE; goto final; } /* * XXX what if it is a bulk recall for FSID, but this client is * doing a return of type FILE for a file on a different FSID? */ if (cp->rc_bulk_recall && cp->rc_bulk_recall != args->lora_layoutreturn.lr_returntype) { nfsstat = NFS4ERR_RECALLCONFLICT; goto final; } switch (args->lora_layoutreturn.lr_returntype) { case LAYOUTRETURN4_FILE: ingrace = rfs4_clnt_in_grace(cp); if (lora_reclaim && !ingrace) { nfsstat = NFS4ERR_NO_GRACE; goto final; } else if (ingrace && lora_reclaim) { nfsstat = NFS4ERR_GRACE; goto final; } lorf = &args->lora_layoutreturn.layoutreturn4_u.lr_layout; nfsstat = mds_return_layout_file(lorf, cs, resp); break; case LAYOUTRETURN4_FSID: if (lora_reclaim) { nfsstat = NFS4ERR_INVAL; goto final; } #ifdef JW_STARTED mds_return_layout_fsid(cs); nfsstat = NFS4_OK; #else cmn_err(CE_NOTE, "loreturn: LAYOUTRETURN4_FSID"); nfsstat = NFS4ERR_NOTSUPP; #endif cp->rc_bulk_recall = 0; break; case LAYOUTRETURN4_ALL: if (lora_reclaim) { nfsstat = NFS4ERR_INVAL; goto final; } #ifdef JW_STARTED mds_return_layout_all(cp); nfsstat = NFS4_OK; #else cmn_err(CE_NOTE, "loreturn: LAYOUTRETURN4_ALL"); nfsstat = NFS4ERR_NOTSUPP; #endif cp->rc_bulk_recall = 0; break; default: nfsstat = NFS4ERR_INVAL; } final: *cs->statusp = resp->lorr_status = nfsstat; DTRACE_NFSV4_2(op__layoutreturn__done, struct compound_state *, cs, LAYOUTRETURN4res *, resp); } char * tohex(const void *bytes, int len) { static char *hexvals = "0123456789ABCDEF"; char *rc; const unsigned char *c = bytes; int i; rc = kmem_alloc(len * 2 + 1, KM_SLEEP); rc[len * 2] = '\0'; for (i = 0; i < len; i++) { rc[2 * i] = hexvals[c[i] >> 4]; rc[2 * i + 1] = hexvals[c[i] & 0xf]; } return (rc); } /* * The function checks if given compound operation is allowed * to be the very fist operation in compound array. */ static bool_t valid_first_compound_op(nfs_opnum4 op) { if (op == OP_BIND_CONN_TO_SESSION || op == OP_SEQUENCE || op == OP_EXCHANGE_ID || op == OP_CREATE_SESSION || op == OP_DESTROY_SESSION || op == OP_ILLEGAL) return (TRUE); return (FALSE); } /* * The function verifies arguments passed to mds_op_compound. * If agrguments are valid, NFS4_OK is returned, otherwise * function returns correspoinding NFS4 error code. */ static nfsstat4 verify_compound_args(compound_state_t *cs, COMPOUND4args *args) { if (args->array_len == 0) return (NFS4_OK); if (!valid_first_compound_op(args->array[0].argop)) return (NFS4ERR_OP_NOT_IN_SESSION); if (cs->sp != NULL) { sess_channel_t *fore_chan = cs->sp->sn_fore; if (args->array_len > fore_chan->cn_attrs.ca_maxoperations) return (NFS4ERR_TOO_MANY_OPS); } else if (args->array_len != 1) { /* * Compound is outside the session, then there must be * only one operation in request. */ return (NFS4ERR_NOT_ONLY_OP); } return (NFS4_OK); } extern slotid4 slrc_slot_size; nfsstat4 sess_chan_limits(sess_channel_t *scp) { count4 maxreqs; maxreqs = scp->cn_attrs.ca_maxrequests; if (maxreqs > slrc_slot_size) { scp->cn_attrs.ca_maxrequests = slrc_slot_size; DTRACE_PROBE4(nfss41__i__sesschanlim, char *, "maxreqs: ", count4, maxreqs, char *, "\tAdjusting to: ", count4, slrc_slot_size); } if (scp->cn_attrs.ca_maxoperations > NFS4_COMPOUND_LIMIT) scp->cn_attrs.ca_maxoperations = NFS4_COMPOUND_LIMIT; /* * Lower limit should be set to smallest sane COMPOUND. Even * though a singleton SEQUENCE op is the very smallest COMPOUND, * it's also quite boring. For all practical purposes, the lower * limit for creating a sess is limited to: * * [SEQUENCE + PUTROOTFH + GETFH] * * XXX - can't limit READ's to a specific threshold, otherwise * we artificially limit the clients to perform reads of * AT LEAST that granularity, which is WRONG !!! Same goes * for READDIR's and GETATTR's. */ if (scp->cn_attrs.ca_maxresponsesize < (sizeof (SEQUENCE4res) + sizeof (PUTROOTFH4res) + sizeof (GETFH4res))) return (NFS4ERR_TOOSMALL); return (NFS4_OK); } /* check csa_flags for OP_CREATE_SESSION */ bool_t nfs41_csa_flags_valid(uint32_t flags) { if (flags & ~CREATE_SESSION41_FLAG_MASK) return (FALSE); return (TRUE); } static int seq_chk_limits(nfs_argop4 *argop, nfs_resop4 *resop, compound_state_t *cs) { sess_channel_t *fcp; void *args; void *resp; attrmap4 am; ASSERT(cs != NULL); if (!cs->sp) return (0); ASSERT(argop->argop == resop->resop); ASSERT(cs->sp->sn_fore != NULL); fcp = cs->sp->sn_fore; switch (argop->argop) { case OP_READ: args = (READ4args *)&argop->nfs_argop4_u.opread; resp = (READ4res *)&resop->nfs_resop4_u.opread; cs->rqst_sz += sizeof (READ4args); cs->resp_sz += sizeof (READ4res) + ((READ4args *)args)->count; break; case OP_READDIR: args = (READDIR4args *)&argop->nfs_argop4_u.opreaddir; resp = (READDIR4res *)&resop->nfs_resop4_u.opreaddir; cs->rqst_sz += sizeof (READDIR4args); cs->resp_sz += sizeof (READDIR4res) + ((READDIR4args *)args)->maxcount; break; case OP_GETATTR: args = (GETATTR4args *)&argop->nfs_argop4_u.opgetattr; resp = (GETATTR4res *)&resop->nfs_resop4_u.opgetattr; /* * ACL and FS_LOCATIONS attrs can be variable sized; * we'll need to post-process this COMPOUND to make * sure the GETATTR reply is w/in session limits. * This must occur w/in mds_op_getattr itself. */ cs->rqst_sz += sizeof (GETATTR4args); am = ((GETATTR4args *)args)->attr_request; if (ATTR_ISSET(am, ACL) || ATTR_ISSET(am, FS_LOCATIONS)) { DTRACE_PROBE(nfss41__i__seqchklim); cs->post_proc = 1; return (0); } break; case OP_GETDEVICEINFO: args = (GETDEVICEINFO4args *) &argop->nfs_argop4_u.opgetdeviceinfo; resp = (GETDEVICEINFO4res *) &resop->nfs_resop4_u.opgetdeviceinfo; cs->rqst_sz += sizeof (GETDEVICEINFO4args); cs->resp_sz += sizeof (GETDEVICEINFO4res) + ((GETDEVICEINFO4args *)args)->gdia_maxcount; break; case OP_LAYOUTGET: args = (LAYOUTGET4args *)&argop->nfs_argop4_u.oplayoutget; resp = (LAYOUTGET4res *)&resop->nfs_resop4_u.oplayoutget; cs->rqst_sz += sizeof (LAYOUTGET4args); cs->resp_sz += sizeof (LAYOUTGET4res) + ((LAYOUTGET4args *)args)->loga_maxcount; break; default: break; } /* Request */ if (cs->rqst_sz > fcp->cn_attrs.ca_maxrequestsize) { *cs->statusp = NFS4ERR_REQ_TOO_BIG; SET_RESOP4(resop, NFS4ERR_REQ_TOO_BIG); return (1); } /* Response */ if (cs->resp_sz > fcp->cn_attrs.ca_maxresponsesize) { *cs->statusp = NFS4ERR_REP_TOO_BIG; SET_RESOP4(resop, NFS4ERR_REP_TOO_BIG); return (1); } /* sa_cachethis == 1; max cached response */ if (cs->sact) { if (cs->resp_sz > fcp->cn_attrs.ca_maxresponsesize_cached) { *cs->statusp = NFS4ERR_REP_TOO_BIG_TO_CACHE; SET_RESOP4(resop, NFS4ERR_REP_TOO_BIG_TO_CACHE); return (1); } } return (0); } static void pnfs_ds_statfs_walk(rfs4_entry_t entry, void *arg) { ds_guid_info_t *pgi = (ds_guid_info_t *)entry; struct statvfs64 *sbp = (struct statvfs64 *)arg; if (rfs4_dbe_skip_or_invalid(pgi->dbe)) return; sbp->f_blocks += pgi->space_total; sbp->f_bfree += pgi->space_free; sbp->f_bavail += pgi->space_free; } void pnfs_correct_statfs(struct compound_state *cs, struct statvfs64 *sbp) { sbp->f_frsize = 1; sbp->f_blocks = sbp->f_bfree = sbp->f_bavail = 0; rw_enter(&mds_server->ds_guid_info_lock, RW_READER); rfs4_dbe_walk(mds_server->ds_guid_info_tab, pnfs_ds_statfs_walk, sbp); rw_exit(&mds_server->ds_guid_info_lock); }