view usr/src/uts/common/fs/nfs/nfs4_subr.c @ 13823:3abbdbfdfaf3

3156 nfs: '.', '..', and filename with '/' return wrong error code Reviewed by: Eric Schrock <eric.schrock@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net>
author Daniil Lunev <d.lunev.mail@gmail.com>
date Thu, 30 Aug 2012 15:48:18 -0500
parents 34a2ada0dd49
children 17189d594419
line wrap: on
line source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
/*
 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
 */

/*
 *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
 *	All Rights Reserved
 */

#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/cmn_err.h>
#include <sys/vtrace.h>
#include <sys/session.h>
#include <sys/thread.h>
#include <sys/dnlc.h>
#include <sys/cred.h>
#include <sys/priv.h>
#include <sys/list.h>
#include <sys/sdt.h>
#include <sys/policy.h>

#include <rpc/types.h>
#include <rpc/xdr.h>

#include <nfs/nfs.h>

#include <nfs/nfs_clnt.h>

#include <nfs/nfs4.h>
#include <nfs/rnode4.h>
#include <nfs/nfs4_clnt.h>

/*
 * client side statistics
 */
static const struct clstat4 clstat4_tmpl = {
	{ "calls",	KSTAT_DATA_UINT64 },
	{ "badcalls",	KSTAT_DATA_UINT64 },
	{ "referrals",	KSTAT_DATA_UINT64 },
	{ "referlinks",	KSTAT_DATA_UINT64 },
	{ "clgets",	KSTAT_DATA_UINT64 },
	{ "cltoomany",	KSTAT_DATA_UINT64 },
#ifdef DEBUG
	{ "clalloc",	KSTAT_DATA_UINT64 },
	{ "noresponse",	KSTAT_DATA_UINT64 },
	{ "failover",	KSTAT_DATA_UINT64 },
	{ "remap",	KSTAT_DATA_UINT64 },
#endif
};

#ifdef DEBUG
struct clstat4_debug clstat4_debug = {
	{ "nrnode",	KSTAT_DATA_UINT64 },
	{ "access",	KSTAT_DATA_UINT64 },
	{ "dirent",	KSTAT_DATA_UINT64 },
	{ "dirents",	KSTAT_DATA_UINT64 },
	{ "reclaim",	KSTAT_DATA_UINT64 },
	{ "clreclaim",	KSTAT_DATA_UINT64 },
	{ "f_reclaim",	KSTAT_DATA_UINT64 },
	{ "a_reclaim",	KSTAT_DATA_UINT64 },
	{ "r_reclaim",	KSTAT_DATA_UINT64 },
	{ "r_path",	KSTAT_DATA_UINT64 },
};
#endif

/*
 * We keep a global list of per-zone client data, so we can clean up all zones
 * if we get low on memory.
 */
static list_t nfs4_clnt_list;
static kmutex_t nfs4_clnt_list_lock;
zone_key_t nfs4clnt_zone_key;

static struct kmem_cache *chtab4_cache;

#ifdef DEBUG
static int nfs4_rfscall_debug;
static int nfs4_try_failover_any;
int nfs4_utf8_debug = 0;
#endif

/*
 * NFSv4 readdir cache implementation
 */
typedef struct rddir4_cache_impl {
	rddir4_cache	rc;		/* readdir cache element */
	kmutex_t	lock;		/* lock protects count */
	uint_t		count;		/* reference count */
	avl_node_t	tree;		/* AVL tree link */
} rddir4_cache_impl;

static int rddir4_cache_compar(const void *, const void *);
static void rddir4_cache_free(rddir4_cache_impl *);
static rddir4_cache *rddir4_cache_alloc(int);
static void rddir4_cache_hold(rddir4_cache *);
static int try_failover(enum clnt_stat);

static int nfs4_readdir_cache_hits = 0;
static int nfs4_readdir_cache_waits = 0;
static int nfs4_readdir_cache_misses = 0;

/*
 * Shared nfs4 functions
 */

/*
 * Copy an nfs_fh4.  The destination storage (to->nfs_fh4_val) must already
 * be allocated.
 */

void
nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to)
{
	to->nfs_fh4_len = from->nfs_fh4_len;
	bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len);
}

/*
 * nfs4cmpfh - compare 2 filehandles.
 * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
 * "less" than the second, +1 if the first is "greater" than the second.
 */

int
nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2)
{
	const char *c1, *c2;

	if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len)
		return (-1);
	if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len)
		return (1);
	for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val;
	    c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len;
	    c1++, c2++) {
		if (*c1 < *c2)
			return (-1);
		if (*c1 > *c2)
			return (1);
	}

	return (0);
}

/*
 * Compare two v4 filehandles.  Return zero if they're the same, non-zero
 * if they're not.  Like nfs4cmpfh(), but different filehandle
 * representation, and doesn't provide information about greater than or
 * less than.
 */

int
nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2)
{
	if (fh1->fh_len == fh2->fh_len)
		return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len));

	return (1);
}

int
stateid4_cmp(stateid4 *s1, stateid4 *s2)
{
	if (bcmp(s1, s2, sizeof (stateid4)) == 0)
		return (1);
	else
		return (0);
}

nfsstat4
puterrno4(int error)
{
	switch (error) {
	case 0:
		return (NFS4_OK);
	case EPERM:
		return (NFS4ERR_PERM);
	case ENOENT:
		return (NFS4ERR_NOENT);
	case EINTR:
		return (NFS4ERR_IO);
	case EIO:
		return (NFS4ERR_IO);
	case ENXIO:
		return (NFS4ERR_NXIO);
	case ENOMEM:
		return (NFS4ERR_RESOURCE);
	case EACCES:
		return (NFS4ERR_ACCESS);
	case EBUSY:
		return (NFS4ERR_IO);
	case EEXIST:
		return (NFS4ERR_EXIST);
	case EXDEV:
		return (NFS4ERR_XDEV);
	case ENODEV:
		return (NFS4ERR_IO);
	case ENOTDIR:
		return (NFS4ERR_NOTDIR);
	case EISDIR:
		return (NFS4ERR_ISDIR);
	case EINVAL:
		return (NFS4ERR_INVAL);
	case EMFILE:
		return (NFS4ERR_RESOURCE);
	case EFBIG:
		return (NFS4ERR_FBIG);
	case ENOSPC:
		return (NFS4ERR_NOSPC);
	case EROFS:
		return (NFS4ERR_ROFS);
	case EMLINK:
		return (NFS4ERR_MLINK);
	case EDEADLK:
		return (NFS4ERR_DEADLOCK);
	case ENOLCK:
		return (NFS4ERR_DENIED);
	case EREMOTE:
		return (NFS4ERR_SERVERFAULT);
	case ENOTSUP:
		return (NFS4ERR_NOTSUPP);
	case EDQUOT:
		return (NFS4ERR_DQUOT);
	case ENAMETOOLONG:
		return (NFS4ERR_NAMETOOLONG);
	case EOVERFLOW:
		return (NFS4ERR_INVAL);
	case ENOSYS:
		return (NFS4ERR_NOTSUPP);
	case ENOTEMPTY:
		return (NFS4ERR_NOTEMPTY);
	case EOPNOTSUPP:
		return (NFS4ERR_NOTSUPP);
	case ESTALE:
		return (NFS4ERR_STALE);
	case EAGAIN:
		if (curthread->t_flag & T_WOULDBLOCK) {
			curthread->t_flag &= ~T_WOULDBLOCK;
			return (NFS4ERR_DELAY);
		}
		return (NFS4ERR_LOCKED);
	default:
		return ((enum nfsstat4)error);
	}
}

int
geterrno4(enum nfsstat4 status)
{
	switch (status) {
	case NFS4_OK:
		return (0);
	case NFS4ERR_PERM:
		return (EPERM);
	case NFS4ERR_NOENT:
		return (ENOENT);
	case NFS4ERR_IO:
		return (EIO);
	case NFS4ERR_NXIO:
		return (ENXIO);
	case NFS4ERR_ACCESS:
		return (EACCES);
	case NFS4ERR_EXIST:
		return (EEXIST);
	case NFS4ERR_XDEV:
		return (EXDEV);
	case NFS4ERR_NOTDIR:
		return (ENOTDIR);
	case NFS4ERR_ISDIR:
		return (EISDIR);
	case NFS4ERR_INVAL:
		return (EINVAL);
	case NFS4ERR_FBIG:
		return (EFBIG);
	case NFS4ERR_NOSPC:
		return (ENOSPC);
	case NFS4ERR_ROFS:
		return (EROFS);
	case NFS4ERR_MLINK:
		return (EMLINK);
	case NFS4ERR_NAMETOOLONG:
		return (ENAMETOOLONG);
	case NFS4ERR_NOTEMPTY:
		return (ENOTEMPTY);
	case NFS4ERR_DQUOT:
		return (EDQUOT);
	case NFS4ERR_STALE:
		return (ESTALE);
	case NFS4ERR_BADHANDLE:
		return (ESTALE);
	case NFS4ERR_BAD_COOKIE:
		return (EINVAL);
	case NFS4ERR_NOTSUPP:
		return (EOPNOTSUPP);
	case NFS4ERR_TOOSMALL:
		return (EINVAL);
	case NFS4ERR_SERVERFAULT:
		return (EIO);
	case NFS4ERR_BADTYPE:
		return (EINVAL);
	case NFS4ERR_DELAY:
		return (ENXIO);
	case NFS4ERR_SAME:
		return (EPROTO);
	case NFS4ERR_DENIED:
		return (ENOLCK);
	case NFS4ERR_EXPIRED:
		return (EPROTO);
	case NFS4ERR_LOCKED:
		return (EACCES);
	case NFS4ERR_GRACE:
		return (EAGAIN);
	case NFS4ERR_FHEXPIRED:	/* if got here, failed to get a new fh */
		return (ESTALE);
	case NFS4ERR_SHARE_DENIED:
		return (EACCES);
	case NFS4ERR_WRONGSEC:
		return (EPERM);
	case NFS4ERR_CLID_INUSE:
		return (EAGAIN);
	case NFS4ERR_RESOURCE:
		return (EAGAIN);
	case NFS4ERR_MOVED:
		return (EPROTO);
	case NFS4ERR_NOFILEHANDLE:
		return (EIO);
	case NFS4ERR_MINOR_VERS_MISMATCH:
		return (ENOTSUP);
	case NFS4ERR_STALE_CLIENTID:
		return (EIO);
	case NFS4ERR_STALE_STATEID:
		return (EIO);
	case NFS4ERR_OLD_STATEID:
		return (EIO);
	case NFS4ERR_BAD_STATEID:
		return (EIO);
	case NFS4ERR_BAD_SEQID:
		return (EIO);
	case NFS4ERR_NOT_SAME:
		return (EPROTO);
	case NFS4ERR_LOCK_RANGE:
		return (EPROTO);
	case NFS4ERR_SYMLINK:
		return (EPROTO);
	case NFS4ERR_RESTOREFH:
		return (EPROTO);
	case NFS4ERR_LEASE_MOVED:
		return (EPROTO);
	case NFS4ERR_ATTRNOTSUPP:
		return (ENOTSUP);
	case NFS4ERR_NO_GRACE:
		return (EPROTO);
	case NFS4ERR_RECLAIM_BAD:
		return (EPROTO);
	case NFS4ERR_RECLAIM_CONFLICT:
		return (EPROTO);
	case NFS4ERR_BADXDR:
		return (EINVAL);
	case NFS4ERR_LOCKS_HELD:
		return (EIO);
	case NFS4ERR_OPENMODE:
		return (EACCES);
	case NFS4ERR_BADOWNER:
		/*
		 * Client and server are in different DNS domains
		 * and the NFSMAPID_DOMAIN in /etc/default/nfs
		 * doesn't match.  No good answer here.  Return
		 * EACCESS, which translates to "permission denied".
		 */
		return (EACCES);
	case NFS4ERR_BADCHAR:
		return (EINVAL);
	case NFS4ERR_BADNAME:
		return (EINVAL);
	case NFS4ERR_BAD_RANGE:
		return (EIO);
	case NFS4ERR_LOCK_NOTSUPP:
		return (ENOTSUP);
	case NFS4ERR_OP_ILLEGAL:
		return (EINVAL);
	case NFS4ERR_DEADLOCK:
		return (EDEADLK);
	case NFS4ERR_FILE_OPEN:
		return (EACCES);
	case NFS4ERR_ADMIN_REVOKED:
		return (EPROTO);
	case NFS4ERR_CB_PATH_DOWN:
		return (EPROTO);
	default:
#ifdef DEBUG
		zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d",
		    status);
#endif
		return ((int)status);
	}
}

void
nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op)
{
	nfs4_server_t *server;

	/*
	 * Return if already printed/queued a msg
	 * for this mount point.
	 */
	if (mi->mi_flags & MI4_BADOWNER_DEBUG)
		return;
	/*
	 * Happens once per client <-> server pair.
	 */
	if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
	    mi->mi_flags & MI4_INT))
		return;

	server = find_nfs4_server(mi);
	if (server == NULL) {
		nfs_rw_exit(&mi->mi_recovlock);
		return;
	}

	if (!(server->s_flags & N4S_BADOWNER_DEBUG)) {
		zcmn_err(mi->mi_zone->zone_id, CE_WARN,
		    "!NFSMAPID_DOMAIN does not match"
		    " the server: %s domain.\n"
		    "Please check configuration",
		    mi->mi_curr_serv->sv_hostname);
		server->s_flags |= N4S_BADOWNER_DEBUG;
	}
	mutex_exit(&server->s_lock);
	nfs4_server_rele(server);
	nfs_rw_exit(&mi->mi_recovlock);

	/*
	 * Happens once per mntinfo4_t.
	 * This error is deemed as one of the recovery facts "RF_BADOWNER",
	 * queue this in the mesg queue for this mount_info. This message
	 * is not printed, meaning its absent from id_to_dump_solo_fact()
	 * but its there for inspection if the queue is ever dumped/inspected.
	 */
	mutex_enter(&mi->mi_lock);
	if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) {
		nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op,
		    FALSE, NULL, 0, NULL);
		mi->mi_flags |= MI4_BADOWNER_DEBUG;
	}
	mutex_exit(&mi->mi_lock);
}

int
nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime)
{
	int64_t sec;
	int32_t nsec;

	/*
	 * Here check that the nfsv4 time is valid for the system.
	 * nfsv4 time value is a signed 64-bit, and the system time
	 * may be either int64_t or int32_t (depends on the kernel),
	 * so if the kernel is 32-bit, the nfsv4 time value may not fit.
	 */
#ifndef _LP64
	if (! NFS4_TIME_OK(ntime->seconds)) {
		return (EOVERFLOW);
	}
#endif

	/* Invalid to specify 1 billion (or more) nsecs */
	if (ntime->nseconds >= 1000000000)
		return (EINVAL);

	if (ntime->seconds < 0) {
		sec = ntime->seconds + 1;
		nsec = -1000000000 + ntime->nseconds;
	} else {
		sec = ntime->seconds;
		nsec = ntime->nseconds;
	}

	vatime->tv_sec = sec;
	vatime->tv_nsec = nsec;

	return (0);
}

int
nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime)
{
	int64_t sec;
	uint32_t nsec;

	/*
	 * nfsv4 time value is a signed 64-bit, and the system time
	 * may be either int64_t or int32_t (depends on the kernel),
	 * so all system time values will fit.
	 */
	if (vatime->tv_nsec >= 0) {
		sec = vatime->tv_sec;
		nsec = vatime->tv_nsec;
	} else {
		sec = vatime->tv_sec - 1;
		nsec = 1000000000 + vatime->tv_nsec;
	}
	ntime->seconds = sec;
	ntime->nseconds = nsec;

	return (0);
}

/*
 * Converts a utf8 string to a valid null terminated filename string.
 *
 * XXX - Not actually translating the UTF-8 string as per RFC 2279.
 *	 For now, just validate that the UTF-8 string off the wire
 *	 does not have characters that will freak out UFS, and leave
 *	 it at that.
 */
char *
utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s)
{
	ASSERT(lenp != NULL);

	if (u8s == NULL || u8s->utf8string_len <= 0 ||
	    u8s->utf8string_val == NULL)
		return (NULL);

	/*
	 * Check for obvious illegal filename chars
	 */
	if (utf8_strchr(u8s, '/') != NULL) {
#ifdef DEBUG
		if (nfs4_utf8_debug) {
			char *path;
			int len = u8s->utf8string_len;

			path = kmem_alloc(len + 1, KM_SLEEP);
			bcopy(u8s->utf8string_val, path, len);
			path[len] = '\0';

			zcmn_err(getzoneid(), CE_WARN,
			    "Invalid UTF-8 filename: %s", path);

			kmem_free(path, len + 1);
		}
#endif
		return (NULL);
	}

	return (utf8_to_str(u8s, lenp, s));
}

/*
 * Converts a utf8 string to a C string.
 * kmem_allocs a new string if not supplied
 */
char *
utf8_to_str(utf8string *str, uint_t *lenp, char *s)
{
	char	*sp;
	char	*u8p;
	int	len;
	int	 i;

	ASSERT(lenp != NULL);

	if (str == NULL)
		return (NULL);

	u8p = str->utf8string_val;
	len = str->utf8string_len;
	if (len <= 0 || u8p == NULL) {
		if (s)
			*s = '\0';
		return (NULL);
	}

	sp = s;
	if (sp == NULL)
		sp = kmem_alloc(len + 1, KM_SLEEP);

	/*
	 * At least check for embedded nulls
	 */
	for (i = 0; i < len; i++) {
		sp[i] = u8p[i];
		if (u8p[i] == '\0') {
#ifdef	DEBUG
			zcmn_err(getzoneid(), CE_WARN,
			    "Embedded NULL in UTF-8 string");
#endif
			if (s == NULL)
				kmem_free(sp, len + 1);
			return (NULL);
		}
	}
	sp[len] = '\0';
	*lenp = len + 1;

	return (sp);
}

/*
 * str_to_utf8 - converts a null-terminated C string to a utf8 string
 */
utf8string *
str_to_utf8(char *nm, utf8string *str)
{
	int len;

	if (str == NULL)
		return (NULL);

	if (nm == NULL || *nm == '\0') {
		str->utf8string_len = 0;
		str->utf8string_val = NULL;
	}

	len = strlen(nm);

	str->utf8string_val = kmem_alloc(len, KM_SLEEP);
	str->utf8string_len = len;
	bcopy(nm, str->utf8string_val, len);

	return (str);
}

utf8string *
utf8_copy(utf8string *src, utf8string *dest)
{
	if (src == NULL)
		return (NULL);
	if (dest == NULL)
		return (NULL);

	if (src->utf8string_len > 0) {
		dest->utf8string_val = kmem_alloc(src->utf8string_len,
		    KM_SLEEP);
		bcopy(src->utf8string_val, dest->utf8string_val,
		    src->utf8string_len);
		dest->utf8string_len = src->utf8string_len;
	} else {
		dest->utf8string_val = NULL;
		dest->utf8string_len = 0;
	}

	return (dest);
}

int
utf8_compare(const utf8string *a, const utf8string *b)
{
	int mlen, cmp;
	int alen, blen;
	char *aval, *bval;

	if ((a == NULL) && (b == NULL))
		return (0);
	else if (a == NULL)
		return (-1);
	else if (b == NULL)
		return (1);

	alen = a->utf8string_len;
	blen = b->utf8string_len;
	aval = a->utf8string_val;
	bval = b->utf8string_val;

	if (((alen == 0) || (aval == NULL)) &&
	    ((blen == 0) || (bval == NULL)))
		return (0);
	else if ((alen == 0) || (aval == NULL))
		return (-1);
	else if ((blen == 0) || (bval == NULL))
		return (1);

	mlen = MIN(alen, blen);
	cmp = strncmp(aval, bval, mlen);

	if ((cmp == 0) && (alen == blen))
		return (0);
	else if ((cmp == 0) && (alen < blen))
		return (-1);
	else if (cmp == 0)
		return (1);
	else if (cmp < 0)
		return (-1);
	return (1);
}

/*
 * utf8_dir_verify - checks that the utf8 string is valid
 */
nfsstat4
utf8_dir_verify(utf8string *str)
{
	char *nm;
	int len;

	if (str == NULL)
		return (NFS4ERR_INVAL);

	nm = str->utf8string_val;
	len = str->utf8string_len;
	if (nm == NULL || len == 0) {
		return (NFS4ERR_INVAL);
	}

	if (len == 1 && nm[0] == '.')
		return (NFS4ERR_BADNAME);
	if (len == 2 && nm[0] == '.' && nm[1] == '.')
		return (NFS4ERR_BADNAME);

	if (utf8_strchr(str, '/') != NULL)
		return (NFS4ERR_BADNAME);

	if (utf8_strchr(str, '\0') != NULL)
		return (NFS4ERR_BADNAME);

	return (NFS4_OK);
}

/*
 * from rpcsec module (common/rpcsec)
 */
extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
extern void sec_clnt_freeh(AUTH *);
extern void sec_clnt_freeinfo(struct sec_data *);

/*
 * authget() gets an auth handle based on the security
 * information from the servinfo in mountinfo.
 * The auth handle is stored in ch_client->cl_auth.
 *
 * First security flavor of choice is to use sv_secdata
 * which is initiated by the client. If that fails, get
 * secinfo from the server and then select one from the
 * server secinfo list .
 *
 * For RPCSEC_GSS flavor, upon success, a secure context is
 * established between client and server.
 */
int
authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr)
{
	int error, i;

	/*
	 * SV4_TRYSECINFO indicates to try the secinfo list from
	 * sv_secinfo until a successful one is reached. Point
	 * sv_currsec to the selected security mechanism for
	 * later sessions.
	 */
	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
	if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) {
		for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count;
		    i++) {
			if (!(error = sec_clnt_geth(ch_client,
			    &svp->sv_secinfo->sdata[i],
			    cr, &ch_client->cl_auth))) {

				svp->sv_currsec = &svp->sv_secinfo->sdata[i];
				svp->sv_secinfo->index = i;
				/* done */
				svp->sv_flags &= ~SV4_TRYSECINFO;
				break;
			}

			/*
			 * Allow the caller retry with the security flavor
			 * pointed by svp->sv_secinfo->index when
			 * ETIMEDOUT/ECONNRESET occurs.
			 */
			if (error == ETIMEDOUT || error == ECONNRESET) {
				svp->sv_secinfo->index = i;
				break;
			}
		}
	} else {
		/* sv_currsec points to one of the entries in sv_secinfo */
		if (svp->sv_currsec) {
			error = sec_clnt_geth(ch_client, svp->sv_currsec, cr,
			    &ch_client->cl_auth);
		} else {
			/* If it's null, use sv_secdata. */
			error = sec_clnt_geth(ch_client, svp->sv_secdata, cr,
			    &ch_client->cl_auth);
		}
	}
	nfs_rw_exit(&svp->sv_lock);

	return (error);
}

/*
 * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
 */
int
clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
    struct chtab **chp, struct nfs4_clnt *nfscl)
{
	struct chhead *ch, *newch;
	struct chhead **plistp;
	struct chtab *cp;
	int error;
	k_sigset_t smask;

	if (newcl == NULL || chp == NULL || ci == NULL)
		return (EINVAL);

	*newcl = NULL;
	*chp = NULL;

	/*
	 * Find an unused handle or create one
	 */
	newch = NULL;
	nfscl->nfscl_stat.clgets.value.ui64++;
top:
	/*
	 * Find the correct entry in the cache to check for free
	 * client handles.  The search is based on the RPC program
	 * number, program version number, dev_t for the transport
	 * device, and the protocol family.
	 */
	mutex_enter(&nfscl->nfscl_chtable4_lock);
	plistp = &nfscl->nfscl_chtable4;
	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
		if (ch->ch_prog == ci->cl_prog &&
		    ch->ch_vers == ci->cl_vers &&
		    ch->ch_dev == svp->sv_knconf->knc_rdev &&
		    (strcmp(ch->ch_protofmly,
		    svp->sv_knconf->knc_protofmly) == 0))
			break;
		plistp = &ch->ch_next;
	}

	/*
	 * If we didn't find a cache entry for this quadruple, then
	 * create one.  If we don't have one already preallocated,
	 * then drop the cache lock, create one, and then start over.
	 * If we did have a preallocated entry, then just add it to
	 * the front of the list.
	 */
	if (ch == NULL) {
		if (newch == NULL) {
			mutex_exit(&nfscl->nfscl_chtable4_lock);
			newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
			newch->ch_timesused = 0;
			newch->ch_prog = ci->cl_prog;
			newch->ch_vers = ci->cl_vers;
			newch->ch_dev = svp->sv_knconf->knc_rdev;
			newch->ch_protofmly = kmem_alloc(
			    strlen(svp->sv_knconf->knc_protofmly) + 1,
			    KM_SLEEP);
			(void) strcpy(newch->ch_protofmly,
			    svp->sv_knconf->knc_protofmly);
			newch->ch_list = NULL;
			goto top;
		}
		ch = newch;
		newch = NULL;
		ch->ch_next = nfscl->nfscl_chtable4;
		nfscl->nfscl_chtable4 = ch;
	/*
	 * We found a cache entry, but if it isn't on the front of the
	 * list, then move it to the front of the list to try to take
	 * advantage of locality of operations.
	 */
	} else if (ch != nfscl->nfscl_chtable4) {
		*plistp = ch->ch_next;
		ch->ch_next = nfscl->nfscl_chtable4;
		nfscl->nfscl_chtable4 = ch;
	}

	/*
	 * If there was a free client handle cached, then remove it
	 * from the list, init it, and use it.
	 */
	if (ch->ch_list != NULL) {
		cp = ch->ch_list;
		ch->ch_list = cp->ch_list;
		mutex_exit(&nfscl->nfscl_chtable4_lock);
		if (newch != NULL) {
			kmem_free(newch->ch_protofmly,
			    strlen(newch->ch_protofmly) + 1);
			kmem_free(newch, sizeof (*newch));
		}
		(void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
		    &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);

		/*
		 * Get an auth handle.
		 */
		error = authget(svp, cp->ch_client, cr);
		if (error || cp->ch_client->cl_auth == NULL) {
			CLNT_DESTROY(cp->ch_client);
			kmem_cache_free(chtab4_cache, cp);
			return ((error != 0) ? error : EINTR);
		}
		ch->ch_timesused++;
		*newcl = cp->ch_client;
		*chp = cp;
		return (0);
	}

	/*
	 * There weren't any free client handles which fit, so allocate
	 * a new one and use that.
	 */
#ifdef DEBUG
	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, 1);
#endif
	mutex_exit(&nfscl->nfscl_chtable4_lock);

	nfscl->nfscl_stat.cltoomany.value.ui64++;
	if (newch != NULL) {
		kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
		kmem_free(newch, sizeof (*newch));
	}

	cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP);
	cp->ch_head = ch;

	sigintr(&smask, (int)ci->cl_flags & MI4_INT);
	error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
	    ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
	sigunintr(&smask);

	if (error != 0) {
		kmem_cache_free(chtab4_cache, cp);
#ifdef DEBUG
		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
#endif
		/*
		 * Warning is unnecessary if error is EINTR.
		 */
		if (error != EINTR) {
			nfs_cmn_err(error, CE_WARN,
			    "clget: couldn't create handle: %m\n");
		}
		return (error);
	}
	(void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
	auth_destroy(cp->ch_client->cl_auth);

	/*
	 * Get an auth handle.
	 */
	error = authget(svp, cp->ch_client, cr);
	if (error || cp->ch_client->cl_auth == NULL) {
		CLNT_DESTROY(cp->ch_client);
		kmem_cache_free(chtab4_cache, cp);
#ifdef DEBUG
		atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -1);
#endif
		return ((error != 0) ? error : EINTR);
	}
	ch->ch_timesused++;
	*newcl = cp->ch_client;
	ASSERT(cp->ch_client->cl_nosignal == FALSE);
	*chp = cp;
	return (0);
}

static int
nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
    struct chtab **chp, struct nfs4_clnt *nfscl)
{
	clinfo_t ci;
	bool_t is_recov;
	int firstcall, error = 0;

	/*
	 * Set read buffer size to rsize
	 * and add room for RPC headers.
	 */
	ci.cl_readsize = mi->mi_tsize;
	if (ci.cl_readsize != 0)
		ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);

	/*
	 * If soft mount and server is down just try once.
	 * meaning: do not retransmit.
	 */
	if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN))
		ci.cl_retrans = 0;
	else
		ci.cl_retrans = mi->mi_retrans;

	ci.cl_prog = mi->mi_prog;
	ci.cl_vers = mi->mi_vers;
	ci.cl_flags = mi->mi_flags;

	/*
	 * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
	 * security flavor, the client tries to establish a security context
	 * by contacting the server. If the connection is timed out or reset,
	 * e.g. server reboot, we will try again.
	 */
	is_recov = (curthread == mi->mi_recovthread);
	firstcall = 1;

	do {
		error = clget4(&ci, svp, cr, newcl, chp, nfscl);

		if (error == 0)
			break;

		/*
		 * For forced unmount and zone shutdown, bail out but
		 * let the recovery thread do one more transmission.
		 */
		if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) &&
		    (!is_recov || !firstcall)) {
			error = EIO;
			break;
		}

		/* do not retry for soft mount */
		if (!(mi->mi_flags & MI4_HARD))
			break;

		/* let the caller deal with the failover case */
		if (FAILOVER_MOUNT4(mi))
			break;

		firstcall = 0;

	} while (error == ETIMEDOUT || error == ECONNRESET);

	return (error);
}

void
clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl)
{
	if (cl->cl_auth != NULL) {
		sec_clnt_freeh(cl->cl_auth);
		cl->cl_auth = NULL;
	}

	/*
	 * Timestamp this cache entry so that we know when it was last
	 * used.
	 */
	cp->ch_freed = gethrestime_sec();

	/*
	 * Add the free client handle to the front of the list.
	 * This way, the list will be sorted in youngest to oldest
	 * order.
	 */
	mutex_enter(&nfscl->nfscl_chtable4_lock);
	cp->ch_list = cp->ch_head->ch_list;
	cp->ch_head->ch_list = cp;
	mutex_exit(&nfscl->nfscl_chtable4_lock);
}

#define	CL_HOLDTIME	60	/* time to hold client handles */

static void
clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime)
{
	struct chhead *ch;
	struct chtab *cp;	/* list of objects that can be reclaimed */
	struct chtab *cpe;
	struct chtab *cpl;
	struct chtab **cpp;
#ifdef DEBUG
	int n = 0;
	clstat4_debug.clreclaim.value.ui64++;
#endif

	/*
	 * Need to reclaim some memory, so step through the cache
	 * looking through the lists for entries which can be freed.
	 */
	cp = NULL;

	mutex_enter(&nfscl->nfscl_chtable4_lock);

	/*
	 * Here we step through each non-NULL quadruple and start to
	 * construct the reclaim list pointed to by cp.  Note that
	 * cp will contain all eligible chtab entries.  When this traversal
	 * completes, chtab entries from the last quadruple will be at the
	 * front of cp and entries from previously inspected quadruples have
	 * been appended to the rear of cp.
	 */
	for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
		if (ch->ch_list == NULL)
			continue;
		/*
		 * Search each list for entries older then
		 * cl_holdtime seconds.  The lists are maintained
		 * in youngest to oldest order so that when the
		 * first entry is found which is old enough, then
		 * all of the rest of the entries on the list will
		 * be old enough as well.
		 */
		cpl = ch->ch_list;
		cpp = &ch->ch_list;
		while (cpl != NULL &&
		    cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
			cpp = &cpl->ch_list;
			cpl = cpl->ch_list;
		}
		if (cpl != NULL) {
			*cpp = NULL;
			if (cp != NULL) {
				cpe = cpl;
				while (cpe->ch_list != NULL)
					cpe = cpe->ch_list;
				cpe->ch_list = cp;
			}
			cp = cpl;
		}
	}

	mutex_exit(&nfscl->nfscl_chtable4_lock);

	/*
	 * If cp is empty, then there is nothing to reclaim here.
	 */
	if (cp == NULL)
		return;

	/*
	 * Step through the list of entries to free, destroying each client
	 * handle and kmem_free'ing the memory for each entry.
	 */
	while (cp != NULL) {
#ifdef DEBUG
		n++;
#endif
		CLNT_DESTROY(cp->ch_client);
		cpl = cp->ch_list;
		kmem_cache_free(chtab4_cache, cp);
		cp = cpl;
	}

#ifdef DEBUG
	/*
	 * Update clalloc so that nfsstat shows the current number
	 * of allocated client handles.
	 */
	atomic_add_64(&nfscl->nfscl_stat.clalloc.value.ui64, -n);
#endif
}

/* ARGSUSED */
static void
clreclaim4(void *all)
{
	struct nfs4_clnt *nfscl;

	/*
	 * The system is low on memory; go through and try to reclaim some from
	 * every zone on the system.
	 */
	mutex_enter(&nfs4_clnt_list_lock);
	nfscl = list_head(&nfs4_clnt_list);
	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl))
		clreclaim4_zone(nfscl, CL_HOLDTIME);
	mutex_exit(&nfs4_clnt_list_lock);
}

/*
 * Minimum time-out values indexed by call type
 * These units are in "eights" of a second to avoid multiplies
 */
static unsigned int minimum_timeo[] = {
	6, 7, 10
};

#define	SHORTWAIT	(NFS_COTS_TIMEO / 10)

/*
 * Back off for retransmission timeout, MAXTIMO is in hz of a sec
 */
#define	MAXTIMO	(20*hz)
#define	backoff(tim)	(((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
#define	dobackoff(tim)	((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))

static int
nfs4_rfscall(mntinfo4_t *mi, rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
    xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue,
    enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl)
{
	CLIENT *client;
	struct chtab *ch;
	cred_t *cr = icr;
	struct rpc_err rpcerr, rpcerr_tmp;
	enum clnt_stat status;
	int error;
	struct timeval wait;
	int timeo;		/* in units of hz */
	bool_t tryagain, is_recov;
	bool_t cred_cloned = FALSE;
	k_sigset_t smask;
	servinfo4_t *svp;
#ifdef DEBUG
	char *bufp;
#endif
	int firstcall;

	rpcerr.re_status = RPC_SUCCESS;

	/*
	 * If we know that we are rebooting then let's
	 * not bother with doing any over the wireness.
	 */
	mutex_enter(&mi->mi_lock);
	if (mi->mi_flags & MI4_SHUTDOWN) {
		mutex_exit(&mi->mi_lock);
		return (EIO);
	}
	mutex_exit(&mi->mi_lock);

	/* For TSOL, use a new cred which has net_mac_aware flag */
	if (!cred_cloned && is_system_labeled()) {
		cred_cloned = TRUE;
		cr = crdup(icr);
		(void) setpflags(NET_MAC_AWARE, 1, cr);
	}

	/*
	 * clget() calls clnt_tli_kinit() which clears the xid, so we
	 * are guaranteed to reprocess the retry as a new request.
	 */
	svp = mi->mi_curr_serv;
	rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
	if (rpcerr.re_errno != 0)
		return (rpcerr.re_errno);

	timeo = (mi->mi_timeo * hz) / 10;

	/*
	 * If hard mounted fs, retry call forever unless hard error
	 * occurs.
	 *
	 * For forced unmount, let the recovery thread through but return
	 * an error for all others.  This is so that user processes can
	 * exit quickly.  The recovery thread bails out after one
	 * transmission so that it can tell if it needs to continue.
	 *
	 * For zone shutdown, behave as above to encourage quick
	 * process exit, but also fail quickly when servers have
	 * timed out before and reduce the timeouts.
	 */
	is_recov = (curthread == mi->mi_recovthread);
	firstcall = 1;
	do {
		tryagain = FALSE;

		NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE,
		    "nfs4_rfscall: vfs_flag=0x%x, %s",
		    mi->mi_vfsp->vfs_flag,
		    is_recov ? "recov thread" : "not recov thread"));

		/*
		 * It's possible while we're retrying the admin
		 * decided to reboot.
		 */
		mutex_enter(&mi->mi_lock);
		if (mi->mi_flags & MI4_SHUTDOWN) {
			mutex_exit(&mi->mi_lock);
			clfree4(client, ch, nfscl);
			if (cred_cloned)
				crfree(cr);
			return (EIO);
		}
		mutex_exit(&mi->mi_lock);

		if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
		    (!is_recov || !firstcall)) {
			clfree4(client, ch, nfscl);
			if (cred_cloned)
				crfree(cr);
			return (EIO);
		}

		if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) {
			mutex_enter(&mi->mi_lock);
			if ((mi->mi_flags & MI4_TIMEDOUT) ||
			    !is_recov || !firstcall) {
				mutex_exit(&mi->mi_lock);
				clfree4(client, ch, nfscl);
				if (cred_cloned)
					crfree(cr);
				return (EIO);
			}
			mutex_exit(&mi->mi_lock);
			timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10;
		}

		firstcall = 0;
		TICK_TO_TIMEVAL(timeo, &wait);

		/*
		 * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
		 * and SIGTERM. (Preserving the existing masks).
		 * Mask out SIGINT if mount option nointr is specified.
		 */
		sigintr(&smask, (int)mi->mi_flags & MI4_INT);
		if (!(mi->mi_flags & MI4_INT))
			client->cl_nosignal = TRUE;

		/*
		 * If there is a current signal, then don't bother
		 * even trying to send out the request because we
		 * won't be able to block waiting for the response.
		 * Simply assume RPC_INTR and get on with it.
		 */
		if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
			status = RPC_INTR;
		else {
			status = CLNT_CALL(client, which, xdrargs, argsp,
			    xdrres, resp, wait);
		}

		if (!(mi->mi_flags & MI4_INT))
			client->cl_nosignal = FALSE;
		/*
		 * restore original signal mask
		 */
		sigunintr(&smask);

		switch (status) {
		case RPC_SUCCESS:
			break;

		case RPC_INTR:
			/*
			 * There is no way to recover from this error,
			 * even if mount option nointr is specified.
			 * SIGKILL, for example, cannot be blocked.
			 */
			rpcerr.re_status = RPC_INTR;
			rpcerr.re_errno = EINTR;
			break;

		case RPC_UDERROR:
			/*
			 * If the NFS server is local (vold) and
			 * it goes away then we get RPC_UDERROR.
			 * This is a retryable error, so we would
			 * loop, so check to see if the specific
			 * error was ECONNRESET, indicating that
			 * target did not exist at all.  If so,
			 * return with RPC_PROGUNAVAIL and
			 * ECONNRESET to indicate why.
			 */
			CLNT_GETERR(client, &rpcerr);
			if (rpcerr.re_errno == ECONNRESET) {
				rpcerr.re_status = RPC_PROGUNAVAIL;
				rpcerr.re_errno = ECONNRESET;
				break;
			}
			/*FALLTHROUGH*/

		default:		/* probably RPC_TIMEDOUT */

			if (IS_UNRECOVERABLE_RPC(status))
				break;

			/*
			 * increment server not responding count
			 */
			mutex_enter(&mi->mi_lock);
			mi->mi_noresponse++;
			mutex_exit(&mi->mi_lock);
#ifdef DEBUG
			nfscl->nfscl_stat.noresponse.value.ui64++;
#endif
			/*
			 * On zone shutdown, mark server dead and move on.
			 */
			if (zone_status_get(curproc->p_zone) >=
			    ZONE_IS_SHUTTING_DOWN) {
				mutex_enter(&mi->mi_lock);
				mi->mi_flags |= MI4_TIMEDOUT;
				mutex_exit(&mi->mi_lock);
				clfree4(client, ch, nfscl);
				if (cred_cloned)
					crfree(cr);
				return (EIO);
			}

			/*
			 * NFS client failover support:
			 * return and let the caller take care of
			 * failover.  We only return for failover mounts
			 * because otherwise we want the "not responding"
			 * message, the timer updates, etc.
			 */
			if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) &&
			    (error = try_failover(status)) != 0) {
				clfree4(client, ch, nfscl);
				if (cred_cloned)
					crfree(cr);
				*rpc_statusp = status;
				return (error);
			}

			if (flags & RFSCALL_SOFT)
				break;

			tryagain = TRUE;

			/*
			 * The call is in progress (over COTS).
			 * Try the CLNT_CALL again, but don't
			 * print a noisy error message.
			 */
			if (status == RPC_INPROGRESS)
				break;

			timeo = backoff(timeo);
			CLNT_GETERR(client, &rpcerr_tmp);

			mutex_enter(&mi->mi_lock);
			if (!(mi->mi_flags & MI4_PRINTED)) {
				mi->mi_flags |= MI4_PRINTED;
				mutex_exit(&mi->mi_lock);
				if ((status == RPC_CANTSEND) &&
				    (rpcerr_tmp.re_errno == ENOBUFS))
					nfs4_queue_fact(RF_SENDQ_FULL, mi, 0,
					    0, 0, FALSE, NULL, 0, NULL);
				else
					nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi,
					    0, 0, 0, FALSE, NULL, 0, NULL);
			} else
				mutex_exit(&mi->mi_lock);

			if (*doqueue && nfs_has_ctty()) {
				*doqueue = 0;
				if (!(mi->mi_flags & MI4_NOPRINT)) {
					if ((status == RPC_CANTSEND) &&
					    (rpcerr_tmp.re_errno == ENOBUFS))
						nfs4_queue_fact(RF_SENDQ_FULL,
						    mi, 0, 0, 0, FALSE, NULL,
						    0, NULL);
					else
						nfs4_queue_fact(
						    RF_SRV_NOT_RESPOND, mi, 0,
						    0, 0, FALSE, NULL, 0, NULL);
				}
			}
		}
	} while (tryagain);

	DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status,
	    int, rpcerr.re_errno);

	if (status != RPC_SUCCESS) {
		zoneid_t zoneid = mi->mi_zone->zone_id;

		/*
		 * Let soft mounts use the timed out message.
		 */
		if (status == RPC_INPROGRESS)
			status = RPC_TIMEDOUT;
		nfscl->nfscl_stat.badcalls.value.ui64++;
		if (status != RPC_INTR) {
			mutex_enter(&mi->mi_lock);
			mi->mi_flags |= MI4_DOWN;
			mutex_exit(&mi->mi_lock);
			CLNT_GETERR(client, &rpcerr);
#ifdef DEBUG
			bufp = clnt_sperror(client, svp->sv_hostname);
			zprintf(zoneid, "NFS%d %s failed for %s\n",
			    mi->mi_vers, mi->mi_rfsnames[which], bufp);
			if (nfs_has_ctty()) {
				if (!(mi->mi_flags & MI4_NOPRINT)) {
					uprintf("NFS%d %s failed for %s\n",
					    mi->mi_vers, mi->mi_rfsnames[which],
					    bufp);
				}
			}
			kmem_free(bufp, MAXPATHLEN);
#else
			zprintf(zoneid,
			    "NFS %s failed for server %s: error %d (%s)\n",
			    mi->mi_rfsnames[which], svp->sv_hostname,
			    status, clnt_sperrno(status));
			if (nfs_has_ctty()) {
				if (!(mi->mi_flags & MI4_NOPRINT)) {
					uprintf(
				"NFS %s failed for server %s: error %d (%s)\n",
					    mi->mi_rfsnames[which],
					    svp->sv_hostname, status,
					    clnt_sperrno(status));
				}
			}
#endif
			/*
			 * when CLNT_CALL() fails with RPC_AUTHERROR,
			 * re_errno is set appropriately depending on
			 * the authentication error
			 */
			if (status == RPC_VERSMISMATCH ||
			    status == RPC_PROGVERSMISMATCH)
				rpcerr.re_errno = EIO;
		}
	} else {
		/*
		 * Test the value of mi_down and mi_printed without
		 * holding the mi_lock mutex.  If they are both zero,
		 * then it is okay to skip the down and printed
		 * processing.  This saves on a mutex_enter and
		 * mutex_exit pair for a normal, successful RPC.
		 * This was just complete overhead.
		 */
		if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) {
			mutex_enter(&mi->mi_lock);
			mi->mi_flags &= ~MI4_DOWN;
			if (mi->mi_flags & MI4_PRINTED) {
				mi->mi_flags &= ~MI4_PRINTED;
				mutex_exit(&mi->mi_lock);
				if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
					nfs4_queue_fact(RF_SRV_OK, mi, 0, 0,
					    0, FALSE, NULL, 0, NULL);
			} else
				mutex_exit(&mi->mi_lock);
		}

		if (*doqueue == 0) {
			if (!(mi->mi_flags & MI4_NOPRINT) &&
			    !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
				nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0,
				    FALSE, NULL, 0, NULL);

			*doqueue = 1;
		}
	}

	clfree4(client, ch, nfscl);
	if (cred_cloned)
		crfree(cr);

	ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);

	TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d",
	    rpcerr.re_errno);

	*rpc_statusp = status;
	return (rpcerr.re_errno);
}

/*
 * rfs4call - general wrapper for RPC calls initiated by the client
 */
void
rfs4call(mntinfo4_t *mi, COMPOUND4args_clnt *argsp, COMPOUND4res_clnt *resp,
    cred_t *cr, int *doqueue, int flags, nfs4_error_t *ep)
{
	int i, error;
	enum clnt_stat rpc_status = NFS4_OK;
	int num_resops;
	struct nfs4_clnt *nfscl;

	ASSERT(nfs_zone() == mi->mi_zone);
	nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
	ASSERT(nfscl != NULL);

	nfscl->nfscl_stat.calls.value.ui64++;
	mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++;

	/* Set up the results struct for XDR usage */
	resp->argsp = argsp;
	resp->array = NULL;
	resp->status = 0;
	resp->decode_len = 0;

	error = nfs4_rfscall(mi, NFSPROC4_COMPOUND,
	    xdr_COMPOUND4args_clnt, (caddr_t)argsp,
	    xdr_COMPOUND4res_clnt, (caddr_t)resp, cr,
	    doqueue, &rpc_status, flags, nfscl);

	/* Return now if it was an RPC error */
	if (error) {
		ep->error = error;
		ep->stat = resp->status;
		ep->rpc_status = rpc_status;
		return;
	}

	/* else we'll count the processed operations */
	num_resops = resp->decode_len;
	for (i = 0; i < num_resops; i++) {
		/*
		 * Count the individual operations
		 * processed by the server.
		 */
		if (resp->array[i].resop >= NFSPROC4_NULL &&
		    resp->array[i].resop <= OP_WRITE)
			mi->mi_reqs[resp->array[i].resop].value.ui64++;
	}

	ep->error = 0;
	ep->stat = resp->status;
	ep->rpc_status = rpc_status;
}

/*
 * nfs4rename_update - updates stored state after a rename.  Currently this
 * is the path of the object and anything under it, and the filehandle of
 * the renamed object.
 */
void
nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm)
{
	sfh4_update(VTOR4(renvp)->r_fh, nfh4p);
	fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm);
}

/*
 * Routine to look up the filehandle for the given path and rootvp.
 *
 * Return values:
 * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
 *   updated.
 * - error: return value (errno value) and/or *statp is set appropriately.
 */
#define	RML_ORDINARY	1
#define	RML_NAMED_ATTR	2
#define	RML_ATTRDIR	3

static void
remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp,
    int filetype, cred_t *cr,
    nfs_fh4 *fhp, nfs4_ga_res_t *garp,		/* fh, attrs for object */
    nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp,	/* fh, attrs for parent */
    nfs4_error_t *ep)
{
	COMPOUND4args_clnt args;
	COMPOUND4res_clnt res;
	nfs_argop4 *argop;
	nfs_resop4 *resop;
	int num_argops;
	lookup4_param_t lookuparg;
	nfs_fh4 *tmpfhp;
	int doqueue = 1;
	char *path;
	mntinfo4_t *mi;

	ASSERT(fname != NULL);
	ASSERT(rootvp->v_type == VDIR);

	mi = VTOMI4(rootvp);
	path = fn_path(fname);
	switch (filetype) {
	case RML_NAMED_ATTR:
		lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR;
		args.ctag = TAG_REMAP_LOOKUP_NA;
		break;
	case RML_ATTRDIR:
		lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR;
		args.ctag = TAG_REMAP_LOOKUP_AD;
		break;
	case RML_ORDINARY:
		lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
		args.ctag = TAG_REMAP_LOOKUP;
		break;
	default:
		ep->error = EINVAL;
		return;
	}
	lookuparg.argsp = &args;
	lookuparg.resp = &res;
	lookuparg.header_len = 1;	/* Putfh */
	lookuparg.trailer_len = 0;
	lookuparg.ga_bits = NFS4_VATTR_MASK;
	lookuparg.mi = VTOMI4(rootvp);

	(void) nfs4lookup_setup(path, &lookuparg, 1);

	/* 0: putfh directory */
	argop = args.array;
	argop[0].argop = OP_CPUTFH;
	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh;

	num_argops = args.array_len;

	rfs4call(mi, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);

	if (ep->error || res.status != NFS4_OK)
		goto exit;

	/* get the object filehandle */
	resop = &res.array[res.array_len - 2];
	if (resop->resop != OP_GETFH) {
		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
		ep->stat = NFS4ERR_SERVERFAULT;
		goto exit;
	}
	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
		    TAG_NONE, 0, 0);
		ep->stat = NFS4ERR_SERVERFAULT;
		goto exit;
	}
	fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
	nfs_fh4_copy(tmpfhp, fhp);

	/* get the object attributes */
	resop = &res.array[res.array_len - 1];
	if (garp && resop->resop == OP_GETATTR)
		*garp = resop->nfs_resop4_u.opgetattr.ga_res;

	/* See if there are enough fields in the response for parent info */
	if ((int)res.array_len - 5 <= 0)
		goto exit;

	/* get the parent filehandle */
	resop = &res.array[res.array_len - 5];
	if (resop->resop != OP_GETFH) {
		nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
		    0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
		ep->stat = NFS4ERR_SERVERFAULT;
		goto exit;
	}
	tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
	if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
		nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
		    tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
		    TAG_NONE, 0, 0);
		ep->stat = NFS4ERR_SERVERFAULT;
		goto exit;
	}
	pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
	nfs_fh4_copy(tmpfhp, pfhp);

	/* get the parent attributes */
	resop = &res.array[res.array_len - 4];
	if (pgarp && resop->resop == OP_GETATTR)
		*pgarp = resop->nfs_resop4_u.opgetattr.ga_res;

exit:
	/*
	 * It is too hard to remember where all the OP_LOOKUPs are
	 */
	nfs4args_lookup_free(argop, num_argops);
	kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));

	if (!ep->error)
		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
	kmem_free(path, strlen(path)+1);
}

/*
 * NFS client failover / volatile filehandle support
 *
 * Recover the filehandle for the given rnode.
 *
 * Errors are returned via the nfs4_error_t parameter.
 */

void
nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
{
	int is_stub;
	rnode4_t *rp = VTOR4(vp);
	vnode_t *rootvp = NULL;
	vnode_t *dvp = NULL;
	cred_t *cr, *cred_otw;
	nfs4_ga_res_t gar, pgar;
	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
	int filetype = RML_ORDINARY;
	nfs4_recov_state_t recov = {NULL, 0, 0};
	int badfhcount = 0;
	nfs4_open_stream_t *osp = NULL;
	bool_t first_time = TRUE;	/* first time getting OTW cred */
	bool_t last_time = FALSE;	/* last time getting OTW cred */

	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
	    "nfs4_remap_file: remapping %s", rnode4info(rp)));
	ASSERT(nfs4_consistent_type(vp));

	if (vp->v_flag & VROOT) {
		nfs4_remap_root(mi, ep, flags);
		return;
	}

	/*
	 * Given the root fh, use the path stored in
	 * the rnode to find the fh for the new server.
	 */
	ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
	if (ep->error != 0)
		return;

	cr = curthread->t_cred;
	ASSERT(cr != NULL);
get_remap_cred:
	/*
	 * Releases the osp, if it is provided.
	 * Puts a hold on the cred_otw and the new osp (if found).
	 */
	cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
	    &first_time, &last_time);
	ASSERT(cred_otw != NULL);

	if (rp->r_flags & R4ISXATTR) {
		filetype = RML_NAMED_ATTR;
		(void) vtodv(vp, &dvp, cred_otw, FALSE);
	}

	if (vp->v_flag & V_XATTRDIR) {
		filetype = RML_ATTRDIR;
	}

	if (filetype == RML_ORDINARY && rootvp->v_type == VREG) {
		/* file mount, doesn't need a remap */
		goto done;
	}

again:
	remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw,
	    &newfh, &gar, &newpfh, &pgar, ep);

	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
	    "nfs4_remap_file: remap_lookup returned %d/%d",
	    ep->error, ep->stat));

	if (last_time == FALSE && ep->error == EACCES) {
		crfree(cred_otw);
		if (dvp != NULL)
			VN_RELE(dvp);
		goto get_remap_cred;
	}
	if (ep->error != 0)
		goto done;

	switch (ep->stat) {
	case NFS4_OK:
		badfhcount = 0;
		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
			mutex_enter(&rp->r_statelock);
			rp->r_delay_interval = 0;
			mutex_exit(&rp->r_statelock);
			uprintf("NFS File Available..\n");
		}
		break;
	case NFS4ERR_FHEXPIRED:
	case NFS4ERR_BADHANDLE:
	case NFS4ERR_STALE:
		/*
		 * If we ran into filehandle problems, we should try to
		 * remap the root vnode first and hope life gets better.
		 * But we need to avoid loops.
		 */
		if (badfhcount++ > 0)
			goto done;
		if (newfh.nfs_fh4_len != 0) {
			kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
			newfh.nfs_fh4_len = 0;
		}
		if (newpfh.nfs_fh4_len != 0) {
			kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
			newpfh.nfs_fh4_len = 0;
		}
		/* relative path - remap rootvp then retry */
		VN_RELE(rootvp);
		rootvp = NULL;
		nfs4_remap_root(mi, ep, flags);
		if (ep->error != 0 || ep->stat != NFS4_OK)
			goto done;
		ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
		if (ep->error != 0)
			goto done;
		goto again;
	case NFS4ERR_DELAY:
		badfhcount = 0;
		nfs4_set_delay_wait(vp);
		ep->error = nfs4_wait_for_delay(vp, &recov);
		if (ep->error != 0)
			goto done;
		goto again;
	case NFS4ERR_ACCESS:
		/* get new cred, try again */
		if (last_time == TRUE)
			goto done;
		if (dvp != NULL)
			VN_RELE(dvp);
		crfree(cred_otw);
		goto get_remap_cred;
	default:
		goto done;
	}

	/*
	 * Check on the new and old rnodes before updating;
	 * if the vnode type or size changes, issue a warning
	 * and mark the file dead.
	 */
	mutex_enter(&rp->r_statelock);
	if (flags & NFS4_REMAP_CKATTRS) {
		if (vp->v_type != gar.n4g_va.va_type ||
		    (vp->v_type != VDIR &&
		    rp->r_size != gar.n4g_va.va_size)) {
			NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
			    "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
			    (int)rp->r_size, (int)gar.n4g_va.va_size,
			    vp->v_type, gar.n4g_va.va_type));
			mutex_exit(&rp->r_statelock);
			nfs4_queue_event(RE_FILE_DIFF, mi,
			    rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0,
			    TAG_NONE, TAG_NONE, 0, 0);
			nfs4_fail_recov(vp, NULL, 0, NFS4_OK);
			goto done;
		}
	}
	ASSERT(gar.n4g_va.va_type != VNON);
	rp->r_server = mi->mi_curr_serv;

	/*
	 * Turn this object into a "stub" object if we
	 * crossed an underlying server fs boundary.
	 *
	 * This stub will be for a mirror-mount.
	 * A referral would look like a boundary crossing
	 * as well, but would not be the same type of object,
	 * so we would expect to mark the object dead.
	 *
	 * See comment in r4_do_attrcache() for more details.
	 */
	is_stub = 0;
	if (gar.n4g_fsid_valid) {
		(void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0);
		rp->r_srv_fsid = gar.n4g_fsid;
		if (!FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid))
			is_stub = 1;
		nfs_rw_exit(&rp->r_server->sv_lock);
#ifdef DEBUG
	} else {
		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
		    "remap_file: fsid attr not provided by server.  rp=%p",
		    (void *)rp));
#endif
	}
	if (is_stub)
		r4_stub_mirrormount(rp);
	else
		r4_stub_none(rp);
	mutex_exit(&rp->r_statelock);
	nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */
	sfh4_update(rp->r_fh, &newfh);
	ASSERT(nfs4_consistent_type(vp));

	/*
	 * If we got parent info, use it to update the parent
	 */
	if (newpfh.nfs_fh4_len != 0) {
		if (rp->r_svnode.sv_dfh != NULL)
			sfh4_update(rp->r_svnode.sv_dfh, &newpfh);
		if (dvp != NULL) {
			/* force update of attrs */
			nfs4_attrcache_noinval(dvp, &pgar, gethrtime());
		}
	}
done:
	if (newfh.nfs_fh4_len != 0)
		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
	if (newpfh.nfs_fh4_len != 0)
		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
	if (cred_otw != NULL)
		crfree(cred_otw);
	if (rootvp != NULL)
		VN_RELE(rootvp);
	if (dvp != NULL)
		VN_RELE(dvp);
	if (osp != NULL)
		open_stream_rele(osp, rp);
}

/*
 * Client-side failover support: remap the filehandle for vp if it appears
 * necessary.  errors are returned via the nfs4_error_t parameter; though,
 * if there is a problem, we will just try again later.
 */

void
nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
{
	if (vp == NULL)
		return;

	if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY))
		return;

	if (VTOR4(vp)->r_server == mi->mi_curr_serv)
		return;

	nfs4_remap_file(mi, vp, flags, ep);
}

/*
 * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
 *
 * Our caller has a filehandle for ".." relative to a particular
 * directory object.  We want to find or create a parent vnode
 * with that filehandle and return it.  We can of course create
 * a vnode from this filehandle, but we need to also make sure
 * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
 * that we have a parent FH for future reopens as well.  If
 * we have a remap failure, we won't be able to reopen this
 * file, but we won't treat that as fatal because a reopen
 * is at least unlikely.  Someday nfs4_reopen() should look
 * for a missing parent FH and try a remap to recover from it.
 *
 * need_start_op argument indicates whether this function should
 * do a start_op before calling remap_lookup().  This should
 * be FALSE, if you are the recovery thread or in an op; otherwise,
 * set it to TRUE.
 */
int
nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp,
    cred_t *cr, vnode_t **vpp, int need_start_op)
{
	mntinfo4_t *mi = VTOMI4(dvp);
	nfs4_fname_t *np = NULL, *pnp = NULL;
	vnode_t *vp = NULL, *rootvp = NULL;
	rnode4_t *rp;
	nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
	nfs4_ga_res_t gar, pgar;
	vattr_t va, pva;
	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
	nfs4_sharedfh_t *sfh = NULL, *psfh = NULL;
	nfs4_recov_state_t recov_state;

#ifdef DEBUG
	/*
	 * ensure need_start_op is correct
	 */
	{
		int no_need_start_op = (tsd_get(nfs4_tsd_key) ||
		    (curthread == mi->mi_recovthread));
		/* C needs a ^^ operator! */
		ASSERT(((need_start_op) && (!no_need_start_op)) ||
		    ((! need_start_op) && (no_need_start_op)));
	}
#endif
	ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone());

	NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE,
	    "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp,
	    rnode4info(VTOR4(dvp))));

	/*
	 * rootvp might be needed eventually. Holding it now will
	 * ensure that r4find_unlocked() will find it, if ".." is the root.
	 */
	e.error = VFS_ROOT(mi->mi_vfsp, &rootvp);
	if (e.error != 0)
		goto out;
	rp = r4find_unlocked(fhp, mi->mi_vfsp);
	if (rp != NULL) {
		*vpp = RTOV4(rp);
		VN_RELE(rootvp);
		return (0);
	}

	/*
	 * Since we don't have the rnode, we have to go over the wire.
	 * remap_lookup() can get all of the filehandles and attributes
	 * we need in one operation.
	 */
	np = fn_parent(VTOSV(dvp)->sv_name);
	/* if a parent was not found return an error */
	if (np == NULL) {
		e.error = ENOENT;
		goto out;
	}

	recov_state.rs_flags = 0;
	recov_state.rs_num_retry_despite_err = 0;
recov_retry:
	if (need_start_op) {
		e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP,
		    &recov_state, NULL);
		if (e.error != 0) {
			goto out;
		}
	}

	pgar.n4g_va.va_type = VNON;
	gar.n4g_va.va_type = VNON;

	remap_lookup(np, rootvp, RML_ORDINARY, cr,
	    &newfh, &gar, &newpfh, &pgar, &e);
	if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
		if (need_start_op) {
			bool_t abort;

			abort = nfs4_start_recovery(&e, mi,
			    rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL, NULL,
			    NULL);
			if (abort) {
				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
				    &recov_state, FALSE);
				if (e.error == 0)
					e.error = EIO;
				goto out;
			}
			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
			    &recov_state, TRUE);
			goto recov_retry;
		}
		if (e.error == 0)
			e.error = EIO;
		goto out;
	}

	va = gar.n4g_va;
	pva = pgar.n4g_va;

	if ((e.error != 0) ||
	    (va.va_type != VDIR)) {
		if (need_start_op)
			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
			    &recov_state, FALSE);
		if (e.error == 0)
			e.error = EIO;
		goto out;
	}

	if (e.stat != NFS4_OK) {
		if (need_start_op)
			nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
			    &recov_state, FALSE);
		e.error = EIO;
		goto out;
	}

	/*
	 * It is possible for remap_lookup() to return with no error,
	 * but without providing the parent filehandle and attrs.
	 */
	if (pva.va_type != VDIR) {
		/*
		 * Call remap_lookup() again, this time with the
		 * newpfh and pgar args in the first position.
		 */
		pnp = fn_parent(np);
		if (pnp != NULL) {
			remap_lookup(pnp, rootvp, RML_ORDINARY, cr,
			    &newpfh, &pgar, NULL, NULL, &e);
			/*
			 * This remap_lookup call modifies pgar. The following
			 * line prevents trouble when checking the va_type of
			 * pva later in this code.
			 */
			pva = pgar.n4g_va;

			if (nfs4_needs_recovery(&e, FALSE,
			    mi->mi_vfsp)) {
				if (need_start_op) {
					bool_t abort;

					abort = nfs4_start_recovery(&e, mi,
					    rootvp, NULL, NULL, NULL,
					    OP_LOOKUP, NULL, NULL, NULL);
					if (abort) {
						nfs4_end_fop(mi, rootvp, NULL,
						    OH_LOOKUP, &recov_state,
						    FALSE);
						if (e.error == 0)
							e.error = EIO;
						goto out;
					}
					nfs4_end_fop(mi, rootvp, NULL,
					    OH_LOOKUP, &recov_state, TRUE);
					goto recov_retry;
				}
				if (e.error == 0)
					e.error = EIO;
				goto out;
			}

			if (e.stat != NFS4_OK) {
				if (need_start_op)
					nfs4_end_fop(mi, rootvp, NULL,
					    OH_LOOKUP, &recov_state, FALSE);
				e.error = EIO;
				goto out;
			}
		}
		if ((pnp == NULL) ||
		    (e.error != 0) ||
		    (pva.va_type == VNON)) {
			if (need_start_op)
				nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
				    &recov_state, FALSE);
			if (e.error == 0)
				e.error = EIO;
			goto out;
		}
	}
	ASSERT(newpfh.nfs_fh4_len != 0);
	if (need_start_op)
		nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE);
	psfh = sfh4_get(&newpfh, mi);

	sfh = sfh4_get(&newfh, mi);
	vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t);

out:
	if (np != NULL)
		fn_rele(&np);
	if (pnp != NULL)
		fn_rele(&pnp);
	if (newfh.nfs_fh4_len != 0)
		kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
	if (newpfh.nfs_fh4_len != 0)
		kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
	if (sfh != NULL)
		sfh4_rele(&sfh);
	if (psfh != NULL)
		sfh4_rele(&psfh);
	if (rootvp != NULL)
		VN_RELE(rootvp);
	*vpp = vp;
	return (e.error);
}

#ifdef DEBUG
size_t r_path_memuse = 0;
#endif

/*
 * NFS client failover support
 *
 * sv4_free() frees the malloc'd portion of a "servinfo_t".
 */
void
sv4_free(servinfo4_t *svp)
{
	servinfo4_t *next;
	struct knetconfig *knconf;

	while (svp != NULL) {
		next = svp->sv_next;
		if (svp->sv_dhsec)
			sec_clnt_freeinfo(svp->sv_dhsec);
		if (svp->sv_secdata)
			sec_clnt_freeinfo(svp->sv_secdata);
		if (svp->sv_save_secinfo &&
		    svp->sv_save_secinfo != svp->sv_secinfo)
			secinfo_free(svp->sv_save_secinfo);
		if (svp->sv_secinfo)
			secinfo_free(svp->sv_secinfo);
		if (svp->sv_hostname && svp->sv_hostnamelen > 0)
			kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
		knconf = svp->sv_knconf;
		if (knconf != NULL) {
			if (knconf->knc_protofmly != NULL)
				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
			if (knconf->knc_proto != NULL)
				kmem_free(knconf->knc_proto, KNC_STRSIZE);
			kmem_free(knconf, sizeof (*knconf));
		}
		knconf = svp->sv_origknconf;
		if (knconf != NULL) {
			if (knconf->knc_protofmly != NULL)
				kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
			if (knconf->knc_proto != NULL)
				kmem_free(knconf->knc_proto, KNC_STRSIZE);
			kmem_free(knconf, sizeof (*knconf));
		}
		if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
			kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
		if (svp->sv_path != NULL) {
			kmem_free(svp->sv_path, svp->sv_pathlen);
		}
		nfs_rw_destroy(&svp->sv_lock);
		kmem_free(svp, sizeof (*svp));
		svp = next;
	}
}

void
nfs4_printfhandle(nfs4_fhandle_t *fhp)
{
	int *ip;
	char *buf;
	size_t bufsize;
	char *cp;

	/*
	 * 13 == "(file handle:"
	 * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
	 *	1 == ' '
	 *	8 == maximum strlen of "%x"
	 * 3 == ")\n\0"
	 */
	bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
	buf = kmem_alloc(bufsize, KM_NOSLEEP);
	if (buf == NULL)
		return;

	cp = buf;
	(void) strcpy(cp, "(file handle:");
	while (*cp != '\0')
		cp++;
	for (ip = (int *)fhp->fh_buf;
	    ip < (int *)&fhp->fh_buf[fhp->fh_len];
	    ip++) {
		(void) sprintf(cp, " %x", *ip);
		while (*cp != '\0')
			cp++;
	}
	(void) strcpy(cp, ")\n");

	zcmn_err(getzoneid(), CE_CONT, "%s", buf);

	kmem_free(buf, bufsize);
}

/*
 * The NFSv4 readdir cache subsystem.
 *
 * We provide a set of interfaces to allow the rest of the system to utilize
 * a caching mechanism while encapsulating the details of the actual
 * implementation.  This should allow for better maintainability and
 * extensibility by consolidating the implementation details in one location.
 */

/*
 * Comparator used by AVL routines.
 */
static int
rddir4_cache_compar(const void *x, const void *y)
{
	rddir4_cache_impl *ai = (rddir4_cache_impl *)x;
	rddir4_cache_impl *bi = (rddir4_cache_impl *)y;
	rddir4_cache *a = &ai->rc;
	rddir4_cache *b = &bi->rc;

	if (a->nfs4_cookie == b->nfs4_cookie) {
		if (a->buflen == b->buflen)
			return (0);
		if (a->buflen < b->buflen)
			return (-1);
		return (1);
	}

	if (a->nfs4_cookie < b->nfs4_cookie)
			return (-1);

	return (1);
}

/*
 * Allocate an opaque handle for the readdir cache.
 */
void
rddir4_cache_create(rnode4_t *rp)
{
	ASSERT(rp->r_dir == NULL);

	rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);

	avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl),
	    offsetof(rddir4_cache_impl, tree));
}

/*
 *  Purge the cache of all cached readdir responses.
 */
void
rddir4_cache_purge(rnode4_t *rp)
{
	rddir4_cache_impl	*rdip;
	rddir4_cache_impl	*nrdip;

	ASSERT(MUTEX_HELD(&rp->r_statelock));

	if (rp->r_dir == NULL)
		return;

	rdip = avl_first(rp->r_dir);

	while (rdip != NULL) {
		nrdip = AVL_NEXT(rp->r_dir, rdip);
		avl_remove(rp->r_dir, rdip);
		rdip->rc.flags &= ~RDDIRCACHED;
		rddir4_cache_rele(rp, &rdip->rc);
		rdip = nrdip;
	}
	ASSERT(avl_numnodes(rp->r_dir) == 0);
}

/*
 * Destroy the readdir cache.
 */
void
rddir4_cache_destroy(rnode4_t *rp)
{
	ASSERT(MUTEX_HELD(&rp->r_statelock));
	if (rp->r_dir == NULL)
		return;

	rddir4_cache_purge(rp);
	avl_destroy(rp->r_dir);
	kmem_free(rp->r_dir, sizeof (avl_tree_t));
	rp->r_dir = NULL;
}

/*
 * Locate a readdir response from the readdir cache.
 *
 * Return values:
 *
 * NULL - If there is an unrecoverable situation like the operation may have
 *	  been interrupted.
 *
 * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
 *		    The flags are set approprately, such that the caller knows
 *		    what state the entry is in.
 */
rddir4_cache *
rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count)
{
	rddir4_cache_impl	*rdip = NULL;
	rddir4_cache_impl	srdip;
	rddir4_cache		*srdc;
	rddir4_cache		*rdc = NULL;
	rddir4_cache		*nrdc = NULL;
	avl_index_t		where;

top:
	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
	ASSERT(MUTEX_HELD(&rp->r_statelock));
	/*
	 * Check to see if the readdir cache has been disabled.  If so, then
	 * simply allocate an rddir4_cache entry and return it, since caching
	 * operations do not apply.
	 */
	if (rp->r_dir == NULL) {
		if (nrdc == NULL) {
			/*
			 * Drop the lock because we are doing a sleeping
			 * allocation.
			 */
			mutex_exit(&rp->r_statelock);
			rdc = rddir4_cache_alloc(KM_SLEEP);
			rdc->nfs4_cookie = cookie;
			rdc->buflen = count;
			mutex_enter(&rp->r_statelock);
			return (rdc);
		}
		return (nrdc);
	}

	srdc = &srdip.rc;
	srdc->nfs4_cookie = cookie;
	srdc->buflen = count;

	rdip = avl_find(rp->r_dir, &srdip, &where);

	/*
	 * If we didn't find an entry then create one and insert it
	 * into the cache.
	 */
	if (rdip == NULL) {
		/*
		 * Check for the case where we have made a second pass through
		 * the cache due to a lockless allocation.  If we find that no
		 * thread has already inserted this entry, do the insert now
		 * and return.
		 */
		if (nrdc != NULL) {
			avl_insert(rp->r_dir, nrdc->data, where);
			nrdc->flags |= RDDIRCACHED;
			rddir4_cache_hold(nrdc);
			return (nrdc);
		}

#ifdef DEBUG
		nfs4_readdir_cache_misses++;
#endif
		/*
		 * First, try to allocate an entry without sleeping.  If that
		 * fails then drop the lock and do a sleeping allocation.
		 */
		nrdc = rddir4_cache_alloc(KM_NOSLEEP);
		if (nrdc != NULL) {
			nrdc->nfs4_cookie = cookie;
			nrdc->buflen = count;
			avl_insert(rp->r_dir, nrdc->data, where);
			nrdc->flags |= RDDIRCACHED;
			rddir4_cache_hold(nrdc);
			return (nrdc);
		}

		/*
		 * Drop the lock and do a sleeping allocation.	We incur
		 * additional overhead by having to search the cache again,
		 * but this case should be rare.
		 */
		mutex_exit(&rp->r_statelock);
		nrdc = rddir4_cache_alloc(KM_SLEEP);
		nrdc->nfs4_cookie = cookie;
		nrdc->buflen = count;
		mutex_enter(&rp->r_statelock);
		/*
		 * We need to take another pass through the cache
		 * since we dropped our lock to perform the alloc.
		 * Another thread may have come by and inserted the
		 * entry we are interested in.
		 */
		goto top;
	}

	/*
	 * Check to see if we need to free our entry.  This can happen if
	 * another thread came along beat us to the insert.  We can
	 * safely call rddir4_cache_free directly because no other thread
	 * would have a reference to this entry.
	 */
	if (nrdc != NULL)
		rddir4_cache_free((rddir4_cache_impl *)nrdc->data);

#ifdef DEBUG
	nfs4_readdir_cache_hits++;
#endif
	/*
	 * Found something.  Make sure it's ready to return.
	 */
	rdc = &rdip->rc;
	rddir4_cache_hold(rdc);
	/*
	 * If the cache entry is in the process of being filled in, wait
	 * until this completes.  The RDDIRWAIT bit is set to indicate that
	 * someone is waiting and when the thread currently filling the entry
	 * is done, it should do a cv_broadcast to wakeup all of the threads
	 * waiting for it to finish. If the thread wakes up to find that
	 * someone new is now trying to complete the the entry, go back
	 * to sleep.
	 */
	while (rdc->flags & RDDIR) {
		/*
		 * The entry is not complete.
		 */
		nfs_rw_exit(&rp->r_rwlock);
		rdc->flags |= RDDIRWAIT;
#ifdef DEBUG
		nfs4_readdir_cache_waits++;
#endif
		while (rdc->flags & RDDIRWAIT) {
			if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
				/*
				 * We got interrupted, probably the user
				 * typed ^C or an alarm fired.  We free the
				 * new entry if we allocated one.
				 */
				rddir4_cache_rele(rp, rdc);
				mutex_exit(&rp->r_statelock);
				(void) nfs_rw_enter_sig(&rp->r_rwlock,
				    RW_READER, FALSE);
				mutex_enter(&rp->r_statelock);
				return (NULL);
			}
		}
		mutex_exit(&rp->r_statelock);
		(void) nfs_rw_enter_sig(&rp->r_rwlock,
		    RW_READER, FALSE);
		mutex_enter(&rp->r_statelock);
	}

	/*
	 * The entry we were waiting on may have been purged from
	 * the cache and should no longer be used, release it and
	 * start over.
	 */
	if (!(rdc->flags & RDDIRCACHED)) {
		rddir4_cache_rele(rp, rdc);
		goto top;
	}

	/*
	 * The entry is completed.  Return it.
	 */
	return (rdc);
}

/*
 * Allocate a cache element and return it.  Can return NULL if memory is
 * low.
 */
static rddir4_cache *
rddir4_cache_alloc(int flags)
{
	rddir4_cache_impl	*rdip = NULL;
	rddir4_cache		*rc = NULL;

	rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags);

	if (rdip != NULL) {
		rc = &rdip->rc;
		rc->data = (void *)rdip;
		rc->nfs4_cookie = 0;
		rc->nfs4_ncookie = 0;
		rc->entries = NULL;
		rc->eof = 0;
		rc->entlen = 0;
		rc->buflen = 0;
		rc->actlen = 0;
		/*
		 * A readdir is required so set the flag.
		 */
		rc->flags = RDDIRREQ;
		cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
		rc->error = 0;
		mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL);
		rdip->count = 1;
#ifdef DEBUG
		atomic_add_64(&clstat4_debug.dirent.value.ui64, 1);
#endif
	}
	return (rc);
}

/*
 * Increment the reference count to this cache element.
 */
static void
rddir4_cache_hold(rddir4_cache *rc)
{
	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data;

	mutex_enter(&rdip->lock);
	rdip->count++;
	mutex_exit(&rdip->lock);
}

/*
 * Release a reference to this cache element.  If the count is zero then
 * free the element.
 */
void
rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc)
{
	rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data;

	ASSERT(MUTEX_HELD(&rp->r_statelock));

	/*
	 * Check to see if we have any waiters.  If so, we can wake them
	 * so that they can proceed.
	 */
	if (rdc->flags & RDDIRWAIT) {
		rdc->flags &= ~RDDIRWAIT;
		cv_broadcast(&rdc->cv);
	}

	mutex_enter(&rdip->lock);
	ASSERT(rdip->count > 0);
	if (--rdip->count == 0) {
		mutex_exit(&rdip->lock);
		rddir4_cache_free(rdip);
	} else
		mutex_exit(&rdip->lock);
}

/*
 * Free a cache element.
 */
static void
rddir4_cache_free(rddir4_cache_impl *rdip)
{
	rddir4_cache *rc = &rdip->rc;

#ifdef DEBUG
	atomic_add_64(&clstat4_debug.dirent.value.ui64, -1);
#endif
	if (rc->entries != NULL)
		kmem_free(rc->entries, rc->buflen);
	cv_destroy(&rc->cv);
	mutex_destroy(&rdip->lock);
	kmem_free(rdip, sizeof (*rdip));
}

/*
 * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
 * framework.
 */
static int
cl4_snapshot(kstat_t *ksp, void *buf, int rw)
{
	ksp->ks_snaptime = gethrtime();
	if (rw == KSTAT_WRITE) {
		bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl));
#ifdef DEBUG
		/*
		 * Currently only the global zone can write to kstats, but we
		 * add the check just for paranoia.
		 */
		if (INGLOBALZONE(curproc))
			bcopy((char *)buf + sizeof (clstat4_tmpl),
			    &clstat4_debug, sizeof (clstat4_debug));
#endif
	} else {
		bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl));
#ifdef DEBUG
		/*
		 * If we're displaying the "global" debug kstat values, we
		 * display them as-is to all zones since in fact they apply to
		 * the system as a whole.
		 */
		bcopy(&clstat4_debug, (char *)buf + sizeof (clstat4_tmpl),
		    sizeof (clstat4_debug));
#endif
	}
	return (0);
}



/*
 * Zone support
 */
static void *
clinit4_zone(zoneid_t zoneid)
{
	kstat_t *nfs4_client_kstat;
	struct nfs4_clnt *nfscl;
	uint_t ndata;

	nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
	mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL);
	nfscl->nfscl_chtable4 = NULL;
	nfscl->nfscl_zoneid = zoneid;

	bcopy(&clstat4_tmpl, &nfscl->nfscl_stat, sizeof (clstat4_tmpl));
	ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t);
#ifdef DEBUG
	ndata += sizeof (clstat4_debug) / sizeof (kstat_named_t);
#endif
	if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client",
	    "misc", KSTAT_TYPE_NAMED, ndata,
	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
		nfs4_client_kstat->ks_private = &nfscl->nfscl_stat;
		nfs4_client_kstat->ks_snapshot = cl4_snapshot;
		kstat_install(nfs4_client_kstat);
	}
	mutex_enter(&nfs4_clnt_list_lock);
	list_insert_head(&nfs4_clnt_list, nfscl);
	mutex_exit(&nfs4_clnt_list_lock);

	return (nfscl);
}

/*ARGSUSED*/
static void
clfini4_zone(zoneid_t zoneid, void *arg)
{
	struct nfs4_clnt *nfscl = arg;
	chhead_t *chp, *next;

	if (nfscl == NULL)
		return;
	mutex_enter(&nfs4_clnt_list_lock);
	list_remove(&nfs4_clnt_list, nfscl);
	mutex_exit(&nfs4_clnt_list_lock);
	clreclaim4_zone(nfscl, 0);
	for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) {
		ASSERT(chp->ch_list == NULL);
		kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
		next = chp->ch_next;
		kmem_free(chp, sizeof (*chp));
	}
	kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid);
	mutex_destroy(&nfscl->nfscl_chtable4_lock);
	kmem_free(nfscl, sizeof (*nfscl));
}

/*
 * Called by endpnt_destructor to make sure the client handles are
 * cleaned up before the RPC endpoints.  This becomes a no-op if
 * clfini_zone (above) is called first.  This function is needed
 * (rather than relying on clfini_zone to clean up) because the ZSD
 * callbacks have no ordering mechanism, so we have no way to ensure
 * that clfini_zone is called before endpnt_destructor.
 */
void
clcleanup4_zone(zoneid_t zoneid)
{
	struct nfs4_clnt *nfscl;

	mutex_enter(&nfs4_clnt_list_lock);
	nfscl = list_head(&nfs4_clnt_list);
	for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) {
		if (nfscl->nfscl_zoneid == zoneid) {
			clreclaim4_zone(nfscl, 0);
			break;
		}
	}
	mutex_exit(&nfs4_clnt_list_lock);
}

int
nfs4_subr_init(void)
{
	/*
	 * Allocate and initialize the client handle cache
	 */
	chtab4_cache = kmem_cache_create("client_handle4_cache",
	    sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL,
	    NULL, 0);

	/*
	 * Initialize the list of per-zone client handles (and associated data).
	 * This needs to be done before we call zone_key_create().
	 */
	list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt),
	    offsetof(struct nfs4_clnt, nfscl_node));

	/*
	 * Initialize the zone_key for per-zone client handle lists.
	 */
	zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone);

	if (nfs4err_delay_time == 0)
		nfs4err_delay_time = NFS4ERR_DELAY_TIME;

	return (0);
}

int
nfs4_subr_fini(void)
{
	/*
	 * Deallocate the client handle cache
	 */
	kmem_cache_destroy(chtab4_cache);

	/*
	 * Destroy the zone_key
	 */
	(void) zone_key_delete(nfs4clnt_zone_key);

	return (0);
}
/*
 * Set or Clear direct I/O flag
 * VOP_RWLOCK() is held for write access to prevent a race condition
 * which would occur if a process is in the middle of a write when
 * directio flag gets set. It is possible that all pages may not get flushed.
 *
 * This is a copy of nfs_directio, changes here may need to be made
 * there and vice versa.
 */

int
nfs4_directio(vnode_t *vp, int cmd, cred_t *cr)
{
	int	error = 0;
	rnode4_t *rp;

	rp = VTOR4(vp);

	if (cmd == DIRECTIO_ON) {

		if (rp->r_flags & R4DIRECTIO)
			return (0);

		/*
		 * Flush the page cache.
		 */

		(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);

		if (rp->r_flags & R4DIRECTIO) {
			VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
			return (0);
		}

		if (nfs4_has_pages(vp) &&
		    ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) {
			error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0,
			    B_INVAL, cr, NULL);
			if (error) {
				if (error == ENOSPC || error == EDQUOT) {
					mutex_enter(&rp->r_statelock);
					if (!rp->r_error)
						rp->r_error = error;
					mutex_exit(&rp->r_statelock);
				}
				VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
				return (error);
			}
		}

		mutex_enter(&rp->r_statelock);
		rp->r_flags |= R4DIRECTIO;
		mutex_exit(&rp->r_statelock);
		VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
		return (0);
	}

	if (cmd == DIRECTIO_OFF) {
		mutex_enter(&rp->r_statelock);
		rp->r_flags &= ~R4DIRECTIO;	/* disable direct mode */
		mutex_exit(&rp->r_statelock);
		return (0);
	}

	return (EINVAL);
}

/*
 * Return TRUE if the file has any pages.  Always go back to
 * the master vnode to check v_pages since none of the shadows
 * can have pages.
 */

bool_t
nfs4_has_pages(vnode_t *vp)
{
	rnode4_t *rp;

	rp = VTOR4(vp);
	if (IS_SHADOW(vp, rp))
		vp = RTOV4(rp);	/* RTOV4 always gives the master */

	return (vn_has_cached_data(vp));
}

/*
 * This table is used to determine whether the client should attempt
 * failover based on the clnt_stat value returned by CLNT_CALL.  The
 * clnt_stat is used as an index into the table.  If
 * the error value that corresponds to the clnt_stat value in the
 * table is non-zero, then that is the error to be returned AND
 * that signals that failover should be attempted.
 *
 * Special note: If the RPC_ values change, then direct indexing of the
 * table is no longer valid, but having the RPC_ values in the table
 * allow the functions to detect the change and issue a warning.
 * In this case, the code will always attempt failover as a defensive
 * measure.
 */

static struct try_failover_tab {
	enum clnt_stat	cstat;
	int		error;
} try_failover_table [] = {

	RPC_SUCCESS,		0,
	RPC_CANTENCODEARGS,	0,
	RPC_CANTDECODERES,	0,
	RPC_CANTSEND,		ECOMM,
	RPC_CANTRECV,		ECOMM,
	RPC_TIMEDOUT,		ETIMEDOUT,
	RPC_VERSMISMATCH,	0,
	RPC_AUTHERROR,		0,
	RPC_PROGUNAVAIL,	0,
	RPC_PROGVERSMISMATCH,	0,
	RPC_PROCUNAVAIL,	0,
	RPC_CANTDECODEARGS,	0,
	RPC_SYSTEMERROR,	ENOSR,
	RPC_UNKNOWNHOST,	EHOSTUNREACH,
	RPC_RPCBFAILURE,	ENETUNREACH,
	RPC_PROGNOTREGISTERED,	ECONNREFUSED,
	RPC_FAILED,		ETIMEDOUT,
	RPC_UNKNOWNPROTO,	EHOSTUNREACH,
	RPC_INTR,		0,
	RPC_UNKNOWNADDR,	EHOSTUNREACH,
	RPC_TLIERROR,		0,
	RPC_NOBROADCAST,	EHOSTUNREACH,
	RPC_N2AXLATEFAILURE,	ECONNREFUSED,
	RPC_UDERROR,		0,
	RPC_INPROGRESS,		0,
	RPC_STALERACHANDLE,	EINVAL,
	RPC_CANTCONNECT,	ECONNREFUSED,
	RPC_XPRTFAILED,		ECONNABORTED,
	RPC_CANTCREATESTREAM,	ECONNREFUSED,
	RPC_CANTSTORE,		ENOBUFS
};

/*
 * nfs4_try_failover - determine whether the client should
 * attempt failover based on the values stored in the nfs4_error_t.
 */
int
nfs4_try_failover(nfs4_error_t *ep)
{
	if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE)
		return (TRUE);

	if (ep->error && ep->rpc_status != RPC_SUCCESS)
		return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE);

	return (FALSE);
}

/*
 * try_failover - internal version of nfs4_try_failover, called
 * only by rfscall and aclcall.  Determine if failover is warranted
 * based on the clnt_stat and return the error number if it is.
 */
static int
try_failover(enum clnt_stat rpc_status)
{
	int err = 0;

	if (rpc_status == RPC_SUCCESS)
		return (0);

#ifdef	DEBUG
	if (rpc_status != 0 && nfs4_try_failover_any) {
		err = ETIMEDOUT;
		goto done;
	}
#endif
	/*
	 * The rpc status is used as an index into the table.
	 * If the rpc status is outside of the range of the
	 * table or if the rpc error numbers have been changed
	 * since the table was constructed, then print a warning
	 * (DEBUG only) and try failover anyway.  Otherwise, just
	 * grab the resulting error number out of the table.
	 */
	if (rpc_status < RPC_SUCCESS || rpc_status >=
	    sizeof (try_failover_table)/sizeof (try_failover_table[0]) ||
	    try_failover_table[rpc_status].cstat != rpc_status) {

		err = ETIMEDOUT;
#ifdef	DEBUG
		cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d",
		    rpc_status);
#endif
	} else
		err = try_failover_table[rpc_status].error;

done:
	if (rpc_status)
		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
		    "nfs4_try_failover: %strying failover on error %d",
		    err ? "" : "NOT ", rpc_status));

	return (err);
}

void
nfs4_error_zinit(nfs4_error_t *ep)
{
	ep->error = 0;
	ep->stat = NFS4_OK;
	ep->rpc_status = RPC_SUCCESS;
}

void
nfs4_error_init(nfs4_error_t *ep, int error)
{
	ep->error = error;
	ep->stat = NFS4_OK;
	ep->rpc_status = RPC_SUCCESS;
}


#ifdef DEBUG

/*
 * Return a 16-bit hash for filehandle, stateid, clientid, owner.
 * use the same algorithm as for NFS v3.
 *
 */
int
hash16(void *p, int len)
{
	int i, rem;
	uint_t *wp;
	uint_t key = 0;

	/* protect against non word aligned */
	if ((rem = len & 3) != 0)
		len &= ~3;

	for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) {
		key ^= (*wp >> 16) ^ *wp;
	}

	/* hash left-over bytes */
	for (i = 0; i < rem; i++)
		key ^= *((uchar_t *)p + i);

	return (key & 0xffff);
}

/*
 * rnode4info - return filehandle and path information for an rnode.
 * XXX MT issues: uses a single static buffer, no locking of path.
 */
char *
rnode4info(rnode4_t *rp)
{
	static char buf[80];
	nfs4_fhandle_t fhandle;
	char *path;
	char *type;

	if (rp == NULL)
		return ("null");
	if (rp->r_flags & R4ISXATTR)
		type = "attr";
	else if (RTOV4(rp)->v_flag & V_XATTRDIR)
		type = "attrdir";
	else if (RTOV4(rp)->v_flag & VROOT)
		type = "root";
	else if (RTOV4(rp)->v_type == VDIR)
		type = "dir";
	else if (RTOV4(rp)->v_type == VREG)
		type = "file";
	else
		type = "other";
	sfh4_copyval(rp->r_fh, &fhandle);
	path = fn_path(rp->r_svnode.sv_name);
	(void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
	    (void *)rp, path, type, rp->r_flags,
	    hash16((void *)&fhandle.fh_buf, fhandle.fh_len));
	kmem_free(path, strlen(path)+1);
	return (buf);
}
#endif