changeset 10004:474324f166a9

6216670 NFS server needs a bigger transmit buffer
author Dai Ngo <dai.ngo@sun.com>
date Tue, 30 Jun 2009 16:17:37 -0700
parents f07f995d4507
children b6940fb2404d
files usr/src/cmd/fs.d/nfs/lib/nfs_tbind.c usr/src/uts/common/rpc/clnt_cots.c
diffstat 2 files changed, 255 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/fs.d/nfs/lib/nfs_tbind.c	Tue Jun 30 11:47:15 2009 -0700
+++ b/usr/src/cmd/fs.d/nfs/lib/nfs_tbind.c	Tue Jun 30 16:17:37 2009 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -27,8 +27,6 @@
  * nfs_tbind.c, common part for nfsd and lockd.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #define	PORTMAP
 
 #include <tiuser.h>
@@ -82,6 +80,11 @@
  */
 #define	NOFILE_INC_SIZE	64
 
+/*
+ * Default TCP send and receive buffer size of NFS server.
+ */
+#define	NFSD_TCP_BUFSZ	(1024*1024)
+
 struct conn_ind {
 	struct conn_ind *conn_next;
 	struct conn_ind *conn_prev;
@@ -121,6 +124,9 @@
 static	int	num_conns;		/* Current number of connections */
 int		(*Mysvc4)(int, struct netbuf *, struct netconfig *, int,
 		struct netbuf *);
+static int	setopt(int fd, int level, int name, int value);
+static int	get_opt(int fd, int level, int name);
+static void	nfslib_set_sockbuf(int fd);
 
 extern bool_t __pmap_set(const rpcprog_t program, const rpcvers_t version,
     const struct netconfig *nconf, const struct netbuf *address);
@@ -240,6 +246,40 @@
 	return (0);
 }
 
+static void
+nfslib_set_sockbuf(int fd)
+{
+	int curval, val;
+
+	val = NFSD_TCP_BUFSZ;
+
+	curval = get_opt(fd, SOL_SOCKET, SO_SNDBUF);
+	syslog(LOG_DEBUG, "Current SO_SNDBUF value is %d", curval);
+	if ((curval != -1) && (curval < val)) {
+		syslog(LOG_DEBUG, "Set SO_SNDBUF  option to %d", val);
+		if (setopt(fd, SOL_SOCKET, SO_SNDBUF, val) < 0) {
+			syslog(LOG_ERR,
+			    "couldn't set SO_SNDBUF to %d - t_errno = %d",
+			    val, t_errno);
+			syslog(LOG_ERR,
+			    "Check and increase system-wide tcp_max_buf");
+		}
+	}
+
+	curval = get_opt(fd, SOL_SOCKET, SO_RCVBUF);
+	syslog(LOG_DEBUG, "Current SO_RCVBUF value is %d", curval);
+	if ((curval != -1) && (curval < val)) {
+		syslog(LOG_DEBUG, "Set SO_RCVBUF  option to %d", val);
+		if (setopt(fd, SOL_SOCKET, SO_RCVBUF, val) < 0) {
+			syslog(LOG_ERR,
+			    "couldn't set SO_RCVBUF to %d - t_errno = %d",
+			    val, t_errno);
+			syslog(LOG_ERR,
+			    "Check and increase system-wide tcp_max_buf");
+		}
+	}
+}
+
 int
 nfslib_bindit(struct netconfig *nconf, struct netbuf **addr,
 	struct nd_hostserv *hs, int backlog)
@@ -402,12 +442,43 @@
 	"couldn't set NODELAY option for proto %s: t_errno = %d, %m",
 			    nconf->nc_proto, t_errno);
 		}
+
+		nfslib_set_sockbuf(fd);
 	}
 
 	return (fd);
 }
 
 static int
+get_opt(int fd, int level, int name)
+{
+	struct t_optmgmt req, res;
+	struct {
+		struct opthdr opt;
+		int value;
+	} reqbuf;
+
+	reqbuf.opt.level = level;
+	reqbuf.opt.name = name;
+	reqbuf.opt.len = sizeof (int);
+	reqbuf.value = 0;
+
+	req.flags = T_CURRENT;
+	req.opt.len = sizeof (reqbuf);
+	req.opt.buf = (char *)&reqbuf;
+
+	res.flags = 0;
+	res.opt.buf = (char *)&reqbuf;
+	res.opt.maxlen = sizeof (reqbuf);
+
+	if (t_optmgmt(fd, &req, &res) < 0 || res.flags != T_SUCCESS) {
+		t_error("t_optmgmt");
+		return (-1);
+	}
+	return (reqbuf.value);
+}
+
+static int
 setopt(int fd, int level, int name, int value)
 {
 	struct t_optmgmt req, resp;
@@ -582,6 +653,7 @@
 	 */
 	add_to_poll_list(sock, retnconf);
 }
+
 /*
  * Set up the NFS service over all the available transports.
  * Returns -1 for failure, 0 for success.
--- a/usr/src/uts/common/rpc/clnt_cots.c	Tue Jun 30 11:47:15 2009 -0700
+++ b/usr/src/uts/common/rpc/clnt_cots.c	Tue Jun 30 16:17:37 2009 -0700
@@ -381,6 +381,13 @@
 				int, calllist_t *, int *, bool_t reconnect,
 				const struct timeval *, bool_t, cred_t *);
 
+static void	*connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset,
+				t_uscalar_t length, uint_t align_size);
+static bool_t	connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr);
+static bool_t	connmgr_getopt_int(queue_t *wq, int level, int name, int *val,
+				calllist_t *e, cred_t *cr);
+static bool_t	connmgr_setopt_int(queue_t *wq, int level, int name, int val,
+				calllist_t *e, cred_t *cr);
 static bool_t	connmgr_setopt(queue_t *, int, int, calllist_t *, cred_t *cr);
 static void	connmgr_sndrel(struct cm_xprt *);
 static void	connmgr_snddis(struct cm_xprt *);
@@ -503,6 +510,20 @@
 static zone_key_t zone_cots_key;
 
 /*
+ * Defaults TCP send and receive buffer size for RPC connections.
+ * These values can be tuned by /etc/system.
+ */
+int rpc_send_bufsz = 1024*1024;
+int rpc_recv_bufsz = 1024*1024;
+/*
+ * To use system-wide default for TCP send and receive buffer size,
+ * use /etc/system to set rpc_default_tcp_bufsz to 1:
+ *
+ * set rpcmod:rpc_default_tcp_bufsz=1
+ */
+int rpc_default_tcp_bufsz = 0;
+
+/*
  * We need to do this after all kernel threads in the zone have exited.
  */
 /* ARGSUSED */
@@ -2558,6 +2579,41 @@
 }
 
 /*
+ * Set TCP receive and xmit buffer size for RPC connections.
+ */
+static bool_t
+connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr)
+{
+	int ok = FALSE;
+	int val;
+
+	if (rpc_default_tcp_bufsz)
+		return (FALSE);
+
+	/*
+	 * Only set new buffer size if it's larger than the system
+	 * default buffer size. If smaller buffer size is needed
+	 * then use /etc/system to set rpc_default_tcp_bufsz to 1.
+	 */
+	ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_RCVBUF, &val, e, cr);
+	if ((ok == TRUE) && (val < rpc_send_bufsz)) {
+		ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_RCVBUF,
+		    rpc_send_bufsz, e, cr);
+		DTRACE_PROBE2(krpc__i__connmgr_rcvbufsz,
+		    int, ok, calllist_t *, e);
+	}
+
+	ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_SNDBUF, &val, e, cr);
+	if ((ok == TRUE) && (val < rpc_recv_bufsz)) {
+		ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_SNDBUF,
+		    rpc_recv_bufsz, e, cr);
+		DTRACE_PROBE2(krpc__i__connmgr_sndbufsz,
+		    int, ok, calllist_t *, e);
+	}
+	return (TRUE);
+}
+
+/*
  * Given an open stream, connect to the remote.  Returns true if connected,
  * false otherwise.
  */
@@ -2609,6 +2665,10 @@
 		return (FALSE);
 	}
 
+	/* Set TCP buffer size for RPC connections if needed */
+	if (addrfmly == AF_INET || addrfmly == AF_INET6)
+		(void) connmgr_setbufsz(e, wq, cr);
+
 	mp->b_datap->db_type = M_PROTO;
 	tcr = (struct T_conn_req *)mp->b_rptr;
 	bzero(tcr, sizeof (*tcr));
@@ -2764,10 +2824,122 @@
 }
 
 /*
+ * Verify that the specified offset falls within the mblk and
+ * that the resulting pointer is aligned.
+ * Returns NULL if not.
+ *
+ * code from fs/sockfs/socksubr.c
+ */
+static void *
+connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset,
+    t_uscalar_t length, uint_t align_size)
+{
+	uintptr_t ptr1, ptr2;
+
+	ASSERT(mp && mp->b_wptr >= mp->b_rptr);
+	ptr1 = (uintptr_t)mp->b_rptr + offset;
+	ptr2 = (uintptr_t)ptr1 + length;
+	if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
+		return (NULL);
+	}
+	if ((ptr1 & (align_size - 1)) != 0) {
+		return (NULL);
+	}
+	return ((void *)ptr1);
+}
+
+static bool_t
+connmgr_getopt_int(queue_t *wq, int level, int name, int *val,
+    calllist_t *e, cred_t *cr)
+{
+	mblk_t *mp;
+	struct opthdr *opt, *opt_res;
+	struct T_optmgmt_req *tor;
+	struct T_optmgmt_ack *opt_ack;
+	struct timeval waitp;
+	int error;
+
+	mp = allocb_cred(sizeof (struct T_optmgmt_req) +
+	    sizeof (struct opthdr) + sizeof (int), cr, NOPID);
+	if (mp == NULL)
+		return (FALSE);
+
+	mp->b_datap->db_type = M_PROTO;
+	tor = (struct T_optmgmt_req *)(mp->b_rptr);
+	tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
+	tor->MGMT_flags = T_CURRENT;
+	tor->OPT_length = sizeof (struct opthdr) + sizeof (int);
+	tor->OPT_offset = sizeof (struct T_optmgmt_req);
+
+	opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req));
+	opt->level = level;
+	opt->name = name;
+	opt->len = sizeof (int);
+	mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) +
+	    sizeof (int);
+
+	/*
+	 * We will use this connection regardless
+	 * of whether or not the option is readable.
+	 */
+	if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) {
+		DTRACE_PROBE(krpc__e__connmgr__getopt__cantsend);
+		freemsg(mp);
+		return (FALSE);
+	}
+
+	mutex_enter(&clnt_pending_lock);
+
+	waitp.tv_sec = clnt_cots_min_conntout;
+	waitp.tv_usec = 0;
+	error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1);
+
+	if (e->call_prev)
+		e->call_prev->call_next = e->call_next;
+	else
+		clnt_pending = e->call_next;
+	if (e->call_next)
+		e->call_next->call_prev = e->call_prev;
+	mutex_exit(&clnt_pending_lock);
+
+	/* get reply message */
+	mp = e->call_reply;
+	e->call_reply = NULL;
+
+	if ((!mp) || (e->call_status != RPC_SUCCESS) || (error != 0)) {
+
+		DTRACE_PROBE4(krpc__e__connmgr_getopt, int, name,
+		    int, e->call_status, int, error, mblk_t *, mp);
+
+		if (mp)
+			freemsg(mp);
+		return (FALSE);
+	}
+
+	opt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
+	opt_res = (struct opthdr *)connmgr_opt_getoff(mp, opt_ack->OPT_offset,
+	    opt_ack->OPT_length, __TPI_ALIGN_SIZE);
+
+	if (!opt_res) {
+		DTRACE_PROBE4(krpc__e__connmgr_optres, mblk_t *, mp, int, name,
+		    int, opt_ack->OPT_offset, int, opt_ack->OPT_length);
+		freemsg(mp);
+		return (FALSE);
+	}
+	*val = *(int *)&opt_res[1];
+
+	DTRACE_PROBE2(connmgr_getopt__ok, int, name, int, *val);
+
+	freemsg(mp);
+	return (TRUE);
+}
+
+/*
  * Called by connmgr_connect to set an option on the new stream.
  */
 static bool_t
-connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr)
+connmgr_setopt_int(queue_t *wq, int level, int name, int val,
+    calllist_t *e, cred_t *cr)
 {
 	mblk_t *mp;
 	struct opthdr *opt;
@@ -2794,7 +2966,7 @@
 	opt->level = level;
 	opt->name = name;
 	opt->len = sizeof (int);
-	*(int *)((char *)opt + sizeof (*opt)) = 1;
+	*(int *)((char *)opt + sizeof (*opt)) = val;
 	mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) +
 	    sizeof (int);
 
@@ -2835,6 +3007,12 @@
 	return (TRUE);
 }
 
+static bool_t
+connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr)
+{
+	return (connmgr_setopt_int(wq, level, name, 1, e, cr));
+}
+
 #ifdef	DEBUG
 
 /*