Mercurial > illumos > illumos-gate
changeset 10004:474324f166a9
6216670 NFS server needs a bigger transmit buffer
author | Dai Ngo <dai.ngo@sun.com> |
---|---|
date | Tue, 30 Jun 2009 16:17:37 -0700 |
parents | f07f995d4507 |
children | b6940fb2404d |
files | usr/src/cmd/fs.d/nfs/lib/nfs_tbind.c usr/src/uts/common/rpc/clnt_cots.c |
diffstat | 2 files changed, 255 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/usr/src/cmd/fs.d/nfs/lib/nfs_tbind.c Tue Jun 30 11:47:15 2009 -0700 +++ b/usr/src/cmd/fs.d/nfs/lib/nfs_tbind.c Tue Jun 30 16:17:37 2009 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,8 +27,6 @@ * nfs_tbind.c, common part for nfsd and lockd. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #define PORTMAP #include <tiuser.h> @@ -82,6 +80,11 @@ */ #define NOFILE_INC_SIZE 64 +/* + * Default TCP send and receive buffer size of NFS server. + */ +#define NFSD_TCP_BUFSZ (1024*1024) + struct conn_ind { struct conn_ind *conn_next; struct conn_ind *conn_prev; @@ -121,6 +124,9 @@ static int num_conns; /* Current number of connections */ int (*Mysvc4)(int, struct netbuf *, struct netconfig *, int, struct netbuf *); +static int setopt(int fd, int level, int name, int value); +static int get_opt(int fd, int level, int name); +static void nfslib_set_sockbuf(int fd); extern bool_t __pmap_set(const rpcprog_t program, const rpcvers_t version, const struct netconfig *nconf, const struct netbuf *address); @@ -240,6 +246,40 @@ return (0); } +static void +nfslib_set_sockbuf(int fd) +{ + int curval, val; + + val = NFSD_TCP_BUFSZ; + + curval = get_opt(fd, SOL_SOCKET, SO_SNDBUF); + syslog(LOG_DEBUG, "Current SO_SNDBUF value is %d", curval); + if ((curval != -1) && (curval < val)) { + syslog(LOG_DEBUG, "Set SO_SNDBUF option to %d", val); + if (setopt(fd, SOL_SOCKET, SO_SNDBUF, val) < 0) { + syslog(LOG_ERR, + "couldn't set SO_SNDBUF to %d - t_errno = %d", + val, t_errno); + syslog(LOG_ERR, + "Check and increase system-wide tcp_max_buf"); + } + } + + curval = get_opt(fd, SOL_SOCKET, SO_RCVBUF); + syslog(LOG_DEBUG, "Current SO_RCVBUF value is %d", curval); + if ((curval != -1) && (curval < val)) { + syslog(LOG_DEBUG, "Set SO_RCVBUF option to %d", val); + if (setopt(fd, SOL_SOCKET, SO_RCVBUF, val) < 0) { + syslog(LOG_ERR, + "couldn't set SO_RCVBUF to %d - t_errno = %d", + val, t_errno); + syslog(LOG_ERR, + "Check and increase system-wide tcp_max_buf"); + } + } +} + int nfslib_bindit(struct netconfig *nconf, struct netbuf **addr, struct nd_hostserv *hs, int backlog) @@ -402,12 +442,43 @@ "couldn't set NODELAY option for proto %s: t_errno = %d, %m", nconf->nc_proto, t_errno); } + + nfslib_set_sockbuf(fd); } return (fd); } static int +get_opt(int fd, int level, int name) +{ + struct t_optmgmt req, res; + struct { + struct opthdr opt; + int value; + } reqbuf; + + reqbuf.opt.level = level; + reqbuf.opt.name = name; + reqbuf.opt.len = sizeof (int); + reqbuf.value = 0; + + req.flags = T_CURRENT; + req.opt.len = sizeof (reqbuf); + req.opt.buf = (char *)&reqbuf; + + res.flags = 0; + res.opt.buf = (char *)&reqbuf; + res.opt.maxlen = sizeof (reqbuf); + + if (t_optmgmt(fd, &req, &res) < 0 || res.flags != T_SUCCESS) { + t_error("t_optmgmt"); + return (-1); + } + return (reqbuf.value); +} + +static int setopt(int fd, int level, int name, int value) { struct t_optmgmt req, resp; @@ -582,6 +653,7 @@ */ add_to_poll_list(sock, retnconf); } + /* * Set up the NFS service over all the available transports. * Returns -1 for failure, 0 for success.
--- a/usr/src/uts/common/rpc/clnt_cots.c Tue Jun 30 11:47:15 2009 -0700 +++ b/usr/src/uts/common/rpc/clnt_cots.c Tue Jun 30 16:17:37 2009 -0700 @@ -381,6 +381,13 @@ int, calllist_t *, int *, bool_t reconnect, const struct timeval *, bool_t, cred_t *); +static void *connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, + t_uscalar_t length, uint_t align_size); +static bool_t connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr); +static bool_t connmgr_getopt_int(queue_t *wq, int level, int name, int *val, + calllist_t *e, cred_t *cr); +static bool_t connmgr_setopt_int(queue_t *wq, int level, int name, int val, + calllist_t *e, cred_t *cr); static bool_t connmgr_setopt(queue_t *, int, int, calllist_t *, cred_t *cr); static void connmgr_sndrel(struct cm_xprt *); static void connmgr_snddis(struct cm_xprt *); @@ -503,6 +510,20 @@ static zone_key_t zone_cots_key; /* + * Defaults TCP send and receive buffer size for RPC connections. + * These values can be tuned by /etc/system. + */ +int rpc_send_bufsz = 1024*1024; +int rpc_recv_bufsz = 1024*1024; +/* + * To use system-wide default for TCP send and receive buffer size, + * use /etc/system to set rpc_default_tcp_bufsz to 1: + * + * set rpcmod:rpc_default_tcp_bufsz=1 + */ +int rpc_default_tcp_bufsz = 0; + +/* * We need to do this after all kernel threads in the zone have exited. */ /* ARGSUSED */ @@ -2558,6 +2579,41 @@ } /* + * Set TCP receive and xmit buffer size for RPC connections. + */ +static bool_t +connmgr_setbufsz(calllist_t *e, queue_t *wq, cred_t *cr) +{ + int ok = FALSE; + int val; + + if (rpc_default_tcp_bufsz) + return (FALSE); + + /* + * Only set new buffer size if it's larger than the system + * default buffer size. If smaller buffer size is needed + * then use /etc/system to set rpc_default_tcp_bufsz to 1. + */ + ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_RCVBUF, &val, e, cr); + if ((ok == TRUE) && (val < rpc_send_bufsz)) { + ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_RCVBUF, + rpc_send_bufsz, e, cr); + DTRACE_PROBE2(krpc__i__connmgr_rcvbufsz, + int, ok, calllist_t *, e); + } + + ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_SNDBUF, &val, e, cr); + if ((ok == TRUE) && (val < rpc_recv_bufsz)) { + ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_SNDBUF, + rpc_recv_bufsz, e, cr); + DTRACE_PROBE2(krpc__i__connmgr_sndbufsz, + int, ok, calllist_t *, e); + } + return (TRUE); +} + +/* * Given an open stream, connect to the remote. Returns true if connected, * false otherwise. */ @@ -2609,6 +2665,10 @@ return (FALSE); } + /* Set TCP buffer size for RPC connections if needed */ + if (addrfmly == AF_INET || addrfmly == AF_INET6) + (void) connmgr_setbufsz(e, wq, cr); + mp->b_datap->db_type = M_PROTO; tcr = (struct T_conn_req *)mp->b_rptr; bzero(tcr, sizeof (*tcr)); @@ -2764,10 +2824,122 @@ } /* + * Verify that the specified offset falls within the mblk and + * that the resulting pointer is aligned. + * Returns NULL if not. + * + * code from fs/sockfs/socksubr.c + */ +static void * +connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, + t_uscalar_t length, uint_t align_size) +{ + uintptr_t ptr1, ptr2; + + ASSERT(mp && mp->b_wptr >= mp->b_rptr); + ptr1 = (uintptr_t)mp->b_rptr + offset; + ptr2 = (uintptr_t)ptr1 + length; + if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { + return (NULL); + } + if ((ptr1 & (align_size - 1)) != 0) { + return (NULL); + } + return ((void *)ptr1); +} + +static bool_t +connmgr_getopt_int(queue_t *wq, int level, int name, int *val, + calllist_t *e, cred_t *cr) +{ + mblk_t *mp; + struct opthdr *opt, *opt_res; + struct T_optmgmt_req *tor; + struct T_optmgmt_ack *opt_ack; + struct timeval waitp; + int error; + + mp = allocb_cred(sizeof (struct T_optmgmt_req) + + sizeof (struct opthdr) + sizeof (int), cr, NOPID); + if (mp == NULL) + return (FALSE); + + mp->b_datap->db_type = M_PROTO; + tor = (struct T_optmgmt_req *)(mp->b_rptr); + tor->PRIM_type = T_SVR4_OPTMGMT_REQ; + tor->MGMT_flags = T_CURRENT; + tor->OPT_length = sizeof (struct opthdr) + sizeof (int); + tor->OPT_offset = sizeof (struct T_optmgmt_req); + + opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req)); + opt->level = level; + opt->name = name; + opt->len = sizeof (int); + mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + + sizeof (int); + + /* + * We will use this connection regardless + * of whether or not the option is readable. + */ + if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { + DTRACE_PROBE(krpc__e__connmgr__getopt__cantsend); + freemsg(mp); + return (FALSE); + } + + mutex_enter(&clnt_pending_lock); + + waitp.tv_sec = clnt_cots_min_conntout; + waitp.tv_usec = 0; + error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1); + + if (e->call_prev) + e->call_prev->call_next = e->call_next; + else + clnt_pending = e->call_next; + if (e->call_next) + e->call_next->call_prev = e->call_prev; + mutex_exit(&clnt_pending_lock); + + /* get reply message */ + mp = e->call_reply; + e->call_reply = NULL; + + if ((!mp) || (e->call_status != RPC_SUCCESS) || (error != 0)) { + + DTRACE_PROBE4(krpc__e__connmgr_getopt, int, name, + int, e->call_status, int, error, mblk_t *, mp); + + if (mp) + freemsg(mp); + return (FALSE); + } + + opt_ack = (struct T_optmgmt_ack *)mp->b_rptr; + opt_res = (struct opthdr *)connmgr_opt_getoff(mp, opt_ack->OPT_offset, + opt_ack->OPT_length, __TPI_ALIGN_SIZE); + + if (!opt_res) { + DTRACE_PROBE4(krpc__e__connmgr_optres, mblk_t *, mp, int, name, + int, opt_ack->OPT_offset, int, opt_ack->OPT_length); + freemsg(mp); + return (FALSE); + } + *val = *(int *)&opt_res[1]; + + DTRACE_PROBE2(connmgr_getopt__ok, int, name, int, *val); + + freemsg(mp); + return (TRUE); +} + +/* * Called by connmgr_connect to set an option on the new stream. */ static bool_t -connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr) +connmgr_setopt_int(queue_t *wq, int level, int name, int val, + calllist_t *e, cred_t *cr) { mblk_t *mp; struct opthdr *opt; @@ -2794,7 +2966,7 @@ opt->level = level; opt->name = name; opt->len = sizeof (int); - *(int *)((char *)opt + sizeof (*opt)) = 1; + *(int *)((char *)opt + sizeof (*opt)) = val; mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + sizeof (int); @@ -2835,6 +3007,12 @@ return (TRUE); } +static bool_t +connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr) +{ + return (connmgr_setopt_int(wq, level, name, 1, e, cr)); +} + #ifdef DEBUG /*