view usr/src/uts/common/inet/tcp/tcp_opt_data.c @ 13774:80b84226b49f

3065 some functions in the tcp module can be static Reviewed by: Sebastien Roy <sebastien.roy@delphix.com> Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Gary Mills <gary_mills@fastmail.fm> Approved by: Richard Lowe <richlowe@richlowe.net>
author David Höppner <0xffea@gmail.com>
date Fri, 17 Aug 2012 14:11:30 -0400
parents 4dad26dd1fb9
children 6d652ac77401
line wrap: on
line source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
 */

#include <sys/types.h>
#include <sys/stream.h>
#define	_SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/socket.h>
#include <sys/xti_xtiopt.h>
#include <sys/xti_inet.h>
#include <sys/policy.h>

#include <inet/common.h>
#include <netinet/ip6.h>
#include <inet/ip.h>

#include <netinet/in.h>
#include <netinet/tcp.h>
#include <inet/optcom.h>
#include <inet/proto_set.h>
#include <inet/tcp_impl.h>

static int	tcp_opt_default(queue_t *, int, int, uchar_t *);

/*
 * Table of all known options handled on a TCP protocol stack.
 *
 * Note: This table contains options processed by both TCP and IP levels
 *       and is the superset of options that can be performed on a TCP over IP
 *       stack.
 */
opdes_t	tcp_opt_arr[] = {

{ SO_LINGER,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
	sizeof (struct linger), 0 },

{ SO_DEBUG,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_KEEPALIVE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_DONTROUTE,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
	},
{ SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
{ SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_RCVBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_SNDTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
	sizeof (struct timeval), 0 },
{ SO_RCVTIMEO,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
	sizeof (struct timeval), 0 },
{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
	},
{ SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
	0 },
{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
	0 },
{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
	0 },
{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
	0 },
{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },

{ SO_PROTOTYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },

{ TCP_NODELAY,	IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
	},
{ TCP_MAXSEG,	IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
	536 },

{ TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },

{ TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },

{ TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },

{ TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
	OP_DEF_FN, sizeof (int), -1 /* not initialized */ },

{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
	0 },

{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
	sizeof (int), 0 },

{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
	},

{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
	sizeof (int), 0 },

{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0	},

{ TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0	},

{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },

{ TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },

{ TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },

{ TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
	(OP_VARLEN|OP_NODEFAULT),
	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ T_IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
	(OP_VARLEN|OP_NODEFAULT),
	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },

{ IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ T_IP_TOS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ IP_TTL,	IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
	sizeof (int), -1 /* not initialized */ },

{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
	sizeof (ipsec_req_t), -1 /* not initialized */ },

{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int),	0 /* no ifindex */ },

{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
	sizeof (int), 0 },

{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
	sizeof (int), -1 /* not initialized */ },

{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int),	0 /* no ifindex */ },

{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
	sizeof (in_addr_t),	-1 /* not initialized  */ },

{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
	sizeof (int), 0 },

{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
	(OP_NODEFAULT|OP_VARLEN),
	sizeof (struct in6_pktinfo), -1 /* not initialized */ },
{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
	OP_NODEFAULT,
	sizeof (sin6_t), -1 /* not initialized */ },
{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
	(OP_VARLEN|OP_NODEFAULT), 255*8,
	-1 /* not initialized */ },
{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
	(OP_VARLEN|OP_NODEFAULT), 255*8,
	-1 /* not initialized */ },
{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
	(OP_VARLEN|OP_NODEFAULT), 255*8,
	-1 /* not initialized */ },
{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
	(OP_VARLEN|OP_NODEFAULT), 255*8,
	-1 /* not initialized */ },
{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
	OP_NODEFAULT,
	sizeof (int), -1 /* not initialized */ },
{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
	OP_NODEFAULT,
	sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },
{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },
{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },

/* Enable receipt of ancillary data */
{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },
{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },
{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },
{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },
{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },
{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },
{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },
{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (int), 0 },

{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
	sizeof (ipsec_req_t), -1 /* not initialized */ },
{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
	sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },
};

/*
 * Table of all supported levels
 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
 * any supported options so we need this info separately.
 *
 * This is needed only for topmost tpi providers and is used only by
 * XTI interfaces.
 */
optlevel_t	tcp_valid_levels_arr[] = {
	XTI_GENERIC,
	SOL_SOCKET,
	IPPROTO_TCP,
	IPPROTO_IP,
	IPPROTO_IPV6
};


#define	TCP_OPT_ARR_CNT		A_CNT(tcp_opt_arr)
#define	TCP_VALID_LEVELS_CNT	A_CNT(tcp_valid_levels_arr)

uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */

/*
 * Initialize option database object for TCP
 *
 * This object represents database of options to search passed to
 * {sock,tpi}optcom_req() interface routine to take care of option
 * management and associated methods.
 */

optdb_obj_t tcp_opt_obj = {
	tcp_opt_default,	/* TCP default value function pointer */
	tcp_tpi_opt_get,	/* TCP get function pointer */
	tcp_tpi_opt_set,	/* TCP set function pointer */
	TCP_OPT_ARR_CNT,	/* TCP option database count of entries */
	tcp_opt_arr,		/* TCP option database */
	TCP_VALID_LEVELS_CNT,	/* TCP valid level count of entries */
	tcp_valid_levels_arr	/* TCP valid level array */
};

/* Maximum TCP initial cwin (start/restart). */
#define	TCP_MAX_INIT_CWND	16

static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;

/*
 * Some TCP options can be "set" by requesting them in the option
 * buffer. This is needed for XTI feature test though we do not
 * allow it in general. We interpret that this mechanism is more
 * applicable to OSI protocols and need not be allowed in general.
 * This routine filters out options for which it is not allowed (most)
 * and lets through those (few) for which it is. [ The XTI interface
 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
 * ever implemented will have to be allowed here ].
 */
static boolean_t
tcp_allow_connopt_set(int level, int name)
{

	switch (level) {
	case IPPROTO_TCP:
		switch (name) {
		case TCP_NODELAY:
			return (B_TRUE);
		default:
			return (B_FALSE);
		}
		/*NOTREACHED*/
	default:
		return (B_FALSE);
	}
	/*NOTREACHED*/
}

/*
 * This routine gets default values of certain options whose default
 * values are maintained by protocol specific code
 */
/* ARGSUSED */
static int
tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
{
	int32_t	*i1 = (int32_t *)ptr;
	tcp_stack_t	*tcps = Q_TO_TCP(q)->tcp_tcps;

	switch (level) {
	case IPPROTO_TCP:
		switch (name) {
		case TCP_NOTIFY_THRESHOLD:
			*i1 = tcps->tcps_ip_notify_interval;
			break;
		case TCP_ABORT_THRESHOLD:
			*i1 = tcps->tcps_ip_abort_interval;
			break;
		case TCP_CONN_NOTIFY_THRESHOLD:
			*i1 = tcps->tcps_ip_notify_cinterval;
			break;
		case TCP_CONN_ABORT_THRESHOLD:
			*i1 = tcps->tcps_ip_abort_cinterval;
			break;
		default:
			return (-1);
		}
		break;
	case IPPROTO_IP:
		switch (name) {
		case IP_TTL:
			*i1 = tcps->tcps_ipv4_ttl;
			break;
		default:
			return (-1);
		}
		break;
	case IPPROTO_IPV6:
		switch (name) {
		case IPV6_UNICAST_HOPS:
			*i1 = tcps->tcps_ipv6_hoplimit;
			break;
		default:
			return (-1);
		}
		break;
	default:
		return (-1);
	}
	return (sizeof (int));
}

/*
 * TCP routine to get the values of options.
 */
int
tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
{
	int		*i1 = (int *)ptr;
	tcp_t		*tcp = connp->conn_tcp;
	conn_opt_arg_t	coas;
	int		retval;

	coas.coa_connp = connp;
	coas.coa_ixa = connp->conn_ixa;
	coas.coa_ipp = &connp->conn_xmit_ipp;
	coas.coa_ancillary = B_FALSE;
	coas.coa_changed = 0;

	switch (level) {
	case SOL_SOCKET:
		switch (name) {
		case SO_SND_COPYAVOID:
			*i1 = tcp->tcp_snd_zcopy_on ?
			    SO_SND_COPYAVOID : 0;
			return (sizeof (int));
		case SO_ACCEPTCONN:
			*i1 = (tcp->tcp_state == TCPS_LISTEN);
			return (sizeof (int));
		}
		break;
	case IPPROTO_TCP:
		switch (name) {
		case TCP_NODELAY:
			*i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
			return (sizeof (int));
		case TCP_MAXSEG:
			*i1 = tcp->tcp_mss;
			return (sizeof (int));
		case TCP_NOTIFY_THRESHOLD:
			*i1 = (int)tcp->tcp_first_timer_threshold;
			return (sizeof (int));
		case TCP_ABORT_THRESHOLD:
			*i1 = tcp->tcp_second_timer_threshold;
			return (sizeof (int));
		case TCP_CONN_NOTIFY_THRESHOLD:
			*i1 = tcp->tcp_first_ctimer_threshold;
			return (sizeof (int));
		case TCP_CONN_ABORT_THRESHOLD:
			*i1 = tcp->tcp_second_ctimer_threshold;
			return (sizeof (int));
		case TCP_INIT_CWND:
			*i1 = tcp->tcp_init_cwnd;
			return (sizeof (int));
		case TCP_KEEPALIVE_THRESHOLD:
			*i1 = tcp->tcp_ka_interval;
			return (sizeof (int));

		/*
		 * TCP_KEEPIDLE expects value in seconds, but
		 * tcp_ka_interval is in milliseconds.
		 */
		case TCP_KEEPIDLE:
			*i1 = tcp->tcp_ka_interval / 1000;
			return (sizeof (int));
		case TCP_KEEPCNT:
			*i1 = tcp->tcp_ka_cnt;
			return (sizeof (int));

		/*
		 * TCP_KEEPINTVL expects value in seconds, but
		 * tcp_ka_rinterval is in milliseconds.
		 */
		case TCP_KEEPINTVL:
			*i1 = tcp->tcp_ka_rinterval / 1000;
			return (sizeof (int));
		case TCP_KEEPALIVE_ABORT_THRESHOLD:
			*i1 = tcp->tcp_ka_abort_thres;
			return (sizeof (int));
		case TCP_CORK:
			*i1 = tcp->tcp_cork;
			return (sizeof (int));
		case TCP_RTO_INITIAL:
			*i1 = tcp->tcp_rto_initial;
			return (sizeof (uint32_t));
		case TCP_RTO_MIN:
			*i1 = tcp->tcp_rto_min;
			return (sizeof (uint32_t));
		case TCP_RTO_MAX:
			*i1 = tcp->tcp_rto_max;
			return (sizeof (uint32_t));
		case TCP_LINGER2:
			*i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
			return (sizeof (int));
		}
		break;
	case IPPROTO_IP:
		if (connp->conn_family != AF_INET)
			return (-1);
		switch (name) {
		case IP_OPTIONS:
		case T_IP_OPTIONS:
			/* Caller ensures enough space */
			return (ip_opt_get_user(connp, ptr));
		default:
			break;
		}
		break;

	case IPPROTO_IPV6:
		/*
		 * IPPROTO_IPV6 options are only supported for sockets
		 * that are using IPv6 on the wire.
		 */
		if (connp->conn_ipversion != IPV6_VERSION) {
			return (-1);
		}
		switch (name) {
		case IPV6_PATHMTU:
			if (tcp->tcp_state < TCPS_ESTABLISHED)
				return (-1);
			break;
		}
		break;
	}
	mutex_enter(&connp->conn_lock);
	retval = conn_opt_get(&coas, level, name, ptr);
	mutex_exit(&connp->conn_lock);
	return (retval);
}

/*
 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
 * Parameters are assumed to be verified by the caller.
 */
/* ARGSUSED */
int
tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
    void *thisdg_attrs, cred_t *cr)
{
	tcp_t	*tcp = connp->conn_tcp;
	int	*i1 = (int *)invalp;
	boolean_t onoff = (*i1 == 0) ? 0 : 1;
	boolean_t checkonly;
	int	reterr;
	tcp_stack_t	*tcps = tcp->tcp_tcps;
	conn_opt_arg_t	coas;
	uint32_t	val = *((uint32_t *)invalp);

	coas.coa_connp = connp;
	coas.coa_ixa = connp->conn_ixa;
	coas.coa_ipp = &connp->conn_xmit_ipp;
	coas.coa_ancillary = B_FALSE;
	coas.coa_changed = 0;

	switch (optset_context) {
	case SETFN_OPTCOM_CHECKONLY:
		checkonly = B_TRUE;
		/*
		 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
		 * inlen != 0 implies value supplied and
		 * 	we have to "pretend" to set it.
		 * inlen == 0 implies that there is no
		 * 	value part in T_CHECK request and just validation
		 * done elsewhere should be enough, we just return here.
		 */
		if (inlen == 0) {
			*outlenp = 0;
			return (0);
		}
		break;
	case SETFN_OPTCOM_NEGOTIATE:
		checkonly = B_FALSE;
		break;
	case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
	case SETFN_CONN_NEGOTIATE:
		checkonly = B_FALSE;
		/*
		 * Negotiating local and "association-related" options
		 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
		 * primitives is allowed by XTI, but we choose
		 * to not implement this style negotiation for Internet
		 * protocols (We interpret it is a must for OSI world but
		 * optional for Internet protocols) for all options.
		 * [ Will do only for the few options that enable test
		 * suites that our XTI implementation of this feature
		 * works for transports that do allow it ]
		 */
		if (!tcp_allow_connopt_set(level, name)) {
			*outlenp = 0;
			return (EINVAL);
		}
		break;
	default:
		/*
		 * We should never get here
		 */
		*outlenp = 0;
		return (EINVAL);
	}

	ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
	    (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));

	/*
	 * For TCP, we should have no ancillary data sent down
	 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
	 * has to be zero.
	 */
	ASSERT(thisdg_attrs == NULL);

	/*
	 * For fixed length options, no sanity check
	 * of passed in length is done. It is assumed *_optcom_req()
	 * routines do the right thing.
	 */
	switch (level) {
	case SOL_SOCKET:
		switch (name) {
		case SO_KEEPALIVE:
			if (checkonly) {
				/* check only case */
				break;
			}

			if (!onoff) {
				if (connp->conn_keepalive) {
					if (tcp->tcp_ka_tid != 0) {
						(void) TCP_TIMER_CANCEL(tcp,
						    tcp->tcp_ka_tid);
						tcp->tcp_ka_tid = 0;
					}
					connp->conn_keepalive = 0;
				}
				break;
			}
			if (!connp->conn_keepalive) {
				/* Crank up the keepalive timer */
				tcp->tcp_ka_last_intrvl = 0;
				tcp->tcp_ka_tid = TCP_TIMER(tcp,
				    tcp_keepalive_timer, tcp->tcp_ka_interval);
				connp->conn_keepalive = 1;
			}
			break;
		case SO_SNDBUF: {
			if (*i1 > tcps->tcps_max_buf) {
				*outlenp = 0;
				return (ENOBUFS);
			}
			if (checkonly)
				break;

			connp->conn_sndbuf = *i1;
			if (tcps->tcps_snd_lowat_fraction != 0) {
				connp->conn_sndlowat = connp->conn_sndbuf /
				    tcps->tcps_snd_lowat_fraction;
			}
			(void) tcp_maxpsz_set(tcp, B_TRUE);
			/*
			 * If we are flow-controlled, recheck the condition.
			 * There are apps that increase SO_SNDBUF size when
			 * flow-controlled (EWOULDBLOCK), and expect the flow
			 * control condition to be lifted right away.
			 */
			mutex_enter(&tcp->tcp_non_sq_lock);
			if (tcp->tcp_flow_stopped &&
			    TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
				tcp_clrqfull(tcp);
			}
			mutex_exit(&tcp->tcp_non_sq_lock);
			*outlenp = inlen;
			return (0);
		}
		case SO_RCVBUF:
			if (*i1 > tcps->tcps_max_buf) {
				*outlenp = 0;
				return (ENOBUFS);
			}
			/* Silently ignore zero */
			if (!checkonly && *i1 != 0) {
				*i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
				(void) tcp_rwnd_set(tcp, *i1);
			}
			/*
			 * XXX should we return the rwnd here
			 * and tcp_opt_get ?
			 */
			*outlenp = inlen;
			return (0);
		case SO_SND_COPYAVOID:
			if (!checkonly) {
				if (tcp->tcp_loopback ||
				    (onoff != 1) || !tcp_zcopy_check(tcp)) {
					*outlenp = 0;
					return (EOPNOTSUPP);
				}
				tcp->tcp_snd_zcopy_aware = 1;
			}
			*outlenp = inlen;
			return (0);
		}
		break;
	case IPPROTO_TCP:
		switch (name) {
		case TCP_NODELAY:
			if (!checkonly)
				tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
			break;
		case TCP_NOTIFY_THRESHOLD:
			if (!checkonly)
				tcp->tcp_first_timer_threshold = *i1;
			break;
		case TCP_ABORT_THRESHOLD:
			if (!checkonly)
				tcp->tcp_second_timer_threshold = *i1;
			break;
		case TCP_CONN_NOTIFY_THRESHOLD:
			if (!checkonly)
				tcp->tcp_first_ctimer_threshold = *i1;
			break;
		case TCP_CONN_ABORT_THRESHOLD:
			if (!checkonly)
				tcp->tcp_second_ctimer_threshold = *i1;
			break;
		case TCP_RECVDSTADDR:
			if (tcp->tcp_state > TCPS_LISTEN) {
				*outlenp = 0;
				return (EOPNOTSUPP);
			}
			/* Setting done in conn_opt_set */
			break;
		case TCP_INIT_CWND:
			if (checkonly)
				break;

			/*
			 * Only allow socket with network configuration
			 * privilege to set the initial cwnd to be larger
			 * than allowed by RFC 3390.
			 */
			if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
				if ((reterr = secpolicy_ip_config(cr, B_TRUE))
				    != 0) {
					*outlenp = 0;
					return (reterr);
				}
				if (val > tcp_max_init_cwnd) {
					*outlenp = 0;
					return (EINVAL);
				}
			}

			tcp->tcp_init_cwnd = val;

			/*
			 * If the socket is connected, AND no outbound data
			 * has been sent, reset the actual cwnd values.
			 */
			if (tcp->tcp_state == TCPS_ESTABLISHED &&
			    tcp->tcp_iss == tcp->tcp_snxt - 1) {
				tcp->tcp_cwnd =
				    MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
			}
			break;

		/*
		 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
		 * is in milliseconds. TCP_KEEPIDLE is introduced for
		 * compatibility with other Unix flavors.
		 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
		 * converting the input to milliseconds.
		 */
		case TCP_KEEPIDLE:
			*i1 *= 1000;
			/* FALLTHRU */

		case TCP_KEEPALIVE_THRESHOLD:
			if (checkonly)
				break;

			if (*i1 < tcps->tcps_keepalive_interval_low ||
			    *i1 > tcps->tcps_keepalive_interval_high) {
				*outlenp = 0;
				return (EINVAL);
			}
			if (*i1 != tcp->tcp_ka_interval) {
				tcp->tcp_ka_interval = *i1;
				/*
				 * Check if we need to restart the
				 * keepalive timer.
				 */
				if (tcp->tcp_ka_tid != 0) {
					ASSERT(connp->conn_keepalive);
					(void) TCP_TIMER_CANCEL(tcp,
					    tcp->tcp_ka_tid);
					tcp->tcp_ka_last_intrvl = 0;
					tcp->tcp_ka_tid = TCP_TIMER(tcp,
					    tcp_keepalive_timer,
					    tcp->tcp_ka_interval);
				}
			}
			break;

		/*
		 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
		 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
		 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
		 * tcp_ka_cnt.
		 */
		case TCP_KEEPCNT:
			if (checkonly)
				break;

			if (*i1 == 0) {
				return (EINVAL);
			} else if (tcp->tcp_ka_rinterval == 0) {
				if ((tcp->tcp_ka_abort_thres / *i1) <
				    tcp->tcp_rto_min ||
				    (tcp->tcp_ka_abort_thres / *i1) >
				    tcp->tcp_rto_max)
					return (EINVAL);

				tcp->tcp_ka_rinterval =
				    tcp->tcp_ka_abort_thres / *i1;
			} else {
				if ((*i1 * tcp->tcp_ka_rinterval) <
				    tcps->tcps_keepalive_abort_interval_low ||
				    (*i1 * tcp->tcp_ka_rinterval) >
				    tcps->tcps_keepalive_abort_interval_high)
					return (EINVAL);
				tcp->tcp_ka_abort_thres =
				    (*i1 * tcp->tcp_ka_rinterval);
			}
			tcp->tcp_ka_cnt = *i1;
			break;
		case TCP_KEEPINTVL:
			/*
			 * TCP_KEEPINTVL is specified in seconds, but
			 * tcp_ka_rinterval is in milliseconds.
			 */

			if (checkonly)
				break;

			if ((*i1 * 1000) < tcp->tcp_rto_min ||
			    (*i1 * 1000) > tcp->tcp_rto_max)
				return (EINVAL);

			if (tcp->tcp_ka_cnt == 0) {
				tcp->tcp_ka_cnt =
				    tcp->tcp_ka_abort_thres / (*i1 * 1000);
			} else {
				if ((*i1 * tcp->tcp_ka_cnt * 1000) <
				    tcps->tcps_keepalive_abort_interval_low ||
				    (*i1 * tcp->tcp_ka_cnt * 1000) >
				    tcps->tcps_keepalive_abort_interval_high)
					return (EINVAL);
				tcp->tcp_ka_abort_thres =
				    (*i1 * tcp->tcp_ka_cnt * 1000);
			}
			tcp->tcp_ka_rinterval = *i1 * 1000;
			break;
		case TCP_KEEPALIVE_ABORT_THRESHOLD:
			if (!checkonly) {
				if (*i1 <
				    tcps->tcps_keepalive_abort_interval_low ||
				    *i1 >
				    tcps->tcps_keepalive_abort_interval_high) {
					*outlenp = 0;
					return (EINVAL);
				}
				tcp->tcp_ka_abort_thres = *i1;
				tcp->tcp_ka_cnt = 0;
				tcp->tcp_ka_rinterval = 0;
			}
			break;
		case TCP_CORK:
			if (!checkonly) {
				/*
				 * if tcp->tcp_cork was set and is now
				 * being unset, we have to make sure that
				 * the remaining data gets sent out. Also
				 * unset tcp->tcp_cork so that tcp_wput_data()
				 * can send data even if it is less than mss
				 */
				if (tcp->tcp_cork && onoff == 0 &&
				    tcp->tcp_unsent > 0) {
					tcp->tcp_cork = B_FALSE;
					tcp_wput_data(tcp, NULL, B_FALSE);
				}
				tcp->tcp_cork = onoff;
			}
			break;
		case TCP_RTO_INITIAL: {
			clock_t rto;

			if (checkonly || val == 0)
				break;

			/*
			 * Sanity checks
			 *
			 * The initial RTO should be bounded by the minimum
			 * and maximum RTO.  And it should also be smaller
			 * than the connect attempt abort timeout.  Otherwise,
			 * the connection won't be aborted in a period
			 * reasonably close to that timeout.
			 */
			if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
			    val > tcp->tcp_second_ctimer_threshold ||
			    val < tcps->tcps_rexmit_interval_initial_low ||
			    val > tcps->tcps_rexmit_interval_initial_high) {
				*outlenp = 0;
				return (EINVAL);
			}
			tcp->tcp_rto_initial = val;

			/*
			 * If TCP has not sent anything, need to re-calculate
			 * tcp_rto.  Otherwise, this option change does not
			 * really affect anything.
			 */
			if (tcp->tcp_state >= TCPS_SYN_SENT)
				break;

			tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
			tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
			rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
			    tcps->tcps_rexmit_interval_extra +
			    (tcp->tcp_rtt_sa >> 5) +
			    tcps->tcps_conn_grace_period;
			TCP_SET_RTO(tcp, rto);
			break;
		}
		case TCP_RTO_MIN:
			if (checkonly || val == 0)
				break;

			if (val < tcps->tcps_rexmit_interval_min_low ||
			    val > tcps->tcps_rexmit_interval_min_high ||
			    val > tcp->tcp_rto_max) {
				*outlenp = 0;
				return (EINVAL);
			}
			tcp->tcp_rto_min = val;
			if (tcp->tcp_rto < val)
				tcp->tcp_rto = val;
			break;
		case TCP_RTO_MAX:
			if (checkonly || val == 0)
				break;

			/*
			 * Sanity checks
			 *
			 * The maximum RTO should not be larger than the
			 * connection abort timeout.  Otherwise, the
			 * connection won't be aborted in a period reasonably
			 * close to that timeout.
			 */
			if (val < tcps->tcps_rexmit_interval_max_low ||
			    val > tcps->tcps_rexmit_interval_max_high ||
			    val < tcp->tcp_rto_min ||
			    val > tcp->tcp_second_timer_threshold) {
				*outlenp = 0;
				return (EINVAL);
			}
			tcp->tcp_rto_max = val;
			if (tcp->tcp_rto > val)
				tcp->tcp_rto = val;
			break;
		case TCP_LINGER2:
			if (checkonly || *i1 == 0)
				break;

			/*
			 * Note that the option value's unit is second.  And
			 * the value should be bigger than the private
			 * parameter tcp_fin_wait_2_flush_interval's lower
			 * bound and smaller than the current value of that
			 * parameter.  It should be smaller than the current
			 * value to avoid an app setting TCP_LINGER2 to a big
			 * value, causing resource to be held up too long in
			 * FIN-WAIT-2 state.
			 */
			if (*i1 < 0 ||
			    tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
			    *i1 ||
			    tcps->tcps_fin_wait_2_flush_interval/SECONDS <
			    *i1) {
				*outlenp = 0;
				return (EINVAL);
			}
			tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
			break;
		default:
			break;
		}
		break;
	case IPPROTO_IP:
		if (connp->conn_family != AF_INET) {
			*outlenp = 0;
			return (EINVAL);
		}
		switch (name) {
		case IP_SEC_OPT:
			/*
			 * We should not allow policy setting after
			 * we start listening for connections.
			 */
			if (tcp->tcp_state == TCPS_LISTEN) {
				return (EINVAL);
			}
			break;
		}
		break;
	case IPPROTO_IPV6:
		/*
		 * IPPROTO_IPV6 options are only supported for sockets
		 * that are using IPv6 on the wire.
		 */
		if (connp->conn_ipversion != IPV6_VERSION) {
			*outlenp = 0;
			return (EINVAL);
		}

		switch (name) {
		case IPV6_RECVPKTINFO:
			if (!checkonly) {
				/* Force it to be sent up with the next msg */
				tcp->tcp_recvifindex = 0;
			}
			break;
		case IPV6_RECVTCLASS:
			if (!checkonly) {
				/* Force it to be sent up with the next msg */
				tcp->tcp_recvtclass = 0xffffffffU;
			}
			break;
		case IPV6_RECVHOPLIMIT:
			if (!checkonly) {
				/* Force it to be sent up with the next msg */
				tcp->tcp_recvhops = 0xffffffffU;
			}
			break;
		case IPV6_PKTINFO:
			/* This is an extra check for TCP */
			if (inlen == sizeof (struct in6_pktinfo)) {
				struct in6_pktinfo *pkti;

				pkti = (struct in6_pktinfo *)invalp;
				/*
				 * RFC 3542 states that ipi6_addr must be
				 * the unspecified address when setting the
				 * IPV6_PKTINFO sticky socket option on a
				 * TCP socket.
				 */
				if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
					return (EINVAL);
			}
			break;
		case IPV6_SEC_OPT:
			/*
			 * We should not allow policy setting after
			 * we start listening for connections.
			 */
			if (tcp->tcp_state == TCPS_LISTEN) {
				return (EINVAL);
			}
			break;
		}
		break;
	}
	reterr = conn_opt_set(&coas, level, name, inlen, invalp,
	    checkonly, cr);
	if (reterr != 0) {
		*outlenp = 0;
		return (reterr);
	}

	/*
	 * Common case of OK return with outval same as inval
	 */
	if (invalp != outvalp) {
		/* don't trust bcopy for identical src/dst */
		(void) bcopy(invalp, outvalp, inlen);
	}
	*outlenp = inlen;

	if (coas.coa_changed & COA_HEADER_CHANGED) {
		/* If we are connected we rebuilt the headers */
		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
			reterr = tcp_build_hdrs(tcp);
			if (reterr != 0)
				return (reterr);
		}
	}
	if (coas.coa_changed & COA_ROUTE_CHANGED) {
		in6_addr_t nexthop;

		/*
		 * If we are connected we re-cache the information.
		 * We ignore errors to preserve BSD behavior.
		 * Note that we don't redo IPsec policy lookup here
		 * since the final destination (or source) didn't change.
		 */
		ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
		    &connp->conn_faddr_v6, &nexthop);

		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
		    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
			(void) ip_attr_connect(connp, connp->conn_ixa,
			    &connp->conn_laddr_v6, &connp->conn_faddr_v6,
			    &nexthop, connp->conn_fport, NULL, NULL,
			    IPDF_VERIFY_DST);
		}
	}
	if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
		connp->conn_wq->q_hiwat = connp->conn_sndbuf;
	}
	if (coas.coa_changed & COA_WROFF_CHANGED) {
		connp->conn_wroff = connp->conn_ht_iphc_allocated +
		    tcps->tcps_wroff_xtra;
		(void) proto_set_tx_wroff(connp->conn_rq, connp,
		    connp->conn_wroff);
	}
	if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
		if (IPCL_IS_NONSTR(connp))
			proto_set_rx_oob_opt(connp, onoff);
	}
	return (0);
}