view usr/src/uts/common/fs/sockfs/nl7curi.c @ 12714:711e35fbf43b

6878143 Assertion failure in sockfs:uri_delete
author Anil udupa <anil.udupa@sun.com>
date Mon, 28 Jun 2010 10:49:02 -0700
parents cebb50cbe4f9
children
line wrap: on
line source

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <vm/seg_map.h>
#include <vm/seg_kpm.h>
#include <sys/condvar_impl.h>
#include <sys/sendfile.h>
#include <fs/sockfs/nl7c.h>
#include <fs/sockfs/nl7curi.h>
#include <fs/sockfs/socktpi_impl.h>

#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/tcp.h>
#include <inet/led.h>
#include <inet/mi.h>

#include <inet/nca/ncadoorhdr.h>
#include <inet/nca/ncalogd.h>
#include <inet/nca/ncandd.h>

#include <sys/promif.h>

/*
 * Some externs:
 */

extern boolean_t	nl7c_logd_enabled;
extern void		nl7c_logd_log(uri_desc_t *, uri_desc_t *,
			    time_t, ipaddr_t);
extern boolean_t	nl7c_close_addr(struct sonode *);
extern struct sonode	*nl7c_addr2portso(void *);
extern uri_desc_t	*nl7c_http_cond(uri_desc_t *, uri_desc_t *);

/*
 * Various global tuneables:
 */

clock_t		nl7c_uri_ttl = -1;	/* TTL in seconds (-1 == infinite) */

boolean_t	nl7c_use_kmem = B_FALSE; /* Force use of kmem (no segmap) */

uint64_t	nl7c_file_prefetch = 1; /* File cache prefetch pages */

uint64_t	nl7c_uri_max = 0;	/* Maximum bytes (0 == infinite) */
uint64_t	nl7c_uri_bytes = 0;	/* Bytes of kmem used by URIs */

/*
 * Locals:
 */

static int	uri_rd_response(struct sonode *, uri_desc_t *,
		    uri_rd_t *, boolean_t);
static int	uri_response(struct sonode *, uri_desc_t *);

/*
 * HTTP scheme functions called from nl7chttp.c:
 */

boolean_t nl7c_http_request(char **, char *, uri_desc_t *, struct sonode *);
boolean_t nl7c_http_response(char **, char *, uri_desc_t *, struct sonode *);
boolean_t nl7c_http_cmp(void *, void *);
mblk_t *nl7c_http_persist(struct sonode *);
void nl7c_http_free(void *arg);
void nl7c_http_init(void);

/*
 * Counters that need to move to kstat and/or be removed:
 */

volatile uint64_t nl7c_uri_request = 0;
volatile uint64_t nl7c_uri_hit = 0;
volatile uint64_t nl7c_uri_pass = 0;
volatile uint64_t nl7c_uri_miss = 0;
volatile uint64_t nl7c_uri_temp = 0;
volatile uint64_t nl7c_uri_more = 0;
volatile uint64_t nl7c_uri_data = 0;
volatile uint64_t nl7c_uri_sendfilev = 0;
volatile uint64_t nl7c_uri_reclaim_calls = 0;
volatile uint64_t nl7c_uri_reclaim_cnt = 0;
volatile uint64_t nl7c_uri_pass_urifail = 0;
volatile uint64_t nl7c_uri_pass_dupbfail = 0;
volatile uint64_t nl7c_uri_more_get = 0;
volatile uint64_t nl7c_uri_pass_method = 0;
volatile uint64_t nl7c_uri_pass_option = 0;
volatile uint64_t nl7c_uri_more_eol = 0;
volatile uint64_t nl7c_uri_more_http = 0;
volatile uint64_t nl7c_uri_pass_http = 0;
volatile uint64_t nl7c_uri_pass_addfail = 0;
volatile uint64_t nl7c_uri_pass_temp = 0;
volatile uint64_t nl7c_uri_expire = 0;
volatile uint64_t nl7c_uri_purge = 0;
volatile uint64_t nl7c_uri_NULL1 = 0;
volatile uint64_t nl7c_uri_NULL2 = 0;
volatile uint64_t nl7c_uri_close = 0;
volatile uint64_t nl7c_uri_temp_close = 0;
volatile uint64_t nl7c_uri_free = 0;
volatile uint64_t nl7c_uri_temp_free = 0;
volatile uint64_t nl7c_uri_temp_mk = 0;
volatile uint64_t nl7c_uri_rd_EAGAIN = 0;

/*
 * Various kmem_cache_t's:
 */

kmem_cache_t *nl7c_uri_kmc;
kmem_cache_t *nl7c_uri_rd_kmc;
static kmem_cache_t *uri_desb_kmc;
static kmem_cache_t *uri_segmap_kmc;

static void uri_kmc_reclaim(void *);

static void nl7c_uri_reclaim(void);

/*
 * The URI hash is a dynamically sized A/B bucket hash, when the current
 * hash's average bucket chain length exceeds URI_HASH_AVRG a new hash of
 * the next P2Ps[] size is created.
 *
 * All lookups are done in the current hash then the new hash (if any),
 * if there is a new has then when a current hash bucket chain is examined
 * any uri_desc_t members will be migrated to the new hash and when the
 * last uri_desc_t has been migrated then the new hash will become the
 * current and the previous current hash will be freed leaving a single
 * hash.
 *
 * uri_hash_t - hash bucket (chain) type, contained in the uri_hash_ab[]
 * and can be accessed only after aquiring the uri_hash_access lock (for
 * READER or WRITER) then acquiring the lock uri_hash_t.lock, the uri_hash_t
 * and all linked uri_desc_t.hash members are protected. Note, a REF_HOLD()
 * is placed on all uri_desc_t uri_hash_t list members.
 *
 * uri_hash_access - rwlock for all uri_hash_* variables, READER for read
 * access and WRITER for write access. Note, WRITER is only required for
 * hash geometry changes.
 *
 * uri_hash_which - which uri_hash_ab[] is the current hash.
 *
 * uri_hash_n[] - the P2Ps[] index for each uri_hash_ab[].
 *
 * uri_hash_sz[] - the size for each uri_hash_ab[].
 *
 * uri_hash_cnt[] - the total uri_desc_t members for each uri_hash_ab[].
 *
 * uri_hash_overflow[] - the uri_hash_cnt[] for each uri_hash_ab[] when
 * a new uri_hash_ab[] needs to be created.
 *
 * uri_hash_ab[] - the uri_hash_t entries.
 *
 * uri_hash_lru[] - the last uri_hash_ab[] walked for lru reclaim.
 */

typedef struct uri_hash_s {
	struct uri_desc_s	*list;		/* List of uri_t(s) */
	kmutex_t		lock;
} uri_hash_t;

#define	URI_HASH_AVRG	5	/* Desired average hash chain length */
#define	URI_HASH_N_INIT	9	/* P2Ps[] initial index */

static krwlock_t	uri_hash_access;
static uint32_t		uri_hash_which = 0;
static uint32_t		uri_hash_n[2] = {URI_HASH_N_INIT, 0};
static uint32_t		uri_hash_sz[2] = {0, 0};
static uint32_t		uri_hash_cnt[2] = {0, 0};
static uint32_t		uri_hash_overflow[2] = {0, 0};
static uri_hash_t	*uri_hash_ab[2] = {NULL, NULL};
static uri_hash_t	*uri_hash_lru[2] = {NULL, NULL};

/*
 * Primes for N of 3 - 24 where P is first prime less then (2^(N-1))+(2^(N-2))
 * these primes have been foud to be useful for prime sized hash tables.
 */

static const int P2Ps[] = {
	0, 0, 0, 5, 11, 23, 47, 89, 191, 383, 761, 1531, 3067,
	6143, 12281, 24571, 49139, 98299, 196597, 393209,
	786431, 1572853, 3145721, 6291449, 12582893, 0};

/*
 * Hash macros:
 *
 *    H2A(char *cp, char *ep, char c) - convert the escaped octet (ASCII)
 *    hex multichar of the format "%HH" pointeded to by *cp to a char and
 *    return in c, *ep points to past end of (char *), on return *cp will
 *    point to the last char consumed.
 *
 *    URI_HASH(unsigned hix, char *cp, char *ep) - hash the char(s) from
 *    *cp to *ep to the unsigned hix, cp nor ep are modified.
 *
 *    URI_HASH_IX(unsigned hix, int which) - convert the hash value hix to
 *    a hash index 0 - (uri_hash_sz[which] - 1).
 *
 *    URI_HASH_MIGRATE(from, hp, to) - migrate the uri_hash_t *hp list
 *    uri_desc_t members from hash from to hash to.
 *
 *    URI_HASH_UNLINK(cur, new, hp, puri, uri) - unlink the uri_desc_t
 *    *uri which is a member of the uri_hash_t *hp list with a previous
 *    list member of *puri for the uri_hash_ab[] cur. After unlinking
 *    check for cur hash empty, if so make new cur. Note, as this macro
 *    can change a hash chain it needs to be run under hash_access as
 *    RW_WRITER, futher as it can change the new hash to cur any access
 *    to the hash state must be done after either dropping locks and
 *    starting over or making sure the global state is consistent after
 *    as before.
 */

#define	H2A(cp, ep, c) {						\
	int	_h = 2;							\
	int	_n = 0;							\
	char	_hc;							\
									\
	while (_h > 0 && ++(cp) < (ep)) {				\
		if (_h == 1)						\
			_n *= 0x10;					\
		_hc = *(cp);						\
		if (_hc >= '0' && _hc <= '9')				\
			_n += _hc - '0';				\
		else if (_hc >= 'a' || _hc <= 'f')			\
			_n += _hc - 'W';				\
		else if (_hc >= 'A' || _hc <= 'F')			\
			_n += _hc - '7';				\
		_h--;							\
	}								\
	(c) = _n;							\
}

#define	URI_HASH(hv, cp, ep) {						\
	char	*_s = (cp);						\
	char	_c;							\
									\
	while (_s < (ep)) {						\
		if ((_c = *_s) == '%') {				\
			H2A(_s, (ep), _c);				\
		}							\
		CHASH(hv, _c);						\
		_s++;							\
	}								\
}

#define	URI_HASH_IX(hix, which) (hix) = (hix) % (uri_hash_sz[(which)])

#define	URI_HASH_MIGRATE(from, hp, to) {				\
	uri_desc_t	*_nuri;						\
	uint32_t	_nhix;						\
	uri_hash_t	*_nhp;						\
									\
	mutex_enter(&(hp)->lock);					\
	while ((_nuri = (hp)->list) != NULL) {				\
		(hp)->list = _nuri->hash;				\
		atomic_add_32(&uri_hash_cnt[(from)], -1);		\
		atomic_add_32(&uri_hash_cnt[(to)], 1);			\
		_nhix = _nuri->hvalue;					\
		URI_HASH_IX(_nhix, to);					\
		_nhp = &uri_hash_ab[(to)][_nhix];			\
		mutex_enter(&_nhp->lock);				\
		_nuri->hash = _nhp->list;				\
		_nhp->list = _nuri;					\
		_nuri->hit = 0;						\
		mutex_exit(&_nhp->lock);				\
	}								\
	mutex_exit(&(hp)->lock);					\
}

#define	URI_HASH_UNLINK(cur, new, hp, puri, uri) {			\
	if ((puri) != NULL) {						\
		(puri)->hash = (uri)->hash;				\
	} else {							\
		(hp)->list = (uri)->hash;				\
	}								\
	if (atomic_add_32_nv(&uri_hash_cnt[(cur)], -1) == 0 &&		\
	    uri_hash_ab[(new)] != NULL) {				\
		kmem_free(uri_hash_ab[cur],				\
		    sizeof (uri_hash_t) * uri_hash_sz[cur]);		\
		uri_hash_ab[(cur)] = NULL;				\
		uri_hash_lru[(cur)] = NULL;				\
		uri_hash_which = (new);					\
	} else {							\
		uri_hash_lru[(cur)] = (hp);				\
	}								\
}

void
nl7c_uri_init(void)
{
	uint32_t	cur = uri_hash_which;

	rw_init(&uri_hash_access, NULL, RW_DEFAULT, NULL);

	uri_hash_sz[cur] = P2Ps[URI_HASH_N_INIT];
	uri_hash_overflow[cur] = P2Ps[URI_HASH_N_INIT] * URI_HASH_AVRG;
	uri_hash_ab[cur] = kmem_zalloc(sizeof (uri_hash_t) * uri_hash_sz[cur],
	    KM_SLEEP);
	uri_hash_lru[cur] = uri_hash_ab[cur];

	nl7c_uri_kmc = kmem_cache_create("NL7C_uri_kmc", sizeof (uri_desc_t),
	    0, NULL, NULL, uri_kmc_reclaim, NULL, NULL, 0);

	nl7c_uri_rd_kmc = kmem_cache_create("NL7C_uri_rd_kmc",
	    sizeof (uri_rd_t), 0, NULL, NULL, NULL, NULL, NULL, 0);

	uri_desb_kmc = kmem_cache_create("NL7C_uri_desb_kmc",
	    sizeof (uri_desb_t), 0, NULL, NULL, NULL, NULL, NULL, 0);

	uri_segmap_kmc = kmem_cache_create("NL7C_uri_segmap_kmc",
	    sizeof (uri_segmap_t), 0, NULL, NULL, NULL, NULL, NULL, 0);

	nl7c_http_init();
}

#define	CV_SZ	16

void
nl7c_mi_report_hash(mblk_t *mp)
{
	uri_hash_t	*hp, *pend;
	uri_desc_t	*uri;
	uint32_t	cur;
	uint32_t	new;
	int		n, nz, tot;
	uint32_t	cv[CV_SZ + 1];

	rw_enter(&uri_hash_access, RW_READER);
	cur = uri_hash_which;
	new = cur ? 0 : 1;
next:
	for (n = 0; n <= CV_SZ; n++)
		cv[n] = 0;
	nz = 0;
	tot = 0;
	hp = &uri_hash_ab[cur][0];
	pend = &uri_hash_ab[cur][uri_hash_sz[cur]];
	while (hp < pend) {
		n = 0;
		for (uri = hp->list; uri != NULL; uri = uri->hash) {
			n++;
		}
		tot += n;
		if (n > 0)
			nz++;
		if (n > CV_SZ)
			n = CV_SZ;
		cv[n]++;
		hp++;
	}

	(void) mi_mpprintf(mp, "\nHash=%s, Buckets=%d, "
	    "Avrg=%d\nCount by bucket:", cur != new ? "CUR" : "NEW",
	    uri_hash_sz[cur], nz != 0 ? ((tot * 10 + 5) / nz) / 10 : 0);
	(void) mi_mpprintf(mp, "Free=%d", cv[0]);
	for (n = 1; n < CV_SZ; n++) {
		int	pn = 0;
		char	pv[5];
		char	*pp = pv;

		for (pn = n; pn < 1000; pn *= 10)
			*pp++ = ' ';
		*pp = 0;
		(void) mi_mpprintf(mp, "%s%d=%d", pv, n, cv[n]);
	}
	(void) mi_mpprintf(mp, "Long=%d", cv[CV_SZ]);

	if (cur != new && uri_hash_ab[new] != NULL) {
		cur = new;
		goto next;
	}
	rw_exit(&uri_hash_access);
}

void
nl7c_mi_report_uri(mblk_t *mp)
{
	uri_hash_t	*hp;
	uri_desc_t	*uri;
	uint32_t	cur;
	uint32_t	new;
	int		ix;
	int		ret;
	char		sc;

	rw_enter(&uri_hash_access, RW_READER);
	cur = uri_hash_which;
	new = cur ? 0 : 1;
next:
	for (ix = 0; ix < uri_hash_sz[cur]; ix++) {
		hp = &uri_hash_ab[cur][ix];
		mutex_enter(&hp->lock);
		uri = hp->list;
		while (uri != NULL) {
			sc = *(uri->path.ep);
			*(uri->path.ep) = 0;
			ret = mi_mpprintf(mp, "%s: %d %d %d",
			    uri->path.cp, (int)uri->resplen,
			    (int)uri->respclen, (int)uri->count);
			*(uri->path.ep) = sc;
			if (ret == -1) break;
			uri = uri->hash;
		}
		mutex_exit(&hp->lock);
		if (ret == -1) break;
	}
	if (ret != -1 && cur != new && uri_hash_ab[new] != NULL) {
		cur = new;
		goto next;
	}
	rw_exit(&uri_hash_access);
}

/*
 * The uri_desc_t ref_t inactive function called on the last REF_RELE(),
 * free all resources contained in the uri_desc_t. Note, the uri_desc_t
 * will be freed by REF_RELE() on return.
 */

void
nl7c_uri_inactive(uri_desc_t *uri)
{
	int64_t	 bytes = 0;

	if (uri->tail) {
		uri_rd_t *rdp = &uri->response;
		uri_rd_t *free = NULL;

		while (rdp) {
			if (rdp->off == -1) {
				bytes += rdp->sz;
				kmem_free(rdp->data.kmem, rdp->sz);
			} else {
				VN_RELE(rdp->data.vnode);
			}
			rdp = rdp->next;
			if (free != NULL) {
				kmem_cache_free(nl7c_uri_rd_kmc, free);
			}
			free = rdp;
		}
	}
	if (bytes) {
		atomic_add_64(&nl7c_uri_bytes, -bytes);
	}
	if (uri->scheme != NULL) {
		nl7c_http_free(uri->scheme);
	}
	if (uri->reqmp) {
		freeb(uri->reqmp);
	}
}

/*
 * The reclaim is called by the kmem subsystem when kmem is running
 * low. More work is needed to determine the best reclaim policy, for
 * now we just manipulate the nl7c_uri_max global maximum bytes threshold
 * value using a simple arithmetic backoff of the value every time this
 * function is called then call uri_reclaim() to enforce it.
 *
 * Note, this value remains in place and enforced for all subsequent
 * URI request/response processing.
 *
 * Note, nl7c_uri_max is currently initialized to 0 or infinite such that
 * the first call here set it to the current uri_bytes value then backoff
 * from there.
 *
 * XXX how do we determine when to increase nl7c_uri_max ???
 */

/*ARGSUSED*/
static void
uri_kmc_reclaim(void *arg)
{
	uint64_t new_max;

	if ((new_max = nl7c_uri_max) == 0) {
		/* Currently infinite, initialize to current bytes used */
		nl7c_uri_max = nl7c_uri_bytes;
		new_max = nl7c_uri_bytes;
	}
	if (new_max > 1) {
		/* Lower max_bytes to 93% of current value */
		new_max >>= 1;			/* 50% */
		new_max += (new_max >> 1);	/* 75% */
		new_max += (new_max >> 2);	/* 93% */
		if (new_max < nl7c_uri_max)
			nl7c_uri_max = new_max;
		else
			nl7c_uri_max = 1;
	}
	nl7c_uri_reclaim();
}

/*
 * Delete a uri_desc_t from the URI hash.
 */

static void
uri_delete(uri_desc_t *del)
{
	uint32_t	hix;
	uri_hash_t	*hp;
	uri_desc_t	*uri;
	uri_desc_t	*puri;
	uint32_t	cur;
	uint32_t	new;

	ASSERT(del->hash != URI_TEMP);
	rw_enter(&uri_hash_access, RW_WRITER);
	cur = uri_hash_which;
	new = cur ? 0 : 1;
next:
	puri = NULL;
	hix = del->hvalue;
	URI_HASH_IX(hix, cur);
	hp = &uri_hash_ab[cur][hix];
	for (uri = hp->list; uri != NULL; uri = uri->hash) {
		if (uri != del) {
			puri = uri;
			continue;
		}
		/*
		 * Found the URI, unlink from the hash chain,
		 * drop locks, ref release it.
		 */
		URI_HASH_UNLINK(cur, new, hp, puri, uri);
		rw_exit(&uri_hash_access);
		REF_RELE(uri);
		return;
	}
	if (cur != new && uri_hash_ab[new] != NULL) {
		/*
		 * Not found in current hash and have a new hash so
		 * check the new hash next.
		 */
		cur = new;
		goto next;
	}
	rw_exit(&uri_hash_access);
}

/*
 * Add a uri_desc_t to the URI hash.
 */

static void
uri_add(uri_desc_t *uri, krw_t rwlock, boolean_t nonblocking)
{
	uint32_t	hix;
	uri_hash_t	*hp;
	uint32_t	cur = uri_hash_which;
	uint32_t	new = cur ? 0 : 1;

	/*
	 * Caller of uri_add() must hold the uri_hash_access rwlock.
	 */
	ASSERT((rwlock == RW_READER && RW_READ_HELD(&uri_hash_access)) ||
	    (rwlock == RW_WRITER && RW_WRITE_HELD(&uri_hash_access)));
	/*
	 * uri_add() always succeeds so add a hash ref to the URI now.
	 */
	REF_HOLD(uri);
again:
	hix = uri->hvalue;
	URI_HASH_IX(hix, cur);
	if (uri_hash_ab[new] == NULL &&
	    uri_hash_cnt[cur] < uri_hash_overflow[cur]) {
		/*
		 * Easy case, no new hash and current hasn't overflowed,
		 * add URI to current hash and return.
		 *
		 * Note, the check for uri_hash_cnt[] above aren't done
		 * atomictally, i.e. multiple threads can be in this code
		 * as RW_READER and update the cnt[], this isn't a problem
		 * as the check is only advisory.
		 */
	fast:
		atomic_add_32(&uri_hash_cnt[cur], 1);
		hp = &uri_hash_ab[cur][hix];
		mutex_enter(&hp->lock);
		uri->hash = hp->list;
		hp->list = uri;
		mutex_exit(&hp->lock);
		rw_exit(&uri_hash_access);
		return;
	}
	if (uri_hash_ab[new] == NULL) {
		/*
		 * Need a new a or b hash, if not already RW_WRITER
		 * try to upgrade our lock to writer.
		 */
		if (rwlock != RW_WRITER && ! rw_tryupgrade(&uri_hash_access)) {
			/*
			 * Upgrade failed, we can't simple exit and reenter
			 * the lock as after the exit and before the reenter
			 * the whole world can change so just wait for writer
			 * then do everything again.
			 */
			if (nonblocking) {
				/*
				 * Can't block, use fast-path above.
				 *
				 * XXX should have a background thread to
				 * handle new ab[] in this case so as to
				 * not overflow the cur hash to much.
				 */
				goto fast;
			}
			rw_exit(&uri_hash_access);
			rwlock = RW_WRITER;
			rw_enter(&uri_hash_access, rwlock);
			cur = uri_hash_which;
			new = cur ? 0 : 1;
			goto again;
		}
		rwlock = RW_WRITER;
		if (uri_hash_ab[new] == NULL) {
			/*
			 * Still need a new hash, allocate and initialize
			 * the new hash.
			 */
			uri_hash_n[new] = uri_hash_n[cur] + 1;
			if (uri_hash_n[new] == 0) {
				/*
				 * No larger P2Ps[] value so use current,
				 * i.e. 2 of the largest are better than 1 ?
				 */
				uri_hash_n[new] = uri_hash_n[cur];
				cmn_err(CE_NOTE, "NL7C: hash index overflow");
			}
			uri_hash_sz[new] = P2Ps[uri_hash_n[new]];
			ASSERT(uri_hash_cnt[new] == 0);
			uri_hash_overflow[new] = uri_hash_sz[new] *
			    URI_HASH_AVRG;
			uri_hash_ab[new] = kmem_zalloc(sizeof (uri_hash_t) *
			    uri_hash_sz[new], nonblocking ? KM_NOSLEEP :
			    KM_SLEEP);
			if (uri_hash_ab[new] == NULL) {
				/*
				 * Alloc failed, use fast-path above.
				 *
				 * XXX should have a background thread to
				 * handle new ab[] in this case so as to
				 * not overflow the cur hash to much.
				 */
				goto fast;
			}
			uri_hash_lru[new] = uri_hash_ab[new];
		}
	}
	/*
	 * Hashed against current hash so migrate any current hash chain
	 * members, if any.
	 *
	 * Note, the hash chain list can be checked for a non empty list
	 * outside of the hash chain list lock as the hash chain struct
	 * can't be destroyed while in the uri_hash_access rwlock, worst
	 * case is that a non empty list is found and after acquiring the
	 * lock another thread beats us to it (i.e. migrated the list).
	 */
	hp = &uri_hash_ab[cur][hix];
	if (hp->list != NULL) {
		URI_HASH_MIGRATE(cur, hp, new);
	}
	/*
	 * If new hash has overflowed before current hash has been
	 * completely migrated then walk all current hash chains and
	 * migrate list members now.
	 */
	if (atomic_add_32_nv(&uri_hash_cnt[new], 1) >= uri_hash_overflow[new]) {
		for (hix = 0; hix < uri_hash_sz[cur]; hix++) {
			hp = &uri_hash_ab[cur][hix];
			if (hp->list != NULL) {
				URI_HASH_MIGRATE(cur, hp, new);
			}
		}
	}
	/*
	 * Add URI to new hash.
	 */
	hix = uri->hvalue;
	URI_HASH_IX(hix, new);
	hp = &uri_hash_ab[new][hix];
	mutex_enter(&hp->lock);
	uri->hash = hp->list;
	hp->list = uri;
	mutex_exit(&hp->lock);
	/*
	 * Last, check to see if last cur hash chain has been
	 * migrated, if so free cur hash and make new hash cur.
	 */
	if (uri_hash_cnt[cur] == 0) {
		/*
		 * If we don't already hold the uri_hash_access rwlock for
		 * RW_WRITE try to upgrade to RW_WRITE and if successful
		 * check again and to see if still need to do the free.
		 */
		if ((rwlock == RW_WRITER || rw_tryupgrade(&uri_hash_access)) &&
		    uri_hash_cnt[cur] == 0 && uri_hash_ab[new] != 0) {
			kmem_free(uri_hash_ab[cur],
			    sizeof (uri_hash_t) * uri_hash_sz[cur]);
			uri_hash_ab[cur] = NULL;
			uri_hash_lru[cur] = NULL;
			uri_hash_which = new;
		}
	}
	rw_exit(&uri_hash_access);
}

/*
 * Lookup a uri_desc_t in the URI hash, if found free the request uri_desc_t
 * and return the found uri_desc_t with a REF_HOLD() placed on it. Else, if
 * add B_TRUE use the request URI to create a new hash entry. Else if add
 * B_FALSE ...
 */

static uri_desc_t *
uri_lookup(uri_desc_t *ruri, boolean_t add, boolean_t nonblocking)
{
	uint32_t	hix;
	uri_hash_t	*hp;
	uri_desc_t	*uri;
	uri_desc_t	*puri;
	uint32_t	cur;
	uint32_t	new;
	char		*rcp = ruri->path.cp;
	char		*rep = ruri->path.ep;

again:
	rw_enter(&uri_hash_access, RW_READER);
	cur = uri_hash_which;
	new = cur ? 0 : 1;
nexthash:
	puri = NULL;
	hix = ruri->hvalue;
	URI_HASH_IX(hix, cur);
	hp = &uri_hash_ab[cur][hix];
	mutex_enter(&hp->lock);
	for (uri = hp->list; uri != NULL; uri = uri->hash) {
		char	*ap = uri->path.cp;
		char	*bp = rcp;
		char	a, b;

		/* Compare paths */
		while (bp < rep && ap < uri->path.ep) {
			if ((a = *ap) == '%') {
				/* Escaped hex multichar, convert it */
				H2A(ap, uri->path.ep, a);
			}
			if ((b = *bp) == '%') {
				/* Escaped hex multichar, convert it */
				H2A(bp, rep, b);
			}
			if (a != b) {
				/* Char's don't match */
				goto nexturi;
			}
			ap++;
			bp++;
		}
		if (bp != rep || ap != uri->path.ep) {
			/* Not same length */
			goto nexturi;
		}
		ap = uri->auth.cp;
		bp = ruri->auth.cp;
		if (ap != NULL) {
			if (bp == NULL) {
				/* URI has auth request URI doesn't */
				goto nexturi;
			}
			while (bp < ruri->auth.ep && ap < uri->auth.ep) {
				if ((a = *ap) == '%') {
					/* Escaped hex multichar, convert it */
					H2A(ap, uri->path.ep, a);
				}
				if ((b = *bp) == '%') {
					/* Escaped hex multichar, convert it */
					H2A(bp, rep, b);
				}
				if (a != b) {
					/* Char's don't match */
					goto nexturi;
				}
				ap++;
				bp++;
			}
			if (bp != ruri->auth.ep || ap != uri->auth.ep) {
				/* Not same length */
				goto nexturi;
			}
		} else if (bp != NULL) {
			/* URI doesn't have auth and request URI does */
			goto nexturi;
		}
		/*
		 * Have a path/auth match so before any other processing
		 * of requested URI, check for expire or request no cache
		 * purge.
		 */
		if (uri->expire >= 0 && uri->expire <= ddi_get_lbolt() ||
		    ruri->nocache) {
			/*
			 * URI has expired or request specified to not use
			 * the cached version, unlink the URI from the hash
			 * chain, release all locks, release the hash ref
			 * on the URI, and last look it up again.
			 *
			 * Note, this will cause all variants of the named
			 * URI to be purged.
			 */
			if (puri != NULL) {
				puri->hash = uri->hash;
			} else {
				hp->list = uri->hash;
			}
			mutex_exit(&hp->lock);
			atomic_add_32(&uri_hash_cnt[cur], -1);
			rw_exit(&uri_hash_access);
			if (ruri->nocache)
				nl7c_uri_purge++;
			else
				nl7c_uri_expire++;
			REF_RELE(uri);
			goto again;
		}
		if (uri->scheme != NULL) {
			/*
			 * URI has scheme private qualifier(s), if request
			 * URI doesn't or if no match skip this URI.
			 */
			if (ruri->scheme == NULL ||
			    ! nl7c_http_cmp(uri->scheme, ruri->scheme))
				goto nexturi;
		} else if (ruri->scheme != NULL) {
			/*
			 * URI doesn't have scheme private qualifiers but
			 * request URI does, no match, skip this URI.
			 */
			goto nexturi;
		}
		/*
		 * Have a match, ready URI for return, first put a reference
		 * hold on the URI, if this URI is currently being processed
		 * then have to wait for the processing to be completed and
		 * redo the lookup, else return it.
		 */
		REF_HOLD(uri);
		mutex_enter(&uri->proclock);
		if (uri->proc != NULL) {
			/* The URI is being processed, wait for completion */
			mutex_exit(&hp->lock);
			rw_exit(&uri_hash_access);
			if (! nonblocking &&
			    cv_wait_sig(&uri->waiting, &uri->proclock)) {
				/*
				 * URI has been processed but things may
				 * have changed while we were away so do
				 * most everything again.
				 */
				mutex_exit(&uri->proclock);
				REF_RELE(uri);
				goto again;
			} else {
				/*
				 * A nonblocking socket or an interrupted
				 * cv_wait_sig() in the first case can't
				 * block waiting for the processing of the
				 * uri hash hit uri to complete, in both
				 * cases just return failure to lookup.
				 */
				mutex_exit(&uri->proclock);
				REF_RELE(uri);
				return (NULL);
			}
		}
		mutex_exit(&uri->proclock);
		uri->hit++;
		mutex_exit(&hp->lock);
		rw_exit(&uri_hash_access);
		return (uri);
	nexturi:
		puri = uri;
	}
	mutex_exit(&hp->lock);
	if (cur != new && uri_hash_ab[new] != NULL) {
		/*
		 * Not found in current hash and have a new hash so
		 * check the new hash next.
		 */
		cur = new;
		goto nexthash;
	}
add:
	if (! add) {
		/* Lookup only so return failure */
		rw_exit(&uri_hash_access);
		return (NULL);
	}
	/*
	 * URI not hashed, finish intialization of the
	 * request URI, add it to the hash, return it.
	 */
	ruri->hit = 0;
	ruri->expire = -1;
	ruri->response.sz = 0;
	ruri->proc = (struct sonode *)~NULL;
	cv_init(&ruri->waiting, NULL, CV_DEFAULT, NULL);
	mutex_init(&ruri->proclock, NULL, MUTEX_DEFAULT, NULL);
	uri_add(ruri, RW_READER, nonblocking);
	/* uri_add() has done rw_exit(&uri_hash_access) */
	return (ruri);
}

/*
 * Reclaim URIs until max cache size threshold has been reached.
 *
 * A CLOCK based reclaim modified with a history (hit counter) counter.
 */

static void
nl7c_uri_reclaim(void)
{
	uri_hash_t	*hp, *start, *pend;
	uri_desc_t	*uri;
	uri_desc_t	*puri;
	uint32_t	cur;
	uint32_t	new;

	nl7c_uri_reclaim_calls++;
again:
	rw_enter(&uri_hash_access, RW_WRITER);
	cur = uri_hash_which;
	new = cur ? 0 : 1;
next:
	hp = uri_hash_lru[cur];
	start = hp;
	pend = &uri_hash_ab[cur][uri_hash_sz[cur]];
	while (nl7c_uri_bytes > nl7c_uri_max) {
		puri = NULL;
		for (uri = hp->list; uri != NULL; uri = uri->hash) {
			if (uri->hit != 0) {
				/*
				 * Decrement URI activity counter and skip.
				 */
				uri->hit--;
				puri = uri;
				continue;
			}
			if (uri->proc != NULL) {
				/*
				 * Currently being processed by a socket, skip.
				 */
				continue;
			}
			/*
			 * Found a candidate, no hit(s) since added or last
			 * reclaim pass, unlink from it's hash chain, update
			 * lru scan pointer, drop lock, ref release it.
			 */
			URI_HASH_UNLINK(cur, new, hp, puri, uri);
			if (cur == uri_hash_which) {
				if (++hp == pend) {
					/* Wrap pointer */
					hp = uri_hash_ab[cur];
				}
				uri_hash_lru[cur] = hp;
			}
			rw_exit(&uri_hash_access);
			REF_RELE(uri);
			nl7c_uri_reclaim_cnt++;
			goto again;
		}
		if (++hp == pend) {
			/* Wrap pointer */
			hp = uri_hash_ab[cur];
		}
		if (hp == start) {
			if (cur != new && uri_hash_ab[new] != NULL) {
				/*
				 * Done with the current hash and have a
				 * new hash so check the new hash next.
				 */
				cur = new;
				goto next;
			}
		}
	}
	rw_exit(&uri_hash_access);
}

/*
 * Called for a socket which is being freed prior to close, e.g. errored.
 */

void
nl7c_urifree(struct sonode *so)
{
	sotpi_info_t *sti = SOTOTPI(so);
	uri_desc_t *uri = (uri_desc_t *)sti->sti_nl7c_uri;

	sti->sti_nl7c_uri = NULL;
	if (uri->hash != URI_TEMP) {
		uri_delete(uri);
		mutex_enter(&uri->proclock);
		uri->proc = NULL;
		if (CV_HAS_WAITERS(&uri->waiting)) {
			cv_broadcast(&uri->waiting);
		}
		mutex_exit(&uri->proclock);
		nl7c_uri_free++;
	} else {
		/* No proclock as uri exclusively owned by so */
		uri->proc = NULL;
		nl7c_uri_temp_free++;
	}
	REF_RELE(uri);
}

/*
 * ...
 *
 *	< 0	need more data
 *
 *	  0	parse complete
 *
 *	> 0	parse error
 */

volatile uint64_t nl7c_resp_pfail = 0;
volatile uint64_t nl7c_resp_ntemp = 0;
volatile uint64_t nl7c_resp_pass = 0;

static int
nl7c_resp_parse(struct sonode *so, uri_desc_t *uri, char *data, int sz)
{
	if (! nl7c_http_response(&data, &data[sz], uri, so)) {
		if (data == NULL) {
			/* Parse fail */
			goto pfail;
		}
		/* More data */
		data = NULL;
	} else if (data == NULL) {
		goto pass;
	}
	if (uri->hash != URI_TEMP && uri->nocache) {
		/*
		 * After response parse now no cache,
		 * delete it from cache, wakeup any
		 * waiters on this URI, make URI_TEMP.
		 */
		uri_delete(uri);
		mutex_enter(&uri->proclock);
		if (CV_HAS_WAITERS(&uri->waiting)) {
			cv_broadcast(&uri->waiting);
		}
		mutex_exit(&uri->proclock);
		uri->hash = URI_TEMP;
		nl7c_uri_temp_mk++;
	}
	if (data == NULL) {
		/* More data needed */
		return (-1);
	}
	/* Success */
	return (0);

pfail:
	nl7c_resp_pfail++;
	return (EINVAL);

pass:
	nl7c_resp_pass++;
	return (ENOTSUP);
}

/*
 * Called to sink application response data, the processing of the data
 * is the same for a cached or temp URI (i.e. a URI for which we aren't
 * going to cache the URI but want to parse it for detecting response
 * data end such that for a persistent connection we can parse the next
 * request).
 *
 * On return 0 is returned for sink success, > 0 on error, and < 0 on
 * no so URI (note, data not sinked).
 */

int
nl7c_data(struct sonode *so, uio_t *uio)
{
	sotpi_info_t	*sti = SOTOTPI(so);
	uri_desc_t	*uri = (uri_desc_t *)sti->sti_nl7c_uri;
	iovec_t		*iov;
	int		cnt;
	int		sz = uio->uio_resid;
	char		*data, *alloc;
	char		*bp;
	uri_rd_t	*rdp;
	boolean_t	first;
	int		error, perror;

	nl7c_uri_data++;

	if (uri == NULL) {
		/* Socket & NL7C out of sync, disable NL7C */
		sti->sti_nl7c_flags = 0;
		nl7c_uri_NULL1++;
		return (-1);
	}

	if (sti->sti_nl7c_flags & NL7C_WAITWRITE) {
		sti->sti_nl7c_flags &= ~NL7C_WAITWRITE;
		first = B_TRUE;
	} else {
		first = B_FALSE;
	}

	alloc = kmem_alloc(sz, KM_SLEEP);
	URI_RD_ADD(uri, rdp, sz, -1);
	if (rdp == NULL) {
		error = ENOMEM;
		goto fail;
	}

	if (uri->hash != URI_TEMP && uri->count > nca_max_cache_size) {
		uri_delete(uri);
		uri->hash = URI_TEMP;
	}
	data = alloc;
	alloc = NULL;
	rdp->data.kmem = data;
	atomic_add_64(&nl7c_uri_bytes, sz);

	bp = data;
	while (uio->uio_resid > 0) {
		iov = uio->uio_iov;
		if ((cnt = iov->iov_len) == 0) {
			goto next;
		}
		cnt = MIN(cnt, uio->uio_resid);
		error = xcopyin(iov->iov_base, bp, cnt);
		if (error)
			goto fail;

		iov->iov_base += cnt;
		iov->iov_len -= cnt;
		uio->uio_resid -= cnt;
		uio->uio_loffset += cnt;
		bp += cnt;
	next:
		uio->uio_iov++;
		uio->uio_iovcnt--;
	}

	/* Successfull sink of data, response parse the data */
	perror = nl7c_resp_parse(so, uri, data, sz);

	/* Send the data out the connection */
	error = uri_rd_response(so, uri, rdp, first);
	if (error)
		goto fail;

	/* Success */
	if (perror == 0 &&
	    ((uri->respclen == URI_LEN_NOVALUE &&
	    uri->resplen == URI_LEN_NOVALUE) ||
	    uri->count >= uri->resplen)) {
		/*
		 * No more data needed and no pending response
		 * data or current data count >= response length
		 * so close the URI processing for this so.
		 */
		nl7c_close(so);
		if (! (sti->sti_nl7c_flags & NL7C_SOPERSIST)) {
			/* Not a persistent connection */
			sti->sti_nl7c_flags = 0;
		}
	}

	return (0);

fail:
	if (alloc != NULL) {
		kmem_free(alloc, sz);
	}
	sti->sti_nl7c_flags = 0;
	nl7c_urifree(so);

	return (error);
}

/*
 * Called to read data from file "*fp" at offset "*off" of length "*len"
 * for a maximum of "*max_rem" bytes.
 *
 * On success a pointer to the kmem_alloc()ed file data is returned, "*off"
 * and "*len" are updated for the acutal number of bytes read and "*max_rem"
 * is updated with the number of bytes remaining to be read.
 *
 * Else, "NULL" is returned.
 */

static char *
nl7c_readfile(file_t *fp, u_offset_t *off, int *len, int max, int *ret)
{
	vnode_t	*vp = fp->f_vnode;
	int	flg = 0;
	size_t	size = MIN(*len, max);
	char	*data;
	int	error;
	uio_t	uio;
	iovec_t	iov;

	(void) VOP_RWLOCK(vp, flg, NULL);

	if (*off > MAXOFFSET_T) {
		VOP_RWUNLOCK(vp, flg, NULL);
		*ret = EFBIG;
		return (NULL);
	}

	if (*off + size > MAXOFFSET_T)
		size = (ssize32_t)(MAXOFFSET_T - *off);

	data = kmem_alloc(size, KM_SLEEP);

	iov.iov_base = data;
	iov.iov_len = size;
	uio.uio_loffset = *off;
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_resid = size;
	uio.uio_segflg = UIO_SYSSPACE;
	uio.uio_llimit = MAXOFFSET_T;
	uio.uio_fmode = fp->f_flag;

	error = VOP_READ(vp, &uio, fp->f_flag, fp->f_cred, NULL);
	VOP_RWUNLOCK(vp, flg, NULL);
	*ret = error;
	if (error) {
		kmem_free(data, size);
		return (NULL);
	}
	*len = size;
	*off += size;
	return (data);
}

/*
 * Called to sink application response sendfilev, as with nl7c_data() above
 * all the data will be processed by NL7C unless there's an error.
 */

int
nl7c_sendfilev(struct sonode *so, u_offset_t *fileoff, sendfilevec_t *sfvp,
	int sfvc, ssize_t *xfer)
{
	sotpi_info_t	*sti = SOTOTPI(so);
	uri_desc_t	*uri = (uri_desc_t *)sti->sti_nl7c_uri;
	file_t		*fp = NULL;
	vnode_t		*vp = NULL;
	char		*data = NULL;
	u_offset_t	off;
	int		len;
	int		cnt;
	int		total_count = 0;
	char		*alloc;
	uri_rd_t	*rdp;
	int		max;
	int		perror;
	int		error = 0;
	boolean_t	first = B_TRUE;

	nl7c_uri_sendfilev++;

	if (uri == NULL) {
		/* Socket & NL7C out of sync, disable NL7C */
		sti->sti_nl7c_flags = 0;
		nl7c_uri_NULL2++;
		return (0);
	}

	if (sti->sti_nl7c_flags & NL7C_WAITWRITE)
		sti->sti_nl7c_flags &= ~NL7C_WAITWRITE;

	while (sfvc-- > 0) {
		/*
		 * off - the current sfv read file offset or user address.
		 *
		 * len - the current sfv length in bytes.
		 *
		 * cnt - number of bytes kmem_alloc()ed.
		 *
		 * alloc - the kmem_alloc()ed buffer of size "cnt".
		 *
		 * data - copy of "alloc" used for post alloc references.
		 *
		 * fp - the current sfv file_t pointer.
		 *
		 * vp - the current "*vp" vnode_t pointer.
		 *
		 * Note, for "data" and "fp" and "vp" a NULL value is used
		 * when not allocated such that the common failure path "fail"
		 * is used.
		 */
		off = sfvp->sfv_off;
		len = sfvp->sfv_len;
		cnt = len;

		if (len == 0) {
			sfvp++;
			continue;
		}

		if (sfvp->sfv_fd == SFV_FD_SELF) {
			/*
			 * User memory, copyin() all the bytes.
			 */
			alloc = kmem_alloc(cnt, KM_SLEEP);
			error = xcopyin((caddr_t)(uintptr_t)off, alloc, cnt);
			if (error)
				goto fail;
		} else {
			/*
			 * File descriptor, prefetch some bytes.
			 */
			if ((fp = getf(sfvp->sfv_fd)) == NULL) {
				error = EBADF;
				goto fail;
			}
			if ((fp->f_flag & FREAD) == 0) {
				error = EACCES;
				goto fail;
			}
			vp = fp->f_vnode;
			if (vp->v_type != VREG) {
				error = EINVAL;
				goto fail;
			}
			VN_HOLD(vp);

			/* Read max_rem bytes from file for prefetch */
			if (nl7c_use_kmem) {
				max = cnt;
			} else {
				max = MAXBSIZE * nl7c_file_prefetch;
			}
			alloc = nl7c_readfile(fp, &off, &cnt, max, &error);
			if (alloc == NULL)
				goto fail;

			releasef(sfvp->sfv_fd);
			fp = NULL;
		}
		URI_RD_ADD(uri, rdp, cnt, -1);
		if (rdp == NULL) {
			error = ENOMEM;
			goto fail;
		}
		data = alloc;
		alloc = NULL;
		rdp->data.kmem = data;
		total_count += cnt;
		if (uri->hash != URI_TEMP && total_count > nca_max_cache_size) {
			uri_delete(uri);
			uri->hash = URI_TEMP;
		}

		/* Response parse */
		perror = nl7c_resp_parse(so, uri, data, len);

		/* Send kmem data out the connection */
		error = uri_rd_response(so, uri, rdp, first);

		if (error)
			goto fail;

		if (sfvp->sfv_fd != SFV_FD_SELF) {
			/*
			 * File descriptor, if any bytes left save vnode_t.
			 */
			if (len > cnt) {
				/* More file data so add it */
				URI_RD_ADD(uri, rdp, len - cnt, off);
				if (rdp == NULL) {
					error = ENOMEM;
					goto fail;
				}
				rdp->data.vnode = vp;

				/* Send vnode data out the connection */
				error = uri_rd_response(so, uri, rdp, first);
			} else {
				/* All file data fit in the prefetch */
				VN_RELE(vp);
			}
			*fileoff += len;
			vp = NULL;
		}
		*xfer += len;
		sfvp++;

		if (first)
			first = B_FALSE;
	}
	if (total_count > 0) {
		atomic_add_64(&nl7c_uri_bytes, total_count);
	}
	if (perror == 0 &&
	    ((uri->respclen == URI_LEN_NOVALUE &&
	    uri->resplen == URI_LEN_NOVALUE) ||
	    uri->count >= uri->resplen)) {
		/*
		 * No more data needed and no pending response
		 * data or current data count >= response length
		 * so close the URI processing for this so.
		 */
		nl7c_close(so);
		if (! (sti->sti_nl7c_flags & NL7C_SOPERSIST)) {
			/* Not a persistent connection */
			sti->sti_nl7c_flags = 0;
		}
	}

	return (0);

fail:
	if (error == EPIPE)
		tsignal(curthread, SIGPIPE);

	if (alloc != NULL)
		kmem_free(data, len);

	if (vp != NULL)
		VN_RELE(vp);

	if (fp != NULL)
		releasef(sfvp->sfv_fd);

	if (total_count > 0) {
		atomic_add_64(&nl7c_uri_bytes, total_count);
	}

	sti->sti_nl7c_flags = 0;
	nl7c_urifree(so);

	return (error);
}

/*
 * Called for a socket which is closing or when an application has
 * completed sending all the response data (i.e. for a persistent
 * connection called once for each completed application response).
 */

void
nl7c_close(struct sonode *so)
{
	sotpi_info_t	*sti = SOTOTPI(so);
	uri_desc_t 	*uri = (uri_desc_t *)sti->sti_nl7c_uri;

	if (uri == NULL) {
		/*
		 * No URI being processed so might be a listen()er
		 * if so do any cleanup, else nothing more to do.
		 */
		if (so->so_state & SS_ACCEPTCONN) {
			(void) nl7c_close_addr(so);
		}
		return;
	}
	sti->sti_nl7c_uri = NULL;
	if (uri->hash != URI_TEMP) {
		mutex_enter(&uri->proclock);
		uri->proc = NULL;
		if (CV_HAS_WAITERS(&uri->waiting)) {
			cv_broadcast(&uri->waiting);
		}
		mutex_exit(&uri->proclock);
		nl7c_uri_close++;
	} else {
		/* No proclock as uri exclusively owned by so */
		uri->proc = NULL;
		nl7c_uri_temp_close++;
	}
	REF_RELE(uri);
	if (nl7c_uri_max > 0 && nl7c_uri_bytes > nl7c_uri_max) {
		nl7c_uri_reclaim();
	}
}

/*
 * The uri_segmap_t ref_t inactive function called on the last REF_RELE(),
 * release the segmap mapping. Note, the uri_segmap_t will be freed by
 * REF_RELE() on return.
 */

void
uri_segmap_inactive(uri_segmap_t *smp)
{
	if (!segmap_kpm) {
		(void) segmap_fault(kas.a_hat, segkmap, smp->base,
		    smp->len, F_SOFTUNLOCK, S_OTHER);
	}
	(void) segmap_release(segkmap, smp->base, SM_DONTNEED);
	VN_RELE(smp->vp);
}

/*
 * The call-back for desballoc()ed mblk_t's, if a segmap mapped mblk_t
 * release the reference, one per desballoc() of a segmap page, if a rd_t
 * mapped mblk_t release the reference, one per desballoc() of a uri_desc_t,
 * last kmem free the uri_desb_t.
 */

static void
uri_desb_free(uri_desb_t *desb)
{
	if (desb->segmap != NULL) {
		REF_RELE(desb->segmap);
	}
	REF_RELE(desb->uri);
	kmem_cache_free(uri_desb_kmc, desb);
}

/*
 * Segmap map up to a page of a uri_rd_t file descriptor.
 */

uri_segmap_t *
uri_segmap_map(uri_rd_t *rdp, int bytes)
{
	uri_segmap_t	*segmap = kmem_cache_alloc(uri_segmap_kmc, KM_SLEEP);
	int		len = MIN(rdp->sz, MAXBSIZE);

	if (len > bytes)
		len = bytes;

	REF_INIT(segmap, 1, uri_segmap_inactive, uri_segmap_kmc);
	segmap->len = len;
	VN_HOLD(rdp->data.vnode);
	segmap->vp = rdp->data.vnode;

	segmap->base = segmap_getmapflt(segkmap, segmap->vp, rdp->off, len,
	    segmap_kpm ? SM_FAULT : 0, S_READ);

	if (segmap_fault(kas.a_hat, segkmap, segmap->base, len,
	    F_SOFTLOCK, S_READ) != 0) {
		REF_RELE(segmap);
		return (NULL);
	}
	return (segmap);
}

/*
 * Chop up the kernel virtual memory area *data of size *sz bytes for
 * a maximum of *bytes bytes into an besballoc()ed mblk_t chain using
 * the given template uri_desb_t *temp of max_mblk bytes per.
 *
 * The values of *data, *sz, and *bytes are updated on return, the
 * mblk_t chain is returned.
 */

static mblk_t *
uri_desb_chop(
	char 		**data,
	size_t		*sz,
	int 		*bytes,
	uri_desb_t 	*temp,
	int		max_mblk,
	char		*eoh,
	mblk_t		*persist
)
{
	char		*ldata = *data;
	size_t		lsz = *sz;
	int		lbytes = bytes ? *bytes : lsz;
	uri_desb_t	*desb;
	mblk_t		*mp = NULL;
	mblk_t		*nmp, *pmp = NULL;
	int		msz;

	if (lbytes == 0 && lsz == 0)
		return (NULL);

	while (lbytes > 0 && lsz > 0) {
		msz = MIN(lbytes, max_mblk);
		msz = MIN(msz, lsz);
		if (persist && eoh >= ldata && eoh < &ldata[msz]) {
			msz = (eoh - ldata);
			pmp = persist;
			persist = NULL;
			if (msz == 0) {
				nmp = pmp;
				pmp = NULL;
				goto zero;
			}
		}
		desb = kmem_cache_alloc(uri_desb_kmc, KM_SLEEP);
		REF_HOLD(temp->uri);
		if (temp->segmap) {
			REF_HOLD(temp->segmap);
		}
		bcopy(temp, desb, sizeof (*desb));
		desb->frtn.free_arg = (caddr_t)desb;
		nmp = desballoc((uchar_t *)ldata, msz, BPRI_HI, &desb->frtn);
		if (nmp == NULL) {
			if (temp->segmap) {
				REF_RELE(temp->segmap);
			}
			REF_RELE(temp->uri);
			if (mp != NULL) {
				mp->b_next = NULL;
				freemsg(mp);
			}
			if (persist != NULL) {
				freeb(persist);
			}
			return (NULL);
		}
		nmp->b_wptr += msz;
	zero:
		if (mp != NULL) {
			mp->b_next->b_cont = nmp;
		} else {
			mp = nmp;
		}
		if (pmp != NULL) {
			nmp->b_cont = pmp;
			nmp = pmp;
			pmp = NULL;
		}
		mp->b_next = nmp;
		ldata += msz;
		lsz -= msz;
		lbytes -= msz;
	}
	*data = ldata;
	*sz = lsz;
	if (bytes)
		*bytes = lbytes;
	return (mp);
}

/*
 * Experimential noqwait (i.e. no canput()/qwait() checks), just send
 * the entire mblk_t chain down without flow-control checks.
 */

static int
kstrwritempnoqwait(struct vnode *vp, mblk_t *mp)
{
	struct stdata *stp;
	int error = 0;

	ASSERT(vp->v_stream);
	stp = vp->v_stream;

	/* Fast check of flags before acquiring the lock */
	if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
		mutex_enter(&stp->sd_lock);
		error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
		mutex_exit(&stp->sd_lock);
		if (error != 0) {
			if (!(stp->sd_flag & STPLEX) &&
			    (stp->sd_wput_opt & SW_SIGPIPE)) {
				error = EPIPE;
			}
			return (error);
		}
	}
	putnext(stp->sd_wrq, mp);
	return (0);
}

/*
 * Send the URI uri_desc_t *uri response uri_rd_t *rdp out the socket_t *so.
 */

static int
uri_rd_response(struct sonode *so,
    uri_desc_t *uri,
    uri_rd_t *rdp,
    boolean_t first)
{
	vnode_t		*vp = SOTOV(so);
	int		max_mblk = (int)vp->v_stream->sd_maxblk;
	int		wsz;
	mblk_t		*mp, *wmp, *persist;
	int		write_bytes;
	uri_rd_t	rd;
	uri_desb_t	desb;
	uri_segmap_t	*segmap = NULL;
	char		*segmap_data;
	size_t		segmap_sz;
	int		error;
	int		fflg = ((so->so_state & SS_NDELAY) ? FNDELAY : 0) |
	    ((so->so_state & SS_NONBLOCK) ? FNONBLOCK : 0);


	/* Initialize template uri_desb_t */
	desb.frtn.free_func = uri_desb_free;
	desb.frtn.free_arg = NULL;
	desb.uri = uri;

	/* Get a local copy of the rd_t */
	bcopy(rdp, &rd, sizeof (rd));
	do {
		if (first) {
			/*
			 * For first kstrwrite() enough data to get
			 * things going, note non blocking version of
			 * kstrwrite() will be used below.
			 */
			write_bytes = P2ROUNDUP((max_mblk * 4),
			    MAXBSIZE * nl7c_file_prefetch);
		} else {
			if ((write_bytes = so->so_sndbuf) == 0)
				write_bytes = vp->v_stream->sd_qn_maxpsz;
			ASSERT(write_bytes > 0);
			write_bytes = P2ROUNDUP(write_bytes, MAXBSIZE);
		}
		/*
		 * Chop up to a write_bytes worth of data.
		 */
		wmp = NULL;
		wsz = write_bytes;
		do {
			if (rd.sz == 0)
				break;
			if (rd.off == -1) {
				if (uri->eoh >= rd.data.kmem &&
				    uri->eoh < &rd.data.kmem[rd.sz]) {
					persist = nl7c_http_persist(so);
				} else {
					persist = NULL;
				}
				desb.segmap = NULL;
				mp = uri_desb_chop(&rd.data.kmem, &rd.sz,
				    &wsz, &desb, max_mblk, uri->eoh, persist);
				if (mp == NULL) {
					error = ENOMEM;
					goto invalidate;
				}
			} else {
				if (segmap == NULL) {
					segmap = uri_segmap_map(&rd,
					    write_bytes);
					if (segmap == NULL) {
						error = ENOMEM;
						goto invalidate;
					}
					desb.segmap = segmap;
					segmap_data = segmap->base;
					segmap_sz = segmap->len;
				}
				mp = uri_desb_chop(&segmap_data, &segmap_sz,
				    &wsz, &desb, max_mblk, NULL, NULL);
				if (mp == NULL) {
					error = ENOMEM;
					goto invalidate;
				}
				if (segmap_sz == 0) {
					rd.sz -= segmap->len;
					rd.off += segmap->len;
					REF_RELE(segmap);
					segmap = NULL;
				}
			}
			if (wmp == NULL) {
				wmp = mp;
			} else {
				wmp->b_next->b_cont = mp;
				wmp->b_next = mp->b_next;
				mp->b_next = NULL;
			}
		} while (wsz > 0 && rd.sz > 0);

		wmp->b_next = NULL;
		if (first) {
			/* First kstrwrite(), use noqwait */
			if ((error = kstrwritempnoqwait(vp, wmp)) != 0)
				goto invalidate;
			/*
			 * For the rest of the kstrwrite()s use SO_SNDBUF
			 * worth of data at a time, note these kstrwrite()s
			 * may (will) block one or more times.
			 */
			first = B_FALSE;
		} else {
			if ((error = kstrwritemp(vp, wmp, fflg)) != 0) {
				if (error == EAGAIN) {
					nl7c_uri_rd_EAGAIN++;
					if ((error =
					    kstrwritempnoqwait(vp, wmp)) != 0)
						goto invalidate;
				} else
					goto invalidate;
			}
		}
	} while (rd.sz > 0);

	return (0);

invalidate:
	if (segmap) {
		REF_RELE(segmap);
	}
	if (wmp)
		freemsg(wmp);

	return (error);
}

/*
 * Send the URI uri_desc_t *uri response out the socket_t *so.
 */

static int
uri_response(struct sonode *so, uri_desc_t *uri)
{
	uri_rd_t	*rdp = &uri->response;
	boolean_t	first = B_TRUE;
	int		error;

	while (rdp != NULL) {
		error = uri_rd_response(so, uri, rdp, first);
		if (error != 0) {
			goto invalidate;
		}
		first = B_FALSE;
		rdp = rdp->next;
	}
	return (0);

invalidate:
	if (uri->hash != URI_TEMP)
		uri_delete(uri);
	return (error);
}

/*
 * The pchars[] array is indexed by a char to determine if it's a
 * valid URI path component chararcter where:
 *
 *    pchar       = unreserved | escaped |
 *                  ":" | "@" | "&" | "=" | "+" | "$" | ","
 *
 *    unreserved  = alphanum | mark
 *
 *    alphanum    = alpha | digit
 *
 *    alpha       = lowalpha | upalpha
 *
 *    lowalpha    = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" |
 *                  "i" | "j" | "k" | "l" | "m" | "n" | "o" | "p" |
 *                  "q" | "r" | "s" | "t" | "u" | "v" | "w" | "x" |
 *                  "y" | "z"
 *
 *    upalpha     = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" |
 *                  "I" | "J" | "K" | "L" | "M" | "N" | "O" | "P" |
 *                  "Q" | "R" | "S" | "T" | "U" | "V" | "W" | "X" |
 *                  "Y" | "Z"
 *
 *    digit       = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
 *                  "8" | "9"
 *
 *    mark        = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
 *
 *    escaped     = "%" hex hex
 *    hex         = digit | "A" | "B" | "C" | "D" | "E" | "F" |
 *                  "a" | "b" | "c" | "d" | "e" | "f"
 */

static char pchars[] = {
    0, 0, 0, 0, 0, 0, 0, 0,	/* 0x00 - 0x07 */
    0, 0, 0, 0, 0, 0, 0, 0,	/* 0x08 - 0x0F */
    0, 0, 0, 0, 0, 0, 0, 0,	/* 0x10 - 0x17 */
    0, 0, 0, 0, 0, 0, 0, 0,	/* 0x18 - 0x1F */
    0, 1, 0, 0, 1, 1, 1, 1,	/* 0x20 - 0x27 */
    0, 0, 1, 1, 1, 1, 1, 1,	/* 0x28 - 0x2F */
    1, 1, 1, 1, 1, 1, 1, 1,	/* 0x30 - 0x37 */
    1, 1, 1, 0, 0, 1, 0, 0,	/* 0x38 - 0x3F */
    1, 1, 1, 1, 1, 1, 1, 1,	/* 0x40 - 0x47 */
    1, 1, 1, 1, 1, 1, 1, 1,	/* 0x48 - 0x4F */
    1, 1, 1, 1, 1, 1, 1, 1,	/* 0x50 - 0x57 */
    1, 1, 1, 0, 0, 0, 0, 1,	/* 0x58 - 0x5F */
    0, 1, 1, 1, 1, 1, 1, 1,	/* 0x60 - 0x67 */
    1, 1, 1, 1, 1, 1, 1, 1,	/* 0x68 - 0x6F */
    1, 1, 1, 1, 1, 1, 1, 1,	/* 0x70 - 0x77 */
    1, 1, 1, 0, 0, 0, 1, 0	/* 0x78 - 0x7F */
};

#define	PCHARS_MASK 0x7F

/*
 * This is the main L7 request message parse, we are called each time
 * new data is availble for a socket, each time a single buffer of the
 * entire message to date is given.
 *
 * Here we parse the request looking for the URI, parse it, and if a
 * supported scheme call the scheme parser to commplete the parse of any
 * headers which may further qualify the identity of the requested object
 * then lookup it up in the URI hash.
 *
 * Return B_TRUE for more processing.
 *
 * Note, at this time the parser supports the generic message format as
 * specified in RFC 822 with potentional limitations as specified in RFC
 * 2616 for HTTP messages.
 *
 * Note, the caller supports an mblk_t chain, for now the parser(s)
 * require the complete header in a single mblk_t. This is the common
 * case and certainly for high performance environments, if at a future
 * date mblk_t chains are important the parse can be reved to process
 * mblk_t chains.
 */

boolean_t
nl7c_parse(struct sonode *so, boolean_t nonblocking, boolean_t *ret)
{
	sotpi_info_t *sti = SOTOTPI(so);
	char	*cp = (char *)sti->sti_nl7c_rcv_mp->b_rptr;
	char	*ep = (char *)sti->sti_nl7c_rcv_mp->b_wptr;
	char	*get = "GET ";
	char	*post = "POST ";
	char	c;
	char	*uris;
	uri_desc_t *uri = NULL;
	uri_desc_t *ruri = NULL;
	mblk_t	*reqmp;
	uint32_t hv = 0;

	if ((reqmp = dupb(sti->sti_nl7c_rcv_mp)) == NULL) {
		nl7c_uri_pass_dupbfail++;
		goto pass;
	}
	/*
	 * Allocate and initialize minimumal state for the request
	 * uri_desc_t, in the cache hit case this uri_desc_t will
	 * be freed.
	 */
	uri = kmem_cache_alloc(nl7c_uri_kmc, KM_SLEEP);
	REF_INIT(uri, 1, nl7c_uri_inactive, nl7c_uri_kmc);
	uri->hash = NULL;
	uri->tail = NULL;
	uri->scheme = NULL;
	uri->count = 0;
	uri->reqmp = reqmp;

	/*
	 * Set request time to current time.
	 */
	sti->sti_nl7c_rtime = gethrestime_sec();

	/*
	 * Parse the Request-Line for the URI.
	 *
	 * For backwards HTTP version compatable reasons skip any leading
	 * CRLF (or CR or LF) line terminator(s) preceding Request-Line.
	 */
	while (cp < ep && (*cp == '\r' || *cp == '\n')) {
		cp++;
	}
	while (cp < ep && *get == *cp) {
		get++;
		cp++;
	}
	if (*get != 0) {
		/* Note a "GET", check for "POST" */
		while (cp < ep && *post == *cp) {
			post++;
			cp++;
		}
		if (*post != 0) {
			if (cp == ep) {
				nl7c_uri_more_get++;
				goto more;
			}
			/* Not a "GET" or a "POST", just pass */
			nl7c_uri_pass_method++;
			goto pass;
		}
		/* "POST", don't cache but still may want to parse */
		uri->hash = URI_TEMP;
	}
	/*
	 * Skip over URI path char(s) and save start and past end pointers.
	 */
	uris = cp;
	while (cp < ep && (c = *cp) != ' ' && c != '\r') {
		if (c == '?') {
			/* Don't cache but still may want to parse */
			uri->hash = URI_TEMP;
		}
		CHASH(hv, c);
		cp++;
	}
	if (c != '\r' && cp == ep) {
		nl7c_uri_more_eol++;
		goto more;
	}
	/*
	 * Request-Line URI parsed, pass the rest of the request on
	 * to the the http scheme parse.
	 */
	uri->path.cp = uris;
	uri->path.ep = cp;
	uri->hvalue = hv;
	if (! nl7c_http_request(&cp, ep, uri, so) || cp == NULL) {
		/*
		 * Parse not successful or pass on request, the pointer
		 * to the parse pointer "cp" is overloaded such that ! NULL
		 * for more data and NULL for bad parse of request or pass.
		 */
		if (cp != NULL) {
			nl7c_uri_more_http++;
			goto more;
		}
		nl7c_uri_pass_http++;
		goto pass;
	}
	if (uri->nocache) {
		uri->hash = URI_TEMP;
		(void) uri_lookup(uri, B_FALSE, nonblocking);
	} else if (uri->hash == URI_TEMP) {
		uri->nocache = B_TRUE;
		(void) uri_lookup(uri, B_FALSE, nonblocking);
	}

	if (uri->hash == URI_TEMP) {
		if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
			/* Temporary URI so skip hash processing */
			nl7c_uri_request++;
			nl7c_uri_temp++;
			goto temp;
		}
		/* Not persistent so not interested in the response */
		nl7c_uri_pass_temp++;
		goto pass;
	}
	/*
	 * Check the URI hash for a cached response, save the request
	 * uri in case we need it below.
	 */
	ruri = uri;
	if ((uri = uri_lookup(uri, B_TRUE, nonblocking)) == NULL) {
		/*
		 * Failed to lookup due to nonblocking wait required,
		 * interrupted cv_wait_sig(), KM_NOSLEEP memory alloc
		 * failure, ... Just pass on this request.
		 */
		nl7c_uri_pass_addfail++;
		goto pass;
	}
	nl7c_uri_request++;
	if (uri->response.sz > 0) {
		/*
		 * We have the response cached, update recv mblk rptr
		 * to reflect the data consumed in parse.
		 */
		mblk_t	*mp = sti->sti_nl7c_rcv_mp;

		if (cp == (char *)mp->b_wptr) {
			sti->sti_nl7c_rcv_mp = mp->b_cont;
			mp->b_cont = NULL;
			freeb(mp);
		} else {
			mp->b_rptr = (unsigned char *)cp;
		}
		nl7c_uri_hit++;
		/* If logging enabled log request */
		if (nl7c_logd_enabled) {
			ipaddr_t faddr;

			if (so->so_family == AF_INET) {
				/* Only support IPv4 addrs */
				faddr = ((struct sockaddr_in *)
				    sti->sti_faddr_sa) ->sin_addr.s_addr;
			} else {
				faddr = 0;
			}
			/* XXX need to pass response type, e.g. 200, 304 */
			nl7c_logd_log(ruri, uri, sti->sti_nl7c_rtime, faddr);
		}

		/* If conditional request check for substitute response */
		if (ruri->conditional) {
			uri = nl7c_http_cond(ruri, uri);
		}

		/*
		 * Release reference on request URI, send the response out
		 * the socket, release reference on response uri, set the
		 * *ret value to B_TRUE to indicate request was consumed
		 * then return B_FALSE to indcate no more data needed.
		 */
		REF_RELE(ruri);
		(void) uri_response(so, uri);
		REF_RELE(uri);
		*ret = B_TRUE;
		return (B_FALSE);
	}
	/*
	 * Miss the cache, the request URI is in the cache waiting for
	 * application write-side data to fill it.
	 */
	nl7c_uri_miss++;
temp:
	/*
	 * A miss or temp URI for which response data is needed, link
	 * uri to so and so to uri, set WAITWRITE in the so such that
	 * read-side processing is suspended (so the next read() gets
	 * the request data) until a write() is processed by NL7C.
	 *
	 * Note, sti->sti_nl7c_uri now owns the REF_INIT() ref.
	 */
	uri->proc = so;
	sti->sti_nl7c_uri = uri;
	sti->sti_nl7c_flags |= NL7C_WAITWRITE;
	*ret = B_FALSE;
	return (B_FALSE);

more:
	/* More data is needed, note fragmented recv not supported */
	nl7c_uri_more++;

pass:
	/* Pass on this request */
	nl7c_uri_pass++;
	nl7c_uri_request++;
	if (ruri != NULL) {
		REF_RELE(ruri);
	}
	if (uri) {
		REF_RELE(uri);
	}
	sti->sti_nl7c_flags = 0;
	*ret = B_FALSE;
	return (B_FALSE);
}