# HG changeset patch # User Jerry Jelinek # Date 1329162626 0 # Node ID 417f3f83e896151fa8a238aa3fc94978912f2ed1 # Parent b92428ed8d2a667a0f8c9755141013ecf9e2b4d2 3925 IP DCE does not scale Reviewed by: Keith M Wesolowski Reviewed by: Theo Schlossnagle Reviewed by: Sebastien Roy Approved by: Dan McDonald diff -r b92428ed8d2a -r 417f3f83e896 usr/src/uts/common/inet/ip.h --- a/usr/src/uts/common/inet/ip.h Tue Jul 03 12:49:09 2012 +0000 +++ b/usr/src/uts/common/inet/ip.h Mon Feb 13 19:50:26 2012 +0000 @@ -21,6 +21,7 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 1990 Mentat Inc. */ @@ -2195,6 +2196,8 @@ */ ixa_notify_t ixa_notify; /* Registered upcall notify function */ void *ixa_notify_cookie; /* ULP cookie for ixa_notify */ + + uint_t ixa_tcpcleanup; /* Used by conn_ixa_cleanup */ }; /* @@ -2266,6 +2269,14 @@ #define IXA_FREE_TSL 0x00000002 /* ixa_tsl needs to be rele */ /* + * Trivial state machine used to synchronize IXA cleanup for TCP connections. + * See conn_ixa_cleanup(). + */ +#define IXATC_IDLE 0x00000000 +#define IXATC_INPROGRESS 0x00000001 +#define IXATC_COMPLETE 0x00000002 + +/* * Simplistic way to set the ixa_xmit_hint for locally generated traffic * and forwarded traffic. The shift amount are based on the size of the * structs to discard the low order bits which don't have much if any variation @@ -3030,6 +3041,7 @@ #define ips_ip_strict_src_multihoming ips_propinfo_tbl[80].prop_cur_uval #define ips_ipv6_strict_src_multihoming ips_propinfo_tbl[81].prop_cur_uval #define ips_ipv6_drop_inbound_icmpv6 ips_propinfo_tbl[82].prop_cur_bval +#define ips_ip_dce_reclaim_threshold ips_propinfo_tbl[83].prop_cur_uval extern int dohwcksum; /* use h/w cksum if supported by the h/w */ #ifdef ZC_TEST diff -r b92428ed8d2a -r 417f3f83e896 usr/src/uts/common/inet/ip/ip_attr.c --- a/usr/src/uts/common/inet/ip/ip_attr.c Tue Jul 03 12:49:09 2012 +0000 +++ b/usr/src/uts/common/inet/ip/ip_attr.c Mon Feb 13 19:50:26 2012 +0000 @@ -1176,6 +1176,59 @@ } } +static mblk_t * +tcp_ixa_cleanup_getmblk(conn_t *connp) +{ + tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp; + int need_retry; + mblk_t *mp; + + mutex_enter(&tcps->tcps_ixa_cleanup_lock); + + /* + * It's possible that someone else came in and started cleaning up + * another connection between the time we verified this one is not being + * cleaned up and the time we actually get the shared mblk. If that's + * the case, we've dropped the lock, and some other thread may have + * cleaned up this connection again, and is still waiting for + * notification of that cleanup's completion. Therefore we need to + * recheck. + */ + do { + need_retry = 0; + while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) { + cv_wait(&tcps->tcps_ixa_cleanup_done_cv, + &tcps->tcps_ixa_cleanup_lock); + } + + while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) { + /* + * Multiple concurrent cleanups; need to have the last + * one run since it could be an unplumb. + */ + need_retry = 1; + cv_wait(&tcps->tcps_ixa_cleanup_ready_cv, + &tcps->tcps_ixa_cleanup_lock); + } + } while (need_retry); + + /* + * We now have the lock and the mblk; now make sure that no one else can + * try to clean up this connection or enqueue it for cleanup, clear the + * mblk pointer for this stack, drop the lock, and return the mblk. + */ + ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock)); + ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE); + ASSERT(tcps->tcps_ixa_cleanup_mp == mp); + ASSERT(mp != NULL); + + connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS; + tcps->tcps_ixa_cleanup_mp = NULL; + mutex_exit(&tcps->tcps_ixa_cleanup_lock); + + return (mp); +} + /* * Used to run ixa_cleanup_stale inside the tcp squeue. * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp @@ -1195,11 +1248,39 @@ mutex_enter(&tcps->tcps_ixa_cleanup_lock); ASSERT(tcps->tcps_ixa_cleanup_mp == NULL); + connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE; tcps->tcps_ixa_cleanup_mp = mp; - cv_signal(&tcps->tcps_ixa_cleanup_cv); + cv_signal(&tcps->tcps_ixa_cleanup_ready_cv); + /* + * It is possible for any number of threads to be waiting for cleanup of + * different connections. Absent a per-connection (or per-IXA) CV, we + * need to wake them all up even though only one can be waiting on this + * particular cleanup. + */ + cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv); mutex_exit(&tcps->tcps_ixa_cleanup_lock); } +static void +tcp_ixa_cleanup_wait_and_finish(conn_t *connp) +{ + tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp; + + mutex_enter(&tcps->tcps_ixa_cleanup_lock); + + ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE); + + while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) { + cv_wait(&tcps->tcps_ixa_cleanup_done_cv, + &tcps->tcps_ixa_cleanup_lock); + } + + ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE); + connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE; + cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv); + + mutex_exit(&tcps->tcps_ixa_cleanup_lock); +} /* * ipcl_walk() function to help release any IRE, NCE, or DCEs that @@ -1214,21 +1295,8 @@ if (IPCL_IS_TCP(connp)) { mblk_t *mp; - tcp_stack_t *tcps; - tcps = connp->conn_netstack->netstack_tcp; - - mutex_enter(&tcps->tcps_ixa_cleanup_lock); - while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) { - /* - * Multiple concurrent cleanups; need to have the last - * one run since it could be an unplumb. - */ - cv_wait(&tcps->tcps_ixa_cleanup_cv, - &tcps->tcps_ixa_cleanup_lock); - } - tcps->tcps_ixa_cleanup_mp = NULL; - mutex_exit(&tcps->tcps_ixa_cleanup_lock); + mp = tcp_ixa_cleanup_getmblk(connp); if (connp->conn_sqp->sq_run == curthread) { /* Already on squeue */ @@ -1237,15 +1305,8 @@ CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup, connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP); - - /* Wait until tcp_ixa_cleanup has run */ - mutex_enter(&tcps->tcps_ixa_cleanup_lock); - while (tcps->tcps_ixa_cleanup_mp == NULL) { - cv_wait(&tcps->tcps_ixa_cleanup_cv, - &tcps->tcps_ixa_cleanup_lock); - } - mutex_exit(&tcps->tcps_ixa_cleanup_lock); } + tcp_ixa_cleanup_wait_and_finish(connp); } else if (IPCL_IS_SCTP(connp)) { sctp_t *sctp; sctp_faddr_t *fp; diff -r b92428ed8d2a -r 417f3f83e896 usr/src/uts/common/inet/ip/ip_dce.c --- a/usr/src/uts/common/inet/ip/ip_dce.c Tue Jul 03 12:49:09 2012 +0000 +++ b/usr/src/uts/common/inet/ip/ip_dce.c Mon Feb 13 19:50:26 2012 +0000 @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ #include @@ -28,10 +29,12 @@ #include #include #include +#include #include #include #include #include +#include #define _SUN_TPI_VERSION 2 #include @@ -102,7 +105,19 @@ static void dce_make_condemned(dce_t *); static kmem_cache_t *dce_cache; +static kthread_t *dce_reclaim_thread; +static kmutex_t dce_reclaim_lock; +static kcondvar_t dce_reclaim_cv; +static int dce_reclaim_shutdown; +/* Global so it can be tuned in /etc/system. This must be a power of two. */ +uint_t ip_dce_hash_size = 1024; + +/* The time in seconds between executions of the IP DCE reclaim worker. */ +uint_t ip_dce_reclaim_interval = 60; + +/* The factor of the DCE threshold at which to start hard reclaims */ +uint_t ip_dce_reclaim_threshold_hard = 2; /* Operates on a uint64_t */ #define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48)) @@ -117,6 +132,11 @@ uint_t fraction_pmtu = fraction*4; uint_t hash; dce_t *dce, *nextdce; + hrtime_t seed = gethrtime(); + uint_t retained = 0; + uint_t max = ipst->ips_ip_dce_reclaim_threshold; + + max *= ip_dce_reclaim_threshold_hard; rw_enter(&dcb->dcb_lock, RW_WRITER); for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { @@ -132,13 +152,21 @@ } else { mutex_exit(&dce->dce_lock); } - hash = RANDOM_HASH((uint64_t)(uintptr_t)dce); - if (dce->dce_flags & DCEF_PMTU) { - if (hash % fraction_pmtu != 0) - continue; - } else { - if (hash % fraction != 0) - continue; + + if (max == 0 || retained < max) { + hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed)); + + if (dce->dce_flags & DCEF_PMTU) { + if (hash % fraction_pmtu != 0) { + retained++; + continue; + } + } else { + if (hash % fraction != 0) { + retained++; + continue; + } + } } IP_STAT(ipst, ip_dce_reclaim_deleted); @@ -175,17 +203,19 @@ } /* - * Called by the memory allocator subsystem directly, when the system - * is running low on memory. + * Called by dce_reclaim_worker() below, and no one else. Typically this will + * mean that the number of entries in the hash buckets has exceeded a tunable + * threshold. */ -/* ARGSUSED */ -void -ip_dce_reclaim(void *args) +static void +ip_dce_reclaim(void) { netstack_handle_t nh; netstack_t *ns; ip_stack_t *ipst; + ASSERT(curthread == dce_reclaim_thread); + netstack_next_init(&nh); while ((ns = netstack_next(&nh)) != NULL) { /* @@ -196,26 +226,75 @@ netstack_rele(ns); continue; } - ip_dce_reclaim_stack(ipst); + if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0) + ip_dce_reclaim_stack(ipst); netstack_rele(ns); } netstack_next_fini(&nh); } +/* ARGSUSED */ +static void +dce_reclaim_worker(void *arg) +{ + callb_cpr_t cprinfo; + + CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr, + "dce_reclaim_worker"); + + mutex_enter(&dce_reclaim_lock); + while (!dce_reclaim_shutdown) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock, + ddi_get_lbolt() + ip_dce_reclaim_interval * hz); + CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock); + + if (dce_reclaim_shutdown) + break; + + mutex_exit(&dce_reclaim_lock); + ip_dce_reclaim(); + mutex_enter(&dce_reclaim_lock); + } + + ASSERT(MUTEX_HELD(&dce_reclaim_lock)); + dce_reclaim_thread = NULL; + dce_reclaim_shutdown = 0; + cv_broadcast(&dce_reclaim_cv); + CALLB_CPR_EXIT(&cprinfo); /* drops the lock */ + + thread_exit(); +} + void dce_g_init(void) { dce_cache = kmem_cache_create("dce_cache", - sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0); + sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL); + + dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker, + NULL, 0, &p0, TS_RUN, minclsyspri); } void dce_g_destroy(void) { + mutex_enter(&dce_reclaim_lock); + dce_reclaim_shutdown = 1; + cv_signal(&dce_reclaim_cv); + while (dce_reclaim_thread != NULL) + cv_wait(&dce_reclaim_cv, &dce_reclaim_lock); + mutex_exit(&dce_reclaim_lock); + + cv_destroy(&dce_reclaim_cv); + mutex_destroy(&dce_reclaim_lock); + kmem_cache_destroy(dce_cache); } - /* * Allocate a default DCE and a hash table for per-IP address DCEs */ @@ -234,7 +313,7 @@ ipst->ips_dce_default->dce_ipst = ipst; /* This must be a power of two since we are using IRE_ADDR_HASH macro */ - ipst->ips_dce_hashsize = 256; + ipst->ips_dce_hashsize = ip_dce_hash_size; ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize * sizeof (dcb_t), KM_SLEEP); ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize * @@ -414,6 +493,12 @@ hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); dcb = &ipst->ips_dce_hash_v4[hash]; + /* + * Assuming that we get fairly even distribution across all of the + * buckets, once one bucket is overly full, prune the whole cache. + */ + if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold) + atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1); rw_enter(&dcb->dcb_lock, RW_WRITER); for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { if (dce->dce_v4addr == dst) { @@ -447,6 +532,7 @@ dce->dce_ptpn = &dcb->dcb_dce; dcb->dcb_dce = dce; dce->dce_bucket = dcb; + atomic_add_32(&dcb->dcb_cnt, 1); dce_refhold(dce); /* For the caller */ rw_exit(&dcb->dcb_lock); @@ -476,6 +562,12 @@ hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); dcb = &ipst->ips_dce_hash_v6[hash]; + /* + * Assuming that we get fairly even distribution across all of the + * buckets, once one bucket is overly full, prune the whole cache. + */ + if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold) + atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1); rw_enter(&dcb->dcb_lock, RW_WRITER); for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && diff -r b92428ed8d2a -r 417f3f83e896 usr/src/uts/common/inet/ip/ip_tunables.c --- a/usr/src/uts/common/inet/ip/ip_tunables.c Tue Jul 03 12:49:09 2012 +0000 +++ b/usr/src/uts/common/inet/ip/ip_tunables.c Mon Feb 13 19:50:26 2012 +0000 @@ -21,6 +21,7 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -909,6 +910,11 @@ #else { "", 0, NULL, NULL, {0}, {0} }, #endif + + { "_dce_reclaim_threshold", MOD_PROTO_IP, + mod_set_uint32, mod_get_uint32, + {1, 100000, 32}, {32} }, + { "mtu", MOD_PROTO_IPV4, NULL, ip_get_mtu, {0}, {0} }, { "mtu", MOD_PROTO_IPV6, NULL, ip_get_mtu, {0}, {0} }, diff -r b92428ed8d2a -r 417f3f83e896 usr/src/uts/common/inet/ip_stack.h --- a/usr/src/uts/common/inet/ip_stack.h Tue Jul 03 12:49:09 2012 +0000 +++ b/usr/src/uts/common/inet/ip_stack.h Mon Feb 13 19:50:26 2012 +0000 @@ -264,6 +264,7 @@ uint_t ips_dce_hashsize; struct dcb_s *ips_dce_hash_v4; struct dcb_s *ips_dce_hash_v6; + uint_t ips_dce_reclaim_needed; /* pending binds */ mblk_t *ips_ip6_asp_pending_ops; diff -r b92428ed8d2a -r 417f3f83e896 usr/src/uts/common/inet/squeue.c --- a/usr/src/uts/common/inet/squeue.c Tue Jul 03 12:49:09 2012 +0000 +++ b/usr/src/uts/common/inet/squeue.c Mon Feb 13 19:50:26 2012 +0000 @@ -546,6 +546,7 @@ ASSERT(MUTEX_HELD(&sqp->sq_lock)); ASSERT(sqp->sq_first != NULL); now = gethrtime(); + sqp->sq_run = curthread; sqp->sq_drain(sqp, SQS_ENTER, now + squeue_drain_ns); /* diff -r b92428ed8d2a -r 417f3f83e896 usr/src/uts/common/inet/tcp/tcp.c --- a/usr/src/uts/common/inet/tcp/tcp.c Tue Jul 03 12:49:09 2012 +0000 +++ b/usr/src/uts/common/inet/tcp/tcp.c Mon Feb 13 19:50:26 2012 +0000 @@ -3793,7 +3793,8 @@ ASSERT(error == 0); tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); ASSERT(tcps->tcps_ixa_cleanup_mp != NULL); - cv_init(&tcps->tcps_ixa_cleanup_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tcps->tcps_ixa_cleanup_done_cv, NULL, CV_DEFAULT, NULL); mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&tcps->tcps_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); @@ -3858,7 +3859,8 @@ freeb(tcps->tcps_ixa_cleanup_mp); tcps->tcps_ixa_cleanup_mp = NULL; - cv_destroy(&tcps->tcps_ixa_cleanup_cv); + cv_destroy(&tcps->tcps_ixa_cleanup_ready_cv); + cv_destroy(&tcps->tcps_ixa_cleanup_done_cv); mutex_destroy(&tcps->tcps_ixa_cleanup_lock); /* diff -r b92428ed8d2a -r 417f3f83e896 usr/src/uts/common/inet/tcp_stack.h --- a/usr/src/uts/common/inet/tcp_stack.h Tue Jul 03 12:49:09 2012 +0000 +++ b/usr/src/uts/common/inet/tcp_stack.h Mon Feb 13 19:50:26 2012 +0000 @@ -101,7 +101,8 @@ /* Used to synchronize access when reclaiming memory */ mblk_t *tcps_ixa_cleanup_mp; kmutex_t tcps_ixa_cleanup_lock; - kcondvar_t tcps_ixa_cleanup_cv; + kcondvar_t tcps_ixa_cleanup_ready_cv; + kcondvar_t tcps_ixa_cleanup_done_cv; /* Variables for handling kmem reclaim call back. */ kmutex_t tcps_reclaim_lock; diff -r b92428ed8d2a -r 417f3f83e896 usr/src/uts/intel/ip/ip.global-objs.debug64 --- a/usr/src/uts/intel/ip/ip.global-objs.debug64 Tue Jul 03 12:49:09 2012 +0000 +++ b/usr/src/uts/intel/ip/ip.global-objs.debug64 Mon Feb 13 19:50:26 2012 +0000 @@ -21,6 +21,7 @@ # # Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2011 Nexenta Systems, Inc. All rights reserved +# Copyright 2012 Joyent, Inc. All rights reserved # arp_m_tbl @@ -49,6 +50,10 @@ cl_sctp_unlisten conn_drain_nthreads dce_cache +dce_reclaim_cv +dce_reclaim_lock +dce_reclaim_shutdown +dce_reclaim_thread default_ip6_asp_table do_tcp_fusion do_tcpzcopy @@ -107,6 +112,9 @@ ip6opt_ls ip_cgtp_filter_rev ip_conn_cache +ip_dce_hash_size +ip_dce_reclaim_interval +ip_dce_reclaim_threshold_hard ip_debug ip_g_all_ones ip_helper_stream_info diff -r b92428ed8d2a -r 417f3f83e896 usr/src/uts/intel/ip/ip.global-objs.obj64 --- a/usr/src/uts/intel/ip/ip.global-objs.obj64 Tue Jul 03 12:49:09 2012 +0000 +++ b/usr/src/uts/intel/ip/ip.global-objs.obj64 Mon Feb 13 19:50:26 2012 +0000 @@ -21,6 +21,7 @@ # # Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. # Copyright 2011 Nexenta Systems, Inc. All rights reserved +# Copyright 2012 Joyent, Inc. All rights reserved # arp_m_tbl @@ -49,6 +50,10 @@ cl_sctp_unlisten conn_drain_nthreads dce_cache +dce_reclaim_cv +dce_reclaim_lock +dce_reclaim_shutdown +dce_reclaim_thread default_ip6_asp_table do_tcp_fusion do_tcpzcopy @@ -107,6 +112,9 @@ ip6opt_ls ip_cgtp_filter_rev ip_conn_cache +ip_dce_hash_size +ip_dce_reclaim_interval +ip_dce_reclaim_threshold_hard ip_debug ip_g_all_ones ip_helper_stream_info