From owner-svn-src-head@FreeBSD.ORG Thu Apr 10 18:15:36 2014 Return-Path: Delivered-To: svn-src-head@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 375985F1; Thu, 10 Apr 2014 18:15:36 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 24128125F; Thu, 10 Apr 2014 18:15:36 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.8/8.14.8) with ESMTP id s3AIFa48065545; Thu, 10 Apr 2014 18:15:36 GMT (envelope-from jhb@svn.freebsd.org) Received: (from jhb@localhost) by svn.freebsd.org (8.14.8/8.14.8/Submit) id s3AIFZx3065541; Thu, 10 Apr 2014 18:15:35 GMT (envelope-from jhb@svn.freebsd.org) Message-Id: <201404101815.s3AIFZx3065541@svn.freebsd.org> From: John Baldwin Date: Thu, 10 Apr 2014 18:15:35 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r264321 - head/sys/netinet X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.17 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 10 Apr 2014 18:15:36 -0000 Author: jhb Date: Thu Apr 10 18:15:35 2014 New Revision: 264321 URL: http://svnweb.freebsd.org/changeset/base/264321 Log: Currently, the TCP slow timer can starve TCP input processing while it walks the list of connections in TIME_WAIT closing expired connections due to contention on the global TCP pcbinfo lock. To remediate, introduce a new global lock to protect the list of connections in TIME_WAIT. Only acquire the TCP pcbinfo lock when closing an expired connection. This limits the window of time when TCP input processing is stopped to the amount of time needed to close a single connection. Submitted by: Julien Charbon Reviewed by: rwatson, rrs, adrian MFC after: 2 months Modified: head/sys/netinet/tcp_timer.c head/sys/netinet/tcp_timer.h head/sys/netinet/tcp_timewait.c head/sys/netinet/tcp_var.h Modified: head/sys/netinet/tcp_timer.c ============================================================================== --- head/sys/netinet/tcp_timer.c Thu Apr 10 17:00:44 2014 (r264320) +++ head/sys/netinet/tcp_timer.c Thu Apr 10 18:15:35 2014 (r264321) @@ -144,9 +144,7 @@ tcp_slowtimo(void) VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - INP_INFO_WLOCK(&V_tcbinfo); - (void) tcp_tw_2msl_scan(0); - INP_INFO_WUNLOCK(&V_tcbinfo); + tcp_tw_2msl_scan(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); Modified: head/sys/netinet/tcp_timer.h ============================================================================== --- head/sys/netinet/tcp_timer.h Thu Apr 10 17:00:44 2014 (r264320) +++ head/sys/netinet/tcp_timer.h Thu Apr 10 18:15:35 2014 (r264321) @@ -178,7 +178,8 @@ extern int tcp_fast_finwait2_recycle; void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); struct tcptw * - tcp_tw_2msl_scan(int _reuse); /* XXX temporary */ + tcp_tw_2msl_reuse(void); /* XXX temporary? */ +void tcp_tw_2msl_scan(void); void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); Modified: head/sys/netinet/tcp_timewait.c ============================================================================== --- head/sys/netinet/tcp_timewait.c Thu Apr 10 17:00:44 2014 (r264320) +++ head/sys/netinet/tcp_timewait.c Thu Apr 10 18:15:35 2014 (r264321) @@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include @@ -99,13 +100,61 @@ static int maxtcptw; * The timed wait queue contains references to each of the TCP sessions * currently in the TIME_WAIT state. The queue pointers, including the * queue pointers in each tcptw structure, are protected using the global - * tcbinfo lock, which must be held over queue iteration and modification. + * timewait lock, which must be held over queue iteration and modification. */ static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl); #define V_twq_2msl VNET(twq_2msl) -static void tcp_tw_2msl_reset(struct tcptw *, int); -static void tcp_tw_2msl_stop(struct tcptw *); +/* Global timewait lock */ +static VNET_DEFINE(struct rwlock, tw_lock); +#define V_tw_lock VNET(tw_lock) + +#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0) +#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw)) +#define TW_RLOCK(tw) rw_rlock(&(tw)) +#define TW_WLOCK(tw) rw_wlock(&(tw)) +#define TW_RUNLOCK(tw) rw_runlock(&(tw)) +#define TW_WUNLOCK(tw) rw_wunlock(&(tw)) +#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED) +#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED) +#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED) +#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED) + +/* + * tw_pcbref() bumps the reference count on an tw in order to maintain + * stability of an tw pointer despite the tw lock being released. + */ +static void +tw_pcbref(struct tcptw *tw) +{ + + KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__)); + refcount_acquire(&tw->tw_refcount); +} + +/* + * Drop a refcount on an tw elevated using tw_pcbref(). Return + * the tw lock released. + */ +static int +tw_pcbrele(struct tcptw *tw) +{ + + TW_WLOCK_ASSERT(V_tw_lock); + KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__)); + + if (!refcount_release(&tw->tw_refcount)) { + TW_WUNLOCK(V_tw_lock); + return (0); + } + + uma_zfree(V_tcptw_zone, tw); + TW_WUNLOCK(V_tw_lock); + return (1); +} + +static void tcp_tw_2msl_reset(struct tcptw *, int ream); +static void tcp_tw_2msl_stop(struct tcptw *, int reuse); static int tcptw_auto_size(void) @@ -172,6 +221,7 @@ tcp_tw_init(void) else uma_zone_set_max(V_tcptw_zone, maxtcptw); TAILQ_INIT(&V_twq_2msl); + TW_LOCK_INIT(V_tw_lock, "tcptw"); } #ifdef VIMAGE @@ -181,10 +231,11 @@ tcp_tw_destroy(void) struct tcptw *tw; INP_INFO_WLOCK(&V_tcbinfo); - while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) + while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) tcp_twclose(tw, 0); INP_INFO_WUNLOCK(&V_tcbinfo); + TW_LOCK_DESTROY(V_tw_lock); uma_zdestroy(V_tcptw_zone); } #endif @@ -205,7 +256,7 @@ tcp_twstart(struct tcpcb *tp) int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if (V_nolocaltimewait) { @@ -230,7 +281,7 @@ tcp_twstart(struct tcpcb *tp) tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); if (tw == NULL) { - tw = tcp_tw_2msl_scan(1); + tw = tcp_tw_2msl_reuse(); if (tw == NULL) { tp = tcp_close(tp); if (tp != NULL) @@ -239,6 +290,7 @@ tcp_twstart(struct tcpcb *tp) } } tw->tw_inpcb = inp; + refcount_init(&tw->tw_refcount, 1); /* * Recover last window size sent. @@ -357,7 +409,6 @@ tcp_twcheck(struct inpcb *inp, struct tc int thflags; tcp_seq seq; - /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */ INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); @@ -459,11 +510,11 @@ tcp_twclose(struct tcptw *tw, int reuse) inp = tw->tw_inpcb; KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */ INP_WLOCK_ASSERT(inp); tw->tw_inpcb = NULL; - tcp_tw_2msl_stop(tw); + tcp_tw_2msl_stop(tw, reuse); inp->inp_ppcb = NULL; in_pcbdrop(inp); @@ -495,11 +546,6 @@ tcp_twclose(struct tcptw *tw, int reuse) } else in_pcbfree(inp); TCPSTAT_INC(tcps_closed); - crfree(tw->tw_cred); - tw->tw_cred = NULL; - if (reuse) - return; - uma_zfree(V_tcptw_zone, tw); } int @@ -617,34 +663,88 @@ tcp_tw_2msl_reset(struct tcptw *tw, int INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tw->tw_inpcb); + + TW_WLOCK(V_tw_lock); if (rearm) TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); tw->tw_time = ticks + 2 * tcp_msl; TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl); + TW_WUNLOCK(V_tw_lock); } static void -tcp_tw_2msl_stop(struct tcptw *tw) +tcp_tw_2msl_stop(struct tcptw *tw, int reuse) { INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + + TW_WLOCK(V_tw_lock); TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); + crfree(tw->tw_cred); + tw->tw_cred = NULL; + + if (!reuse) { + tw_pcbrele(tw); + return; + } + + TW_WUNLOCK(V_tw_lock); } struct tcptw * -tcp_tw_2msl_scan(int reuse) +tcp_tw_2msl_reuse(void) { - struct tcptw *tw; INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + + struct tcptw *tw; + + TW_WLOCK(V_tw_lock); + tw = TAILQ_FIRST(&V_twq_2msl); + if (tw == NULL) { + TW_WUNLOCK(V_tw_lock); + return NULL; + } + TW_WUNLOCK(V_tw_lock); + + INP_WLOCK(tw->tw_inpcb); + tcp_twclose(tw, 1); + + return (tw); +} + +void +tcp_tw_2msl_scan(void) +{ + + struct tcptw *tw; for (;;) { + TW_RLOCK(V_tw_lock); tw = TAILQ_FIRST(&V_twq_2msl); - if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) + if (tw == NULL || ((tw->tw_time - ticks) > 0)) { + TW_RUNLOCK(V_tw_lock); + break; + } + tw_pcbref(tw); + TW_RUNLOCK(V_tw_lock); + + /* Close timewait state */ + if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) { + TW_WLOCK(V_tw_lock); + if (tw_pcbrele(tw)) + continue; + + KASSERT(tw->tw_inpcb != NULL, + ("%s: tw->tw_inpcb == NULL", __func__)); + + INP_WLOCK(tw->tw_inpcb); + tcp_twclose(tw, 0); + INP_INFO_WUNLOCK(&V_tcbinfo); + } else { + /* INP_INFO lock is busy, continue later */ + TW_WLOCK(V_tw_lock); + tw_pcbrele(tw); break; - INP_WLOCK(tw->tw_inpcb); - tcp_twclose(tw, reuse); - if (reuse) - return (tw); + } } - return (NULL); } Modified: head/sys/netinet/tcp_var.h ============================================================================== --- head/sys/netinet/tcp_var.h Thu Apr 10 17:00:44 2014 (r264320) +++ head/sys/netinet/tcp_var.h Thu Apr 10 18:15:35 2014 (r264321) @@ -353,8 +353,7 @@ struct tcptw { u_int t_starttime; int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; - void *tw_pspare; /* TCP_SIGNATURE */ - u_int *tw_spare; /* TCP_SIGNATURE */ + u_int tw_refcount; /* refcount */ }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)