Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 10 Apr 2014 18:15:35 +0000 (UTC)
From:      John Baldwin <jhb@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r264321 - head/sys/netinet
Message-ID:  <201404101815.s3AIFZx3065541@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jhb
Date: Thu Apr 10 18:15:35 2014
New Revision: 264321
URL: http://svnweb.freebsd.org/changeset/base/264321

Log:
  Currently, the TCP slow timer can starve TCP input processing while it
  walks the list of connections in TIME_WAIT closing expired connections
  due to contention on the global TCP pcbinfo lock.
  
  To remediate, introduce a new global lock to protect the list of
  connections in TIME_WAIT.  Only acquire the TCP pcbinfo lock when
  closing an expired connection.  This limits the window of time when
  TCP input processing is stopped to the amount of time needed to close
  a single connection.
  
  Submitted by:	Julien Charbon <jcharbon@verisign.com>
  Reviewed by:	rwatson, rrs, adrian
  MFC after:	2 months

Modified:
  head/sys/netinet/tcp_timer.c
  head/sys/netinet/tcp_timer.h
  head/sys/netinet/tcp_timewait.c
  head/sys/netinet/tcp_var.h

Modified: head/sys/netinet/tcp_timer.c
==============================================================================
--- head/sys/netinet/tcp_timer.c	Thu Apr 10 17:00:44 2014	(r264320)
+++ head/sys/netinet/tcp_timer.c	Thu Apr 10 18:15:35 2014	(r264321)
@@ -144,9 +144,7 @@ tcp_slowtimo(void)
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
-		INP_INFO_WLOCK(&V_tcbinfo);
-		(void) tcp_tw_2msl_scan(0);
-		INP_INFO_WUNLOCK(&V_tcbinfo);
+		tcp_tw_2msl_scan();
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();

Modified: head/sys/netinet/tcp_timer.h
==============================================================================
--- head/sys/netinet/tcp_timer.h	Thu Apr 10 17:00:44 2014	(r264320)
+++ head/sys/netinet/tcp_timer.h	Thu Apr 10 18:15:35 2014	(r264321)
@@ -178,7 +178,8 @@ extern int tcp_fast_finwait2_recycle;
 void	tcp_timer_init(void);
 void	tcp_timer_2msl(void *xtp);
 struct tcptw *
-	tcp_tw_2msl_scan(int _reuse);		/* XXX temporary */
+	tcp_tw_2msl_reuse(void);	/* XXX temporary? */
+void	tcp_tw_2msl_scan(void);
 void	tcp_timer_keep(void *xtp);
 void	tcp_timer_persist(void *xtp);
 void	tcp_timer_rexmt(void *xtp);

Modified: head/sys/netinet/tcp_timewait.c
==============================================================================
--- head/sys/netinet/tcp_timewait.c	Thu Apr 10 17:00:44 2014	(r264320)
+++ head/sys/netinet/tcp_timewait.c	Thu Apr 10 18:15:35 2014	(r264321)
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
+#include <sys/refcount.h>
 
 #include <vm/uma.h>
 
@@ -99,13 +100,61 @@ static int	maxtcptw;
  * The timed wait queue contains references to each of the TCP sessions
  * currently in the TIME_WAIT state.  The queue pointers, including the
  * queue pointers in each tcptw structure, are protected using the global
- * tcbinfo lock, which must be held over queue iteration and modification.
+ * timewait lock, which must be held over queue iteration and modification.
  */
 static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
 #define	V_twq_2msl			VNET(twq_2msl)
 
-static void	tcp_tw_2msl_reset(struct tcptw *, int);
-static void	tcp_tw_2msl_stop(struct tcptw *);
+/* Global timewait lock */
+static VNET_DEFINE(struct rwlock, tw_lock);
+#define	V_tw_lock			VNET(tw_lock)
+
+#define TW_LOCK_INIT(tw, d)	rw_init_flags(&(tw), (d), 0)
+#define TW_LOCK_DESTROY(tw)	rw_destroy(&(tw))
+#define TW_RLOCK(tw)		rw_rlock(&(tw))
+#define TW_WLOCK(tw)		rw_wlock(&(tw))
+#define TW_RUNLOCK(tw)		rw_runlock(&(tw))
+#define TW_WUNLOCK(tw)		rw_wunlock(&(tw))
+#define TW_LOCK_ASSERT(tw)	rw_assert(&(tw), RA_LOCKED)
+#define TW_RLOCK_ASSERT(tw)	rw_assert(&(tw), RA_RLOCKED)
+#define TW_WLOCK_ASSERT(tw)	rw_assert(&(tw), RA_WLOCKED)
+#define TW_UNLOCK_ASSERT(tw)	rw_assert(&(tw), RA_UNLOCKED)
+
+/*
+ * tw_pcbref() bumps the reference count on an tw in order to maintain
+ * stability of an tw pointer despite the tw lock being released.
+ */
+static void
+tw_pcbref(struct tcptw *tw)
+{
+
+	KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
+	refcount_acquire(&tw->tw_refcount);
+}
+
+/*
+ * Drop a refcount on an tw elevated using tw_pcbref().  Return
+ * the tw lock released.
+ */
+static int
+tw_pcbrele(struct tcptw *tw)
+{
+
+	TW_WLOCK_ASSERT(V_tw_lock);
+	KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
+
+	if (!refcount_release(&tw->tw_refcount)) {
+		TW_WUNLOCK(V_tw_lock);
+		return (0);
+	}
+
+	uma_zfree(V_tcptw_zone, tw);
+	TW_WUNLOCK(V_tw_lock);
+	return (1);
+}
+
+static void	tcp_tw_2msl_reset(struct tcptw *, int ream);
+static void	tcp_tw_2msl_stop(struct tcptw *, int reuse);
 
 static int
 tcptw_auto_size(void)
@@ -172,6 +221,7 @@ tcp_tw_init(void)
 	else
 		uma_zone_set_max(V_tcptw_zone, maxtcptw);
 	TAILQ_INIT(&V_twq_2msl);
+	TW_LOCK_INIT(V_tw_lock, "tcptw");
 }
 
 #ifdef VIMAGE
@@ -181,10 +231,11 @@ tcp_tw_destroy(void)
 	struct tcptw *tw;
 
 	INP_INFO_WLOCK(&V_tcbinfo);
-	while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
+	while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
 		tcp_twclose(tw, 0);
 	INP_INFO_WUNLOCK(&V_tcbinfo);
 
+	TW_LOCK_DESTROY(V_tw_lock);
 	uma_zdestroy(V_tcptw_zone);
 }
 #endif
@@ -205,7 +256,7 @@ tcp_twstart(struct tcpcb *tp)
 	int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
 #endif
 
-	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);	/* tcp_tw_2msl_reset(). */
+	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if (V_nolocaltimewait) {
@@ -230,7 +281,7 @@ tcp_twstart(struct tcpcb *tp)
 
 	tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
 	if (tw == NULL) {
-		tw = tcp_tw_2msl_scan(1);
+		tw = tcp_tw_2msl_reuse();
 		if (tw == NULL) {
 			tp = tcp_close(tp);
 			if (tp != NULL)
@@ -239,6 +290,7 @@ tcp_twstart(struct tcpcb *tp)
 		}
 	}
 	tw->tw_inpcb = inp;
+	refcount_init(&tw->tw_refcount, 1);
 
 	/*
 	 * Recover last window size sent.
@@ -357,7 +409,6 @@ tcp_twcheck(struct inpcb *inp, struct tc
 	int thflags;
 	tcp_seq seq;
 
-	/* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
@@ -459,11 +510,11 @@ tcp_twclose(struct tcptw *tw, int reuse)
 	inp = tw->tw_inpcb;
 	KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
 	KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
-	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);	/* tcp_tw_2msl_stop(). */
+	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);	/* in_pcbfree() */
 	INP_WLOCK_ASSERT(inp);
 
 	tw->tw_inpcb = NULL;
-	tcp_tw_2msl_stop(tw);
+	tcp_tw_2msl_stop(tw, reuse);
 	inp->inp_ppcb = NULL;
 	in_pcbdrop(inp);
 
@@ -495,11 +546,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
 	} else
 		in_pcbfree(inp);
 	TCPSTAT_INC(tcps_closed);
-	crfree(tw->tw_cred);
-	tw->tw_cred = NULL;
-	if (reuse)
-		return;
-	uma_zfree(V_tcptw_zone, tw);
 }
 
 int
@@ -617,34 +663,88 @@ tcp_tw_2msl_reset(struct tcptw *tw, int 
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tw->tw_inpcb);
+
+	TW_WLOCK(V_tw_lock);
 	if (rearm)
 		TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
 	tw->tw_time = ticks + 2 * tcp_msl;
 	TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
+	TW_WUNLOCK(V_tw_lock);
 }
 
 static void
-tcp_tw_2msl_stop(struct tcptw *tw)
+tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
 {
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+	TW_WLOCK(V_tw_lock);
 	TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
+	crfree(tw->tw_cred);
+	tw->tw_cred = NULL;
+
+	if (!reuse) {
+		tw_pcbrele(tw);
+		return;
+	}
+
+	TW_WUNLOCK(V_tw_lock);
 }
 
 struct tcptw *
-tcp_tw_2msl_scan(int reuse)
+tcp_tw_2msl_reuse(void)
 {
-	struct tcptw *tw;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+	struct tcptw *tw;
+
+	TW_WLOCK(V_tw_lock);
+	tw = TAILQ_FIRST(&V_twq_2msl);
+	if (tw == NULL) {
+		TW_WUNLOCK(V_tw_lock);
+		return NULL;
+	}
+	TW_WUNLOCK(V_tw_lock);
+
+	INP_WLOCK(tw->tw_inpcb);
+	tcp_twclose(tw, 1);
+
+	return (tw);
+}
+
+void
+tcp_tw_2msl_scan(void)
+{
+
+	struct tcptw *tw;
 	for (;;) {
+		TW_RLOCK(V_tw_lock);
 		tw = TAILQ_FIRST(&V_twq_2msl);
-		if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0))
+		if (tw == NULL || ((tw->tw_time - ticks) > 0)) {
+			TW_RUNLOCK(V_tw_lock);
+			break;
+		}
+		tw_pcbref(tw);
+		TW_RUNLOCK(V_tw_lock);
+
+		/* Close timewait state */
+		if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
+			TW_WLOCK(V_tw_lock);
+			if (tw_pcbrele(tw))
+				continue;
+
+			KASSERT(tw->tw_inpcb != NULL,
+			    ("%s: tw->tw_inpcb == NULL", __func__));
+
+			INP_WLOCK(tw->tw_inpcb);
+			tcp_twclose(tw, 0);
+			INP_INFO_WUNLOCK(&V_tcbinfo);
+		} else {
+			/* INP_INFO lock is busy, continue later */
+			TW_WLOCK(V_tw_lock);
+			tw_pcbrele(tw);
 			break;
-		INP_WLOCK(tw->tw_inpcb);
-		tcp_twclose(tw, reuse);
-		if (reuse)
-			return (tw);
+		}
 	}
-	return (NULL);
 }

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h	Thu Apr 10 17:00:44 2014	(r264320)
+++ head/sys/netinet/tcp_var.h	Thu Apr 10 18:15:35 2014	(r264321)
@@ -353,8 +353,7 @@ struct tcptw {
 	u_int		t_starttime;
 	int		tw_time;
 	TAILQ_ENTRY(tcptw) tw_2msl;
-	void		*tw_pspare;	/* TCP_SIGNATURE */
-	u_int		*tw_spare;	/* TCP_SIGNATURE */
+	u_int		tw_refcount;	/* refcount */
 };
 
 #define	intotcpcb(ip)	((struct tcpcb *)(ip)->inp_ppcb)



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201404101815.s3AIFZx3065541>