Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 8 Dec 2008 20:27:00 +0000 (UTC)
From:      Robert Watson <rwatson@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r185775 - head/sys/netinet
Message-ID:  <200812082027.mB8KR0Vt069831@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: rwatson
Date: Mon Dec  8 20:27:00 2008
New Revision: 185775
URL: http://svn.freebsd.org/changeset/base/185775

Log:
  Move from solely write-locking the global tcbinfo in tcp_input()
  to read-locking in the TCP input path, allowing greater TCP
  input parallelism where multiple ithreads or ithread and netisr
  are able to run in parallel.  Previously, most TCP input paths
  held a write lock on the global tcbinfo lock, effectively
  serializing TCP input.
  
  Before looking up the connection, acquire a write lock if a
  potentially state-changing flag is set on the TCP segment header
  (FIN, RST, SYN), and otherwise a read lock.  We may later have
  to upgrade to a write lock in certain cases (ACKs received by the
  syncache or during TIMEWAIT) in order to support global state
  transitions, but this is never required for steady-state packets.
  
  Upgrading from a write lock to a read lock must be done as a
  trylock operation to avoid deadlocks, and actually violates the
  lock order as the tcbinfo lock preceeds the inpcb lock held at
  the time of upgrade.  If the trylock fails, we bump the refcount
  on the inpcb, drop both locks, and re-acquire in-order.  If
  another thread has freed the connection while the locks are
  dropped, we free the inpcb and repeat the lookup (this should
  hardly ever or never happen in practice).
  
  For now, maintain a number of new counters measuring how many
  times various cases execute, and in particular whether various
  optimistic assumptions about when read locks can be used, whether
  upgrades are done using the fast path, and whether connections
  close in practice in the above-described race, actually occur.
  
  MFC after:	6 weeks
  Discussed with:	kmacy
  Reviewed by:	bz, gnn, kmacy
  Tested by:	kmacy

Modified:
  head/sys/netinet/tcp_input.c

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c	Mon Dec  8 20:21:57 2008	(r185774)
+++ head/sys/netinet/tcp_input.c	Mon Dec  8 20:27:00 2008	(r185775)
@@ -166,6 +166,30 @@ SYSCTL_V_INT(V_NET, vnet_inet, _net_inet
     CTLFLAG_RW, tcp_autorcvbuf_max, 0,
     "Max size of automatic receive buffer");
 
+int	tcp_read_locking = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW,
+    &tcp_read_locking, 0, "Enable read locking strategy");
+
+int	tcp_rlock_atfirst;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, rlock_atfirst, CTLFLAG_RD,
+    &tcp_rlock_atfirst, 0, "");
+
+int	tcp_wlock_atfirst;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_wlock_atfirst, CTLFLAG_RD,
+    &tcp_wlock_atfirst, 0, "");
+
+int	tcp_wlock_upgraded;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_upgraded, CTLFLAG_RD,
+    &tcp_wlock_upgraded, 0, "");
+
+int	tcp_wlock_relocked;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_relocked, CTLFLAG_RD,
+    &tcp_wlock_relocked, 0, "");
+
+int	tcp_wlock_looped;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, wlock_looped, CTLFLAG_RD,
+    &tcp_wlock_looped, 0, "");
+
 #ifdef VIMAGE_GLOBALS
 struct inpcbhead tcb;
 struct inpcbinfo tcbinfo;
@@ -174,7 +198,8 @@ struct inpcbinfo tcbinfo;
 
 static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
-		     struct socket *, struct tcpcb *, int, int, uint8_t);
+		     struct socket *, struct tcpcb *, int, int, uint8_t,
+		     int);
 static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
@@ -298,6 +323,10 @@ tcp_input(struct mbuf *m, int off0)
 #endif
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
+	int ti_locked;
+#define	TI_UNLOCKED	1
+#define	TI_RLOCKED	2
+#define	TI_WLOCKED	3
 
 #ifdef TCPDEBUG
 	/*
@@ -450,11 +479,34 @@ tcp_input(struct mbuf *m, int off0)
 	drop_hdrlen = off0 + off;
 
 	/*
-	 * Locate pcb for segment.
-	 */
-	INP_INFO_WLOCK(&V_tcbinfo);
+	 * Locate pcb for segment, which requires a lock on tcbinfo.
+	 * Optimisticaly acquire a global read lock unless header flags
+	 * necessarily imply a state change.  There are two cases where we
+	 * might discover later we need a write lock despite the flags: ACKs
+	 * moving a connection out of the syncache, and ACK relating to a
+	 * connection in TIMEWAIT.
+	 */
+	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+	    tcp_read_locking == 0) {
+		INP_INFO_WLOCK(&V_tcbinfo);
+		ti_locked = TI_WLOCKED;
+		tcp_wlock_atfirst++;
+	} else {
+		INP_INFO_RLOCK(&V_tcbinfo);
+		ti_locked = TI_RLOCKED;
+		tcp_rlock_atfirst++;
+	}
+
 findpcb:
-	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+#ifdef INVARIANTS
+	if (ti_locked == TI_RLOCKED)
+		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+	else if (ti_locked == TI_WLOCKED)
+		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+	else
+		panic("%s: findpcb ti_locked %d\n", __func__, ti_locked);
+#endif
+
 #ifdef IPFIREWALL_FORWARD
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
@@ -561,12 +613,44 @@ findpcb:
 	}
 
 	/*
-	 * A previous connection in TIMEWAIT state is supposed to catch
-	 * stray or duplicate segments arriving late.  If this segment
-	 * was a legitimate new connection attempt the old INPCB gets
-	 * removed and we can try again to find a listening socket.
+	 * A previous connection in TIMEWAIT state is supposed to catch stray
+	 * or duplicate segments arriving late.  If this segment was a
+	 * legitimate new connection attempt the old INPCB gets removed and
+	 * we can try again to find a listening socket.
+	 *
+	 * At this point, due to earlier optimism, we may hold a read lock on
+	 * the inpcbinfo, rather than a write lock.  If so, we need to
+	 * upgrade, or if that fails, acquire a reference on the inpcb, drop
+	 * all locks, acquire a global write lock, and then re-acquire the
+	 * inpcb lock.  We may at that point discover that another thread has
+	 * tried to free the inpcb, in which case we need to loop back and
+	 * try to find a new inpcb to deliver to.
 	 */
 	if (inp->inp_vflag & INP_TIMEWAIT) {
+		KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+		    ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked));
+
+		if (ti_locked == TI_RLOCKED) {
+			if (rw_try_upgrade(&V_tcbinfo.ipi_lock) == 0) {
+				in_pcbref(inp);
+				INP_WUNLOCK(inp);
+				INP_INFO_RUNLOCK(&V_tcbinfo);
+				INP_INFO_WLOCK(&V_tcbinfo);
+				ti_locked = TI_WLOCKED;
+				INP_WLOCK(inp);
+				if (in_pcbrele(inp)) {
+					tcp_wlock_looped++;
+					inp = NULL;
+					goto findpcb;
+				}
+				tcp_wlock_relocked++;
+			} else {
+				ti_locked = TI_WLOCKED;
+				tcp_wlock_upgraded++;
+			}
+		}
+		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
 		if (thflags & TH_SYN)
 			tcp_dooptions(&to, optp, optlen, TO_SYN);
 		/*
@@ -588,6 +672,40 @@ findpcb:
 		goto dropwithreset;
 	}
 
+	/*
+	 * We've identified a valid inpcb, but it could be that we need an
+	 * inpcbinfo write lock and have only a read lock.  In this case,
+	 * attempt to upgrade/relock using the same strategy as the TIMEWAIT
+	 * case above.
+	 */
+	if (tp->t_state != TCPS_ESTABLISHED ||
+	    (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+	    tcp_read_locking == 0) {
+		KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+		    ("%s: upgrade check ti_locked %d", __func__, ti_locked));
+
+		if (ti_locked == TI_RLOCKED) {
+			if (rw_try_upgrade(&V_tcbinfo.ipi_lock) == 0) {
+				in_pcbref(inp);
+				INP_WUNLOCK(inp);
+				INP_INFO_RUNLOCK(&V_tcbinfo);
+				INP_INFO_WLOCK(&V_tcbinfo);
+				ti_locked = TI_WLOCKED;
+				INP_WLOCK(inp);
+				if (in_pcbrele(inp)) {
+					tcp_wlock_looped++;
+					inp = NULL;
+					goto findpcb;
+				}
+				tcp_wlock_relocked++;
+			} else {
+				ti_locked = TI_WLOCKED;
+				tcp_wlock_upgraded++;
+			}
+		}
+		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+	}
+
 #ifdef MAC
 	INP_WLOCK_ASSERT(inp);
 	if (mac_inpcb_check_deliver(inp, m))
@@ -700,7 +818,7 @@ findpcb:
 			 * the mbuf chain and unlocks the inpcb.
 			 */
 			tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
-			    iptos);
+			    iptos, ti_locked);
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 			return;
 		}
@@ -900,13 +1018,18 @@ findpcb:
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
 	 */
-	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);
+	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	return;
 
 dropwithreset:
-	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
+	if (ti_locked == TI_RLOCKED)
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	else if (ti_locked == TI_WLOCKED)
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+	else
+		panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
+	ti_locked = TI_UNLOCKED;
 
 	if (inp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
@@ -917,10 +1040,16 @@ dropwithreset:
 	goto drop;
 
 dropunlock:
-	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+	if (ti_locked == TI_RLOCKED)
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	else if (ti_locked == TI_WLOCKED)
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+	else
+		panic("%s: dropunlock ti_locked %d", __func__, ti_locked);
+	ti_locked = TI_UNLOCKED;
+
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
-	INP_INFO_WUNLOCK(&V_tcbinfo);
 
 drop:
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
@@ -932,11 +1061,11 @@ drop:
 
 static void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
-    struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos)
+    struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
+    int ti_locked)
 {
 	INIT_VNET_INET(tp->t_vnet);
 	int thflags, acked, ourfinisacked, needoutput = 0;
-	int headlocked = 1;
 	int rstreason, todrop, win;
 	u_long tiwin;
 	struct tcpopt to;
@@ -952,7 +1081,35 @@ tcp_do_segment(struct mbuf *m, struct tc
 #endif
 	thflags = th->th_flags;
 
-	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+	/*
+	 * If this is either a state-changing packet or current state isn't
+	 * established, we require a write lock on tcbinfo.  Otherwise, we
+	 * allow either a read lock or a write lock, as we may have acquired
+	 * a write lock due to a race.
+	 *
+	 * Require a global write lock for SYN/SIN/RST segments or
+	 * non-established connections; otherwise accept either a read or
+	 * write lock, as we may have conservatively acquired a write lock in
+	 * certain cases in tcp_input() (is this still true?).  Currently we
+	 * will never enter with no lock, so we try to drop it quickly in the
+	 * common pure ack/pure data cases.
+	 */
+	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
+	    tp->t_state != TCPS_ESTABLISHED) {
+		KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
+		    "SYN/FIN/RST/!EST", __func__, ti_locked));
+		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+	} else {
+#ifdef INVARIANTS
+		if (ti_locked == TI_RLOCKED)
+			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+		else if (ti_locked == TI_WLOCKED)
+			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+		else
+			panic("%s: ti_locked %d for EST", __func__,
+			    ti_locked);
+#endif
+	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
@@ -1106,14 +1263,20 @@ tcp_do_segment(struct mbuf *m, struct tc
 			      !IN_FASTRECOVERY(tp) &&
 			      (to.to_flags & TOF_SACK) == 0 &&
 			      TAILQ_EMPTY(&tp->snd_holes)))) {
-				KASSERT(headlocked,
-				    ("%s: headlocked", __func__));
-				INP_INFO_WUNLOCK(&V_tcbinfo);
-				headlocked = 0;
 				/*
 				 * This is a pure ack for outstanding data.
 				 */
+				if (ti_locked == TI_RLOCKED)
+					INP_INFO_RUNLOCK(&V_tcbinfo);
+				else if (ti_locked == TI_WLOCKED)
+					INP_INFO_WUNLOCK(&V_tcbinfo);
+				else
+					panic("%s: ti_locked %d on pure ACK",
+					    __func__, ti_locked);
+				ti_locked = TI_UNLOCKED;
+
 				++V_tcpstat.tcps_predack;
+
 				/*
 				 * "bad retransmit" recovery.
 				 */
@@ -1200,14 +1363,20 @@ tcp_do_segment(struct mbuf *m, struct tc
 		    tlen <= sbspace(&so->so_rcv)) {
 			int newsize = 0;	/* automatic sockbuf scaling */
 
-			KASSERT(headlocked, ("%s: headlocked", __func__));
-			INP_INFO_WUNLOCK(&V_tcbinfo);
-			headlocked = 0;
 			/*
-			 * This is a pure, in-sequence data packet
-			 * with nothing on the reassembly queue and
-			 * we have enough buffer space to take it.
-			 */
+			 * This is a pure, in-sequence data packet with
+			 * nothing on the reassembly queue and we have enough
+			 * buffer space to take it.
+			 */
+			if (ti_locked == TI_RLOCKED)
+				INP_INFO_RUNLOCK(&V_tcbinfo);
+			else if (ti_locked == TI_WLOCKED)
+				INP_INFO_WUNLOCK(&V_tcbinfo);
+			else
+				panic("%s: ti_locked %d on pure data "
+				    "segment", __func__, ti_locked);
+			ti_locked = TI_UNLOCKED;
+
 			/* Clean receiver SACK report if present */
 			if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
 				tcp_clean_sackreport(tp);
@@ -1434,8 +1603,9 @@ tcp_do_segment(struct mbuf *m, struct tc
 			tp->t_state = TCPS_SYN_RECEIVED;
 		}
 
-		KASSERT(headlocked, ("%s: trimthenstep6: head not locked",
-		    __func__));
+		KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
+		    "ti_locked %d", __func__, ti_locked));
+		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		/*
@@ -1563,17 +1733,23 @@ tcp_do_segment(struct mbuf *m, struct tc
 			case TCPS_CLOSE_WAIT:
 				so->so_error = ECONNRESET;
 			close:
+				KASSERT(ti_locked == TI_WLOCKED,
+				    ("tcp_do_segment: TH_RST 1 ti_locked %d",
+				    ti_locked));
+				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
 				tp->t_state = TCPS_CLOSED;
 				V_tcpstat.tcps_drops++;
-				KASSERT(headlocked, ("%s: trimthenstep6: "
-				    "tcp_close: head not locked", __func__));
 				tp = tcp_close(tp);
 				break;
 
 			case TCPS_CLOSING:
 			case TCPS_LAST_ACK:
-				KASSERT(headlocked, ("%s: trimthenstep6: "
-				    "tcp_close.2: head not locked", __func__));
+				KASSERT(ti_locked == TI_WLOCKED,
+				    ("tcp_do_segment: TH_RST 2 ti_locked %d",
+				    ti_locked));
+				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
 				tp = tcp_close(tp);
 				break;
 			}
@@ -1678,8 +1854,10 @@ tcp_do_segment(struct mbuf *m, struct tc
 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 		char *s;
 
-		KASSERT(headlocked, ("%s: trimthenstep6: tcp_close.3: head "
-		    "not locked", __func__));
+		KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
+		    "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
+		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
 		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket "
 			    "was closed, sending RST and removing tcpcb\n",
@@ -1751,8 +1929,10 @@ tcp_do_segment(struct mbuf *m, struct tc
 	 * error and we send an RST and drop the connection.
 	 */
 	if (thflags & TH_SYN) {
-		KASSERT(headlocked, ("%s: tcp_drop: trimthenstep6: "
-		    "head not locked", __func__));
+		KASSERT(ti_locked == TI_WLOCKED,
+		    ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
+		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
 		tp = tcp_drop(tp, ECONNRESET);
 		rstreason = BANDLIM_UNLIMITED;
 		goto drop;
@@ -2039,8 +2219,9 @@ tcp_do_segment(struct mbuf *m, struct tc
 		}
 
 process_ACK:
-		KASSERT(headlocked, ("%s: process_ACK: head not locked",
-		    __func__));
+		INP_INFO_LOCK_ASSERT(&V_tcbinfo);
+		KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+		    ("tcp_input: process_ACK ti_locked %d", ti_locked));
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		acked = th->th_ack - tp->snd_una;
@@ -2197,11 +2378,9 @@ process_ACK:
 		 */
 		case TCPS_CLOSING:
 			if (ourfinisacked) {
-				KASSERT(headlocked, ("%s: process_ACK: "
-				    "head not locked", __func__));
+				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 				tcp_twstart(tp);
 				INP_INFO_WUNLOCK(&V_tcbinfo);
-				headlocked = 0;
 				m_freem(m);
 				return;
 			}
@@ -2215,8 +2394,7 @@ process_ACK:
 		 */
 		case TCPS_LAST_ACK:
 			if (ourfinisacked) {
-				KASSERT(headlocked, ("%s: process_ACK: "
-				    "tcp_close: head not locked", __func__));
+				INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 				tp = tcp_close(tp);
 				goto drop;
 			}
@@ -2225,7 +2403,9 @@ process_ACK:
 	}
 
 step6:
-	KASSERT(headlocked, ("%s: step6: head not locked", __func__));
+	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
+	KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+	    ("tcp_do_segment: step6 ti_locked %d", ti_locked));
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
@@ -2311,7 +2491,9 @@ step6:
 			tp->rcv_up = tp->rcv_nxt;
 	}
 dodata:							/* XXX */
-	KASSERT(headlocked, ("%s: dodata: head not locked", __func__));
+	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
+	KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+	    ("tcp_do_segment: dodata ti_locked %d", ti_locked));
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
@@ -2430,15 +2612,25 @@ dodata:							/* XXX */
 		 * standard timers.
 		 */
 		case TCPS_FIN_WAIT_2:
-			KASSERT(headlocked == 1, ("%s: dodata: "
-			    "TCP_FIN_WAIT_2: head not locked", __func__));
+			INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+			KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
+			    "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
+			    ti_locked));
+
 			tcp_twstart(tp);
 			INP_INFO_WUNLOCK(&V_tcbinfo);
 			return;
 		}
 	}
-	INP_INFO_WUNLOCK(&V_tcbinfo);
-	headlocked = 0;
+	if (ti_locked == TI_RLOCKED)
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	else if (ti_locked == TI_WLOCKED)
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+	else
+		panic("%s: dodata epilogue ti_locked %d", __func__,
+		    ti_locked);
+	ti_locked = TI_UNLOCKED;
+
 #ifdef TCPDEBUG
 	if (so->so_options & SO_DEBUG)
 		tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
@@ -2452,10 +2644,11 @@ dodata:							/* XXX */
 		(void) tcp_output(tp);
 
 check_delack:
-	KASSERT(headlocked == 0, ("%s: check_delack: head locked",
-	    __func__));
+	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+	    __func__, ti_locked));
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
+
 	if (tp->t_flags & TF_DELACK) {
 		tp->t_flags &= ~TF_DELACK;
 		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
@@ -2464,7 +2657,9 @@ check_delack:
 	return;
 
 dropafterack:
-	KASSERT(headlocked, ("%s: dropafterack: head not locked", __func__));
+	KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
+	    ("tcp_do_segment: dropafterack ti_locked %d", ti_locked));
+
 	/*
 	 * Generate an ACK dropping incoming segment if it occupies
 	 * sequence space, where the ACK reflects our state.
@@ -2491,8 +2686,15 @@ dropafterack:
 		tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 			  &tcp_savetcp, 0);
 #endif
-	KASSERT(headlocked, ("%s: headlocked should be 1", __func__));
-	INP_INFO_WUNLOCK(&V_tcbinfo);
+	if (ti_locked == TI_RLOCKED)
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	else if (ti_locked == TI_WLOCKED)
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+	else
+		panic("%s: dropafterack epilogue ti_locked %d", __func__,
+		    ti_locked);
+	ti_locked = TI_UNLOCKED;
+
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
 	INP_WUNLOCK(tp->t_inpcb);
@@ -2500,8 +2702,13 @@ dropafterack:
 	return;
 
 dropwithreset:
-	KASSERT(headlocked, ("%s: dropwithreset: head not locked", __func__));
-	INP_INFO_WUNLOCK(&V_tcbinfo);
+	if (ti_locked == TI_RLOCKED)
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	else if (ti_locked == TI_WLOCKED)
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+	else
+		panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
+	ti_locked = TI_UNLOCKED;
 
 	if (tp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
@@ -2511,6 +2718,16 @@ dropwithreset:
 	return;
 
 drop:
+	if (ti_locked == TI_RLOCKED)
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	else if (ti_locked == TI_WLOCKED)
+		INP_INFO_WUNLOCK(&V_tcbinfo);
+#ifdef INVARIANTS
+	else
+		INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+#endif
+	ti_locked = TI_UNLOCKED;
+
 	/*
 	 * Drop space held by incoming segment and return.
 	 */
@@ -2521,8 +2738,6 @@ drop:
 #endif
 	if (tp != NULL)
 		INP_WUNLOCK(tp->t_inpcb);
-	if (headlocked)
-		INP_INFO_WUNLOCK(&V_tcbinfo);
 	m_freem(m);
 }
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200812082027.mB8KR0Vt069831>