Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 25 Jun 2016 10:41:24 -0400
From:      Randall Stewart <rrs@netflix.com>
To:        current@freebsd.org
Cc:        Julien Charbon <jch@FreeBSD.org>, Gleb Smirnoff <glebius@FreeBSD.org>, hselasky@FreeBSD.org, net@FreeBSD.org
Subject:   Re: panic with tcp timers
Message-ID:  <6E52CA6A-2153-4EF9-A3E1-97CB0D07EB28@freebsd.org>
In-Reply-To: <18D94615-810E-4E79-A889-4B0CC70F9E45@netflix.com>
References:  <20160617045319.GE1076@FreeBSD.org> <1f28844b-b4ea-b544-3892-811f2be327b9@freebsd.org> <20160620073917.GI1076@FreeBSD.org> <1d18d0e2-3e42-cb26-928c-2989d0751884@freebsd.org> <20160620095822.GJ1076@FreeBSD.org> <74bb31b7-a9f5-3d0c-eea0-681872e6f09b@freebsd.org> <18D94615-810E-4E79-A889-4B0CC70F9E45@netflix.com>

next in thread | previous in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
Ok

Lets try this again with my source changed to my @freebsd.net :-)

Now I am also attaching a patch for you Gleb, this will take some poking to
get in to your NF-head since it incorporates some changes we made earlier.

I think this will fix the problem.. i.e. dealing with two locks in the callout system (which it was
never meant to have done)..

Note we probably can move the code to use the callout lock init now.. but lets see if this works
on your setup on c096 and if so we can think about doing that.

R


[-- Attachment #2 --]
Index: tcp_timer.c
===================================================================
--- tcp_timer.c	(revision 302198)
+++ tcp_timer.c	(working copy)
@@ -306,6 +306,33 @@ tcp_timer_delack(void *xtp)
 	CURVNET_RESTORE();
 }
 
+static inline int
+tcp_inp_lock_exchange(struct tcpcb *tp)
+{
+	struct inpcb *inp;
+
+
+	inp = tp->t_inpcb;
+	in_pcbref(inp);
+	INP_WUNLOCK(inp);
+        INP_INFO_RLOCK(&V_tcbinfo);
+	INP_WLOCK(inp);
+	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+		return(1);
+	}
+	return(0);
+
+}
+
+static inline void
+tcp_inp_unlock_exchange(struct tcpcb *tp)
+{
+        INP_INFO_RUNLOCK(&V_tcbinfo);
+	if (tp && in_pcbrele_wlocked(tp->t_inpcb) == 0) {
+		INP_WUNLOCK(tp->t_inpcb);
+	}
+}
+
 void
 tcp_timer_2msl(void *xtp)
 {
@@ -317,7 +344,6 @@ tcp_timer_2msl(void *xtp)
 
 	ostate = tp->t_state;
 #endif
-	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
@@ -325,7 +351,6 @@ tcp_timer_2msl(void *xtp)
 	if (callout_pending(&tp->t_timers->tt_2msl) ||
 	    !callout_active(&tp->t_timers->tt_2msl)) {
 		INP_WUNLOCK(tp->t_inpcb);
-		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
@@ -332,7 +357,6 @@ tcp_timer_2msl(void *xtp)
 	callout_deactivate(&tp->t_timers->tt_2msl);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
-		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
@@ -355,7 +379,6 @@ tcp_timer_2msl(void *xtp)
 	 */
 	if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
 		INP_WUNLOCK(inp);
-		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
@@ -363,7 +386,13 @@ tcp_timer_2msl(void *xtp)
 	    tp->t_inpcb && tp->t_inpcb->inp_socket && 
 	    (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
 		TCPSTAT_INC(tcps_finwait2_drops);
+		if (tcp_inp_lock_exchange(tp)) {
+			tcp_inp_unlock_exchange(tp);
+			goto out;
+		}
 		tp = tcp_close(tp);             
+		tcp_inp_unlock_exchange(tp);
+		goto out;
 	} else {
 		if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
 			if (!callout_reset(&tp->t_timers->tt_2msl,
@@ -370,8 +399,15 @@ tcp_timer_2msl(void *xtp)
 			   TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
 				tp->t_timers->tt_flags &= ~TT_2MSL_RST;
 			}
-		} else
-		       tp = tcp_close(tp);
+		} else {
+			if (tcp_inp_lock_exchange(tp)) {
+				tcp_inp_unlock_exchange(tp);
+				goto out;
+			}
+			tp = tcp_close(tp);
+			tcp_inp_unlock_exchange(tp);
+			goto out;
+		}
        }
 
 #ifdef TCPDEBUG
@@ -383,7 +419,7 @@ tcp_timer_2msl(void *xtp)
 
 	if (tp != NULL)
 		INP_WUNLOCK(inp);
-	INP_INFO_RUNLOCK(&V_tcbinfo);
+out:
 	CURVNET_RESTORE();
 }
 
@@ -399,7 +435,6 @@ tcp_timer_keep(void *xtp)
 
 	ostate = tp->t_state;
 #endif
-	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
@@ -406,7 +441,6 @@ tcp_timer_keep(void *xtp)
 	if (callout_pending(&tp->t_timers->tt_keep) ||
 	    !callout_active(&tp->t_timers->tt_keep)) {
 		INP_WUNLOCK(inp);
-		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
@@ -413,7 +447,6 @@ tcp_timer_keep(void *xtp)
 	callout_deactivate(&tp->t_timers->tt_keep);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
-		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
@@ -468,12 +501,16 @@ tcp_timer_keep(void *xtp)
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
 	INP_WUNLOCK(inp);
-	INP_INFO_RUNLOCK(&V_tcbinfo);
 	CURVNET_RESTORE();
 	return;
 
 dropit:
 	TCPSTAT_INC(tcps_keepdrops);
+
+	if (tcp_inp_lock_exchange(tp)) {
+		tcp_inp_unlock_exchange(tp);
+		goto out;
+	}
 	tp = tcp_drop(tp, ETIMEDOUT);
 
 #ifdef TCPDEBUG
@@ -482,9 +519,8 @@ dropit:
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
-	if (tp != NULL)
-		INP_WUNLOCK(tp->t_inpcb);
-	INP_INFO_RUNLOCK(&V_tcbinfo);
+	tcp_inp_unlock_exchange(tp);
+out:
 	CURVNET_RESTORE();
 }
 
@@ -499,7 +535,6 @@ tcp_timer_persist(void *xtp)
 
 	ostate = tp->t_state;
 #endif
-	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
@@ -506,7 +541,6 @@ tcp_timer_persist(void *xtp)
 	if (callout_pending(&tp->t_timers->tt_persist) ||
 	    !callout_active(&tp->t_timers->tt_persist)) {
 		INP_WUNLOCK(inp);
-		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
@@ -513,7 +547,6 @@ tcp_timer_persist(void *xtp)
 	callout_deactivate(&tp->t_timers->tt_persist);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
-		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
@@ -537,7 +570,12 @@ tcp_timer_persist(void *xtp)
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	     ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 		TCPSTAT_INC(tcps_persistdrop);
+		if (tcp_inp_lock_exchange(tp)) {
+			tcp_inp_unlock_exchange(tp);
+			goto out;
+		}
 		tp = tcp_drop(tp, ETIMEDOUT);
+		tcp_inp_unlock_exchange(tp);
 		goto out;
 	}
 	/*
@@ -547,7 +585,12 @@ tcp_timer_persist(void *xtp)
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		TCPSTAT_INC(tcps_persistdrop);
+		if (tcp_inp_lock_exchange(tp)) {
+			tcp_inp_unlock_exchange(tp);
+			goto out;
+		}
 		tp = tcp_drop(tp, ETIMEDOUT);
+		tcp_inp_unlock_exchange(tp);
 		goto out;
 	}
 	tcp_setpersist(tp);
@@ -555,15 +598,13 @@ tcp_timer_persist(void *xtp)
 	(void) tp->t_fb->tfb_tcp_output(tp);
 	tp->t_flags &= ~TF_FORCEDATA;
 
-out:
 #ifdef TCPDEBUG
 	if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
 		tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
-	if (tp != NULL)
-		INP_WUNLOCK(inp);
-	INP_INFO_RUNLOCK(&V_tcbinfo);
+	INP_WUNLOCK(inp);
+out:
 	CURVNET_RESTORE();
 }
 
@@ -573,7 +614,6 @@ tcp_timer_rexmt(void * xtp)
 	struct tcpcb *tp = xtp;
 	CURVNET_SET(tp->t_vnet);
 	int rexmt;
-	int headlocked;
 	struct inpcb *inp;
 #ifdef TCPDEBUG
 	int ostate;
@@ -580,8 +620,6 @@ tcp_timer_rexmt(void * xtp)
 
 	ostate = tp->t_state;
 #endif
-
-	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
 	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
@@ -588,7 +626,6 @@ tcp_timer_rexmt(void * xtp)
 	if (callout_pending(&tp->t_timers->tt_rexmt) ||
 	    !callout_active(&tp->t_timers->tt_rexmt)) {
 		INP_WUNLOCK(inp);
-		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
@@ -595,7 +632,6 @@ tcp_timer_rexmt(void * xtp)
 	callout_deactivate(&tp->t_timers->tt_rexmt);
 	if ((inp->inp_flags & INP_DROPPED) != 0) {
 		INP_WUNLOCK(inp);
-		INP_INFO_RUNLOCK(&V_tcbinfo);
 		CURVNET_RESTORE();
 		return;
 	}
@@ -616,14 +652,15 @@ tcp_timer_rexmt(void * xtp)
 	if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		TCPSTAT_INC(tcps_timeoutdrop);
-
+		if (tcp_inp_lock_exchange(tp)) {
+			tcp_inp_unlock_exchange(tp);
+			goto out;
+		}
 		tp = tcp_drop(tp, tp->t_softerror ?
 			      tp->t_softerror : ETIMEDOUT);
-		headlocked = 1;
+		tcp_inp_unlock_exchange(tp);
 		goto out;
 	}
-	INP_INFO_RUNLOCK(&V_tcbinfo);
-	headlocked = 0;
 	if (tp->t_state == TCPS_SYN_SENT) {
 		/*
 		 * If the SYN was retransmitted, indicate CWND to be
@@ -811,7 +848,6 @@ tcp_timer_rexmt(void * xtp)
 
 	(void) tp->t_fb->tfb_tcp_output(tp);
 
-out:
 #ifdef TCPDEBUG
 	if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
@@ -818,10 +854,8 @@ tcp_timer_rexmt(void * xtp)
 			  PRU_SLOWTIMO);
 #endif
 	TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
-	if (tp != NULL)
-		INP_WUNLOCK(inp);
-	if (headlocked)
-		INP_INFO_RUNLOCK(&V_tcbinfo);
+	INP_WUNLOCK(inp);
+out:
 	CURVNET_RESTORE();
 }
 
@@ -832,7 +866,6 @@ tcp_timer_activate(struct tcpcb *tp, uint32_t time
 	timeout_t *f_callout;
 	struct inpcb *inp = tp->t_inpcb;
 	int cpu = inp_to_cpuid(inp);
-	uint32_t f_reset;
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
@@ -846,27 +879,22 @@ tcp_timer_activate(struct tcpcb *tp, uint32_t time
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
 			f_callout = tcp_timer_delack;
-			f_reset = TT_DELACK_RST;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
 			f_callout = tcp_timer_rexmt;
-			f_reset = TT_REXMT_RST;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
 			f_callout = tcp_timer_persist;
-			f_reset = TT_PERSIST_RST;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
 			f_callout = tcp_timer_keep;
-			f_reset = TT_KEEP_RST;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
 			f_callout = tcp_timer_2msl;
-			f_reset = TT_2MSL_RST;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_activate) {
@@ -876,24 +904,9 @@ tcp_timer_activate(struct tcpcb *tp, uint32_t time
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	if (delta == 0) {
-		if ((tp->t_timers->tt_flags & timer_type) &&
-		    (callout_stop(t_callout) > 0) &&
-		    (tp->t_timers->tt_flags & f_reset)) {
-			tp->t_timers->tt_flags &= ~(timer_type | f_reset);
-		}
+		callout_stop(t_callout);
 	} else {
-		if ((tp->t_timers->tt_flags & timer_type) == 0) {
-			tp->t_timers->tt_flags |= (timer_type | f_reset);
-			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
-		} else {
-			/* Reset already running callout on the same CPU. */
-			if (!callout_reset(t_callout, delta, f_callout, tp)) {
-				/*
-				 * Callout not cancelled, consider it as not
-				 * properly restarted. */
-				tp->t_timers->tt_flags &= ~f_reset;
-			}
-		}
+		callout_reset_on(t_callout, delta, f_callout, tp, cpu);
 	}
 }
 
@@ -931,30 +944,23 @@ void
 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
-	uint32_t f_reset;
 
 	tp->t_timers->tt_flags |= TT_STOPPED;
-
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
-			f_reset = TT_DELACK_RST;
 			break;
 		case TT_REXMT:
 			t_callout = &tp->t_timers->tt_rexmt;
-			f_reset = TT_REXMT_RST;
 			break;
 		case TT_PERSIST:
 			t_callout = &tp->t_timers->tt_persist;
-			f_reset = TT_PERSIST_RST;
 			break;
 		case TT_KEEP:
 			t_callout = &tp->t_timers->tt_keep;
-			f_reset = TT_KEEP_RST;
 			break;
 		case TT_2MSL:
 			t_callout = &tp->t_timers->tt_2msl;
-			f_reset = TT_2MSL_RST;
 			break;
 		default:
 			if (tp->t_fb->tfb_tcp_timer_stop) {
@@ -968,15 +974,13 @@ tcp_timer_stop(struct tcpcb *tp, uint32_t timer_ty
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 
-	if (tp->t_timers->tt_flags & timer_type) {
-		if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
-			/*
-			 * Can't stop the callout, defer tcpcb actual deletion
-			 * to the last one. We do this using the async drain
-			 * function and incrementing the count in 
-			 */
-			tp->t_timers->tt_draincnt++;
-		}
+	if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
+		/*
+		 * Can't stop the callout, defer tcpcb actual deletion
+		 * to the last one. We do this using the async drain
+		 * function and incrementing the count in 
+		 */
+		tp->t_timers->tt_draincnt++;
 	}
 }
 

[-- Attachment #3 --]

> On Jun 25, 2016, at 4:48 AM, Randall Stewart <rrs@netflix.com> wrote:
> 
> So
> 
> All of our timers in TCP do something like
> ---------------------
> INFO-LOCK
> INP_WLOCK
> 
> if (inp needs to be dropped) {
>    drop-it
> }
> do other work
> 
> UNLOCK-INP
> UNLOCK-INFO
> ------------------
> 
> And generally the path “inp needs to be dropped” is rarely taken.
> 
> So why don’t we change the procedure to something like:
> 
> INP_WLOCK
> if (inp needs to be dropped) {
>     inp_ref()
>     INP_WUNLOCK()
>     INFO_LOCK()
>     INP_WLOCK()
>     if (inp_release_ref) {
> 	   /* victory its dead */
>           INFO_UNLOCK
>           return
>      }
>      do-the-drop
>  }
> 
> This way we would only go grab the INFO lock in those rarer cases
> when we *do* actually want to kill the tcb and at the same time
> it would make the current callout system work correctly.. which as
> many have pointed out would be much better if we could just let the
> lock be gotten by the callout system. Hmm maybe with this scheme we
> could even do that...
> 
> R
> 
> 
>> On Jun 20, 2016, at 1:45 PM, Julien Charbon <jch@FreeBSD.org> wrote:
>> 
>> 
>> Hi,
>> 
>> On 6/20/16 11:58 AM, Gleb Smirnoff wrote:
>>> On Mon, Jun 20, 2016 at 11:55:55AM +0200, Julien Charbon wrote:
>>> J> > On Fri, Jun 17, 2016 at 11:27:39AM +0200, Julien Charbon wrote:
>>> J> > J> > Comparing stable/10 and head, I see two changes that could
>>> J> > J> > affect that:
>>> J> > J> > 
>>> J> > J> > - callout_async_drain
>>> J> > J> > - switch to READ lock for inp info in tcp timers
>>> J> > J> > 
>>> J> > J> > That's why you are in To, Julien and Hans :)
>>> J> > J> > 
>>> J> > J> > We continue investigating, and I will keep you updated.
>>> J> > J> > However, any help is welcome. I can share cores.
>>> J> > 
>>> J> > Now, spending some time with cores and adding a bunch of
>>> J> > extra CTRs, I have a sequence of events that lead to the
>>> J> > panic. In short, the bug is in the callout system. It seems
>>> J> > to be not relevant to the callout_async_drain, at least for
>>> J> > now. The transition to READ lock unmasked the problem, that's
>>> J> > why NetflixBSD 10 doesn't panic.
>>> J> > 
>>> J> > The panic requires heavy contention on the TCP info lock.
>>> J> > 
>>> J> > [CPU 1] the callout fires, tcp_timer_keep entered
>>> J> > [CPU 1] blocks on INP_INFO_RLOCK(&V_tcbinfo);
>>> J> > [CPU 2] schedules the callout
>>> J> > [CPU 2] tcp_discardcb called
>>> J> > [CPU 2] callout successfully canceled
>>> J> > [CPU 2] tcpcb freed
>>> J> > [CPU 1] unblocks... panic
>>> J> > 
>>> J> > When the lock was WLOCK, all contenders were resumed in a
>>> J> > sequence they came to the lock. Now, that they are readers,
>>> J> > once the lock is released, readers are resumed in a "random"
>>> J> > order, and this allows tcp_discardcb to go before the old
>>> J> > running callout, and this unmasks the panic.
>>> J> 
>>> J>  Highly interesting.  I should be able to reproduce that (will be useful
>>> J> for testing the corresponding fix).
>>> J> 
>>> J>  Fix proposal:  If callout_async_drain() returns 0 (fail) (instead of 1
>>> J> (success) here) when the callout cancellation is a success _but_ the
>>> J> callout is current running, that should fix it.
>>> J> 
>>> J>  For the history:  It comes back to my old callout question:
>>> J> 
>>> J>  Does _callout_stop_safe() is allowed to return 1 (success) even if the
>>> J> callout is still currently running;  a.k.a. it is not because you
>>> J> successfully cancelled a callout that the callout is not currently running.
>>> J> 
>>> J>  We did propose a patch to make _callout_stop_safe() returns 0 (fail)
>>> J> when the callout is currently running:
>>> J> 
>>> J> callout_stop() should return 0 when the callout is currently being
>>> J> serviced and indeed unstoppable
>>> J> https://reviews.freebsd.org/differential/changeset/?ref=62513&whitespace=ignore-most
>>> J> 
>>> J>  But this change impacted too many old code paths and was interesting
>>> J> only for TCP timers and thus was abandoned.
>>> 
>>> The fix I am working on now is doing exactly that. callout_reset must
>>> return 0 if the callout is currently running.
>>> 
>>> What are the old paths impacted?
>> 
>> Actually all the paths that check the callout_stop() return value AND
>> call both callout_reset() and callout_stop() AND use mpsafe callout().
>> And for each path, we would have to check our patch was ok (or not).
>> 
>> Thus, if you only do the change in callout_async_drain() context, you
>> don't impact the "old" callout_stop() behavior and get the desired
>> behavior for the TCP timers.  Might be a good trade-off...
>> 
>> My 2 cents.
>> 
>> --
>> Julien
> 
> --------
> Randall Stewart
> rrs@netflix.com
> 803-317-4952
> 
> 
> 
> 
> 

--------
Randall Stewart
rrs@netflix.com
803-317-4952






Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?6E52CA6A-2153-4EF9-A3E1-97CB0D07EB28>