Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 14 Jun 2021 20:59:19 GMT
From:      Michael Tuexen <tuexen@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org
Subject:   git: 8ecbecdcfdb0 - stable/13 - tcp: Mbuf leak while holding a socket buffer lock.
Message-ID:  <202106142059.15EKxJLb014122@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch stable/13 has been updated by tuexen:

URL: https://cgit.FreeBSD.org/src/commit/?id=8ecbecdcfdb082dc4056a9d627d5de2ed7eceda4

commit 8ecbecdcfdb082dc4056a9d627d5de2ed7eceda4
Author:     Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2021-06-10 12:33:57 +0000
Commit:     Michael Tuexen <tuexen@FreeBSD.org>
CommitDate: 2021-06-14 20:51:42 +0000

    tcp: Mbuf leak while holding a socket buffer lock.
    
    When running at NF the current Rack and BBR changes with the recent
    commits from Richard that cause the socket buffer lock to be held over
    the ip_output() call and then finally culminating in a call to tcp_handle_wakeup()
    we get a lot of leaked mbufs. I don't think that this leak is actually caused
    by holding the lock or what Richard has done, but is exposing some other
    bug that has probably been lying dormant for a long time. I will continue to
    look (using his changes) at what is going on to try to root cause out the issue.
    
    In the meantime I can't leave the leaks out for everyone else. So this commit
    will revert all of Richards changes and move both Rack and BBR back to just
    doing the old sorwakeup_locked() calls after messing with the so_rcv buffer.
    
    We may want to look at adding back in Richards changes after I have pinpointed
    the root cause of the mbuf leak and fixed it.
    
    Reviewed by: mtuexen,rscheff
    Sponsored by: Netflix Inc
    Differential Revision:  https://reviews.freebsd.org/D30704
    
    (cherry picked from commit 67e892819b26c198e4232c7586ead7f854f848c5)
---
 sys/netinet/tcp_output.c      |  2 ++
 sys/netinet/tcp_stacks/bbr.c  | 47 +++++++++++++++++------------
 sys/netinet/tcp_stacks/rack.c | 69 ++++++++++++++++++++++---------------------
 sys/netinet/tcp_var.h         | 18 +++++++++++
 4 files changed, 84 insertions(+), 52 deletions(-)

diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index 2a91570acdad..44333d772482 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -1551,6 +1551,8 @@ send:
 #endif /* INET */
 
 out:
+	if (error == 0)
+		tcp_account_for_send(tp, len, (tp->snd_nxt != tp->snd_max), 0);
 	/*
 	 * In transmit state, time the transmission and arrange for
 	 * the retransmit.  In persist state, just set snd_max.
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index 7d709e33f0d7..05db7180e7b2 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -486,7 +486,7 @@ static void
 			 uint32_t line, uint8_t is_start, uint16_t set);
 
 static struct bbr_sendmap *
-            bbr_find_lowest_rsm(struct tcp_bbr *bbr);
+	    bbr_find_lowest_rsm(struct tcp_bbr *bbr);
 static __inline uint32_t
 bbr_get_rtt(struct tcp_bbr *bbr, int32_t rtt_type);
 static void
@@ -1620,7 +1620,7 @@ bbr_init_sysctls(void)
 	    &bbr_drop_limit, 0,
 	    "Number of segments limit for drop (0=use min_cwnd w/flight)?");
 
-        /* Timeout controls */
+	/* Timeout controls */
 	bbr_timeout = SYSCTL_ADD_NODE(&bbr_sysctl_ctx,
 	    SYSCTL_CHILDREN(bbr_sysctl_root),
 	    OID_AUTO,
@@ -5750,7 +5750,7 @@ tcp_bbr_tso_size_check(struct tcp_bbr *bbr, uint32_t cts)
 	 *         seg = goal_tso / mss
 	 *         tso = seg * mss
 	 *     else
-         *         tso = mss
+	 *         tso = mss
 	 *     if (tso > per-tcb-max)
 	 *         tso = per-tcb-max
 	 *  else if ( bw > 512Mbps)
@@ -6736,7 +6736,7 @@ bbr_update_bbr_info(struct tcp_bbr *bbr, struct bbr_sendmap *rsm, uint32_t rtt,
 	else
 		bbr->rc_ack_is_cumack = 0;
 	old_rttprop = bbr_get_rtt(bbr, BBR_RTT_PROP);
-        /*
+	/*
 	 * Note the following code differs to the original
 	 * BBR spec. It calls for <= not <. However after a
 	 * long discussion in email with Neal, he acknowledged
@@ -8306,12 +8306,14 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
 				appended =
 #endif
 					sbappendstream_locked(&so->so_rcv, m, 0);
-			tp->t_flags |= TF_WAKESOR;
+			/* NB: sorwakeup_locked() does an implicit unlock. */
+			sorwakeup_locked(so);
 #ifdef NETFLIX_SB_LIMITS
 			if (so->so_rcv.sb_shlim && appended != mcnt)
 				counter_fo_release(so->so_rcv.sb_shlim,
 				    mcnt - appended);
 #endif
+
 		} else {
 			/*
 			 * XXX: Due to the header drop above "th" is
@@ -8323,6 +8325,11 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
 
 			thflags = tcp_reass(tp, th, &temp, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
+			if (tp->t_flags & TF_WAKESOR) {
+				tp->t_flags &= ~TF_WAKESOR;
+				/* NB: sorwakeup_locked() does an implicit unlock. */
+				sorwakeup_locked(so);
+			}
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (save_tlen > 0) &&
@@ -8357,7 +8364,6 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
 				    save_start + tlen);
 			}
 		}
-		tcp_handle_wakeup(tp, so);
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
@@ -9164,7 +9170,11 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	if (tlen == 0 && (thflags & TH_FIN) == 0) {
 		(void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
 			(struct mbuf *)0);
-		tcp_handle_wakeup(tp, so);
+		if (tp->t_flags & TF_WAKESOR) {
+			tp->t_flags &= ~TF_WAKESOR;
+			/* NB: sorwakeup_locked() does an implicit unlock. */
+			sorwakeup_locked(so);
+		}
 	}
 	tp->snd_wl1 = th->th_seq - 1;
 	if (bbr_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
@@ -11565,18 +11575,18 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
 		retval = 0;
 		m_freem(m);
-                goto done_with_input;
-        }
-        /*
-         * If a segment with the ACK-bit set arrives in the SYN-SENT state
-         * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
-         */
-        if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
-            (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
+		goto done_with_input;
+	}
+	/*
+	 * If a segment with the ACK-bit set arrives in the SYN-SENT state
+	 * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
+	 */
+	if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
+	    (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
 		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
-                return (1);
-        }
+		return (1);
+	}
 	in_recovery = IN_RECOVERY(tp->t_flags);
 	if (tiwin > bbr->r_ctl.rc_high_rwnd)
 		bbr->r_ctl.rc_high_rwnd = tiwin;
@@ -11786,8 +11796,6 @@ bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap
 			 * own bin
 			 */
 #ifdef NETFLIX_STATS
-			tp->t_sndtlppack++;
-			tp->t_sndtlpbyte += len;
 			KMOD_TCPSTAT_INC(tcps_tlpresends);
 			KMOD_TCPSTAT_ADD(tcps_tlpresend_bytes, len);
 #endif
@@ -13741,6 +13749,7 @@ out:
 	 * retransmit.  In persist state, just set snd_max.
 	 */
 	if (error == 0) {
+		tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp);
 		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    (tp->t_flags & TF_SACK_PERMIT) &&
 		    tp->rcv_numsacks > 0)
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index bcf3dcc0e38a..f9ba67088f7a 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -10219,7 +10219,8 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
 					sbappendstream_locked(&so->so_rcv, m, 0);
 
 			rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
-			tp->t_flags |= TF_WAKESOR;
+			/* NB: sorwakeup_locked() does an implicit unlock. */
+			sorwakeup_locked(so);
 #ifdef NETFLIX_SB_LIMITS
 			if (so->so_rcv.sb_shlim && appended != mcnt)
 				counter_fo_release(so->so_rcv.sb_shlim,
@@ -10236,7 +10237,11 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
 
 			thflags = tcp_reass(tp, th, &temp, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
-
+			if (tp->t_flags & TF_WAKESOR) {
+				tp->t_flags &= ~TF_WAKESOR;
+				/* NB: sorwakeup_locked() does an implicit unlock. */
+				sorwakeup_locked(so);
+			}
 		}
 		if ((tp->t_flags & TF_SACK_PERMIT) &&
 		    (save_tlen > 0) &&
@@ -10276,7 +10281,6 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
 				    save_start + tlen);
 			}
 		}
-		tcp_handle_wakeup(tp, so);
 	} else {
 		m_freem(m);
 		thflags &= ~TH_FIN;
@@ -10482,7 +10486,8 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		ctf_calc_rwin(so, tp);
 	}
 	rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1);
-	tp->t_flags |= TF_WAKESOR;
+	/* NB: sorwakeup_locked() does an implicit unlock. */
+	sorwakeup_locked(so);
 #ifdef NETFLIX_SB_LIMITS
 	if (so->so_rcv.sb_shlim && mcnt != appended)
 		counter_fo_release(so->so_rcv.sb_shlim, mcnt - appended);
@@ -10490,7 +10495,6 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	rack_handle_delayed_ack(tp, rack, tlen, 0);
 	if (tp->snd_una == tp->snd_max)
 		sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una);
-	tcp_handle_wakeup(tp, so);
 	return (1);
 }
 
@@ -11083,7 +11087,11 @@ rack_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	if (tlen == 0 && (thflags & TH_FIN) == 0) {
 		(void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
 		    (struct mbuf *)0);
-		tcp_handle_wakeup(tp, so);
+		if (tp->t_flags & TF_WAKESOR) {
+			tp->t_flags &= ~TF_WAKESOR;
+			/* NB: sorwakeup_locked() does an implicit unlock. */
+			sorwakeup_locked(so);
+		}
 	}
 	tp->snd_wl1 = th->th_seq - 1;
 	/* For syn-recv we need to possibly update the rtt */
@@ -12331,29 +12339,23 @@ rack_fini(struct tcpcb *tp, int32_t tcb_is_purged)
 		rack = (struct tcp_rack *)tp->t_fb_ptr;
 		if (tp->t_in_pkt) {
 			/*
-			 * Since we are switching we need to process any
-			 * inbound packets in case a compressed ack is
-			 * in queue or the new stack does not support
-			 * mbuf queuing. These packets in theory should
-			 * have been handled by the old stack anyway.
+			 * It is unsafe to process the packets since a
+			 * reset may be lurking in them (its rare but it
+			 * can occur). If we were to find a RST, then we
+			 * would end up dropping the connection and the
+			 * INP lock, so when we return the caller (tcp_usrreq)
+			 * will blow up when it trys to unlock the inp.
 			 */
-			if ((rack->rc_inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) ||
-			    (rack->rc_inp->inp_flags2 & INP_FREED)) {
-				/* Kill all the packets */
-				struct mbuf *save, *m;
-
-				m = tp->t_in_pkt;
-				tp->t_in_pkt = NULL;
-				tp->t_tail_pkt = NULL;
-				while (m) {
-					save = m->m_nextpkt;
-					m->m_nextpkt = NULL;
-					m_freem(m);
-					m = save;
-				}
-			} else {
-				/* Process all the packets */
-				ctf_do_queued_segments(rack->rc_inp->inp_socket, rack->rc_tp, 0);
+			struct mbuf *save, *m;
+
+			m = tp->t_in_pkt;
+			tp->t_in_pkt = NULL;
+			tp->t_tail_pkt = NULL;
+			while (m) {
+				save = m->m_nextpkt;
+				m->m_nextpkt = NULL;
+				m_freem(m);
+				m = save;
 			}
 			if ((tp->t_inpcb) &&
 			    (tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP))
@@ -13995,7 +13997,6 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	}
 	if (rack_do_segment_nounlock(m, th, so, tp,
 				     drop_hdrlen, tlen, iptos, 0, &tv) == 0) {
-		tcp_handle_wakeup(tp, so);
 		INP_WUNLOCK(tp->t_inpcb);
 	}
 }
@@ -15402,6 +15403,8 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
 		rack->rc_tlp_in_progress = 1;
 		rack->r_ctl.rc_tlp_cnt_out++;
 	}
+	if (error == 0)
+		tcp_account_for_send(tp, len, 1, doing_tlp);
 	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 	rack->forced_ack = 0;	/* If we send something zap the FA flag */
 	if (IN_FASTRECOVERY(tp->t_flags) && rsm)
@@ -15880,6 +15883,9 @@ again:
 		rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__);
 		tp->t_acktime = ticks;
 	}
+	if (error == 0)
+		tcp_account_for_send(tp, len, 0, 0);
+
 	rack->forced_ack = 0;	/* If we send something zap the FA flag */
 	tot_len += len;
 	if ((tp->t_flags & TF_GPUTINPROG) == 0)
@@ -16320,8 +16326,6 @@ again:
 		tlen = rsm->r_end - rsm->r_start;
 		if (tlen > segsiz)
 			tlen = segsiz;
-		tp->t_sndtlppack++;
-		tp->t_sndtlpbyte += tlen;
 		KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
 			("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
 			 __func__, __LINE__,
@@ -18107,6 +18111,7 @@ out:
 	 * retransmit.  In persist state, just set snd_max.
 	 */
 	if (error == 0) {
+		tcp_account_for_send(tp, len, (rsm != NULL), doing_tlp);
 		rack->forced_ack = 0;	/* If we send something zap the FA flag */
 		if (rsm && (doing_tlp == 0)) {
 			/* Set we retransmitted */
@@ -18151,8 +18156,6 @@ out:
 	if (doing_tlp && (rsm == NULL)) {
 		/* New send doing a TLP */
 		add_flag |= RACK_TLP;
-		tp->t_sndtlppack++;
-		tp->t_sndtlpbyte += len;
 	}
 	rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error,
 			rack_to_usec_ts(&tv),
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 1858d0146ce5..367dd9765534 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -261,6 +261,8 @@ struct tcpcb {
 	uint32_t t_maxpeakrate;		/* max peak rate set by user, in bytes/s */
 	uint32_t t_sndtlppack;		/* tail loss probe packets sent */
 	uint64_t t_sndtlpbyte;		/* total tail loss probe bytes sent */
+	uint64_t t_sndbytes;		/* total bytes sent */
+	uint64_t t_snd_rxt_bytes;	/* total bytes retransmitted */
 
 	uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
 	uint32_t t_end_info_status;	/* Status flag of end info */
@@ -1128,6 +1130,22 @@ tcp_fields_to_net(struct tcphdr *th)
 	th->th_win = htons(th->th_win);
 	th->th_urp = htons(th->th_urp);
 }
+
+static inline void
+tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt, uint8_t is_tlp)
+{
+	if (is_tlp) {
+		tp->t_sndtlppack++;
+		tp->t_sndtlpbyte += len;
+	}
+	/* To get total bytes sent you must add t_snd_rxt_bytes to t_sndbytes */
+	if (is_rxt)
+		tp->t_snd_rxt_bytes += len;
+	else
+		tp->t_sndbytes += len;
+
+}
+
 #endif /* _KERNEL */
 
 #endif /* _NETINET_TCP_VAR_H_ */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202106142059.15EKxJLb014122>