Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 23 Sep 2021 14:57:56 GMT
From:      Randall Stewart <rrs@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: fd69939e79a6 - main - tcp: Two bugs in rack one of which can lead to a panic.
Message-ID:  <202109231457.18NEvuYN077701@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by rrs:

URL: https://cgit.FreeBSD.org/src/commit/?id=fd69939e79a65d0dea766ac05e4d8b7335819ae1

commit fd69939e79a65d0dea766ac05e4d8b7335819ae1
Author:     Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2021-09-23 14:54:23 +0000
Commit:     Randall Stewart <rrs@FreeBSD.org>
CommitDate: 2021-09-23 14:54:23 +0000

    tcp: Two bugs in rack one of which can lead to a panic.
    
    In extensive testing in NF we have found two issues inside
    the rack stack.
    
    1) An incorrect offset is being generated by the fast send path when a fast send is initiated on
       the end of the socket buffer and before the fast send runs, the sb_compress macro adds data to the trailing socket.
       This fools the fast send code into thinking the sb offset changed and it miscalculates a "updated offset".
       It should only do that when the mbuf in question got smaller.. i.e. an ack was processed. This can lead to
       a panic deref'ing a NULL mbuf if that packet is ever retransmitted. At the best case it leads to invalid data being
       sent to the client which usually terminates the connection. The fix is to have the proper logic (that is in the rsm fast path)
       to make sure we only update the offset when the mbuf shrinks.
    2) The other issue is more bothersome. The timestamp check in rack needs to use the msec timestamp when
       comparing the timestamp echo to now. It was using a microsecond timestamp which ends up giving error
       prone results but causes only small harm in trying to identify which send to use in RTT calculations if its a retransmit.
    
    Reviewed by: tuexen
    Sponsored by: Netflix Inc.
    Differential Revision: https://reviews.freebsd.org/D32062
---
 sys/netinet/tcp_stacks/rack.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 947f9f619929..23b1ff1fc584 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -9176,6 +9176,7 @@ more:
 		if ((rsm->r_flags & RACK_TO_REXT) &&
 		    (tp->t_flags & TF_RCVD_TSTMP) &&
 		    (to->to_flags & TOF_TS) &&
+		    (to->to_tsecr != 0) &&
 		    (tp->t_flags & TF_PREVVALID)) {
 			/*
 			 * We can use the timestamp to see
@@ -13321,7 +13322,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 	struct timespec ts;
 	struct tcp_rack *rack;
 	struct tcp_ackent *ae;
-	uint32_t tiwin, us_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
+	uint32_t tiwin, ms_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack;
 	int cnt, i, did_out, ourfinisacked = 0;
 	int win_up_req = 0;
 	struct tcpopt to_holder, *to = NULL;
@@ -13381,13 +13382,14 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 	the_win = tp->snd_wnd;
 	win_seq = tp->snd_wl1;
 	win_upd_ack = tp->snd_wl2;
-	cts = us_cts = tcp_tv_to_usectick(tv);
+	cts = tcp_tv_to_usectick(tv);
+	ms_cts = tcp_tv_to_mssectick(tv);
 	segsiz = ctf_fixed_maxseg(tp);
 	if ((rack->rc_gp_dyn_mul) &&
 	    (rack->use_fixed_rate == 0) &&
 	    (rack->rc_always_pace)) {
 		/* Check in on probertt */
-		rack_check_probe_rtt(rack, us_cts);
+		rack_check_probe_rtt(rack, cts);
 	}
 	for (i = 0; i < cnt; i++) {
 #ifdef TCP_ACCOUNTING
@@ -13424,8 +13426,8 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 			 * non RFC1323 RTT calculation.  Normalize timestamp if syncookies
 			 * were used when this connection was established.
 			 */
-			if (TSTMP_GT(ae->ts_echo, cts))
-				ae->ts_echo = 0;
+			if (TSTMP_GT(ae->ts_echo, ms_cts))
+				to->to_tsecr = 0;
 			if (tp->ts_recent &&
 			    TSTMP_LT(ae->ts_value, tp->ts_recent)) {
 				if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) {
@@ -13524,7 +13526,6 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 			the_win = tiwin;
 		} else {
 			/* Case A */
-
 			if (SEQ_GT(ae->ack, tp->snd_max)) {
 				/*
 				 * We just send an ack since the incoming
@@ -13564,7 +13565,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 				}
 				rack_process_to_cumack(tp, rack, ae->ack, cts, to);
 				if (rack->rc_dsack_round_seen) {
-					/* Is the dsack roound over? */
+					/* Is the dsack round over? */
 					if (SEQ_GEQ(ae->ack, rack->r_ctl.dsack_round_end)) {
 						/* Yes it is */
 						rack->rc_dsack_round_seen = 0;
@@ -13687,7 +13688,7 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
 
 			rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery);
 			SOCKBUF_LOCK(&so->so_snd);
-			mfree = sbcut_locked(&so->so_snd, acked);
+			mfree = sbcut_locked(&so->so_snd, acked_amount);
 			tp->snd_una = high_seq;
 			/* Note we want to hold the sb lock through the sendmap adjust */
 			rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una);
@@ -13962,7 +13963,12 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 #endif
 	int32_t thflags, retval, did_out = 0;
 	int32_t way_out = 0;
-	uint32_t cts;
+	/*
+	 * cts - is the current time from tv (caller gets ts) in microseconds.
+	 * ms_cts - is the current time from tv in milliseconds.
+	 * us_cts - is the time that LRO or hardware actually got the packet in microseconds.
+	 */
+	uint32_t cts, us_cts, ms_cts;
 	uint32_t tiwin;
 	struct timespec ts;
 	struct tcpopt to;
@@ -13973,19 +13979,19 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	int ack_val_set = 0xf;
 #endif
 	int nsegs;
-	uint32_t us_cts;
 	/*
 	 * tv passed from common code is from either M_TSTMP_LRO or
 	 * tcp_get_usecs() if no LRO m_pkthdr timestamp is present.
 	 */
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
-	cts = tcp_tv_to_usectick(tv);
 	if (m->m_flags & M_ACKCMP) {
 		return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv));
 	}
 	if (m->m_flags & M_ACKCMP) {
 		panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp);
 	}
+	cts = tcp_tv_to_usectick(tv);
+	ms_cts =  tcp_tv_to_mssectick(tv);
 	nsegs = m->m_pkthdr.lro_nsegs;
 	counter_u64_add(rack_proc_non_comp_ack, 1);
 	thflags = th->th_flags;
@@ -14238,7 +14244,7 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	 */
 	if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 		to.to_tsecr -= tp->ts_offset;
-		if (TSTMP_GT(to.to_tsecr, cts))
+		if (TSTMP_GT(to.to_tsecr, ms_cts))
 			to.to_tsecr = 0;
 	}
 
@@ -15571,7 +15577,7 @@ rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
 
 	soff = rack->r_ctl.fsb.off;
 	m = rack->r_ctl.fsb.m;
-	if (rack->r_ctl.fsb.o_m_len != m->m_len) {
+	if (rack->r_ctl.fsb.o_m_len > m->m_len) {
 		/*
 		 * The mbuf had the front of it chopped off by an ack
 		 * we need to adjust the soff/off by that difference.
@@ -15580,6 +15586,12 @@ rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen,
 
 		delta = rack->r_ctl.fsb.o_m_len - m->m_len;
 		soff -= delta;
+	} else if (rack->r_ctl.fsb.o_m_len < m->m_len) {
+		/*
+		 * The mbuf was expanded probably by
+		 * a m_compress. Just update o_m_len.
+		 */
+		rack->r_ctl.fsb.o_m_len = m->m_len;
 	}
 	KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff));
 	KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen));



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202109231457.18NEvuYN077701>