Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 4 May 2020 20:28:54 +0000 (UTC)
From:      Randall Stewart <rrs@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r360639 - head/sys/netinet/tcp_stacks
Message-ID:  <202005042028.044KSsat057898@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: rrs
Date: Mon May  4 20:28:53 2020
New Revision: 360639
URL: https://svnweb.freebsd.org/changeset/base/360639

Log:
  This commit brings things into sync with the advancements that
  have been made in rack and adds a few fixes in BBR. This also
  removes any possibility of incorrectly doing OOB data the stacks
  do not support it. Should fix the skyzaller crashes seen in the
  past. Still to fix is the BBR issue just reported this weekend
  with the SYN and on sending a RST. Note that this version of
  rack can now do pacing as well.
  
  Sponsored by:Netflix Inc
  Differential Revision:https://reviews.freebsd.org/D24576

Modified:
  head/sys/netinet/tcp_stacks/bbr.c
  head/sys/netinet/tcp_stacks/rack.c
  head/sys/netinet/tcp_stacks/rack_bbr_common.c
  head/sys/netinet/tcp_stacks/rack_bbr_common.h
  head/sys/netinet/tcp_stacks/tcp_bbr.h
  head/sys/netinet/tcp_stacks/tcp_rack.h

Modified: head/sys/netinet/tcp_stacks/bbr.c
==============================================================================
--- head/sys/netinet/tcp_stacks/bbr.c	Mon May  4 20:19:57 2020	(r360638)
+++ head/sys/netinet/tcp_stacks/bbr.c	Mon May  4 20:28:53 2020	(r360639)
@@ -1,7 +1,5 @@
 /*-
- * Copyright (c) 2016-9
- *	Netflix Inc.
- *      All rights reserved.
+ * Copyright (c) 2016-2020 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -72,6 +70,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mutex.h>
 #include <sys/tim_filter.h>
 #include <sys/time.h>
+#include <sys/protosw.h>
 #include <vm/uma.h>
 #include <sys/kern_prefetch.h>
 
@@ -1853,28 +1852,6 @@ bbr_init_sysctls(void)
 	    &bbr_clear_lost, 0, sysctl_bbr_clear_lost, "IU", "Clear lost counters");
 }
 
-static inline int32_t
-bbr_progress_timeout_check(struct tcp_bbr *bbr)
-{
-	if (bbr->rc_tp->t_maxunacktime && bbr->rc_tp->t_acktime &&
-	    TSTMP_GT(ticks, bbr->rc_tp->t_acktime)) {
-		if ((((uint32_t)ticks - bbr->rc_tp->t_acktime)) >= bbr->rc_tp->t_maxunacktime) {
-			/*
-			 * There is an assumption here that the caller will
-			 * drop the connection, so we increment the
-			 * statistics.
-			 */
-			bbr_log_progress_event(bbr, bbr->rc_tp, ticks, PROGRESS_DROP, __LINE__);
-			BBR_STAT_INC(bbr_progress_drops);
-#ifdef NETFLIX_STATS
-			KMOD_TCPSTAT_INC(tcps_progdrops);
-#endif
-			return (1);
-		}
-	}
-	return (0);
-}
-
 static void
 bbr_counter_destroy(void)
 {
@@ -1884,6 +1861,8 @@ bbr_counter_destroy(void)
 	COUNTER_ARRAY_FREE(bbr_state_lost, BBR_MAX_STAT);
 	COUNTER_ARRAY_FREE(bbr_state_time, BBR_MAX_STAT);
 	COUNTER_ARRAY_FREE(bbr_state_resend, BBR_MAX_STAT);
+	counter_u64_free(bbr_nohdwr_pacing_enobuf);
+	counter_u64_free(bbr_hdwr_pacing_enobuf);
 	counter_u64_free(bbr_flows_whdwr_pacing);
 	counter_u64_free(bbr_flows_nohdwr_pacing);
 
@@ -4643,7 +4622,8 @@ bbr_timeout_tlp(struct tcpcb *tp, struct tcp_bbr *bbr,
 		/* Its not time yet */
 		return (0);
 	}
-	if (bbr_progress_timeout_check(bbr)) {
+	if (ctf_progress_timeout_check(tp, true)) {
+		bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 		tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
 		return (1);
 	}
@@ -4815,9 +4795,8 @@ bbr_timeout_delack(struct tcpcb *tp, struct tcp_bbr *b
 }
 
 /*
- * Persists timer, here we simply need to setup the
- * FORCE-DATA flag the output routine will send
- * the one byte send.
+ * Here we send a KEEP-ALIVE like probe to the
+ * peer, we do not send data.
  *
  * We only return 1, saying don't proceed, if all timers
  * are stopped (destroyed PCB?).
@@ -4845,7 +4824,8 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *
 	/*
 	 * Have we exceeded the user specified progress time?
 	 */
-	if (bbr_progress_timeout_check(bbr)) {
+	if (ctf_progress_timeout_check(tp, true)) {
+		bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 		tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
 		goto out;
 	}
@@ -4859,6 +4839,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *
 	    (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
 	    ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
+		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
 		goto out;
 	}
@@ -4875,6 +4856,7 @@ bbr_timeout_persist(struct tcpcb *tp, struct tcp_bbr *
 	if (tp->t_state > TCPS_CLOSE_WAIT &&
 	    (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
 		KMOD_TCPSTAT_INC(tcps_persistdrop);
+		tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
 		tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
 		goto out;
 	}
@@ -4947,6 +4929,7 @@ bbr_timeout_keepalive(struct tcpcb *tp, struct tcp_bbr
 	return (1);
 dropit:
 	KMOD_TCPSTAT_INC(tcps_keepdrops);
+	tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
 	tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
 	return (1);
 }
@@ -5058,8 +5041,9 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr,
 	 * retransmit interval.  Back off to a longer retransmit interval
 	 * and retransmit one segment.
 	 */
-	if (bbr_progress_timeout_check(bbr)) {
+	if (ctf_progress_timeout_check(tp, true)) {
 		retval = 1;
+		bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 		tcp_set_inp_to_drop(bbr->rc_inp, ETIMEDOUT);
 		goto out;
 	}
@@ -5078,6 +5062,7 @@ bbr_timeout_rxt(struct tcpcb *tp, struct tcp_bbr *bbr,
 		tp->t_rxtshift = TCP_MAXRXTSHIFT;
 		KMOD_TCPSTAT_INC(tcps_timeoutdrop);
 		retval = 1;
+		tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
 		tcp_set_inp_to_drop(bbr->rc_inp,
 		    (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT));
 		goto out;
@@ -8050,6 +8035,9 @@ nothing_left:
 			 * to reset him.
 			 */
 			*ret_val = 1;
+			tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
+			/* tcp_close will kill the inp pre-log the Reset */
+			tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 			tp = tcp_close(tp);
 			ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen);
 			BBR_STAT_INC(bbr_dropped_af_data);
@@ -8132,7 +8120,6 @@ bbr_exit_persist(struct tcpcb *tp, struct tcp_bbr *bbr
 	idle_time = bbr_calc_time(cts, bbr->r_ctl.rc_went_idle_time);
 	bbr->rc_in_persist = 0;
 	bbr->rc_hit_state_1 = 0;
-	tp->t_flags &= ~TF_FORCEDATA;
 	bbr->r_ctl.rc_del_time = cts;
 	/*
 	 * We invalidate the last ack here since we
@@ -8390,66 +8377,12 @@ bbr_process_data(struct mbuf *m, struct tcphdr *th, st
 		return (0);
 	}
 	/*
-	 * Process segments with URG.
+	 * We don't support urgent data but
+	 * drag along the up just to make sure
+	 * if there is a stack switch no one
+	 * is surprised.
 	 */
-	if ((thflags & TH_URG) && th->th_urp &&
-	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
-		/*
-		 * This is a kludge, but if we receive and accept random
-		 * urgent pointers, we'll crash in soreceive.  It's hard to
-		 * imagine someone actually wanting to send this much urgent
-		 * data.
-		 */
-		SOCKBUF_LOCK(&so->so_rcv);
-		if (th->th_urp + sbavail(&so->so_rcv) > sb_max) {
-			th->th_urp = 0;	/* XXX */
-			thflags &= ~TH_URG;	/* XXX */
-			SOCKBUF_UNLOCK(&so->so_rcv);	/* XXX */
-			goto dodata;	/* XXX */
-		}
-		/*
-		 * If this segment advances the known urgent pointer, then
-		 * mark the data stream.  This should not happen in
-		 * CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since a
-		 * FIN has been received from the remote side. In these
-		 * states we ignore the URG.
-		 *
-		 * According to RFC961 (Assigned Protocols), the urgent
-		 * pointer points to the last octet of urgent data.  We
-		 * continue, however, to consider it to indicate the first
-		 * octet of data past the urgent section as the original
-		 * spec states (in one of two places).
-		 */
-		if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
-			tp->rcv_up = th->th_seq + th->th_urp;
-			so->so_oobmark = sbavail(&so->so_rcv) +
-			    (tp->rcv_up - tp->rcv_nxt) - 1;
-			if (so->so_oobmark == 0)
-				so->so_rcv.sb_state |= SBS_RCVATMARK;
-			sohasoutofband(so);
-			tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
-		}
-		SOCKBUF_UNLOCK(&so->so_rcv);
-		/*
-		 * Remove out of band data so doesn't get presented to user.
-		 * This can happen independent of advancing the URG pointer,
-		 * but if two URG's are pending at once, some out-of-band
-		 * data may creep in... ick.
-		 */
-		if (th->th_urp <= (uint32_t)tlen &&
-		    !(so->so_options & SO_OOBINLINE)) {
-			/* hdr drop is delayed */
-			tcp_pulloutofband(so, th, m, drop_hdrlen);
-		}
-	} else {
-		/*
-		 * If no out of band data is expected, pull receive urgent
-		 * pointer along with the receive window.
-		 */
-		if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
-			tp->rcv_up = tp->rcv_nxt;
-	}
-dodata:				/* XXX */
+	tp->rcv_up = tp->rcv_nxt;
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/*
@@ -8792,7 +8725,7 @@ bbr_do_fastnewdata(struct mbuf *m, struct tcphdr *th, 
 static int
 bbr_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
-    uint32_t tiwin, int32_t nxt_pkt)
+    uint32_t tiwin, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t acked;
 	uint16_t nsegs;
@@ -8987,7 +8920,7 @@ bbr_fastack(struct mbuf *m, struct tcphdr *th, struct 
 static int
 bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
-    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t todrop;
 	int32_t ourfinisacked = 0;
@@ -9010,6 +8943,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, str
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->iss) ||
 	    SEQ_GT(th->th_ack, tp->snd_max))) {
+		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
@@ -9196,7 +9130,7 @@ bbr_do_syn_sent(struct mbuf *m, struct tcphdr *th, str
 static int
 bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, struct socket *so,
 		struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
-		uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+		uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ourfinisacked = 0;
 	int32_t ret_val;
@@ -9207,6 +9141,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, str
 	if ((thflags & TH_ACK) &&
 	    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 	     SEQ_GT(th->th_ack, tp->snd_max))) {
+		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
@@ -9218,6 +9153,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, str
 		 * data), a valid ACK, a FIN, or a RST.
 		 */
 		if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
+			tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 			ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		} else if (thflags & TH_SYN) {
@@ -9253,6 +9189,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, str
 	 * "LAND" DoS attack.
 	 */
 	if (SEQ_LT(th->th_seq, tp->irs)) {
+		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 		return (1);
 	}
@@ -9405,7 +9342,7 @@ bbr_do_syn_recv(struct mbuf *m, struct tcphdr *th, str
 static int
 bbr_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
-    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	struct tcp_bbr *bbr;
 	int32_t ret_val;
@@ -9439,7 +9376,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, 
 	    __predict_true(th->th_seq == tp->rcv_nxt)) {
 		if (tlen == 0) {
 			if (bbr_fastack(m, th, so, tp, to, drop_hdrlen, tlen,
-			    tiwin, nxt_pkt)) {
+			    tiwin, nxt_pkt, iptos)) {
 				return (0);
 			}
 		} else {
@@ -9521,7 +9458,8 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, 
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
-		if (bbr_progress_timeout_check(bbr)) {
+		if (ctf_progress_timeout_check(tp, true)) {
+			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
@@ -9539,7 +9477,7 @@ bbr_do_established(struct mbuf *m, struct tcphdr *th, 
 static int
 bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
-    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	struct tcp_bbr *bbr;
 	int32_t ret_val;
@@ -9616,7 +9554,8 @@ bbr_do_close_wait(struct mbuf *m, struct tcphdr *th, s
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
-		if (bbr_progress_timeout_check(bbr)) {
+		if (ctf_progress_timeout_check(tp, true)) {
+			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
@@ -9632,6 +9571,9 @@ bbr_check_data_after_close(struct mbuf *m, struct tcp_
 
 	if (bbr->rc_allow_data_af_clo == 0) {
 close_now:
+		tcp_log_end_status(tp, TCP_EI_STATUS_DATA_A_CLOSE);
+		/* tcp_close will kill the inp pre-log the Reset */
+		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 		tp = tcp_close(tp);
 		KMOD_TCPSTAT_INC(tcps_rcvafterclose);
 		ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen));
@@ -9655,7 +9597,7 @@ close_now:
 static int
 bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
-    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ourfinisacked = 0;
 	int32_t ret_val;
@@ -9764,7 +9706,8 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, s
 		tcp_state_change(tp, TCPS_FIN_WAIT_2);
 	}
 	if (sbavail(&so->so_snd)) {
-		if (bbr_progress_timeout_check(bbr)) {
+		if (ctf_progress_timeout_check(tp, true)) {
+			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
@@ -9781,7 +9724,7 @@ bbr_do_fin_wait_1(struct mbuf *m, struct tcphdr *th, s
 static int
 bbr_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
-    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ourfinisacked = 0;
 	int32_t ret_val;
@@ -9876,7 +9819,8 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, stru
 		return (1);
 	}
 	if (sbavail(&so->so_snd)) {
-		if (bbr_progress_timeout_check(bbr)) {
+		if (ctf_progress_timeout_check(tp, true)) {
+			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
@@ -9893,7 +9837,7 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, stru
 static int
 bbr_do_lastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
-    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ourfinisacked = 0;
 	int32_t ret_val;
@@ -9988,7 +9932,8 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, stru
 		return (1);
 	}
 	if (sbavail(&so->so_snd)) {
-		if (bbr_progress_timeout_check(bbr)) {
+		if (ctf_progress_timeout_check(tp, true)) {
+			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
@@ -10006,7 +9951,7 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, stru
 static int
 bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen,
-    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt)
+    uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos)
 {
 	int32_t ourfinisacked = 0;
 	int32_t ret_val;
@@ -10104,7 +10049,8 @@ bbr_do_fin_wait_2(struct mbuf *m, struct tcphdr *th, s
 		return (ret_val);
 	}
 	if (sbavail(&so->so_snd)) {
-		if (bbr_progress_timeout_check(bbr)) {
+		if (ctf_progress_timeout_check(tp, true)) {
+			bbr_log_progress_event(bbr, tp, tick, PROGRESS_DROP, __LINE__);
 			ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
 			return (1);
 		}
@@ -11702,6 +11648,8 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr 
 	 * always. All other times (timers etc) we must have a rack-state
 	 * set (so we assure we have done the checks above for SACK).
 	 */
+	if (thflags & TH_FIN)
+		tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN);
 	if (bbr->r_state != tp->t_state)
 		bbr_set_state(tp, bbr, tiwin);
 
@@ -11740,6 +11688,7 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr 
          */
         if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
             (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
+		tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT);
 		ctf_do_dropwithreset_conn(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
                 return (1);
         }
@@ -11765,7 +11714,7 @@ bbr_do_segment_nounlock(struct mbuf *m, struct tcphdr 
 	}
 	retval = (*bbr->r_substate) (m, th, so,
 	    tp, &to, drop_hdrlen,
-	    tlen, tiwin, thflags, nxt_pkt);
+	    tlen, tiwin, thflags, nxt_pkt, iptos);
 #ifdef BBR_INVARIANTS
 	if ((retval == 0) &&
 	    (tp->t_inpcb == NULL)) {
@@ -11969,14 +11918,7 @@ bbr_do_send_accounting(struct tcpcb *tp, struct tcp_bb
 		bbr_do_error_accounting(tp, bbr, rsm, len, error);
 		return;
 	}
-	if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
-		/* Window probe */
-		KMOD_TCPSTAT_INC(tcps_sndprobe);
-#ifdef STATS
-		stats_voi_update_abs_u32(tp->t_stats,
-		    VOI_TCP_RETXPB, len);
-#endif
-	} else if (rsm) {
+	if (rsm) {
 		if (rsm->r_flags & BBR_TLP) {
 			/*
 			 * TLP should not count in retran count, but in its
@@ -12241,7 +12183,6 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeva
 	}
 	/* Mark that we have called bbr_output(). */
 	if ((bbr->r_timer_override) ||
-	    (tp->t_flags & TF_FORCEDATA) ||
 	    (tp->t_state < TCPS_ESTABLISHED)) {
 		/* Timeouts or early states are exempt */
 		if (inp->inp_in_hpts)
@@ -12578,47 +12519,6 @@ recheck_resend:
 	}
 	SOCKBUF_LOCK(sb);
 	/*
-	 * If in persist timeout with window of 0, send 1 byte. Otherwise,
-	 * if window is small but nonzero and time TF_SENTFIN expired, we
-	 * will send what we can and go to transmit state.
-	 */
-	if (tp->t_flags & TF_FORCEDATA) {
-		if ((sendwin == 0) || (sendwin <= (tp->snd_max - tp->snd_una))) {
-			/*
-			 * If we still have some data to send, then clear
-			 * the FIN bit.  Usually this would happen below
-			 * when it realizes that we aren't sending all the
-			 * data.  However, if we have exactly 1 byte of
-			 * unsent data, then it won't clear the FIN bit
-			 * below, and if we are in persist state, we wind up
-			 * sending the packet without recording that we sent
-			 * the FIN bit.
-			 *
-			 * We can't just blindly clear the FIN bit, because
-			 * if we don't have any more data to send then the
-			 * probe will be the FIN itself.
-			 */
-			if (sb_offset < sbused(sb))
-				flags &= ~TH_FIN;
-			sendwin = 1;
-		} else {
-			if ((bbr->rc_in_persist != 0) &&
- 			    (tp->snd_wnd >= min((bbr->r_ctl.rc_high_rwnd/2),
-					       bbr_minseg(bbr)))) {
-				/* Exit persists if there is space */
-				bbr_exit_persist(tp, bbr, cts, __LINE__);
-			}
-			if (rsm == NULL) {
-				/*
-				 * If we are dropping persist mode then we
-				 * need to correct sb_offset if not a
-				 * retransmit.
-				 */
-				sb_offset = tp->snd_max - tp->snd_una;
-			}
-		}
-	}
-	/*
 	 * If snd_nxt == snd_max and we have transmitted a FIN, the
 	 * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a
 	 * negative length.  This can also occur when TCP opens up its
@@ -12674,7 +12574,7 @@ recheck_resend:
 				 */
 				len = 0;
 			}
-			if ((tp->t_flags & TF_FORCEDATA) && (bbr->rc_in_persist)) {
+			if (bbr->rc_in_persist) {
 				/*
 				 * We are in persists, figure out if
 				 * a retransmit is available (maybe the previous
@@ -12970,9 +12870,6 @@ recheck_resend:
 		if ((tp->snd_una == tp->snd_max) && len) {	/* Nothing outstanding */
 			goto send;
 		}
-		if (tp->t_flags & TF_FORCEDATA) {	/* typ. timeout case */
-			goto send;
-		}
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) {
 			goto send;
 		}
@@ -13013,7 +12910,7 @@ recheck_resend:
 			goto send;
 	}
 	/*
-	 * Send if we owe the peer an ACK, RST, SYN, or urgent data.  ACKNOW
+	 * Send if we owe the peer an ACK, RST, SYN.  ACKNOW
 	 * is also a catch-all for the retransmit timer timeout case.
 	 */
 	if (tp->t_flags & TF_ACKNOW) {
@@ -13022,9 +12919,6 @@ recheck_resend:
 	if (((flags & TH_SYN) && (tp->t_flags & TF_NEEDSYN) == 0)) {
 		goto send;
 	}
-	if (SEQ_GT(tp->snd_up, tp->snd_una)) {
-		goto send;
-	}
 	/*
 	 * If our state indicates that FIN should be sent and we have not
 	 * yet done so, then we need to send.
@@ -13089,7 +12983,6 @@ just_return_nolock:
 	}
 	if (tot_len == 0)
 		counter_u64_add(bbr_out_size[TCP_MSS_ACCT_JUSTRET], 1);
-	tp->t_flags &= ~TF_FORCEDATA;
 	/* Dont update the time if we did not send */
 	bbr->r_ctl.rc_last_delay_val = 0;
 	bbr->rc_output_starts_timer = 1;
@@ -13586,8 +13479,6 @@ send:
 			KMOD_TCPSTAT_INC(tcps_sndacks);
 		else if (flags & (TH_SYN | TH_FIN | TH_RST))
 			KMOD_TCPSTAT_INC(tcps_sndctrl);
-		else if (SEQ_GT(tp->snd_up, tp->snd_una))
-			KMOD_TCPSTAT_INC(tcps_sndurg);
 		else
 			KMOD_TCPSTAT_INC(tcps_sndwinup);
 
@@ -13774,17 +13665,11 @@ send:
 		tp->t_flags |= TF_RXWIN0SENT;
 	} else
 		tp->t_flags &= ~TF_RXWIN0SENT;
-	if (SEQ_GT(tp->snd_up, tp->snd_max)) {
-		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_max));
-		th->th_flags |= TH_URG;
-	} else
-		/*
-		 * If no urgent pointer to send, then we pull the urgent
-		 * pointer to the left edge of the send window so that it
-		 * doesn't drift into the send window on sequence number
-		 * wraparound.
-		 */
-		tp->snd_up = tp->snd_una;	/* drag it along */
+	/*
+	 * We don't support urgent data, but drag along
+	 * the pointer in case of a stack switch.
+	 */
+	tp->snd_up = tp->snd_una;
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (to.to_flags & TOF_SIGNATURE) {
@@ -14125,8 +14010,7 @@ out:
 		 */
 		return (0);
 	}
-	if (((tp->t_flags & TF_FORCEDATA) == 0) ||
-	    (bbr->rc_in_persist == 0)) {
+	if (bbr->rc_in_persist == 0) {
 		/*
 		 * Advance snd_nxt over sequence space of this segment.
 		 */
@@ -14254,7 +14138,6 @@ nomore:
 					tp->t_maxseg = old_maxseg - 40;
 					bbr_log_msgsize_fail(bbr, tp, len, maxseg, mtu, 0, tso, cts);
 				}
-				tp->t_flags &= ~TF_FORCEDATA;
 				/*
 				 * Nuke all other things that can interfere
 				 * with slot
@@ -14284,7 +14167,6 @@ nomore:
 			}
 			/* FALLTHROUGH */
 		default:
-			tp->t_flags &= ~TF_FORCEDATA;
 			slot = (bbr_error_base_paceout + 3) << bbr->oerror_cnt;
 			bbr->rc_output_starts_timer = 1;
 			bbr_start_hpts_timer(bbr, tp, cts, 11, slot, 0);
@@ -14399,7 +14281,6 @@ nomore:
 	    ((flags & TH_RST) == 0) &&
 	    (IN_RECOVERY(tp->t_flags) == 0) &&
 	    (bbr->rc_in_persist == 0) &&
-	    ((tp->t_flags & TF_FORCEDATA) == 0) &&
 	    (tot_len < bbr->r_ctl.rc_pace_max_segs)) {
 		/*
 		 * For non-tso we need to goto again until we have sent out
@@ -14416,10 +14297,14 @@ nomore:
 		}
 		rsm = NULL;
 		sack_rxmit = 0;
-		tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA);
+		tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 		goto again;
 	}
 skip_again:
+	if ((error == 0) && (flags & TH_FIN))
+		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_FIN);
+	if ((error == 0) && (flags & TH_RST))
+		tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST);
 	if (((flags & (TH_RST | TH_SYN | TH_FIN)) == 0) && tot_len) {
 		/*
 		 * Calculate/Re-Calculate the hptsi slot in usecs based on
@@ -14429,7 +14314,7 @@ skip_again:
 		if (bbr->rc_no_pacing)
 			slot = 0;
 	}
-	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK | TF_FORCEDATA);
+	tp->t_flags &= ~(TF_ACKNOW | TF_DELACK);
 enobufs:
 	if (bbr->rc_use_google == 0)
 		bbr_check_bbr_for_state(bbr, cts, __LINE__, 0);
@@ -15095,6 +14980,13 @@ out:
 	return (error);
 }
 
+static int
+bbr_pru_options(struct tcpcb *tp, int flags)
+{
+	if (flags & PRUS_OOB)
+		return (EOPNOTSUPP);
+	return (0);
+}
 
 struct tcp_function_block __tcp_bbr = {
 	.tfb_tcp_block_name = __XSTRING(STACKNAME),
@@ -15111,7 +15003,8 @@ struct tcp_function_block __tcp_bbr = {
 	.tfb_tcp_timer_stop = bbr_timer_stop,
 	.tfb_tcp_rexmit_tmr = bbr_remxt_tmr,
 	.tfb_tcp_handoff_ok = bbr_handoff_ok,
-	.tfb_tcp_mtu_chg = bbr_mtu_chg
+	.tfb_tcp_mtu_chg = bbr_mtu_chg,
+	.tfb_pru_options = bbr_pru_options,
 };
 
 static const char *bbr_stack_names[] = {

Modified: head/sys/netinet/tcp_stacks/rack.c
==============================================================================
--- head/sys/netinet/tcp_stacks/rack.c	Mon May  4 20:19:57 2020	(r360638)
+++ head/sys/netinet/tcp_stacks/rack.c	Mon May  4 20:28:53 2020	(r360639)
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2016-9 Netflix, Inc.
+ * Copyright (c) 2016-2020 Netflix, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -57,13 +57,16 @@ __FBSDID("$FreeBSD$");
 #include <sys/qmath.h>
 #include <sys/tree.h>
 #include <sys/stats.h> /* Must come after qmath.h and tree.h */
+#else
+#include <sys/tree.h>
 #endif
 #include <sys/refcount.h>
-#include <sys/tree.h>
 #include <sys/queue.h>
+#include <sys/tim_filter.h>
 #include <sys/smp.h>
 #include <sys/kthread.h>
 #include <sys/kern_prefetch.h>
+#include <sys/protosw.h>
 
 #include <vm/uma.h>
 
@@ -91,10 +94,14 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_hpts.h>
+#include <netinet/tcp_ratelimit.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_fastopen.h>
 #include <netinet/tcp_lro.h>
+#ifdef NETFLIX_SHARED_CWND
+#include <netinet/tcp_shared_cwnd.h>
+#endif
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif				/* TCPDEBUG */
@@ -169,6 +176,8 @@ struct sysctl_oid *rack_sysctl_root;
  *
  */
 static int32_t rack_tlp_thresh = 1;
+static int32_t rack_tlp_limit = 2;	/* No more than 2 TLPs w-out new data */
+static int32_t rack_tlp_use_greater = 1;
 static int32_t rack_reorder_thresh = 2;
 static int32_t rack_reorder_fade = 60000;	/* 0 - never fade, def 60,000
 						 * - 60 seconds */
@@ -177,24 +186,34 @@ static uint32_t rack_highest_sack_thresh_seen = 0;
 static uint32_t rack_highest_move_thresh_seen = 0;
 
 static int32_t rack_pkt_delay = 1;
-static int32_t rack_min_pace_time = 0;
 static int32_t rack_early_recovery = 1;
 static int32_t rack_send_a_lot_in_prr = 1;
 static int32_t rack_min_to = 1;	/* Number of ms minimum timeout */
 static int32_t rack_verbose_logging = 0;
 static int32_t rack_ignore_data_after_close = 1;
-static int32_t use_rack_cheat = 1;
+static int32_t rack_enable_shared_cwnd = 0;
+static int32_t rack_limits_scwnd = 1;
+static int32_t rack_enable_mqueue_for_nonpaced = 0;
+static int32_t rack_disable_prr = 0;
+static int32_t use_rack_rr = 1;
+static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */
 static int32_t rack_persist_min = 250;	/* 250ms */
-static int32_t rack_persist_max = 1000;	/* 1 Second */
+static int32_t rack_persist_max = 2000;	/* 2 Second */
 static int32_t rack_sack_not_required = 0;	/* set to one to allow non-sack to use rack */
-static int32_t rack_hw_tls_max_seg = 0; /* 0 means use hw-tls single segment */
-
+static int32_t rack_hw_tls_max_seg = 3; /* 3 means use hw-tls single segment */
+static int32_t rack_default_init_window = 0; 	/* Use system default */
+static int32_t rack_limit_time_with_srtt = 0;
+static int32_t rack_hw_pace_adjust = 0;
 /*
  * Currently regular tcp has a rto_min of 30ms
  * the backoff goes 12 times so that ends up
  * being a total of 122.850 seconds before a
  * connection is killed.
  */
+static uint32_t rack_def_data_window = 20;
+static uint32_t rack_goal_bdp = 2;
+static uint32_t rack_min_srtts = 1;
+static uint32_t rack_min_measure_usec = 0;
 static int32_t rack_tlp_min = 10;
 static int32_t rack_rto_min = 30;	/* 30ms same as main freebsd */
 static int32_t rack_rto_max = 4000;	/* 4 seconds */
@@ -204,16 +223,78 @@ static int32_t rack_rate_sample_method = USE_RTT_LOW;
 static int32_t rack_pace_every_seg = 0;
 static int32_t rack_delayed_ack_time = 200;	/* 200ms */
 static int32_t rack_slot_reduction = 4;
+static int32_t rack_wma_divisor = 8;		/* For WMA calculation */
+static int32_t rack_cwnd_block_ends_measure = 0;
+static int32_t rack_rwnd_block_ends_measure = 0;
+
 static int32_t rack_lower_cwnd_at_tlp = 0;
 static int32_t rack_use_proportional_reduce = 0;
 static int32_t rack_proportional_rate = 10;
 static int32_t rack_tlp_max_resend = 2;
 static int32_t rack_limited_retran = 0;
 static int32_t rack_always_send_oldest = 0;
-static int32_t rack_use_sack_filter = 1;
 static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
-static int32_t rack_per_of_gp = 50;
 
+static uint16_t rack_per_of_gp_ss = 250;	/* 250 % slow-start */
+static uint16_t rack_per_of_gp_ca = 200;	/* 200 % congestion-avoidance */
+static uint16_t rack_per_of_gp_rec = 200;	/* 200 % of bw */
+
+/* Probertt */
+static uint16_t rack_per_of_gp_probertt = 60;	/* 60% of bw */
+static uint16_t rack_per_of_gp_lowthresh = 40;	/* 40% is bottom */
+static uint16_t rack_per_of_gp_probertt_reduce = 10; /* 10% reduction */
+static uint16_t rack_atexit_prtt_hbp = 130;	/* Clamp to 130% on exit prtt if highly buffered path */
+static uint16_t rack_atexit_prtt = 130;	/* Clamp to 100% on exit prtt if non highly buffered path */
+
+static uint32_t rack_max_drain_wait = 2;	/* How man gp srtt's before we give up draining */
+static uint32_t rack_must_drain = 1;		/* How many GP srtt's we *must* wait */
+static uint32_t rack_probertt_use_min_rtt_entry = 1;	/* Use the min to calculate the goal else gp_srtt */
+static uint32_t rack_probertt_use_min_rtt_exit = 0;
+static uint32_t rack_probe_rtt_sets_cwnd = 0;
+static uint32_t rack_probe_rtt_safety_val = 2000000;	/* No more than 2 sec in probe-rtt */
+static uint32_t rack_time_between_probertt = 9600000;	/* 9.6 sec in us */
+static uint32_t rack_probertt_gpsrtt_cnt_mul = 0;	/* How many srtt periods does probe-rtt last top fraction */
+static uint32_t rack_probertt_gpsrtt_cnt_div = 0;	/* How many srtt periods does probe-rtt last bottom fraction  */
+static uint32_t rack_min_probertt_hold = 200000;	/* Equal to delayed ack time */
+static uint32_t rack_probertt_filter_life = 10000000;
+static uint32_t rack_probertt_lower_within = 10;
+static uint32_t rack_min_rtt_movement = 250;	/* Must move at least 250 useconds to count as a lowering */
+static int32_t rack_pace_one_seg = 0;		/* Shall we pace for less than 1.4Meg 1MSS at a time */
+static int32_t rack_probertt_clear_is = 1;
+static int32_t rack_max_drain_hbp = 1;		/* Extra drain times gpsrtt for highly buffered paths */
+static int32_t rack_hbp_thresh = 3;		/* what is the divisor max_rtt/min_rtt to decided a hbp */
+
+
+/* Part of pacing */
+static int32_t rack_max_per_above = 30;		/* When we go to increment stop if above 100+this% */
+
+/* Timely information */
+/* Combine these two gives the range of 'no change' to bw */
+/* ie the up/down provide the upper and lower bound  */
+static int32_t rack_gp_per_bw_mul_up = 2;	/* 2% */
+static int32_t rack_gp_per_bw_mul_down = 4;	/* 4% */
+static int32_t rack_gp_rtt_maxmul = 3;		/* 3 x maxmin */
+static int32_t rack_gp_rtt_minmul = 1;		/* minrtt + (minrtt/mindiv) is lower rtt */
+static int32_t rack_gp_rtt_mindiv = 4;		/* minrtt + (minrtt * minmul/mindiv) is lower rtt */
+static int32_t rack_gp_decrease_per = 20;	/* 20% decrease in multipler */
+static int32_t rack_gp_increase_per = 2;	/* 2% increase in multipler */
+static int32_t rack_per_lower_bound = 50;	/* Don't allow to drop below this multiplier */
+static int32_t rack_per_upper_bound_ss = 0;	/* Don't allow SS to grow above this */
+static int32_t rack_per_upper_bound_ca = 0;	/* Don't allow CA to grow above this */
+static int32_t rack_do_dyn_mul = 0;		/* Are the rack gp multipliers dynamic */
+static int32_t rack_gp_no_rec_chg = 1;		/* Prohibit recovery from reducing it's multiplier */
+static int32_t rack_timely_dec_clear = 6;	/* Do we clear decrement count at a value (6)? */
+static int32_t rack_timely_max_push_rise = 3;	/* One round of pushing */
+static int32_t rack_timely_max_push_drop = 3;	/* Three round of pushing */
+static int32_t rack_timely_min_segs = 4;	/* 4 segment minimum */
+static int32_t rack_use_max_for_nobackoff = 0;
+static int32_t rack_timely_int_timely_only = 0;	/* do interim timely's only use the timely algo (no b/w changes)? */
+static int32_t rack_timely_no_stopping = 0;
+static int32_t rack_down_raise_thresh = 100;
+static int32_t rack_req_segs = 1;
+
+/* Weird delayed ack mode */
+static int32_t rack_use_imac_dack = 0;
 /* Rack specific counters */
 counter_u64_t rack_badfr;
 counter_u64_t rack_badfr_bytes;
@@ -266,6 +347,7 @@ counter_u64_t rack_enter_tlp_calc;
 counter_u64_t rack_input_idle_reduces;
 counter_u64_t rack_collapsed_win;
 counter_u64_t rack_tlp_does_nada;
+counter_u64_t rack_try_scwnd;
 
 /* Counters for HW TLS */
 counter_u64_t rack_tls_rwnd;
@@ -312,6 +394,8 @@ rack_ctloutput(struct socket *so, struct sockopt *sopt
     struct inpcb *inp, struct tcpcb *tp);
 static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how);
 static void
+rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line);
+static void
 rack_do_segment(struct mbuf *m, struct tcphdr *th,
     struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen,
     uint8_t iptos);
@@ -319,6 +403,15 @@ static void rack_dtor(void *mem, int32_t size, void *a
 static void
 rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm,
     uint32_t t, uint32_t cts);
+static void
+rack_log_alt_to_to_cancel(struct tcp_rack *rack,
+    uint32_t flex1, uint32_t flex2,
+    uint32_t flex3, uint32_t flex4,
+    uint32_t flex5, uint32_t flex6,
+    uint16_t flex7, uint8_t mod);
+static void
+rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot,
+   uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm);
 static struct rack_sendmap *
 rack_find_high_nonack(struct tcp_rack *rack,
     struct rack_sendmap *rsm);
@@ -328,6 +421,11 @@ static void rack_fini(struct tcpcb *tp, int32_t tcb_is
 static int
 rack_get_sockopt(struct socket *so, struct sockopt *sopt,
     struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack);
+static void
+rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack,
+			    tcp_seq th_ack, int line);
+static uint32_t
+rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
 static int32_t rack_handoff_ok(struct tcpcb *tp);
 static int32_t rack_init(struct tcpcb *tp);
 static void rack_init_sysctls(void);
@@ -337,11 +435,11 @@ rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
 static void
 rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len,
     uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts,
-    uint8_t pass, struct rack_sendmap *hintrsm);
+    uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts);
 static void
 rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack,
     struct rack_sendmap *rsm);
-static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int num);
+static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm);
 static int32_t rack_output(struct tcpcb *tp);
 
 static uint32_t
@@ -369,7 +467,7 @@ rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rac
     struct rack_sendmap *rsm, uint32_t ts);
 static int
 rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack,
-    struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type);
+    struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack);
 static int32_t tcp_addrack(module_t mod, int32_t type, void *data);
 static int
 rack_do_close_wait(struct mbuf *m, struct tcphdr *th,
@@ -410,7 +508,8 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th,
 struct rack_sendmap *
 tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack,
     uint32_t tsused);
-static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt);
+static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt,
+    uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt);
 static void
      tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th);
 
@@ -488,6 +587,7 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
 		counter_u64_zero(rack_enter_tlp_calc);
 		counter_u64_zero(rack_progress_drops);
 		counter_u64_zero(rack_tlp_does_nada);
+		counter_u64_zero(rack_try_scwnd);
 		counter_u64_zero(rack_collapsed_win);
 
 	}
@@ -502,7 +602,26 @@ rack_init_sysctls(void)
 {
 	struct sysctl_oid *rack_counters;
 	struct sysctl_oid *rack_attack;
+	struct sysctl_oid *rack_pacing;
+	struct sysctl_oid *rack_timely;
+	struct sysctl_oid *rack_timers;
+	struct sysctl_oid *rack_tlp;
+	struct sysctl_oid *rack_misc;
+	struct sysctl_oid *rack_measure;
+	struct sysctl_oid *rack_probertt;
 
+	rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_sysctl_root),
+	    OID_AUTO,
+	    "sack_attack",
+	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+	    "Rack Sack Attack Counters and Controls");
+	rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_sysctl_root),
+	    OID_AUTO,
+	    "stats",
+	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+	    "Rack Counters");
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "rate_sample_method", CTLFLAG_RW,
@@ -511,166 +630,586 @@ rack_init_sysctls(void)
 	SYSCTL_ADD_S32(&rack_sysctl_ctx,
 	    SYSCTL_CHILDREN(rack_sysctl_root),
 	    OID_AUTO, "hw_tlsmax", CTLFLAG_RW,
-	    &rack_hw_tls_max_seg , 0,
-	    "Do we have a multplier of TLS records we can send as a max (0=1 TLS record)? ");
+	    &rack_hw_tls_max_seg , 3,
+	    "What is the maximum number of full TLS records that will be sent at once");
+	/* Probe rtt related controls */
+	rack_probertt = SYSCTL_ADD_NODE(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_sysctl_root),
+	    OID_AUTO,
+	    "probertt",
+	    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+	    "ProbeRTT related Controls");
+	SYSCTL_ADD_U16(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_probertt),
+	    OID_AUTO, "exit_per_hpb", CTLFLAG_RW,
+	    &rack_atexit_prtt_hbp, 130,
+	    "What percentage above goodput do we clamp CA/SS to at exit on high-BDP path 110%");
+	SYSCTL_ADD_U16(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_probertt),
+	    OID_AUTO, "exit_per_nonhpb", CTLFLAG_RW,
+	    &rack_atexit_prtt, 130,
+	    "What percentage above goodput do we clamp CA/SS to at exit on a non high-BDP path 100%");
+	SYSCTL_ADD_U16(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_probertt),
+	    OID_AUTO, "gp_per_mul", CTLFLAG_RW,
+	    &rack_per_of_gp_probertt, 60,
+	    "What percentage of goodput do we pace at in probertt");
+	SYSCTL_ADD_U16(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_probertt),
+	    OID_AUTO, "gp_per_reduce", CTLFLAG_RW,
+	    &rack_per_of_gp_probertt_reduce, 10,
+	    "What percentage of goodput do we reduce every gp_srtt");
+	SYSCTL_ADD_U16(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_probertt),
+	    OID_AUTO, "gp_per_low", CTLFLAG_RW,
+	    &rack_per_of_gp_lowthresh, 40,
+	    "What percentage of goodput do we allow the multiplier to fall to");
+	SYSCTL_ADD_U32(&rack_sysctl_ctx,
+	    SYSCTL_CHILDREN(rack_probertt),

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202005042028.044KSsat057898>