Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 7 Sep 2019 19:58:06 +0000 (UTC)
From:      Michael Tuexen <tuexen@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-12@freebsd.org
Subject:   svn commit: r352022 - in stable/12/sys/netinet: . tcp_stacks
Message-ID:  <201909071958.x87Jw6J9041161@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: tuexen
Date: Sat Sep  7 19:58:06 2019
New Revision: 352022
URL: https://svnweb.freebsd.org/changeset/base/352022

Log:
  Improve DSACK support:
  
  MFC This is the second in a number of patches needed to
  get BBRv1 into the tree. This fixes the DSACK bug but
  is also needed by BBR. We have yet to go two more
  one will be for the pacing code (tcp_ratelimit.c) and
  the second will be for the new updated LRO code that
  allows a transport to know the arrival times of packets
  and (tcp_lro.c). After that we should finally be able
  Improve DSACK support.
  
  MFC r349987 from rrs@:
  
  This is the second in a number of patches needed to
  get BBRv1 into the tree. This fixes the DSACK bug but
  is also needed by BBR. We have yet to go two more
  one will be for the pacing code (tcp_ratelimit.c) and
  the second will be for the new updated LRO code that
  allows a transport to know the arrival times of packets
  and (tcp_lro.c). After that we should finally be able
  to get BBRv1 into head.
  
  This required manual tweaking to address merge conflicts.
  
  MFC r351725:
  
  This patch improves the DSACK handling to conform with RFC 2883.
  The lowest SACK block is used when multiple Blocks would be elegible as
  DSACK blocks ACK blocks get reordered - while maintaining the ordering of
  SACK blocks not relevant in the DSACK context is maintained.
  
  This required manual tweaking to address merge conflicts.
  
  MFC r351801:
  
  Fix the SACK block generation in the base TCP stack by bringing it in
  sync with the RACK stack.

Modified:
  stable/12/sys/netinet/tcp_input.c
  stable/12/sys/netinet/tcp_output.c
  stable/12/sys/netinet/tcp_sack.c
  stable/12/sys/netinet/tcp_stacks/rack.c
  stable/12/sys/netinet/tcp_var.h
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/sys/netinet/tcp_input.c
==============================================================================
--- stable/12/sys/netinet/tcp_input.c	Sat Sep  7 19:25:45 2019	(r352021)
+++ stable/12/sys/netinet/tcp_input.c	Sat Sep  7 19:58:06 2019	(r352022)
@@ -1506,7 +1506,6 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, stru
 	} else {
 		tp->rfbuf_cnt += tlen;	/* add up */
 	}
-
 	return (newsize);
 }
 
@@ -2285,7 +2284,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
 		 * DSACK - add SACK block for dropped range
 		 */
 		if (tp->t_flags & TF_SACK_PERMIT) {
-			tcp_update_sack_list(tp, th->th_seq, th->th_seq+tlen);
+			tcp_update_sack_list(tp, th->th_seq,
+			    th->th_seq + todrop);
 			/*
 			 * ACK now, as the next in-sequence segment
 			 * will clear the DSACK block again
@@ -3072,21 +3072,29 @@ dodata:							/* XXX */
 				 * DSACK actually handled in the fastpath
 				 * above.
 				 */
-				tcp_update_sack_list(tp, save_start, save_start + save_tlen);
-			} else
-			if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
+				tcp_update_sack_list(tp, save_start,
+				    save_start + save_tlen);
+			} else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
 				/*
 				 * Cleaning sackblks by using zero length
 				 * update.
 				 */
-				tcp_update_sack_list(tp, save_start, save_start);
-			} else
-			if ((tlen > 0) && (tlen >= save_tlen)) {
+				if ((tp->rcv_numsacks >= 1) &&
+				    (tp->sackblks[0].end == save_start)) {
+					/* partial overlap, recorded at todrop above */
+					tcp_update_sack_list(tp, tp->sackblks[0].start,
+					    tp->sackblks[0].end);
+				} else {
+					tcp_update_dsack_list(tp, save_start,
+					    save_start + save_tlen);
+				}
+			} else if ((tlen > 0) && (tlen >= save_tlen)) {
 				/* Update of sackblks. */
-				tcp_update_sack_list(tp, save_start, save_start + save_tlen);
-			} else
-			if (tlen > 0) {
-				tcp_update_sack_list(tp, save_start, save_start+tlen);
+				tcp_update_dsack_list(tp, save_start,
+				    save_start + save_tlen);
+			} else if (tlen > 0) {
+				tcp_update_dsack_list(tp, save_start,
+				    save_start + tlen);
 			}
 		}
 #if 0

Modified: stable/12/sys/netinet/tcp_output.c
==============================================================================
--- stable/12/sys/netinet/tcp_output.c	Sat Sep  7 19:25:45 2019	(r352021)
+++ stable/12/sys/netinet/tcp_output.c	Sat Sep  7 19:58:06 2019	(r352022)
@@ -1507,7 +1507,13 @@ timer:
 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
 			tp->snd_max = tp->snd_nxt + xlen;
 	}
-
+	if ((error == 0) &&
+	    (TCPS_HAVEESTABLISHED(tp->t_state) &&
+	     (tp->t_flags & TF_SACK_PERMIT) &&
+	     tp->rcv_numsacks > 0)) {
+		    /* Clean up any DSACK's sent */
+		    tcp_clean_dsack_blocks(tp);
+	}
 	if (error) {
 		/* Record the error. */
 		TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_OUT,

Modified: stable/12/sys/netinet/tcp_sack.c
==============================================================================
--- stable/12/sys/netinet/tcp_sack.c	Sat Sep  7 19:25:45 2019	(r352021)
+++ stable/12/sys/netinet/tcp_sack.c	Sat Sep  7 19:58:06 2019	(r352022)
@@ -149,7 +149,109 @@ SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, 
     &VNET_NAME(tcp_sack_globalholes), 0,
     "Global number of TCP SACK holes currently allocated");
 
+
 /*
+ * This function will find overlaps with the currently stored sackblocks
+ * and add any overlap as a dsack block upfront
+ */
+void
+tcp_update_dsack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
+{
+	struct sackblk head_blk,mid_blk,saved_blks[MAX_SACK_BLKS];
+	int i, j, n, identical;
+	tcp_seq start, end;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
+
+	if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) {
+		log(LOG_DEBUG, "\nDSACK update: %d..%d, rcv_nxt: %u\n",
+		rcv_start, rcv_end, tp->rcv_nxt);
+	}
+
+	if (SEQ_LT(rcv_end, tp->rcv_nxt) ||
+	    ((rcv_end == tp->rcv_nxt) &&
+	     (tp->rcv_numsacks > 0 ) &&
+	     (tp->sackblks[0].end == tp->rcv_nxt))) {
+		saved_blks[0].start = rcv_start;
+		saved_blks[0].end = rcv_end;
+	} else {
+		saved_blks[0].start = saved_blks[0].end = 0;
+	}
+
+	head_blk.start = head_blk.end = 0;
+	mid_blk.start = rcv_start;
+	mid_blk.end = rcv_end;
+	identical = 0;
+
+	for (i = 0; i < tp->rcv_numsacks; i++) {
+		start = tp->sackblks[i].start;
+		end = tp->sackblks[i].end;
+		if (SEQ_LT(rcv_end, start)) {
+			/* pkt left to sack blk */
+			continue;
+		}
+		if (SEQ_GT(rcv_start, end)) {
+			/* pkt right to sack blk */
+			continue;
+		}
+		if (SEQ_GT(tp->rcv_nxt, end)) {
+			if ((SEQ_MAX(rcv_start, start) != SEQ_MIN(rcv_end, end)) &&
+			    (SEQ_GT(head_blk.start, SEQ_MAX(rcv_start, start)) ||
+			    (head_blk.start == head_blk.end))) {
+				head_blk.start = SEQ_MAX(rcv_start, start);
+				head_blk.end = SEQ_MIN(rcv_end, end);
+			}
+			continue;
+		}
+		if (((head_blk.start == head_blk.end) ||
+		     SEQ_LT(start, head_blk.start)) &&
+		     (SEQ_GT(end, rcv_start) &&
+		      SEQ_LEQ(start, rcv_end))) {
+			head_blk.start = start;
+			head_blk.end = end;
+		}
+		mid_blk.start = SEQ_MIN(mid_blk.start, start);
+		mid_blk.end = SEQ_MAX(mid_blk.end, end);
+		if ((mid_blk.start == start) &&
+		    (mid_blk.end == end))
+			identical = 1;
+	}
+	if (SEQ_LT(head_blk.start, head_blk.end)) {
+		/* store overlapping range */
+		saved_blks[0].start = SEQ_MAX(rcv_start, head_blk.start);
+		saved_blks[0].end   = SEQ_MIN(rcv_end, head_blk.end);
+	}
+	n = 1;
+	/*
+	 * Second, if not ACKed, store the SACK block that
+	 * overlaps with the DSACK block unless it is identical
+	 */
+	if ((SEQ_LT(tp->rcv_nxt, mid_blk.end) &&
+	    !((mid_blk.start == saved_blks[0].start) &&
+	    (mid_blk.end == saved_blks[0].end))) ||
+	    identical == 1) {
+		saved_blks[n].start = mid_blk.start;
+		saved_blks[n++].end = mid_blk.end;
+	}
+	for (j = 0; (j < tp->rcv_numsacks) && (j < MAX_SACK_BLKS-1); j++) {
+		if (((SEQ_LT(tp->sackblks[j].end, mid_blk.start) ||
+		      SEQ_GT(tp->sackblks[j].start, mid_blk.end)) &&
+		    (SEQ_GT(tp->sackblks[j].start, tp->rcv_nxt))))
+		saved_blks[n++] = tp->sackblks[j];
+	}
+	j = 0;
+	for (i = 0; i < n; i++) {
+		/* we can end up with a stale inital entry */
+		if (SEQ_LT(saved_blks[i].start, saved_blks[i].end)) {
+			tp->sackblks[j++] = saved_blks[i];
+		}
+	}
+	tp->rcv_numsacks = j;
+}
+
+/*
  * This function is called upon receipt of new valid data (while not in
  * header prediction mode), and it updates the ordered list of sacks.
  */
@@ -170,9 +272,16 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
 	/* Check arguments. */
 	KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end"));
 
-	/* SACK block for the received segment. */
-	head_blk.start = rcv_start;
-	head_blk.end = rcv_end;
+	if ((rcv_start == rcv_end) &&
+	    (tp->rcv_numsacks >= 1) &&
+	    (rcv_end == tp->sackblks[0].end)) {
+		/* retaining DSACK block below rcv_nxt (todrop) */
+		head_blk = tp->sackblks[0];
+	} else {
+		/* SACK block for the received segment. */
+		head_blk.start = rcv_start;
+		head_blk.end = rcv_end;
+	}
 
 	/*
 	 * Merge updated SACK blocks into head_blk, and save unchanged SACK
@@ -267,6 +376,10 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
 		if (num_saved >= MAX_SACK_BLKS)
 			num_saved--;
 	}
+	if ((rcv_start == rcv_end) &&
+	    (rcv_start == tp->sackblks[0].end)) {
+		num_head = 1;
+	}
 	if (num_saved > 0) {
 		/*
 		 * Copy the saved SACK blocks back.
@@ -277,6 +390,45 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
 
 	/* Save the number of SACK blocks. */
 	tp->rcv_numsacks = num_head + num_saved;
+}
+
+void
+tcp_clean_dsack_blocks(struct tcpcb *tp)
+{
+	struct sackblk saved_blks[MAX_SACK_BLKS];
+	int num_saved, i;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	/*
+	 * Clean up any DSACK blocks that
+	 * are in our queue of sack blocks.
+	 * 
+	 */
+	num_saved = 0;
+	for (i = 0; i < tp->rcv_numsacks; i++) {
+		tcp_seq start = tp->sackblks[i].start;
+		tcp_seq end = tp->sackblks[i].end;
+		if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
+			/*
+			 * Discard this D-SACK block.
+			 */
+			continue;
+		}
+		/*
+		 * Save this SACK block.
+		 */
+		saved_blks[num_saved].start = start;
+		saved_blks[num_saved].end = end;
+		num_saved++;
+	}
+	if (num_saved > 0) {
+		/*
+		 * Copy the saved SACK blocks back.
+		 */
+		bcopy(saved_blks, &tp->sackblks[0],
+		      sizeof(struct sackblk) * num_saved);
+	}
+	tp->rcv_numsacks = num_saved;
 }
 
 /*

Modified: stable/12/sys/netinet/tcp_stacks/rack.c
==============================================================================
--- stable/12/sys/netinet/tcp_stacks/rack.c	Sat Sep  7 19:25:45 2019	(r352021)
+++ stable/12/sys/netinet/tcp_stacks/rack.c	Sat Sep  7 19:58:06 2019	(r352022)
@@ -1794,7 +1794,8 @@ rack_drop_checks(struct tcpopt *to, struct mbuf *m, st
 		 * DSACK - add SACK block for dropped range
 		 */
 		if (tp->t_flags & TF_SACK_PERMIT) {
-			tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
+			tcp_update_sack_list(tp, th->th_seq,
+			    th->th_seq + todrop);
 			/*
 			 * ACK now, as the next in-sequence segment
 			 * will clear the DSACK block again
@@ -4853,7 +4854,8 @@ dodata:				/* XXX */
 		    (TCPS_HAVEESTABLISHED(tp->t_state) ||
 		    tfo_syn)) {
 			if (DELAY_ACK(tp, tlen) || tfo_syn) {
-				rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+				rack_timer_cancel(tp, rack,
+				    rack->r_ctl.rc_rcvtime, __LINE__);
 				tp->t_flags |= TF_DELACK;
 			} else {
 				rack->r_wanted_output++;
@@ -4887,18 +4889,29 @@ dodata:				/* XXX */
 			 * DSACK actually handled in the fastpath
 			 * above.
 			 */
-			tcp_update_sack_list(tp, save_start, save_start + save_tlen);
+			tcp_update_sack_list(tp, save_start,
+			    save_start + save_tlen);
 		} else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
 			/*
 			 * Cleaning sackblks by using zero length
 			 * update.
 			 */
-			tcp_update_sack_list(tp, save_start, save_start);
+			if ((tp->rcv_numsacks >= 1) &&
+			    (tp->sackblks[0].end == save_start)) {
+				/* partial overlap, recorded at todrop above */
+				tcp_update_sack_list(tp, tp->sackblks[0].start,
+				    tp->sackblks[0].end);
+			} else {
+				tcp_update_dsack_list(tp, save_start,
+				    save_start + save_tlen);
+			}
 		} else if ((tlen > 0) && (tlen >= save_tlen)) {
 			/* Update of sackblks. */
-			tcp_update_sack_list(tp, save_start, save_start + save_tlen);
+			tcp_update_dsack_list(tp, save_start,
+			    save_start + save_tlen);
 		} else if (tlen > 0) {
-			tcp_update_sack_list(tp, save_start, save_start+tlen);
+			tcp_update_dsack_list(tp, save_start,
+			    save_start + tlen);
 		}
 	} else {
 		m_freem(m);
@@ -4920,7 +4933,8 @@ dodata:				/* XXX */
 			 * now.
 			 */
 			if (tp->t_flags & TF_NEEDSYN) {
-				rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+				rack_timer_cancel(tp, rack,
+				    rack->r_ctl.rc_rcvtime, __LINE__);
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
@@ -4937,7 +4951,8 @@ dodata:				/* XXX */
 			tp->t_starttime = ticks;
 			/* FALLTHROUGH */
 		case TCPS_ESTABLISHED:
-			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+			rack_timer_cancel(tp, rack,
+			    rack->r_ctl.rc_rcvtime, __LINE__);
 			tcp_state_change(tp, TCPS_CLOSE_WAIT);
 			break;
 
@@ -4946,7 +4961,8 @@ dodata:				/* XXX */
 			 * acked so enter the CLOSING state.
 			 */
 		case TCPS_FIN_WAIT_1:
-			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+			rack_timer_cancel(tp, rack,
+			    rack->r_ctl.rc_rcvtime, __LINE__);
 			tcp_state_change(tp, TCPS_CLOSING);
 			break;
 
@@ -4956,7 +4972,8 @@ dodata:				/* XXX */
 			 * other standard timers.
 			 */
 		case TCPS_FIN_WAIT_2:
-			rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__);
+			rack_timer_cancel(tp, rack,
+			    rack->r_ctl.rc_rcvtime, __LINE__);
 			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 			tcp_twstart(tp);
 			return (1);
@@ -4965,7 +4982,8 @@ dodata:				/* XXX */
 	/*
 	 * Return any desired output.
 	 */
-	if ((tp->t_flags & TF_ACKNOW) || (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
+	if ((tp->t_flags & TF_ACKNOW) ||
+	    (sbavail(&so->so_snd) > (tp->snd_max - tp->snd_una))) {
 		rack->r_wanted_output++;
 	}
 	INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -5038,6 +5056,7 @@ rack_do_fastnewdata(struct mbuf *m, struct tcphdr *th,
 	/* Clean receiver SACK report if present */
 	if (tp->rcv_numsacks)
 		tcp_clean_sackreport(tp);
+
 	TCPSTAT_INC(tcps_preddat);
 	tp->rcv_nxt += tlen;
 	/*
@@ -8571,6 +8590,10 @@ out:
 	 * retransmit.  In persist state, just set snd_max.
 	 */
 	if (error == 0) {
+		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+		    (tp->t_flags & TF_SACK_PERMIT) &&
+		    tp->rcv_numsacks > 0)
+		    tcp_clean_dsack_blocks(tp);
 		if (len == 0)
 			counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
 		else if (len == 1) {

Modified: stable/12/sys/netinet/tcp_var.h
==============================================================================
--- stable/12/sys/netinet/tcp_var.h	Sat Sep  7 19:25:45 2019	(r352021)
+++ stable/12/sys/netinet/tcp_var.h	Sat Sep  7 19:58:06 2019	(r352022)
@@ -935,7 +935,9 @@ uint32_t tcp_new_ts_offset(struct in_conninfo *);
 tcp_seq	 tcp_new_isn(struct in_conninfo *);
 
 int	 tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
+void	 tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq);
 void	 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
+void	 tcp_clean_dsack_blocks(struct tcpcb *tp);
 void	 tcp_clean_sackreport(struct tcpcb *tp);
 void	 tcp_sack_adjust(struct tcpcb *tp);
 struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201909071958.x87Jw6J9041161>