Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 24 Apr 2020 21:30:31 +0000 (UTC)
From:      Michael Tuexen <tuexen@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r360282 - stable/11/sys/netinet
Message-ID:  <202004242130.03OLUVoS091937@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: tuexen
Date: Fri Apr 24 21:30:31 2020
New Revision: 360282
URL: https://svnweb.freebsd.org/changeset/base/360282

Log:
  Improve the TCP SACK generation by reporting DSACKs
  
  MFC r347382:
  Receiver side DSACK implemenation.
  This adds initial support for RFC 2883.
  
  MFC r347407:
  Don't use C++ style comments.
  
  MFC r349987:
  This is the second in a number of patches needed to
  get BBRv1 into the tree. This fixes the DSACK bug but
  is also needed by BBR. We have yet to go two more
  one will be for the pacing code (tcp_ratelimit.c) and
  the second will be for the new updated LRO code that
  allows a transport to know the arrival times of packets
  and (tcp_lro.c). After that we should finally be able
  to get BBRv1 into head.
  
  MFC r351725:
  This patch improves the DSACK handling to conform with RFC 2883.
  The lowest SACK block is used when multiple Blocks would be elegible as
  DSACK blocks ACK blocks get reordered - while maintaining the ordering of
  SACK blocks not relevant in the DSACK context is maintained.
  
  MFC r351801:
  Fix the SACK block generation in the base TCP stack by bringing it in
  sync with the RACK stack.
  
  MFC r352072:
  Only update SACK/DSACK lists when a non-empty segment was received.
  This fixes hitting a KASSERT with a valid packet exchange.
  
  MFC r352386
  Don't write to memory outside of the allocated array for SACK blocks.
  
  MFC r356796:
  Remove debug code not needed anymore.
  
  MFC r357100:
  The server side of TCP fast open relies on the delayed ACK timer to allow
  including user data in the SYN-ACK. When DSACK support was added in
  r347382, an immediate ACK was sent even for the received SYN with
  user data. This patch fixes that and allows again to send user data with
  the SYN-ACK.
  
  This is joint work of rrs, rscheff, and tuexen.
  
  Differential Revision:	https://reviews.freebsd.org/D19334
  Differential Revision:  https://reviews.freebsd.org/D20908
  Differential Revision:  https://reviews.freebsd.org/D21038
  Differential Revision:  https://reviews.freebsd.org/D21513
  Differential Revision:  https://reviews.freebsd.org/D21567
  Differential Revision:  https://reviews.freebsd.org/D23208
  Differential Revision:  https://reviews.freebsd.org/D23212

Modified:
  stable/11/sys/netinet/tcp_input.c
  stable/11/sys/netinet/tcp_output.c
  stable/11/sys/netinet/tcp_sack.c
  stable/11/sys/netinet/tcp_var.h
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/netinet/tcp_input.c
==============================================================================
--- stable/11/sys/netinet/tcp_input.c	Fri Apr 24 21:21:49 2020	(r360281)
+++ stable/11/sys/netinet/tcp_input.c	Fri Apr 24 21:30:31 2020	(r360282)
@@ -1539,7 +1539,6 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, stru
 	} else {
 		tp->rfbuf_cnt += tlen;	/* add up */
 	}
-
 	return (newsize);
 }
 
@@ -2307,6 +2306,18 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
 			TCPSTAT_INC(tcps_rcvpartduppack);
 			TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
 		}
+		/*
+		 * DSACK - add SACK block for dropped range
+		 */
+		if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) {
+			tcp_update_sack_list(tp, th->th_seq,
+			    th->th_seq + todrop);
+			/*
+			 * ACK now, as the next in-sequence segment
+			 * will clear the DSACK block again
+			 */
+			tp->t_flags |= TF_ACKNOW;
+		}
 		drop_hdrlen += todrop;	/* drop from the top afterwards */
 		th->th_seq += todrop;
 		tlen -= todrop;
@@ -3035,6 +3046,8 @@ dodata:							/* XXX */
 	if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
 	    TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 		tcp_seq save_start = th->th_seq;
+		tcp_seq save_rnxt  = tp->rcv_nxt;
+		int     save_tlen  = tlen;
 		m_adj(m, drop_hdrlen);	/* delayed header drop */
 		/*
 		 * Insert segment which includes th into TCP reassembly queue
@@ -3074,11 +3087,41 @@ dodata:							/* XXX */
 			 * m_adj() doesn't actually frees any mbufs
 			 * when trimming from the head.
 			 */
-			thflags = tcp_reass(tp, th, &save_start, &tlen, m);
+			tcp_seq temp = save_start;
+			thflags = tcp_reass(tp, th, &temp, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
-		if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
-			tcp_update_sack_list(tp, save_start, save_start + tlen);
+		if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0)) {
+			if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) {
+				/*
+				 * DSACK actually handled in the fastpath
+				 * above.
+				 */
+				tcp_update_sack_list(tp, save_start,
+				    save_start + save_tlen);
+			} else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) {
+				if ((tp->rcv_numsacks >= 1) &&
+				    (tp->sackblks[0].end == save_start)) {
+					/*
+					 * Partial overlap, recorded at todrop
+					 * above.
+					 */
+					tcp_update_sack_list(tp,
+					    tp->sackblks[0].start,
+					    tp->sackblks[0].end);
+				} else {
+					tcp_update_dsack_list(tp, save_start,
+					    save_start + save_tlen);
+				}
+			} else if (tlen >= save_tlen) {
+				/* Update of sackblks. */
+				tcp_update_dsack_list(tp, save_start,
+				    save_start + save_tlen);
+			} else if (tlen > 0) {
+				tcp_update_dsack_list(tp, save_start,
+				    save_start + tlen);
+			}
+		}
 #if 0
 		/*
 		 * Note the amount of data that peer has sent into

Modified: stable/11/sys/netinet/tcp_output.c
==============================================================================
--- stable/11/sys/netinet/tcp_output.c	Fri Apr 24 21:21:49 2020	(r360281)
+++ stable/11/sys/netinet/tcp_output.c	Fri Apr 24 21:30:31 2020	(r360282)
@@ -1548,7 +1548,13 @@ timer:
 		if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max))
 			tp->snd_max = tp->snd_nxt + xlen;
 	}
-
+	if ((error == 0) &&
+	    (TCPS_HAVEESTABLISHED(tp->t_state) &&
+	     (tp->t_flags & TF_SACK_PERMIT) &&
+	     tp->rcv_numsacks > 0)) {
+		    /* Clean up any DSACK's sent */
+		    tcp_clean_dsack_blocks(tp);
+	}
 	if (error) {
 
 		/*

Modified: stable/11/sys/netinet/tcp_sack.c
==============================================================================
--- stable/11/sys/netinet/tcp_sack.c	Fri Apr 24 21:21:49 2020	(r360281)
+++ stable/11/sys/netinet/tcp_sack.c	Fri Apr 24 21:30:31 2020	(r360282)
@@ -150,7 +150,104 @@ SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, 
     &VNET_NAME(tcp_sack_globalholes), 0,
     "Global number of TCP SACK holes currently allocated");
 
+
 /*
+ * This function will find overlaps with the currently stored sackblocks
+ * and add any overlap as a dsack block upfront
+ */
+void
+tcp_update_dsack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
+{
+	struct sackblk head_blk,mid_blk,saved_blks[MAX_SACK_BLKS];
+	int i, j, n, identical;
+	tcp_seq start, end;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
+
+	if (SEQ_LT(rcv_end, tp->rcv_nxt) ||
+	    ((rcv_end == tp->rcv_nxt) &&
+	     (tp->rcv_numsacks > 0 ) &&
+	     (tp->sackblks[0].end == tp->rcv_nxt))) {
+		saved_blks[0].start = rcv_start;
+		saved_blks[0].end = rcv_end;
+	} else {
+		saved_blks[0].start = saved_blks[0].end = 0;
+	}
+
+	head_blk.start = head_blk.end = 0;
+	mid_blk.start = rcv_start;
+	mid_blk.end = rcv_end;
+	identical = 0;
+
+	for (i = 0; i < tp->rcv_numsacks; i++) {
+		start = tp->sackblks[i].start;
+		end = tp->sackblks[i].end;
+		if (SEQ_LT(rcv_end, start)) {
+			/* pkt left to sack blk */
+			continue;
+		}
+		if (SEQ_GT(rcv_start, end)) {
+			/* pkt right to sack blk */
+			continue;
+		}
+		if (SEQ_GT(tp->rcv_nxt, end)) {
+			if ((SEQ_MAX(rcv_start, start) != SEQ_MIN(rcv_end, end)) &&
+			    (SEQ_GT(head_blk.start, SEQ_MAX(rcv_start, start)) ||
+			    (head_blk.start == head_blk.end))) {
+				head_blk.start = SEQ_MAX(rcv_start, start);
+				head_blk.end = SEQ_MIN(rcv_end, end);
+			}
+			continue;
+		}
+		if (((head_blk.start == head_blk.end) ||
+		     SEQ_LT(start, head_blk.start)) &&
+		     (SEQ_GT(end, rcv_start) &&
+		      SEQ_LEQ(start, rcv_end))) {
+			head_blk.start = start;
+			head_blk.end = end;
+		}
+		mid_blk.start = SEQ_MIN(mid_blk.start, start);
+		mid_blk.end = SEQ_MAX(mid_blk.end, end);
+		if ((mid_blk.start == start) &&
+		    (mid_blk.end == end))
+			identical = 1;
+	}
+	if (SEQ_LT(head_blk.start, head_blk.end)) {
+		/* store overlapping range */
+		saved_blks[0].start = SEQ_MAX(rcv_start, head_blk.start);
+		saved_blks[0].end   = SEQ_MIN(rcv_end, head_blk.end);
+	}
+	n = 1;
+	/*
+	 * Second, if not ACKed, store the SACK block that
+	 * overlaps with the DSACK block unless it is identical
+	 */
+	if ((SEQ_LT(tp->rcv_nxt, mid_blk.end) &&
+	    !((mid_blk.start == saved_blks[0].start) &&
+	    (mid_blk.end == saved_blks[0].end))) ||
+	    identical == 1) {
+		saved_blks[n].start = mid_blk.start;
+		saved_blks[n++].end = mid_blk.end;
+	}
+	for (j = 0; (j < tp->rcv_numsacks) && (n < MAX_SACK_BLKS); j++) {
+		if (((SEQ_LT(tp->sackblks[j].end, mid_blk.start) ||
+		      SEQ_GT(tp->sackblks[j].start, mid_blk.end)) &&
+		    (SEQ_GT(tp->sackblks[j].start, tp->rcv_nxt))))
+		saved_blks[n++] = tp->sackblks[j];
+	}
+	j = 0;
+	for (i = 0; i < n; i++) {
+		/* we can end up with a stale inital entry */
+		if (SEQ_LT(saved_blks[i].start, saved_blks[i].end)) {
+			tp->sackblks[j++] = saved_blks[i];
+		}
+	}
+	tp->rcv_numsacks = j;
+}
+
+/*
  * This function is called upon receipt of new valid data (while not in
  * header prediction mode), and it updates the ordered list of sacks.
  */
@@ -169,11 +266,18 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	/* Check arguments. */
-	KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
+	KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end"));
 
-	/* SACK block for the received segment. */
-	head_blk.start = rcv_start;
-	head_blk.end = rcv_end;
+	if ((rcv_start == rcv_end) &&
+	    (tp->rcv_numsacks >= 1) &&
+	    (rcv_end == tp->sackblks[0].end)) {
+		/* retaining DSACK block below rcv_nxt (todrop) */
+		head_blk = tp->sackblks[0];
+	} else {
+		/* SACK block for the received segment. */
+		head_blk.start = rcv_start;
+		head_blk.end = rcv_end;
+	}
 
 	/*
 	 * Merge updated SACK blocks into head_blk, and save unchanged SACK
@@ -194,12 +298,54 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
 			 * Merge this SACK block into head_blk.  This SACK
 			 * block itself will be discarded.
 			 */
-			if (SEQ_GT(head_blk.start, start))
+			/*
+			 * |-|
+			 *   |---|  merge
+			 *
+			 *     |-|
+			 * |---|    merge
+			 *
+			 * |-----|
+			 *   |-|    DSACK smaller
+			 *
+			 *   |-|
+			 * |-----|  DSACK smaller
+			 */
+			if (head_blk.start == end)
 				head_blk.start = start;
-			if (SEQ_LT(head_blk.end, end))
+			else if (head_blk.end == start)
 				head_blk.end = end;
+			else {
+				if (SEQ_LT(head_blk.start, start)) {
+					tcp_seq temp = start;
+					start = head_blk.start;
+					head_blk.start = temp;
+				}
+				if (SEQ_GT(head_blk.end, end)) {
+					tcp_seq temp = end;
+					end = head_blk.end;
+					head_blk.end = temp;
+				}
+				if ((head_blk.start != start) ||
+				    (head_blk.end != end)) {
+					if ((num_saved >= 1) &&
+					   SEQ_GEQ(saved_blks[num_saved-1].start, start) &&
+					   SEQ_LEQ(saved_blks[num_saved-1].end, end))
+						num_saved--;
+					saved_blks[num_saved].start = start;
+					saved_blks[num_saved].end = end;
+					num_saved++;
+				}
+			}
 		} else {
 			/*
+			 * This block supercedes the prior block
+			 */
+			if ((num_saved >= 1) &&
+			   SEQ_GEQ(saved_blks[num_saved-1].start, start) &&
+			   SEQ_LEQ(saved_blks[num_saved-1].end, end))
+				num_saved--;
+			/*
 			 * Save this SACK block.
 			 */
 			saved_blks[num_saved].start = start;
@@ -212,7 +358,7 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
 	 * Update SACK list in tp->sackblks[].
 	 */
 	num_head = 0;
-	if (SEQ_GT(head_blk.start, tp->rcv_nxt)) {
+	if (SEQ_LT(rcv_start, rcv_end)) {
 		/*
 		 * The received data segment is an out-of-order segment.  Put
 		 * head_blk at the top of SACK list.
@@ -226,6 +372,10 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
 		if (num_saved >= MAX_SACK_BLKS)
 			num_saved--;
 	}
+	if ((rcv_start == rcv_end) &&
+	    (rcv_start == tp->sackblks[0].end)) {
+		num_head = 1;
+	}
 	if (num_saved > 0) {
 		/*
 		 * Copy the saved SACK blocks back.
@@ -236,6 +386,45 @@ tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_sta
 
 	/* Save the number of SACK blocks. */
 	tp->rcv_numsacks = num_head + num_saved;
+}
+
+void
+tcp_clean_dsack_blocks(struct tcpcb *tp)
+{
+	struct sackblk saved_blks[MAX_SACK_BLKS];
+	int num_saved, i;
+
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+	/*
+	 * Clean up any DSACK blocks that
+	 * are in our queue of sack blocks.
+	 * 
+	 */
+	num_saved = 0;
+	for (i = 0; i < tp->rcv_numsacks; i++) {
+		tcp_seq start = tp->sackblks[i].start;
+		tcp_seq end = tp->sackblks[i].end;
+		if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
+			/*
+			 * Discard this D-SACK block.
+			 */
+			continue;
+		}
+		/*
+		 * Save this SACK block.
+		 */
+		saved_blks[num_saved].start = start;
+		saved_blks[num_saved].end = end;
+		num_saved++;
+	}
+	if (num_saved > 0) {
+		/*
+		 * Copy the saved SACK blocks back.
+		 */
+		bcopy(saved_blks, &tp->sackblks[0],
+		      sizeof(struct sackblk) * num_saved);
+	}
+	tp->rcv_numsacks = num_saved;
 }
 
 /*

Modified: stable/11/sys/netinet/tcp_var.h
==============================================================================
--- stable/11/sys/netinet/tcp_var.h	Fri Apr 24 21:21:49 2020	(r360281)
+++ stable/11/sys/netinet/tcp_var.h	Fri Apr 24 21:30:31 2020	(r360282)
@@ -831,7 +831,9 @@ uint32_t tcp_new_ts_offset(struct in_conninfo *);
 tcp_seq	 tcp_new_isn(struct in_conninfo *);
 
 int	 tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
+void	 tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq);
 void	 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
+void	 tcp_clean_dsack_blocks(struct tcpcb *tp);
 void	 tcp_clean_sackreport(struct tcpcb *tp);
 void	 tcp_sack_adjust(struct tcpcb *tp);
 struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202004242130.03OLUVoS091937>