Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 22 Aug 2001 17:15:18 -0700 (PDT)
From:      Dave Zarzycki <zarzycki@freebsd.org>
To:        <freebsd-net@freebsd.org>
Subject:   RFC: SACK/FACK patch port to Current
Message-ID:  <Pine.LNX.4.33.0108221713240.16888-200000@bonk.apple.com>

next in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
Attached and tested.

I'd like to merge this in unless anybody objects.

davez

-- 
Dave Zarzycki
Darwin & Mac OS X
Apple Computer, Inc.

[-- Attachment #2 --]
Index: tcp.h
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp.h,v
retrieving revision 1.16
diff -u -d -b -w -u -d -r1.16 tcp.h
--- tcp.h	2001/01/09 18:26:17	1.16
+++ tcp.h	2001/08/22 23:11:21
@@ -85,12 +85,22 @@
 #define TCPOPT_SACK_PERMITTED	4		/* Experimental */
 #define    TCPOLEN_SACK_PERMITTED	2
 #define TCPOPT_SACK		5		/* Experimental */
+#define    TCPOLEN_SACK                 8   /*2*sizeof(tcp_seq):len of sack blk */    
 #define TCPOPT_TIMESTAMP	8
 #define    TCPOLEN_TIMESTAMP		10
 #define    TCPOLEN_TSTAMP_APPA		(TCPOLEN_TIMESTAMP+2) /* appendix A */
 #define    TCPOPT_TSTAMP_HDR		\
     (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)
 
+#define TCPOPT_SACK_PERMIT_HDR  \
+ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED)
+#define TCPOPT_SACK_HDR          (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8)
+
+/* Miscellaneous constants */
+#define MAX_SACK_BLKS	6	/* Max # SACK blocks stored at sender side */
+#define TCP_MAX_SACK	3	/* MAX # SACKs sent in any segment */
+
+
 #define	TCPOPT_CC		11		/* CC options: RFC-1644 */
 #define TCPOPT_CCNEW		12
 #define TCPOPT_CCECHO		13
@@ -133,5 +143,6 @@
 #define	TCP_MAXSEG	0x02	/* set maximum segment size */
 #define TCP_NOPUSH	0x04	/* don't push last block of write */
 #define TCP_NOOPT	0x08	/* don't use TCP options */
+#define	TCP_SACK_DISABLE 0x300  /* disable SACKs (if enabled by def.) */
 
 #endif
Index: tcp_input.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.137
diff -u -d -b -w -u -d -r1.137 tcp_input.c
--- tcp_input.c	2001/08/22 00:58:16	1.137
+++ tcp_input.c	2001/08/22 23:11:21
@@ -100,7 +100,7 @@
 
 MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
 
-static int	tcprexmtthresh = 3;
+int	tcprexmtthresh = 3;
 tcp_cc	tcp_ccgen;
 
 struct	tcpstat tcpstat;
@@ -870,6 +870,10 @@
 	tp->t_rcvtime = ticks;
 	if (TCPS_HAVEESTABLISHED(tp->t_state))
 		callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
+#ifdef TCP_SACK
+	if (!tp->sack_disable)
+		tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
+#endif /* TCP_SACK */
 
 	/*
 	 * Process options if not in LISTEN state,
@@ -878,6 +882,12 @@
 	if (tp->t_state != TCPS_LISTEN)
 		tcp_dooptions(tp, optp, optlen, th, &to);
 
+#ifdef TCP_SACK
+	if (!tp->sack_disable) {
+		tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
+		tp->rcv_lastend = th->th_seq + tlen;
+	}
+#endif /* TCP_SACK */
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
@@ -954,6 +964,19 @@
 				tcpstat.tcps_rcvackbyte += acked;
 				sbdrop(&so->so_snd, acked);
 				tp->snd_una = th->th_ack;
+
+#if defined(TCP_SACK)
+				/* 
+				 * We want snd_last to track snd_una so
+				 * as to avoid sequence wraparound problems
+				 * for very large transfers.
+				 */
+				tp->snd_last = tp->snd_una;
+#endif /* TCP_SACK */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+				tp->snd_fack = tp->snd_una;
+				tp->retran_data = 0;
+#endif /* TCP_FACK */
 				m_freem(m);
 				ND6_HINT(tp); /* some progress has been done */
 
@@ -986,6 +1009,12 @@
 			 * with nothing on the reassembly queue and
 			 * we have enough buffer space to take it.
 			 */
+
+#ifdef TCP_SACK
+			/* Clean receiver SACK report if present */
+			if (!tp->sack_disable && tp->rcv_numsacks)
+				tcp_clean_sackreport(tp);
+#endif /* TCP_SACK */
 			++tcpstat.tcps_preddat;
 			tp->rcv_nxt += tlen;
 			tcpstat.tcps_rcvpack++;
@@ -1131,6 +1160,17 @@
 			bzero(taop, sizeof(*taop));
 		}
 		tcp_dooptions(tp, optp, optlen, th, &to);
+
+#ifdef TCP_SACK
+		/*
+		 * If peer did not send a SACK_PERMITTED option (i.e., if
+		 * tcp_dooptions() did not set TF_SACK_PERMIT), set 
+                 * sack_disable to 1 if it is currently 0.
+                 */
+                if (!tp->sack_disable)
+                        if ((tp->t_flags & TF_SACK_PERMIT) == 0) 
+                                tp->sack_disable = 1;
+#endif
 		if (iss)
 			tp->iss = iss;
 		else {
@@ -1138,6 +1178,14 @@
  		}
 		tp->irs = th->th_seq;
 		tcp_sendseqinit(tp);
+#if defined (TCP_SACK)
+		tp->snd_last = tp->snd_una;
+#endif /* TCP_SACK */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+		tp->snd_fack = tp->snd_una;
+		tp->retran_data = 0;
+		tp->snd_awnd = 0;
+#endif /* TCP_FACK */
 		tcp_rcvseqinit(tp);
 		tp->snd_recover = tp->snd_una;
 		/*
@@ -1309,6 +1357,17 @@
 				}
 			} else
 				tp->t_flags &= ~TF_RCVD_CC;
+#ifdef TCP_SACK
+                /*
+                 * If we've sent a SACK_PERMITTED option, and the peer
+                 * also replied with one, then TF_SACK_PERMIT should have
+                 * been set in tcp_dooptions().  If it was not, disable SACKs.
+                 */
+                if (!tp->sack_disable)
+                        if ((tp->t_flags & TF_SACK_PERMIT) == 0) 
+                                tp->sack_disable = 1;
+#endif
+	
 			tcpstat.tcps_connects++;
 			soisconnected(so);
 			/* Do window scaling on this connection? */
@@ -1820,14 +1879,36 @@
 				 * to keep a constant cwnd packets in the
 				 * network.
 				 */
+
 				if (!callout_active(tp->tt_rexmt) ||
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+				/* 
+				 * In FACK, can enter fast rec. if the receiver
+				 * reports a reass. queue longer than 3 segs.
+				 */
+				else if (++tp->t_dupacks == tcprexmtthresh ||
+					 ((SEQ_GT(tp->snd_fack, tcprexmtthresh * 
+						  tp->t_maxseg + tp->snd_una)) &&
+					  SEQ_GT(tp->snd_una, tp->snd_last))) {
+#else
 				else if (++tp->t_dupacks == tcprexmtthresh) {
+#endif /* TCP_FACK */
 					tcp_seq onxt = tp->snd_nxt;
 					u_int win =
 					    min(tp->snd_wnd, tp->snd_cwnd) / 2 /
 						tp->t_maxseg;
+#if defined(TCP_SACK)
+				  if (SEQ_LT(th->th_ack, tp->snd_last)){
+				    /* 
+				     * False fast retx after 
+				     * timeout.  Do not cut window.
+				     */
+				    tp->t_dupacks = 0;
+				    goto drop;
+				  }
+#else
 					if (tcp_do_newreno && SEQ_LT(th->th_ack,
 					    tp->snd_recover)) {
 						/* False retransmit, should not
@@ -1838,21 +1919,62 @@
 						(void) tcp_output(tp);
 						goto drop;
 					}
+#endif
 					if (win < 2)
 						win = 2;
 					tp->snd_ssthresh = win * tp->t_maxseg;
+#if defined(TCP_SACK)
+				  tp->snd_last = tp->snd_max;
+#else
 					tp->snd_recover = tp->snd_max;
+#endif
+#ifdef TCP_SACK
+				    if (!tp->sack_disable) {
 					callout_stop(tp->tt_rexmt);
 					tp->t_rtttime = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK) 
+				      tp->t_dupacks = tcprexmtthresh;
+				      (void) tcp_output(tp);
+				      /*
+				       * During FR, snd_cwnd is held
+				       * constant for FACK.
+				       */
+				      tp->snd_cwnd = tp->snd_ssthresh;
+#else
+				      /* 
+				       * tcp_output() will send
+				       * oldest SACK-eligible rtx.
+				       */
+				      (void) tcp_output(tp);
+				      tp->snd_cwnd = tp->snd_ssthresh+
+				      tp->t_maxseg * tp->t_dupacks;
+#endif /* TCP_FACK */
+				      goto drop;
+				    }
+#endif /* TCP_SACK */
+				  callout_stop(tp->tt_rexmt);
+				  tp->t_rtttime = 0;
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
 					(void) tcp_output(tp);
+				  
 					tp->snd_cwnd = tp->snd_ssthresh +
 					       tp->t_maxseg * tp->t_dupacks;
 					if (SEQ_GT(onxt, tp->snd_nxt))
 						tp->snd_nxt = onxt;
 					goto drop;
 				} else if (tp->t_dupacks > tcprexmtthresh) {
+#if defined(TCP_SACK) && defined(TCP_FACK)
+				  /* 
+				   * while (awnd < cwnd) 
+				   *         sendsomething(); 
+				   */
+				  if (!tp->sack_disable) {
+				    if (tp->snd_awnd < tp->snd_cwnd)
+				      tcp_output(tp);
+				    goto drop;
+				  }
+#endif /* TCP_FACK */
 					tp->snd_cwnd += tp->t_maxseg;
 					(void) tcp_output(tp);
 					goto drop;
@@ -1861,10 +1983,57 @@
 				tp->t_dupacks = 0;
 			break;
 		}
+			
 		/*
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
+			 * If the congestion window was inflated to account
+			 * for the other side's cached packets, retract it.
 		 */
+#if defined(TCP_SACK)
+			if (!tp->sack_disable) {
+			  if (tp->t_dupacks >= tcprexmtthresh) {
+				/* Check for a partial ACK */
+			    if (tcp_sack_partialack(tp, th)) {
+#if defined(TCP_SACK) && defined(TCP_FACK)
+			      /* Force call to tcp_output */
+			      if (tp->snd_awnd < tp->snd_cwnd) 
+				needoutput = 1;
+#else
+			      tp->snd_cwnd += tp->t_maxseg;
+			      needoutput = 1;
+#endif /* TCP_FACK */
+			    } else {
+			      /* Out of fast recovery */
+			      tp->snd_cwnd = tp->snd_ssthresh;
+			      if (tcp_seq_subtract(tp->snd_max, 
+						   th->th_ack) < tp->snd_ssthresh)
+				tp->snd_cwnd = 
+				  tcp_seq_subtract(tp->snd_max,
+					           th->th_ack);
+			      tp->t_dupacks = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+			      if (SEQ_GT(th->th_ack, tp->snd_fack))
+				tp->snd_fack = th->th_ack;
+#endif /* TCP_FACK */
+			    }
+			  } 
+			} else {
+			  if (tp->t_dupacks >= tcprexmtthresh && 
+			      !tcp_newreno(tp, th)) {
+				/* Out of fast recovery */
+			    tp->snd_cwnd = tp->snd_ssthresh;
+			    if (tcp_seq_subtract(tp->snd_max, th->th_ack) <
+				tp->snd_ssthresh)
+			      tp->snd_cwnd = 
+				tcp_seq_subtract(tp->snd_max,
+						 th->th_ack);
+			    tp->t_dupacks = 0;
+			  }
+			}
+			if (tp->t_dupacks < tcprexmtthresh)
+			  tp->t_dupacks = 0;
+#else /* else no TCP_SACK */
 		if (tcp_do_newreno == 0) {
                         if (tp->t_dupacks >= tcprexmtthresh &&
                                 tp->snd_cwnd > tp->snd_ssthresh)
@@ -1885,6 +2054,8 @@
                         	tp->snd_cwnd = tp->snd_ssthresh;
                         tp->t_dupacks = 0;
                 }
+#endif
+			
 		if (SEQ_GT(th->th_ack, tp->snd_max)) {
 			tcpstat.tcps_rcvacktoomuch++;
 			goto dropafterack;
@@ -1982,8 +2153,14 @@
 		 * in NewReno fast recovery mode, so we leave the congestion
 		 * window alone.
 		 */
+
+#if defined (TCP_SACK)
+		if (tp->t_dupacks < tcprexmtthresh)
+		  tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
+#else	
 		if (tcp_do_newreno == 0 || tp->t_dupacks == 0)
 			tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
+#endif
 		}
 		if (acked > so->so_snd.sb_cc) {
 			tp->snd_wnd -= so->so_snd.sb_cc;
@@ -1998,6 +2175,16 @@
 		tp->snd_una = th->th_ack;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
+#if defined (TCP_SACK) && defined (TCP_FACK)
+		if (SEQ_GT(tp->snd_una, tp->snd_fack)) {
+		  tp->snd_fack = tp->snd_una;
+		  /* Update snd_awnd for partial ACK
+		   * without any SACK blocks.
+		   */
+		  tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt,
+						  tp->snd_fack) + tp->retran_data;
+		}
+#endif
 
 		switch (tp->t_state) {
 
@@ -2196,7 +2383,10 @@
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
-
+#ifdef TCP_SACK
+		if (!tp->sack_disable)
+			tcp_update_sack_list(tp); 
+#endif 
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
@@ -2208,6 +2398,7 @@
 		thflags &= ~TH_FIN;
 	}
 
+
 	/*
 	 * If FIN is received ACK the FIN and let the user know
 	 * that the connection is closing.
@@ -2498,13 +2689,439 @@
 			    (char *)&to->to_ccecho, sizeof(to->to_ccecho));
 			NTOHL(to->to_ccecho);
 			break;
+#ifdef TCP_SACK 
+		case TCPOPT_SACK_PERMITTED:
+			if (tp->sack_disable || optlen!=TCPOLEN_SACK_PERMITTED)
+				continue;
+			if (th->th_flags & TH_SYN)
+				/* MUST only be set on SYN */
+				tp->t_flags |= TF_SACK_PERMIT;
+			break;
+		case TCPOPT_SACK:
+			if (tcp_sack_option(tp, th, cp, optlen))
+				continue;
+			break;
+#endif     
 		}
 	}
 	if (th->th_flags & TH_SYN)
 		tcp_mss(tp, mss);	/* sets t_maxseg */
 }
 
+#if defined(TCP_SACK)
+u_long 
+tcp_seq_subtract(a, b)
+	u_long a, b;
+{ 
+	return ((long)(a - b)); 
+}
+#endif
+
+
+
+#ifdef TCP_SACK 
+/*
+ * This function is called upon receipt of new valid data (while not in header
+ * prediction mode), and it updates the ordered list of sacks. 
+ */
+void 
+tcp_update_sack_list(tp)
+	struct tcpcb *tp; 
+{    
+	/* 
+	 * First reported block MUST be the most recent one.  Subsequent
+	 * blocks SHOULD be in the order in which they arrived at the
+	 * receiver.  These two conditions make the implementation fully
+	 * compliant with RFC 2018.
+	 */     
+	int i, j = 0, count = 0, lastpos = -1;
+	struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
+    
+	/* First clean up current list of sacks */
+	for (i = 0; i < tp->rcv_numsacks; i++) {
+		sack = tp->sackblks[i];
+		if (sack.start == 0 && sack.end == 0) {
+			count++; /* count = number of blocks to be discarded */
+			continue;
+		}
+		if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
+			tp->sackblks[i].start = tp->sackblks[i].end = 0;
+			count++;
+		} else { 
+			temp[j].start = tp->sackblks[i].start;
+			temp[j++].end = tp->sackblks[i].end;
+		}
+	}   
+	tp->rcv_numsacks -= count;
+	if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
+		tcp_clean_sackreport(tp);
+		if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) {
+			/* ==> need first sack block */
+			tp->sackblks[0].start = tp->rcv_laststart;
+			tp->sackblks[0].end = tp->rcv_lastend;
+			tp->rcv_numsacks = 1;
+		}
+		return;
+	}
+	/* Otherwise, sack blocks are already present. */
+	for (i = 0; i < tp->rcv_numsacks; i++)
+		tp->sackblks[i] = temp[i]; /* first copy back sack list */
+	if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend)) 
+		return;     /* sack list remains unchanged */
+	/* 
+	 * From here, segment just received should be (part of) the 1st sack.
+	 * Go through list, possibly coalescing sack block entries.
+	 */
+	firstsack.start = tp->rcv_laststart;
+	firstsack.end = tp->rcv_lastend;
+	for (i = 0; i < tp->rcv_numsacks; i++) {
+		sack = tp->sackblks[i];
+		if (SEQ_LT(sack.end, firstsack.start) ||
+		    SEQ_GT(sack.start, firstsack.end))
+			continue; /* no overlap */
+		if (sack.start == firstsack.start && sack.end == firstsack.end){
+			/* 
+			 * identical block; delete it here since we will
+			 * move it to the front of the list.
+			 */
+			tp->sackblks[i].start = tp->sackblks[i].end = 0;
+			lastpos = i;    /* last posn with a zero entry */
+			continue;
+		}
+		if (SEQ_LEQ(sack.start, firstsack.start))
+			firstsack.start = sack.start; /* merge blocks */
+		if (SEQ_GEQ(sack.end, firstsack.end))
+			firstsack.end = sack.end;     /* merge blocks */
+		tp->sackblks[i].start = tp->sackblks[i].end = 0;
+		lastpos = i;    /* last posn with a zero entry */
+	}
+	if (lastpos != -1) {    /* at least one merge */
+		for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
+			sack = tp->sackblks[i];
+			if (sack.start == 0 && sack.end == 0)
+				continue;
+			temp[j++] = sack;
+		}
+		tp->rcv_numsacks = j; /* including first blk (added later) */
+		for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
+			tp->sackblks[i] = temp[i];
+	} else {        /* no merges -- shift sacks by 1 */
+		if (tp->rcv_numsacks < MAX_SACK_BLKS)
+			tp->rcv_numsacks++;
+		for (i = tp->rcv_numsacks-1; i > 0; i--)
+			tp->sackblks[i] = tp->sackblks[i-1];
+	}
+	tp->sackblks[0] = firstsack;
+	return;
+}  
+
 /*
+ * Process the TCP SACK option.  Returns 1 if tcp_dooptions() should continue,
+ * and 0 otherwise, if the option was fine.  tp->snd_holes is an ordered list
+ * of holes (oldest to newest, in terms of the sequence space).  
+ */             
+int
+tcp_sack_option(tp, th, cp, optlen)
+	struct tcpcb *tp;
+	struct tcphdr *th;
+	u_char *cp;
+	int    optlen;
+{       
+	int tmp_olen;
+	u_char *tmp_cp;
+	struct sackhole *cur, *p, *temp;
+   
+	if (tp->sack_disable)
+		return 1;
+           
+	/* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
+	if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
+		return 1;
+	tmp_cp = cp + 2;
+	tmp_olen = optlen - 2;
+	if (tp->snd_numholes < 0)
+		tp->snd_numholes = 0;
+	if (tp->t_maxseg == 0)
+		panic("tcp_sack_option"); /* Should never happen */
+	while (tmp_olen > 0) {
+		struct sackblk sack;
+            
+		bcopy((char *) tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
+		NTOHL(sack.start); 
+		bcopy((char *) tmp_cp + sizeof(tcp_seq),
+		    (char *) &(sack.end), sizeof(tcp_seq));
+		NTOHL(sack.end);
+		tmp_olen -= TCPOLEN_SACK;
+		tmp_cp += TCPOLEN_SACK;
+		if (SEQ_LEQ(sack.end, sack.start))
+			continue; /* bad SACK fields */
+		if (SEQ_LEQ(sack.end, tp->snd_una)) 
+			continue; /* old block */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+		/* Updates snd_fack.  */
+		if (SEQ_GEQ(sack.end, tp->snd_fack))
+			tp->snd_fack = sack.end;
+#endif /* TCP_FACK */
+		if (SEQ_GT(th->th_ack, tp->snd_una)) {
+			if (SEQ_LT(sack.start, th->th_ack))
+				continue;
+		} else {
+			if (SEQ_LT(sack.start, tp->snd_una))
+				continue;
+		}
+		if (SEQ_GT(sack.end, tp->snd_max))
+			continue;
+		if (tp->snd_holes == 0) { /* first hole */
+			tp->snd_holes = (struct sackhole *)
+			    malloc(sizeof(struct sackhole), M_PCB, M_NOWAIT);
+			if (tp->snd_holes == NULL) {
+				/* ENOBUFS, so ignore SACKed block for now*/
+				continue;  
+			}
+			cur = tp->snd_holes;
+			cur->start = th->th_ack;
+			cur->end = sack.start;
+			cur->rxmit = cur->start;
+			cur->next = 0;
+			tp->snd_numholes = 1;
+			tp->rcv_lastsack = sack.end;
+			/* 
+			 * dups is at least one.  If more data has been 
+			 * SACKed, it can be greater than one.
+			 */
+			cur->dups = min(tcprexmtthresh, 
+			    ((sack.end - cur->end)/tp->t_maxseg));
+			if (cur->dups < 1)
+				cur->dups = 1;
+			continue; /* with next sack block */
+		}
+		/* Go thru list of holes:  p = previous,  cur = current */
+		p = cur = tp->snd_holes;
+		while (cur) {
+			if (SEQ_LEQ(sack.end, cur->start)) 
+				/* SACKs data before the current hole */ 
+				break; /* no use going through more holes */
+			if (SEQ_GEQ(sack.start, cur->end)) {
+				/* SACKs data beyond the current hole */
+				cur->dups++;
+				if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+					tcprexmtthresh)
+					cur->dups = tcprexmtthresh;
+				p = cur;
+				cur = cur->next;
+				continue;
+			}
+			if (SEQ_LEQ(sack.start, cur->start)) {
+				/* Data acks at least the beginning of hole */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+				if (SEQ_GT(sack.end, cur->rxmit))
+					tp->retran_data -= 
+				    	    tcp_seq_subtract(cur->rxmit, 
+					    cur->start);
+				else
+					tp->retran_data -=
+					    tcp_seq_subtract(sack.end, 
+					    cur->start);
+#endif /* TCP_FACK */
+				if (SEQ_GEQ(sack.end,cur->end)){
+					/* Acks entire hole, so delete hole */
+					if (p != cur) {
+						p->next = cur->next;
+						free(cur, M_PCB);
+						cur = p->next;
+					} else {
+						cur=cur->next;
+						free(p, M_PCB);
+						p = cur;
+						tp->snd_holes = p;
+					}
+					tp->snd_numholes--;
+					continue;
+				}
+				/* otherwise, move start of hole forward */
+				cur->start = sack.end;
+				cur->rxmit = max (cur->rxmit, cur->start);
+				p = cur;
+				cur = cur->next;
+				continue;
+			}
+			/* move end of hole backward */
+			if (SEQ_GEQ(sack.end, cur->end)) {
+#if defined(TCP_SACK) && defined(TCP_FACK)
+				if (SEQ_GT(cur->rxmit, sack.start)) 
+					tp->retran_data -= 
+					    tcp_seq_subtract(cur->rxmit, 
+					    sack.start);
+#endif /* TCP_FACK */
+				cur->end = sack.start;
+				cur->rxmit = min (cur->rxmit, cur->end);
+				cur->dups++;
+				if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+					tcprexmtthresh)
+					cur->dups = tcprexmtthresh;
+				p = cur;
+				cur = cur->next;
+				continue;
+			}
+			if (SEQ_LT(cur->start, sack.start) &&
+			    SEQ_GT(cur->end, sack.end)) {
+				/* 
+				 * ACKs some data in middle of a hole; need to 
+				 * split current hole
+				 */
+				temp = (struct sackhole *)malloc(sizeof(*temp),
+				    M_PCB,M_NOWAIT);
+				if (temp == NULL) 
+					continue; /* ENOBUFS */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+				if (SEQ_GT(cur->rxmit, sack.end)) 
+					tp->retran_data -= 
+					    tcp_seq_subtract(sack.end, 
+					    sack.start);
+				else if (SEQ_GT(cur->rxmit, sack.start))
+					tp->retran_data -= 
+					    tcp_seq_subtract(cur->rxmit, 
+					    sack.start);
+#endif /* TCP_FACK */
+				temp->next = cur->next;
+				temp->start = sack.end;
+				temp->end = cur->end;
+				temp->dups = cur->dups;
+				temp->rxmit = max (cur->rxmit, temp->start);
+				cur->end = sack.start;
+				cur->rxmit = min (cur->rxmit, cur->end);
+				cur->dups++;
+				if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+					tcprexmtthresh)
+					cur->dups = tcprexmtthresh;
+				cur->next = temp;
+				p = temp;
+				cur = p->next;
+				tp->snd_numholes++;
+			}
+		}
+		/* At this point, p points to the last hole on the list */
+		if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
+			/*
+			 * Need to append new hole at end.
+			 * Last hole is p (and it's not NULL).
+			 */
+			temp = (struct sackhole *) malloc(sizeof(*temp),
+			    M_PCB, M_NOWAIT);
+			if (temp == NULL) 
+				continue; /* ENOBUFS */
+			temp->start = tp->rcv_lastsack;
+			temp->end = sack.start;
+			temp->dups = min(tcprexmtthresh, 
+			    ((sack.end - sack.start)/tp->t_maxseg));
+			if (temp->dups < 1)
+				temp->dups = 1;
+			temp->rxmit = temp->start;
+			temp->next = 0;
+			p->next = temp;
+			tp->rcv_lastsack = sack.end;
+			tp->snd_numholes++;
+		}
+	}
+#if defined(TCP_SACK) && defined(TCP_FACK)
+	/* 
+	 * Update retran_data and snd_awnd.  Go through the list of 
+	 * holes.   Increment retran_data by (hole->rxmit - hole->start).
+	 */
+	tp->retran_data = 0;
+	cur = tp->snd_holes;
+	while (cur) {
+		tp->retran_data += cur->rxmit - cur->start;
+		cur = cur->next;
+	}
+	tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) + 
+	    tp->retran_data;
+#endif /* TCP_FACK */
+
+	return 0;
+}   
+
+/*
+ * Delete stale (i.e, cumulatively ack'd) holes.  Hole is deleted only if
+ * it is completely acked; otherwise, tcp_sack_option(), called from 
+ * tcp_dooptions(), will fix up the hole.
+ */
+void
+tcp_del_sackholes(tp, th)
+	struct tcpcb *tp;
+	struct tcphdr *th;
+{
+	if (!tp->sack_disable && tp->t_state != TCPS_LISTEN) {
+		/* max because this could be an older ack just arrived */
+		tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
+			th->th_ack : tp->snd_una;
+		struct sackhole *cur = tp->snd_holes;
+		struct sackhole *prev = cur;
+		while (cur)
+			if (SEQ_LEQ(cur->end, lastack)) {
+				cur = cur->next;
+				free(prev, M_PCB);
+				prev = cur;
+				tp->snd_numholes--;
+			} else if (SEQ_LT(cur->start, lastack)) {
+				cur->start = lastack;
+				if (SEQ_LT(cur->rxmit, cur->start))
+					cur->rxmit = cur->start;
+				break;
+			} else
+				break;
+		tp->snd_holes = cur;
+	}
+}
+
+/* 
+ * Delete all receiver-side SACK information.
+ */
+void
+tcp_clean_sackreport(tp)
+	struct tcpcb *tp;
+{
+	int i;
+
+	tp->rcv_numsacks = 0;
+	for (i = 0; i < MAX_SACK_BLKS; i++)
+		tp->sackblks[i].start = tp->sackblks[i].end=0;
+
+}
+
+/* 
+ * Checks for partial ack.  If partial ack arrives, turn off retransmission
+ * timer, deflate the window, do not clear tp->t_dupacks, and return 1.
+ * If the ack advances at least to tp->snd_last, return 0.
+ */
+int
+tcp_sack_partialack(tp, th)
+	struct tcpcb *tp;
+	struct tcphdr *th;
+{
+	if (SEQ_LT(th->th_ack, tp->snd_last)) {
+		/* Turn off retx. timer (will start again next segment) */
+              callout_stop(tp->tt_rexmt);
+	      tp->t_rtttime = 0;
+#ifndef TCP_FACK
+		/* 
+		 * Partial window deflation.  This statement relies on the 
+		 * fact that tp->snd_una has not been updated yet.  In FACK
+		 * hold snd_cwnd constant during fast recovery.
+		 */
+		if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) {
+			tp->snd_cwnd -= th->th_ack - tp->snd_una;
+			tp->snd_cwnd += tp->t_maxseg;
+		} else
+			tp->snd_cwnd = tp->t_maxseg;
+#endif
+		return 1;
+	}
+	return 0;
+}
+#endif TCP_SACK
+
+/*
  * Pull out of band byte out of a segment so
  * it doesn't appear in the user's data queue.
  * It is still reflected in the segment length for
@@ -2909,7 +3526,12 @@
 	struct tcpcb *tp;
 	struct tcphdr *th;
 {
+
+#if defined (TCP_SACK)
+if (SEQ_LT(th->th_ack, tp->snd_last)) {
+#else
 	if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+#endif
 		tcp_seq onxt = tp->snd_nxt;
 		u_long  ocwnd = tp->snd_cwnd;
 
Index: tcp_output.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_output.c,v
retrieving revision 1.52
diff -u -d -b -w -u -d -r1.52 tcp_output.c
--- tcp_output.c	2001/06/23 03:21:46	1.52
+++ tcp_output.c	2001/08/22 23:11:21
@@ -98,6 +98,106 @@
 int     tcp_do_newreno = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
         0, "Enable NewReno Algorithms");
+
+#ifdef TCP_SACK
+extern int tcprexmtthresh;
+#endif
+
+#ifdef TCP_SACK
+#ifdef TCP_SACK_DEBUG
+void
+tcp_print_holes(tp)
+struct tcpcb *tp;
+{
+	struct sackhole *p = tp->snd_holes;
+	if (p == 0)
+		return;
+	printf("Hole report: start--end dups rxmit\n");
+	while (p) {
+		printf("%d--%d d %d r %d\n",  p->start, p->end, p->dups,
+                    p->rxmit);
+		p = p->next;
+	}
+	printf("\n");
+}
+#endif /* TCP_SACK_DEBUG */
+
+/*
+ * Returns pointer to a sackhole if there are any pending retransmissions;
+ * NULL otherwise.
+ */
+struct sackhole *
+tcp_sack_output(tp)
+register struct tcpcb *tp;
+{
+	struct sackhole *p;
+	if (tp->sack_disable)
+		return 0;
+	p = tp->snd_holes;
+	while (p) {
+#ifndef TCP_FACK
+		if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) {
+#else
+		/* In FACK, if p->dups is less than tcprexmtthresh, but
+		 * snd_fack advances more than tcprextmtthresh * tp->t_maxseg,
+		 * tcp_input() will try fast retransmit. This forces output.
+		 */
+		if ((p->dups >= tcprexmtthresh ||
+		     tp->t_dupacks == tcprexmtthresh) &&
+		    SEQ_LT(p->rxmit, p->end)) {
+#endif /* TCP_FACK */
+			if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
+				p = p->next;
+				continue;
+			}
+#ifdef TCP_SACK_DEBUG
+			if (p)
+				tcp_print_holes(tp);
+#endif
+			return p;
+		}
+        	p = p->next;
+	}
+	return 0;
+}
+
+/*
+ * After a timeout, the SACK list may be rebuilt.  This SACK information
+ * should be used to avoid retransmitting SACKed data.  This function
+ * traverses the SACK list to see if snd_nxt should be moved forward.
+ */
+void
+tcp_sack_adjust(tp)
+	struct tcpcb *tp;
+{
+	struct sackhole *cur = tp->snd_holes;
+	if (cur == 0)
+		return; /* No holes */
+	if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
+		return; /* We're already beyond any SACKed blocks */
+	/* 
+	 * Two cases for which we want to advance snd_nxt:  
+	 * i) snd_nxt lies between end of one hole and beginning of another
+	 * ii) snd_nxt lies between end of last hole and rcv_lastsack
+	 */
+	while (cur->next) {
+		if (SEQ_LT(tp->snd_nxt, cur->end))
+			return;
+		if (SEQ_GEQ(tp->snd_nxt, cur->next->start)) 
+			cur = cur->next;
+		else {
+			tp->snd_nxt = cur->next->start;
+			return;
+		}
+	}
+	if (SEQ_LT(tp->snd_nxt, cur->end))
+		return;
+	tp->snd_nxt = tp->rcv_lastsack;
+	return;
+}
+#endif /* TCP_SACK */
+
+
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
@@ -118,6 +218,10 @@
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 	int idle, sendalot;
+#ifdef TCP_SACK
+	int i, sack_rxmit = 0;
+	struct sackhole *p;
+#endif
 	int maxburst = TCP_MAXBURST;
 	struct rmxp_tao *taop;
 	struct rmxp_tao tao_noncached;
@@ -161,10 +265,30 @@
 	}
 again:
 	sendalot = 0;
+#ifdef TCP_SACK
+	/*
+	 * If we've recently taken a timeout, snd_max will be greater than
+	 * snd_nxt.  There may be SACK information that allows us to avoid
+	 * resending already delivered data.  Adjust snd_nxt accordingly.
+	 */
+	if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max))
+		tcp_sack_adjust(tp);
+#endif
 	off = tp->snd_nxt - tp->snd_una;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+	/* Normally, sendable data is limited by off < tp->snd_cwnd.
+	 * But in FACK, sendable data is limited by snd_awnd < snd_cwnd,
+	 * regardless of offset.
+	 */
+	if (!tp->sack_disable && (tp->t_dupacks > tcprexmtthresh))
+	  win = tp->snd_wnd;
+	else
+#endif
 	win = min(tp->snd_wnd, tp->snd_cwnd);
 
 	flags = tcp_outflags[tp->t_state];
+	
+	
 	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
@@ -173,7 +297,33 @@
 		flags |= TH_FIN;
 	if (tp->t_flags & TF_NEEDSYN)
 		flags |= TH_SYN;
+#ifdef TCP_SACK
+	/* 
+	 * Send any SACK-generated retransmissions.  If we're explicitly trying
+	 * to send out new data (when sendalot is 1), bypass this function.
+	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
+	 * we're replacing a (future) new transmission with a retransmission 
+	 * now, and we previously incremented snd_cwnd in tcp_input().
+	 */
+	if (!tp->sack_disable && !sendalot) {
+	  if (tp->t_dupacks >= tcprexmtthresh &&
+	      (p = tcp_sack_output(tp))) {
+	    off = p->rxmit - tp->snd_una;
+	    sack_rxmit = 1;
+#if 0
+	    /* Coalesce holes into a single retransmission */
+#endif
+	    len = min(tp->t_maxseg, p->end - p->rxmit);
+#ifndef TCP_FACK
+	    /* in FACK, hold snd_cwnd constant during recovery */
+	    if (SEQ_LT(tp->snd_una, tp->snd_last))
+	      tp->snd_cwnd -= tp->t_maxseg;
+#endif
+	  }
+	}
+#endif /* TCP_SACK */
 
+	sendalot = 0;
 	/*
 	 * If in persist timeout with window of 0, send 1 byte.
 	 * Otherwise, if window is small but nonzero
@@ -207,8 +357,26 @@
 		}
 	}
 
+#ifdef TCP_SACK
+	if (!sack_rxmit) {
+#endif
 	len = (long)ulmin(so->so_snd.sb_cc, win) - off;
 
+#if defined(TCP_SACK) && defined(TCP_FACK)
+	/* 
+	 * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and 
+	 * amount of outstanding data (snd_awnd) is >= snd_cwnd, then
+	 * do not send data (like zero window conditions)
+	 */
+	if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) && 
+	    (tp->snd_awnd >= tp->snd_cwnd)) 
+		len = 0;
+#endif /* TCP_FACK */
+#ifdef TCP_SACK
+	}
+#endif
+
+
 	if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
 		taop = &tao_noncached;
 		bzero(taop, sizeof(*taop));
@@ -293,6 +461,10 @@
 			goto send;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))
 			goto send;
+#ifdef TCP_SACK
+		if (sack_rxmit)
+		  goto send;
+#endif
 	}
 
 	/*
@@ -335,6 +507,20 @@
 	if (flags & TH_FIN &&
 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 		goto send;
+#ifdef TCP_SACK
+	/*
+	 * In SACK, it is possible for tcp_output to fail to send a segment 
+	 * after the retransmission timer has been turned off.  Make sure
+	 * that the retransmission timer is set.
+	 */
+	if (SEQ_GT(tp->snd_max, tp->snd_una) &&
+	    !callout_active(tp->tt_rexmt) &&
+	    !callout_active(tp->tt_persist)){
+	  callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+			tcp_timer_rexmt, tp);
+	  return (0);
+	}
+#endif /* TCP_SACK */
 
 	/*
 	 * TCP window updates are not reliable, rather a polling protocol
@@ -395,7 +581,22 @@
 			mss = htons((u_short) tcp_mssopt(tp));
 			(void)memcpy(opt + 2, &mss, sizeof(mss));
 			optlen = TCPOLEN_MAXSEG;
+#ifdef TCP_SACK
+			/* 
+			 * If this is the first SYN of connection (not a SYN 
+			 * ACK), include SACK_PERMIT_HDR option.  If this is a 
+			 * SYN ACK, include SACK_PERMIT_HDR option if peer has 
+			 * already done so.
+			 */
+			if (!tp->sack_disable && ((flags & TH_ACK) == 0 ||
+			    (tp->t_flags & TF_SACK_PERMIT))) {
+				*((u_int32_t *) (opt + optlen)) =
+				    htonl(TCPOPT_SACK_PERMIT_HDR);
+				optlen += 4;
+			}
+#endif
 
+
 			if ((tp->t_flags & TF_REQ_SCALE) &&
 			    ((flags & TH_ACK) == 0 ||
 			    (tp->t_flags & TF_RCVD_SCALE))) {
@@ -426,6 +627,33 @@
  		*lp   = htonl(tp->ts_recent);
  		optlen += TCPOLEN_TSTAMP_APPA;
  	}
+#ifdef TCP_SACK
+	/*
+	 * Send SACKs if necessary.  This should be the last option processed.
+	 * Only as many SACKs are sent as are permitted by the maximum options
+	 * size.  No more than three SACKs are sent.
+	 */
+	if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED &&
+	    (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
+	    tp->rcv_numsacks) {
+		u_int32_t *lp = (u_int32_t *)(opt + optlen);
+		u_int32_t *olp = lp++;
+		int count = 0;  /* actual number of SACKs inserted */
+		int maxsack = (TCP_MAXOLEN - (optlen + 4))/TCPOLEN_SACK;
+
+		maxsack = min(maxsack, TCP_MAX_SACK);
+		for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
+			struct sackblk sack = tp->sackblks[i];
+			if (sack.start == 0 && sack.end == 0)
+				continue;
+			*lp++ = htonl(sack.start);
+			*lp++ = htonl(sack.end);
+			count++;
+		}
+		*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
+		optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
+	}
+#endif /* TCP_SACK */
 
  	/*
 	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
@@ -673,6 +901,23 @@
 		th->th_seq = htonl(tp->snd_nxt);
 	else
 		th->th_seq = htonl(tp->snd_max);
+#ifdef TCP_SACK
+	if (sack_rxmit) {
+	  /* 
+	   * If sendalot was turned on (due to option stuffing), turn it 
+	   * off. Properly set th_seq field.  Advance the ret'x pointer 
+	   * by len.  
+	   */
+	  if (sendalot)
+	    sendalot = 0;
+	  th->th_seq = htonl(p->rxmit);
+	  p->rxmit += len;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+	  tp->retran_data += len;
+#endif /* TCP_FACK */
+	}
+#endif /* TCP_SACK */
+
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
@@ -747,6 +992,14 @@
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
+#ifdef TCP_SACK
+		if (!tp->sack_disable) {
+			if (sack_rxmit && (p->rxmit != tp->snd_nxt)) {
+				goto timer;
+			}
+		}
+#endif
+
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			tp->snd_max = tp->snd_nxt;
@@ -769,6 +1022,19 @@
 		 * Initialize shift counter which is used for backoff
 		 * of retransmit time.
 		 */
+#ifdef TCP_SACK
+ timer:
+		if (!tp->sack_disable && sack_rxmit &&
+		    !callout_active(tp->tt_rexmt) &&
+		    tp->snd_nxt != tp->snd_una) {
+			if (callout_active(tp->tt_persist)) {
+				callout_stop(tp->tt_persist);
+				tp->t_rxtshift = 0;
+			}
+			callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+				      tcp_timer_rexmt, tp);
+		}
+#endif
 		if (!callout_active(tp->tt_rexmt) &&
 		    tp->snd_nxt != tp->snd_una) {
 			if (callout_active(tp->tt_persist)) {
@@ -859,6 +1125,12 @@
 	error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
 	    (so->so_options & SO_DONTROUTE), 0);
     }
+	
+#if defined(TCP_SACK) && defined(TCP_FACK)
+	/* Update snd_awnd to reflect the new data that was sent.  */
+	tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) +
+	  tp->retran_data;                
+#endif /* defined(TCP_SACK) && defined(TCP_FACK) */
 	if (error) {
 
 		/*
Index: tcp_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.113
diff -u -d -b -w -u -d -r1.113 tcp_subr.c
--- tcp_subr.c	2001/08/22 00:58:16	1.113
+++ tcp_subr.c	2001/08/22 23:11:21
@@ -148,6 +148,12 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
 
+#ifdef TCP_SACK
+static int 	tcp_do_sack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_do_sack, CTLFLAG_RW, &tcp_do_sack, 0, 
+    "Experimental Sack");
+#endif
+
 static void	tcp_cleartaocache __P((void));
 static void	tcp_notify __P((struct inpcb *, int));
 
@@ -161,6 +167,14 @@
 #define TCBHASHSIZE	512
 #endif
 
+#ifndef TCP_DO_SACK
+#ifdef TCP_SACK
+#define TCP_DO_SACK	1
+#else
+#define TCP_DO_SACK	0
+#endif
+#endif
+
 /*
  * This is the actual shape of what we allocate using the zone
  * allocator.  Doing it this way allows us to protect both structures
@@ -527,6 +541,9 @@
 	callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
 	callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
 
+#ifdef TCP_SACK
+	tp->sack_disable = tcp_do_sack ? 0 : 1;
+#endif
 	if (tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (tcp_do_rfc1644)
@@ -591,6 +608,9 @@
 	register struct tseg_qent *q;
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
+#ifdef TCP_SACK
+	struct sackhole *p, *q_sack;
+#endif
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
@@ -729,6 +749,15 @@
 		m_freem(q->tqe_m);
 		FREE(q, M_TSEGQ);
 	}
+#ifdef TCP_SACK
+	/* Free SACK holes. */
+	q_sack = p = tp->snd_holes;
+	while (p != 0) {
+		q_sack = p->next;
+		free(p, M_PCB);
+		p = q_sack;
+	}
+#endif
 	inp->inp_ppcb = NULL;
 	soisdisconnected(so);
 #ifdef INET6
Index: tcp_timer.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_timer.c,v
retrieving revision 1.47
diff -u -d -b -w -u -d -r1.47 tcp_timer.c
--- tcp_timer.c	2001/08/22 00:58:16	1.47
+++ tcp_timer.c	2001/08/22 23:11:21
@@ -40,6 +40,8 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/sysctl.h>
@@ -186,6 +188,9 @@
 {
 	struct tcpcb *tp = xtp;
 	int s;
+#ifdef TCP_SACK
+	struct sackhole *p, *q;
+#endif
 #ifdef TCPDEBUG
 	int ostate;
 
@@ -197,7 +202,25 @@
 		return;
 	}
 	callout_deactivate(tp->tt_2msl);
+	
+#ifdef TCP_SACK
 	/*
+	 * Free SACK holes for 2MSL and REXMT timers.
+	 */
+	q = p = tp->snd_holes;
+	while (p != 0) {
+	  q = p->next;
+	  free(p, M_PCB);
+	  p = q;
+	}
+	tp->snd_holes = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+	tp->snd_fack = tp->snd_una;
+	tp->retran_data = 0;
+	tp->snd_awnd = 0;
+#endif /* TCP_FACK */
+#endif /* TCP_SACK */	
+	/*
 	 * 2 MSL timeout in shutdown went off.  If we're closed but
 	 * still waiting for peer to close and connection has been idle
 	 * too long, or if 2MSL time is up from TIME_WAIT, delete connection
@@ -349,6 +372,9 @@
 	struct tcpcb *tp = xtp;
 	int s;
 	int rexmt;
+#ifdef TCP_SACK
+	struct sackhole *p, *q;
+#endif
 #ifdef TCPDEBUG
 	int ostate;
 
@@ -360,7 +386,25 @@
 		return;
 	}
 	callout_deactivate(tp->tt_rexmt);
+#ifdef TCP_SACK
 	/*
+	 * Free SACK holes for 2MSL and REXMT timers.
+	 */
+	q = p = tp->snd_holes;
+	while (p != 0) {
+	  q = p->next;
+	  free(p, M_PCB);
+	  p = q;
+	}
+	tp->snd_holes = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+	tp->snd_fack = tp->snd_una;
+	tp->retran_data = 0;
+	tp->snd_awnd = 0;
+#endif /* TCP_FACK */
+#endif /* TCP_SACK */
+
+	/*
 	 * Retransmission timer went off.  Message has not
 	 * been acked within retransmit interval.  Back off
 	 * to a longer retransmit interval and retransmit one segment.
@@ -421,11 +465,20 @@
 		tp->t_srtt = 0;
 	}
 	tp->snd_nxt = tp->snd_una;
+#if defined(TCP_SACK)
 	/*
+	 * Note:  We overload snd_last to function also as the
+	 * snd_last variable described in RFC 2582
+	 */
+	tp->snd_last = tp->snd_max;
+#else
+	/*
 	 * Note:  We overload snd_recover to function also as the
 	 * snd_last variable described in RFC 2582
 	 */
+
 	tp->snd_recover = tp->snd_max;
+#endif /* TCP_SACK */
 	/*
 	 * Force a segment to be sent.
 	 */
Index: tcp_usrreq.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.66
diff -u -d -b -w -u -d -r1.66 tcp_usrreq.c
--- tcp_usrreq.c	2001/08/22 00:58:16	1.66
+++ tcp_usrreq.c	2001/08/22 23:11:21
@@ -761,6 +761,16 @@
 	tp->iss = tcp_new_isn(tp);
 	tcp_sendseqinit(tp);
 
+#if defined(TCP_SACK)
+		tp->snd_last = tp->snd_una;
+#endif
+#if defined(TCP_SACK) && defined(TCP_FACK)
+		tp->snd_fack = tp->snd_una;
+		tp->retran_data = 0;
+		tp->snd_awnd = 0;
+#endif
+
+
 	/*
 	 * Generate a CC value for this connection and
 	 * check whether CC or CCnew should be used.
@@ -978,6 +988,11 @@
 		case TCP_NOPUSH:
 			optval = tp->t_flags & TF_NOPUSH;
 			break;
+#ifdef TCP_SACK
+		case TCP_SACK_DISABLE:
+			optval = tp->sack_disable;
+			break;
+#endif			
 		default:
 			error = ENOPROTOOPT;
 			break;
Index: tcp_var.h
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_var.h,v
retrieving revision 1.71
diff -u -d -b -w -u -d -r1.71 tcp_var.h
--- tcp_var.h	2001/08/22 00:58:16	1.71
+++ tcp_var.h	2001/08/22 23:11:21
@@ -36,6 +36,21 @@
 
 #ifndef _NETINET_TCP_VAR_H_
 #define _NETINET_TCP_VAR_H_
+
+struct sackblk {
+	tcp_seq start;		/* start seq no. of sack block */
+	tcp_seq end; 		/* end seq no. */
+};  
+
+struct sackhole {
+	tcp_seq start;		/* start seq no. of hole */ 
+	tcp_seq end;		/* end seq no. */
+	int	dups;		/* number of dup(s)acks for this hole */
+	tcp_seq rxmit;		/* next seq. no in hole to be retransmitted */
+	struct sackhole *next;	/* next in list */
+};
+
+
 /*
  * Kernel variables for tcp.
  */
@@ -114,6 +129,31 @@
 	u_long	rcv_wnd;		/* receive window */
 	tcp_seq	rcv_up;			/* receive urgent pointer */
 
+#ifdef TCP_SACK
+	int	sack_disable;		/* disable SACK for this connection */
+	int	snd_numholes;		/* number of holes seen by sender */
+	struct sackhole *snd_holes;	/* linked list of holes (sorted) */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+	tcp_seq snd_fack;		/* for FACK congestion control */
+	u_long	snd_awnd;		/* snd_nxt - snd_fack + */
+					/* retransmitted data */
+	int retran_data;		/* amount of outstanding retx. data  */
+#endif /* TCP_FACK */
+#endif /* TCP_SACK */
+#if defined(TCP_SACK)
+	tcp_seq snd_last;		/* for use in fast recovery */
+#endif
+
+#ifdef TCP_SACK
+	tcp_seq rcv_laststart;		/* start of last segment recd. */
+	tcp_seq rcv_lastend;		/* end of ... */
+	tcp_seq rcv_lastsack;		/* last seq number(+1) sack'd by rcv'r*/
+	int	rcv_numsacks;		/* # distinct sack blks present */
+	struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
+#endif
+
+
+
 	u_long	snd_wnd;		/* send window */
 	u_long	snd_cwnd;		/* congestion-controlled window */
 	u_long	snd_ssthresh;		/* snd_cwnd size threshold for
@@ -338,8 +378,12 @@
 #define	TCPCTL_PCBLIST		11	/* list of all outstanding PCBs */
 #define	TCPCTL_DELACKTIME	12	/* time before sending delayed ACK */
 #define	TCPCTL_V6MSSDFLT	13	/* MSS default for IPv6 */
+#ifdef  TCP_SACK
+#define TCPCTL_SACK             14      /*needs to revist*/
+#define	TCPCTL_MAXID		15
+#else
 #define	TCPCTL_MAXID		14
-
+#endif
 #define TCPCTL_NAMES { \
 	{ 0, 0 }, \
 	{ "rfc1323", CTLTYPE_INT }, \
@@ -355,8 +399,11 @@
 	{ "pcblist", CTLTYPE_STRUCT }, \
 	{ "delacktime", CTLTYPE_INT }, \
 	{ "v6mssdflt", CTLTYPE_INT }, \
+        { "sack",      CTLTYPE_INT}, \
 }
-
+/*
+#define TCP_SACK_DEBUG
+*/
 
 #ifdef _KERNEL
 #ifdef SYSCTL_DECL
@@ -367,6 +414,9 @@
 extern	struct inpcbinfo tcbinfo;
 extern	struct tcpstat tcpstat;	/* tcp statistics */
 extern	int tcp_mssdflt;	/* XXX */
+#ifdef TCP_SACK
+extern	int tcp_do_sack;	/* SACK enabled/disabled */
+#endif
 extern	int tcp_delack_enabled;
 extern	int tcp_do_newreno;
 extern	int ss_fltsz;
@@ -406,7 +456,20 @@
 	 tcp_timers __P((struct tcpcb *, int));
 void	 tcp_trace __P((int, int, struct tcpcb *, void *, struct tcphdr *,
 			int));
+#ifdef TCP_SACK
+int	 tcp_sack_option __P((struct tcpcb *,struct tcphdr *,u_char *,int));
+void	 tcp_update_sack_list __P((struct tcpcb *tp));
+void	 tcp_del_sackholes __P((struct tcpcb *, struct tcphdr *));
+void	 tcp_clean_sackreport __P((struct tcpcb *tp));
+void	 tcp_sack_adjust __P((struct tcpcb *tp));
+struct sackhole *
+	 tcp_sack_output __P((struct tcpcb *tp));
+int	 tcp_sack_partialack __P((struct tcpcb *, struct tcphdr *));
 
+#endif /* TCP_SACK */
+#if defined(TCP_SACK)
+u_long	 tcp_seq_subtract  __P((u_long, u_long )); 
+#endif /* TCP_SACK */
 extern	struct pr_usrreqs tcp_usrreqs;
 extern	u_long tcp_sendspace;
 extern	u_long tcp_recvspace;

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?Pine.LNX.4.33.0108221713240.16888-200000>