Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 7 Jun 2004 23:58:12 GMT
From:      Paul Saab <ps@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 54368 for review
Message-ID:  <200406072358.i57NwCsK065762@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=54368

Change 54368 by ps@butter.corp on 2004/06/07 23:58:02

	Commit the SACK work done at Yahoo! on RELENG_4 and ported
	to -current.
	
	The scoreboarding code was obtained from OpenBSD, and many
	of the remaining changes were inspired by OpenBSD, but not
	taken directly from there.

Affected files ...

.. //depot/projects/sack/conf/files#2 edit
.. //depot/projects/sack/conf/options#2 edit
.. //depot/projects/sack/netinet/tcp.h#2 edit
.. //depot/projects/sack/netinet/tcp_input.c#2 edit
.. //depot/projects/sack/netinet/tcp_output.c#2 edit
.. //depot/projects/sack/netinet/tcp_sack.c#1 add
.. //depot/projects/sack/netinet/tcp_seq.h#2 edit
.. //depot/projects/sack/netinet/tcp_subr.c#2 edit
.. //depot/projects/sack/netinet/tcp_syncache.c#2 edit
.. //depot/projects/sack/netinet/tcp_timer.c#2 edit
.. //depot/projects/sack/netinet/tcp_var.h#2 edit

Differences ...

==== //depot/projects/sack/conf/files#2 (text+ko) ====

@@ -1450,6 +1450,7 @@
 netinet/tcp_hostcache.c	optional inet
 netinet/tcp_input.c	optional inet
 netinet/tcp_output.c	optional inet
+netinet/tcp_sack.c	optional inet
 netinet/tcp_subr.c	optional inet
 netinet/tcp_syncache.c	optional inet
 netinet/tcp_timer.c	optional inet

==== //depot/projects/sack/conf/options#2 (text+ko) ====

@@ -346,6 +346,7 @@
 SLIP_IFF_OPTS		opt_slip.h
 TCPDEBUG
 TCP_SIGNATURE		opt_inet.h
+TCP_SACK_DEBUG		opt_tcp_sack.h 
 TCP_DROP_SYNFIN		opt_tcp_input.h
 XBONEHACK
 

==== //depot/projects/sack/netinet/tcp.h#2 (text+ko) ====

@@ -85,14 +85,17 @@
 #define TCPOPT_SACK_PERMITTED	4		/* Experimental */
 #define    TCPOLEN_SACK_PERMITTED	2
 #define TCPOPT_SACK		5		/* Experimental */
+#define	   TCPOLEN_SACK			8	/* 2*sizeof(tcp_seq) */
 #define TCPOPT_TIMESTAMP	8
 #define    TCPOLEN_TIMESTAMP		10
 #define    TCPOLEN_TSTAMP_APPA		(TCPOLEN_TIMESTAMP+2) /* appendix A */
 #define    TCPOPT_TSTAMP_HDR		\
     (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)
 
+#define	MAX_TCPOPTLEN		40	/* Absolute maximum TCP options len */
+
 #define	TCPOPT_CC		11		/* CC options: RFC-1644 */
-#define TCPOPT_CCNEW		12
+#define	TCPOPT_CCNEW		12
 #define TCPOPT_CCECHO		13
 #define	   TCPOLEN_CC			6
 #define	   TCPOLEN_CC_APPA		(TCPOLEN_CC+2)
@@ -101,6 +104,15 @@
 #define	TCPOPT_SIGNATURE		19	/* Keyed MD5: RFC 2385 */
 #define	   TCPOLEN_SIGNATURE		18
 
+/* Option definitions */
+#define TCPOPT_SACK_PERMIT_HDR	\
+(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED)
+#define	TCPOPT_SACK_HDR		(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8)
+/* Miscellaneous constants */
+#define	MAX_SACK_BLKS	6	/* Max # SACK blocks stored at sender side */
+#define	TCP_MAX_SACK	3	/* MAX # SACKs sent in any segment */
+
+
 /*
  * Default maximum segment size for TCP.
  * With an IP MTU of 576, this is 536,

==== //depot/projects/sack/netinet/tcp_input.c#2 (text+ko) ====

@@ -37,6 +37,7 @@
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
 #include "opt_tcp_input.h"
+#include "opt_tcp_sack.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -159,7 +160,9 @@
 struct inpcbinfo tcbinfo;
 struct mtx	*tcbinfo_mtx;
 
-static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
+static void	 tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *,
+		     int, int, struct tcphdr *);
+
 static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static int	 tcp_reass(struct tcpcb *, struct tcphdr *, int *,
@@ -721,7 +724,7 @@
 		 * present in a SYN segment.  See tcp_timewait().
 		 */
 		if (thflags & TH_SYN)
-			tcp_dooptions(&to, optp, optlen, 1);
+			tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th);
 		if (tcp_timewait((struct tcptw *)inp->inp_ppcb,
 		    &to, th, m, tlen))
 			goto findpcb;
@@ -934,7 +937,7 @@
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
-			tcp_dooptions(&to, optp, optlen, 1);
+			tcp_dooptions(tp, &to, optp, optlen, 1, th);
 			if (!syncache_add(&inc, &to, th, &so, m))
 				goto drop;
 			if (so == NULL) {
@@ -1050,7 +1053,7 @@
 	 * for incoming connections is handled in tcp_syncache.
 	 * XXX this is traditional behavior, may need to be cleaned up.
 	 */
-	tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
+	tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th);
 	if (thflags & TH_SYN) {
 		if (to.to_flags & TOF_SCALE) {
 			tp->t_flags |= TF_RCVD_SCALE;
@@ -1065,8 +1068,22 @@
 			tp->t_flags |= TF_RCVD_CC;
 		if (to.to_flags & TOF_MSS)
 			tcp_mss(tp, to.to_mss);
+		if (tp->sack_enable) {
+			if (!(to.to_flags & TOF_SACK))
+				tp->sack_enable = 0;
+			else
+				tp->t_flags |= TF_SACK_PERMIT;
+		}
+
 	}
 
+	if (tp->sack_enable) {
+		/* Delete stale (cumulatively acked) SACK holes */
+		tcp_del_sackholes(tp, th);
+		tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
+		tp->rcv_lastend = th->th_seq + tlen;
+	}
+
 	/*
 	 * Header prediction: check for the two common cases
 	 * of a uni-directional data xfer.  If the packet has
@@ -1116,9 +1133,10 @@
 			if (SEQ_GT(th->th_ack, tp->snd_una) &&
 			    SEQ_LEQ(th->th_ack, tp->snd_max) &&
 			    tp->snd_cwnd >= tp->snd_wnd &&
-			    ((!tcp_do_newreno &&
+			    ((!tcp_do_newreno && !tp->sack_enable &&
 			      tp->t_dupacks < tcprexmtthresh) ||
-			     (tcp_do_newreno && !IN_FASTRECOVERY(tp)))) {
+			     ((tcp_do_newreno || tp->sack_enable) && 
+			      !IN_FASTRECOVERY(tp)))) {
 				KASSERT(headlocked, ("headlocked"));
 				INP_INFO_WUNLOCK(&tcbinfo);
 				/*
@@ -1214,6 +1232,9 @@
 			 * with nothing on the reassembly queue and
 			 * we have enough buffer space to take it.
 			 */
+			/* Clean receiver SACK report if present */
+			if (tp->sack_enable && tp->rcv_numsacks)
+				tcp_clean_sackreport(tp);
 			++tcpstat.tcps_preddat;
 			tp->rcv_nxt += tlen;
 			/*
@@ -1892,7 +1913,7 @@
 				    th->th_ack != tp->snd_una)
 					tp->t_dupacks = 0;
 				else if (++tp->t_dupacks > tcprexmtthresh ||
-					 (tcp_do_newreno &&
+					 ((tcp_do_newreno || tp->sack_enable) &&
 					  IN_FASTRECOVERY(tp))) {
 					tp->snd_cwnd += tp->t_maxseg;
 					(void) tcp_output(tp);
@@ -1900,7 +1921,8 @@
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
 					u_int win;
-					if (tcp_do_newreno &&
+					if ((tcp_do_newreno || 
+					    tp->sack_enable) &&
 					    SEQ_LEQ(th->th_ack,
 					            tp->snd_recover)) {
 						tp->t_dupacks = 0;
@@ -1915,6 +1937,17 @@
 					tp->snd_recover = tp->snd_max;
 					callout_stop(tp->tt_rexmt);
 					tp->t_rtttime = 0;
+					if (tp->sack_enable) {
+						tcpstat.tcps_sack_recovery_episode++;
+						tp->snd_cwnd = 
+						    tp->t_maxseg * 
+						    tp->t_dupacks;
+						(void) tcp_output(tp);
+						tp->snd_cwnd = 
+						    tp->snd_ssthresh;
+						goto drop;
+					}
+
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
 					(void) tcp_output(tp);
@@ -1965,12 +1998,16 @@
 		 * If the congestion window was inflated to account
 		 * for the other side's cached packets, retract it.
 		 */
-		if (tcp_do_newreno) {
+		if (tcp_do_newreno || tp->sack_enable) {
 			if (IN_FASTRECOVERY(tp)) {
 				if (SEQ_LT(th->th_ack, tp->snd_recover)) {
-					tcp_newreno_partial_ack(tp, th);
+					if (tp->sack_enable)
+						tcp_sack_partialack(tp, th);
+					else
+						tcp_newreno_partial_ack(tp, th);
 				} else {
 					/*
+					 * Out of fast recovery.
 					 * Window inflation should have left us
 					 * with approximately snd_ssthresh
 					 * outstanding data.
@@ -2092,7 +2129,8 @@
 		 * Otherwise open linearly: maxseg per window
 		 * (maxseg^2 / cwnd per packet).
 		 */
-		if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) {
+		if ((!tcp_do_newreno && !tp->sack_enable) || 
+		    !IN_FASTRECOVERY(tp)) {
 			register u_int cw = tp->snd_cwnd;
 			register u_int incr = tp->t_maxseg;
 			if (cw > tp->snd_ssthresh)
@@ -2110,14 +2148,20 @@
 		}
 		sowwakeup(so);
 		/* detect una wraparound */
-		if (tcp_do_newreno && !IN_FASTRECOVERY(tp) &&
+		if ((tcp_do_newreno || tp->sack_enable) && 
+		    !IN_FASTRECOVERY(tp) &&
 		    SEQ_GT(tp->snd_una, tp->snd_recover) &&
 		    SEQ_LEQ(th->th_ack, tp->snd_recover))
 			tp->snd_recover = th->th_ack - 1;
-		if (tcp_do_newreno && IN_FASTRECOVERY(tp) &&
+		if ((tcp_do_newreno || tp->sack_enable) && 
+		    IN_FASTRECOVERY(tp) &&
 		    SEQ_GEQ(th->th_ack, tp->snd_recover))
 			EXIT_FASTRECOVERY(tp);
 		tp->snd_una = th->th_ack;
+		if (tp->sack_enable) {
+			if (SEQ_GT(tp->snd_una, tp->snd_recover))
+				tp->snd_recover = tp->snd_una;
+		} 
 		if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 			tp->snd_nxt = tp->snd_una;
 
@@ -2318,7 +2362,8 @@
 			thflags = tcp_reass(tp, th, &tlen, m);
 			tp->t_flags |= TF_ACKNOW;
 		}
-
+			if (tp->sack_enable)
+				tcp_update_sack_list(tp);
 		/*
 		 * Note the amount of data that peer has sent into
 		 * our window, in order to estimate the sender's
@@ -2521,11 +2566,13 @@
  * Parse TCP options and place in tcpopt.
  */
 static void
-tcp_dooptions(to, cp, cnt, is_syn)
+tcp_dooptions(tp, to, cp, cnt, is_syn, th)
+	struct tcpcb *tp;
 	struct tcpopt *to;
-	u_char *cp;
+	u_char *cp;   
 	int cnt;
 	int is_syn;
+	struct tcphdr *th;
 {
 	int opt, optlen;
 
@@ -2614,6 +2661,20 @@
 			to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
 			break;
 #endif
+		case TCPOPT_SACK_PERMITTED:
+			if (!tcp_do_sack ||
+			    optlen != TCPOLEN_SACK_PERMITTED)
+				continue;
+			if (is_syn) {
+				/* MUST only be set on SYN */   
+				to->to_flags |= TOF_SACK;
+			}
+			break;
+
+		case TCPOPT_SACK:
+			if (!tp || tcp_sack_option(tp, th, cp, optlen))
+				continue;
+			break;
 		default:
 			continue;
 		}

==== //depot/projects/sack/netinet/tcp_output.c#2 (text+ko) ====

@@ -35,6 +35,7 @@
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -122,6 +123,8 @@
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 	int idle, sendalot;
+	int i, sack_rxmit;
+	struct sackhole *p;
 #if 0
 	int maxburst = TCP_MAXBURST;
 #endif
@@ -171,6 +174,13 @@
 		}
 	}
 again:
+	/*
+	 * If we've recently taken a timeout, snd_max will be greater than
+	 * snd_nxt.  There may be SACK information that allows us to avoid
+	 * resending already delivered data.  Adjust snd_nxt accordingly.
+	 */
+	if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
+		tcp_sack_adjust(tp);
 	sendalot = 0;
 	off = tp->snd_nxt - tp->snd_una;
 	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
@@ -178,6 +188,36 @@
 
 	flags = tcp_outflags[tp->t_state];
 	/*
+	 * Send any SACK-generated retransmissions.  If we're explicitly trying
+	 * to send out new data (when sendalot is 1), bypass this function.
+	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
+	 * we're replacing a (future) new transmission with a retransmission
+	 * now, and we previously incremented snd_cwnd in tcp_input().
+	 */
+	/* 
+	 * Still in sack recovery , reset rxmit flag to zero.
+	 */
+	sack_rxmit = 0;
+	len = 0;
+	p = NULL;
+	if (tp->sack_enable &&  IN_FASTRECOVERY(tp) &&
+	    (p = tcp_sack_output(tp))) {
+		sack_rxmit = 1;
+		sendalot = 1;
+		off = p->rxmit - tp->snd_una;
+		KASSERT(tp->snd_cwnd >= 0,("%s: CWIN is negative: %ld", __func__, tp->snd_cwnd));
+		/* Do not retransmit SACK segments beyond snd_recover */
+		if (SEQ_GT(p->end, tp->snd_recover))
+			len = min(tp->snd_cwnd, tp->snd_recover - p->rxmit);
+		else
+			len = min(tp->snd_cwnd, p->end - p->rxmit);
+		if (len > 0) {
+			tcpstat.tcps_sack_rexmits++;
+			tcpstat.tcps_sack_rexmit_bytes += 
+				min(len, tp->t_maxseg);
+		}
+	}
+	/*
 	 * Get standard flags, and add SYN or FIN if requested by 'hidden'
 	 * state flags.
 	 */
@@ -230,9 +270,12 @@
 	 * In the normal retransmit-FIN-only case, however, snd_nxt will
 	 * be set to snd_una, the offset will be 0, and the length may
 	 * wind up 0.
+	 * 
+	 * If sack_rxmit is true we are retransmitting from the scoreboard
+	 * in which case len is already set. 
 	 */
-	len = (long)ulmin(so->so_snd.sb_cc, sendwin) - off;
-
+	if (!sack_rxmit)
+		len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
 
 	/*
 	 * Lop off SYN bit if it has already been sent.  However, if this
@@ -331,6 +374,8 @@
 			goto send;
 		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
 			goto send;
+		if (sack_rxmit)
+			goto send;
 	}
 
 	/*
@@ -374,7 +419,18 @@
 	if (flags & TH_FIN &&
 	    ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
 		goto send;
-
+	/*
+	 * In SACK, it is possible for tcp_output to fail to send a segment
+	 * after the retransmission timer has been turned off.  Make sure
+	 * that the retransmission timer is set.
+	 */
+	if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) &&
+	    !callout_active(tp->tt_rexmt) && 
+	    !callout_active(tp->tt_persist)) {
+		callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+			      tcp_timer_rexmt, tp);
+                return (0);
+        }
 	/*
 	 * TCP window updates are not reliable, rather a polling protocol
 	 * using ``persist'' packets is used to insure receipt of window
@@ -435,6 +491,19 @@
 			(void)memcpy(opt + 2, &mss, sizeof(mss));
 			optlen = TCPOLEN_MAXSEG;
 
+                        /*
+                         * If this is the first SYN of connection (not a SYN
+                         * ACK), include SACK_PERMIT_HDR option.  If this is a
+                         * SYN ACK, include SACK_PERMIT_HDR option if peer has
+                         * already done so. This is only for active connect,
+			 * since the syncache takes care of the passive connect.
+                         */
+                        if (tp->sack_enable && ((flags & TH_ACK) == 0 || 
+			    (tp->t_flags & TF_SACK_PERMIT))) {
+                                *((u_int32_t *) (opt + optlen)) =
+					htonl(TCPOPT_SACK_PERMIT_HDR);
+                                optlen += 4;
+                        }
 			if ((tp->t_flags & TF_REQ_SCALE) &&
 			    ((flags & TH_ACK) == 0 ||
 			    (tp->t_flags & TF_RCVD_SCALE))) {
@@ -466,6 +535,32 @@
  		optlen += TCPOLEN_TSTAMP_APPA;
  	}
 
+	/*
+	 * Send SACKs if necessary.  This should be the last option processed.
+	 * Only as many SACKs are sent as are permitted by the maximum options
+	 * size.  No more than three SACKs are sent.
+	 */
+	if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED &&
+	    (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
+	    tp->rcv_numsacks) {
+		u_int32_t *lp = (u_int32_t *)(opt + optlen);
+		u_int32_t *olp = lp++;
+		int count = 0;  /* actual number of SACKs inserted */
+		int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
+
+		tcpstat.tcps_sack_send_blocks++;
+		maxsack = min(maxsack, TCP_MAX_SACK);
+		for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
+			struct sackblk sack = tp->sackblks[i];
+			if (sack.start == 0 && sack.end == 0)
+				continue;
+			*lp++ = htonl(sack.start);
+			*lp++ = htonl(sack.end);
+			count++;
+		}
+		*olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
+		optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
+	}
  	/*
 	 * Send `CC-family' options if our side wants to use them (TF_REQ_CC),
 	 * options are allowed (!TF_NOOPT) and it's not a RST.
@@ -734,6 +829,10 @@
 		th->th_seq = htonl(tp->snd_nxt);
 	else
 		th->th_seq = htonl(tp->snd_max);
+        if (sack_rxmit) {
+                th->th_seq = htonl(p->rxmit);
+                p->rxmit += len;
+        }
 	th->th_ack = htonl(tp->rcv_nxt);
 	if (optlen) {
 		bcopy(opt, th + 1, optlen);
@@ -831,6 +930,8 @@
 				tp->t_flags |= TF_SENTFIN;
 			}
 		}
+		if (tp->sack_enable && sack_rxmit && (p->rxmit != tp->snd_nxt))
+			goto timer;
 		tp->snd_nxt += len;
 		if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
 			tp->snd_max = tp->snd_nxt;
@@ -853,6 +954,17 @@
 		 * Initialize shift counter which is used for backoff
 		 * of retransmit time.
 		 */
+timer:
+		if (tp->sack_enable && sack_rxmit &&
+		    !callout_active(tp->tt_rexmt) &&
+		    tp->snd_nxt != tp->snd_max) {
+			callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+				      tcp_timer_rexmt, tp);
+			if (callout_active(tp->tt_persist)) {
+				callout_stop(tp->tt_persist);
+				tp->t_rxtshift = 0;
+			}
+		}
 		if (!callout_active(tp->tt_rexmt) &&
 		    tp->snd_nxt != tp->snd_una) {
 			if (callout_active(tp->tt_persist)) {

==== //depot/projects/sack/netinet/tcp_seq.h#2 (text+ko) ====

@@ -42,6 +42,9 @@
 #define	SEQ_GT(a,b)	((int)((a)-(b)) > 0)
 #define	SEQ_GEQ(a,b)	((int)((a)-(b)) >= 0)
 
+#define	SEQ_MIN(a, b)	((SEQ_LT(a, b)) ? (a) : (b))
+#define	SEQ_MAX(a, b)	((SEQ_GT(a, b)) ? (a) : (b))
+
 /* for modulo comparisons of timestamps */
 #define TSTMP_LT(a,b)	((int)((a)-(b)) < 0)
 #define TSTMP_GEQ(a,b)	((int)((a)-(b)) >= 0)

==== //depot/projects/sack/netinet/tcp_subr.c#2 (text+ko) ====

@@ -36,6 +36,7 @@
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -201,6 +202,17 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
     &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
 
+
+int tcp_do_sack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW,
+    &tcp_do_sack, 0, "Enable/Disable TCP SACK support");
+
+int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW,
+	   &tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements");
+
+uma_zone_t sack_hole_zone;
+
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static void	tcp_discardcb(struct tcpcb *);
 static void	tcp_isn_tick(void *);
@@ -292,6 +304,8 @@
 	tcp_isn_tick(NULL);
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
+	sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), 
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 }
 
 void
@@ -599,6 +613,7 @@
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (tcp_do_rfc1644)
 		tp->t_flags |= TF_REQ_CC;
+	tp->sack_enable = tcp_do_sack;
 	tp->t_inpcb = inp;	/* XXX */
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
@@ -732,6 +747,7 @@
 		tp->t_segqlen--;
 		tcp_reass_qsize--;
 	}
+	tcp_free_sackholes(tp);
 	inp->inp_ppcb = NULL;
 	tp->t_inpcb = NULL;
 	uma_zfree(tcpcb_zone, tp);
@@ -752,7 +768,6 @@
 #ifdef INET6
 	struct socket *so = inp->inp_socket;
 #endif
-
 	tcp_discardcb(tp);
 #ifdef INET6
 	if (INP_CHECK_SOCKAF(so, AF_INET6))

==== //depot/projects/sack/netinet/tcp_syncache.c#2 (text+ko) ====

@@ -39,6 +39,7 @@
 #include "opt_ipsec.h"
 #include "opt_mac.h"
 #include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -702,7 +703,10 @@
 	if (sc->sc_flags & SCF_SIGNATURE)
 		tp->t_flags |= TF_SIGNATURE;
 #endif
-
+	if (sc->sc_flags & SCF_SACK) {
+		tp->sack_enable = 1;
+		tp->t_flags |= TF_SACK_PERMIT;
+	}
 	/*
 	 * Set up MSS and get cached values from tcp_hostcache.
 	 * This might overwrite some of the defaults we just set.
@@ -989,6 +993,9 @@
 		sc->sc_flags = SCF_SIGNATURE;
 #endif
 
+	if (to->to_flags & TOF_SACK)  
+		sc->sc_flags |= SCF_SACK;
+
 	/*
 	 * XXX
 	 * We have the option here of not doing TAO (even if the segment
@@ -1105,6 +1112,7 @@
 		optlen += (sc->sc_flags & SCF_SIGNATURE) ?
 		    TCPOLEN_SIGNATURE + 2 : 0;
 #endif
+		optlen += ((sc->sc_flags & SCF_SACK) ? 4 : 0);
 	}
 	tlen = hlen + sizeof(struct tcphdr) + optlen;
 
@@ -1242,6 +1250,11 @@
 			optp += TCPOLEN_SIGNATURE + 2;
 		}
 #endif /* TCP_SIGNATURE */
+
+	if (sc->sc_flags & SCF_SACK) {
+		*(u_int32_t *)optp = htonl(TCPOPT_SACK_PERMIT_HDR);
+		optp += 4;
+	}
 	}
 
 #ifdef INET6

==== //depot/projects/sack/netinet/tcp_timer.c#2 (text+ko) ====

@@ -32,6 +32,7 @@
 
 #include "opt_inet6.h"
 #include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -217,6 +218,7 @@
 		return;
 	}
 	INP_LOCK(inp);
+	tcp_free_sackholes(tp);
 	if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) {
 		INP_UNLOCK(tp->t_inpcb);
 		INP_INFO_WUNLOCK(&tcbinfo);
@@ -497,6 +499,7 @@
 		return;
 	}
 	callout_deactivate(tp->tt_rexmt);
+	tcp_free_sackholes(tp);
 	/*
 	 * Retransmission timer went off.  Message has not
 	 * been acked within retransmit interval.  Back off

==== //depot/projects/sack/netinet/tcp_var.h#2 (text+ko) ====

@@ -52,6 +52,17 @@
 extern int	tcp_reass_qsize;
 extern struct uma_zone *tcp_reass_zone;
 
+struct sackblk {
+	tcp_seq start;		/* start seq no. of sack block */
+	tcp_seq end; 		/* end seq no. */
+};
+
+struct sackhole {
+	tcp_seq start;		/* start seq no. of hole */
+	tcp_seq end;		/* end seq no. */
+	tcp_seq rxmit;		/* next seq. no in hole to be retransmitted */
+	struct sackhole *next;	/* next in list */
+};
 struct tcptemp {
 	u_char	tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
 	struct	tcphdr tt_t;
@@ -179,6 +190,16 @@
 	u_long	rcv_second;		/* start of interval second */
 	u_long	rcv_pps;		/* received packets per second */
 	u_long	rcv_byps;		/* received bytes per second */
+	/* SACK related state */
+	int	sack_enable;		/* enable SACK for this connection */
+	int	snd_numholes;		/* number of holes seen by sender */
+	struct sackhole *snd_holes;	/* linked list of holes (sorted) */
+
+	tcp_seq	rcv_laststart;		/* start of last segment recd. */
+	tcp_seq	rcv_lastend;		/* end of ... */
+	tcp_seq	rcv_lastsack;		/* last seq number(+1) sack'd by rcv'r*/
+	int	rcv_numsacks;		/* # distinct sack blks present */
+	struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
 };
 
 #define IN_FASTRECOVERY(tp)	(tp->t_flags & TF_FASTRECOVERY)
@@ -216,6 +237,7 @@
 #define	TOF_SCALE	0x0020
 #define	TOF_SIGNATURE	0x0040		/* signature option present */
 #define	TOF_SIGLEN	0x0080		/* signature length valid (RFC2385) */
+#define	TOF_SACK	0x0100		/* Peer sent SACK option */
 	u_int32_t	to_tsval;
 	u_int32_t	to_tsecr;
 	tcp_cc		to_cc;		/* holds CC or CCnew */
@@ -249,6 +271,7 @@
 #define SCF_CC		0x08			/* negotiated CC */
 #define SCF_UNREACH	0x10			/* icmp unreachable received */
 #define SCF_SIGNATURE	0x20			/* send MD5 digests */
+#define SCF_SACK	0x80			/* send SACK option */
 	TAILQ_ENTRY(syncache)	sc_hash;
 	TAILQ_ENTRY(syncache)	sc_timerq;
 };
@@ -434,6 +457,13 @@
 
 	u_long	tcps_hc_added;		/* entry added to hostcache */
 	u_long	tcps_hc_bucketoverflow;	/* hostcache per bucket limit hit */
+
+	/* SACK related stats */
+	u_long	tcps_sack_recovery_episode; /* SACK recovery episodes */
+	u_long  tcps_sack_rexmits;	    /* SACK rexmit segments   */
+	u_long  tcps_sack_rexmit_bytes;	    /* SACK rexmit bytes      */	
+	u_long  tcps_sack_rcv_blocks;	    /* SACK blocks (options) received */
+	u_long  tcps_sack_send_blocks;	    /* SACK blocks (options) sent     */
 };
 
 /*
@@ -467,7 +497,8 @@
 #define	TCPCTL_PCBLIST		11	/* list of all outstanding PCBs */
 #define	TCPCTL_DELACKTIME	12	/* time before sending delayed ACK */
 #define	TCPCTL_V6MSSDFLT	13	/* MSS default for IPv6 */
-#define	TCPCTL_MAXID		14
+#define	TCPCTL_SACK		14	/* Selective Acknowledgement,rfc 2018 */
+#define	TCPCTL_MAXID		15
 
 #define TCPCTL_NAMES { \
 	{ 0, 0 }, \
@@ -505,6 +536,8 @@
 extern	int ss_fltsz;
 extern	int ss_fltsz_local;
 
+extern	int tcp_do_sack;	/* SACK enabled/disabled */
+
 void	 tcp_canceltimers(struct tcpcb *);
 struct tcpcb *
 	 tcp_close(struct tcpcb *);
@@ -578,6 +611,23 @@
 extern	u_long tcp_recvspace;
 tcp_seq tcp_new_isn(struct tcpcb *);
 
+int	 tcp_sack_option(struct tcpcb *,struct tcphdr *,u_char *,int);
+void	 tcp_update_sack_list(struct tcpcb *tp);
+void	 tcp_del_sackholes(struct tcpcb *, struct tcphdr *);
+void	 tcp_clean_sackreport(struct tcpcb *tp);
+void	 tcp_sack_adjust(struct tcpcb *tp);
+struct sackhole *tcp_sack_output(struct tcpcb *tp);
+void	 tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
+void	 tcp_free_sackholes(struct tcpcb *tp);
+#ifdef DEBUG
+void	 tcp_print_holes(struct tcpcb *tp);
+#endif
+int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
+u_long	 tcp_seq_subtract(u_long, u_long );
+#ifdef TCP_SACK_DEBUG
+void 	 tcp_print_holes(struct tcpcb *tp);
+#endif /* TCP_SACK_DEBUG */
+
 #endif /* _KERNEL */
 
 #endif /* _NETINET_TCP_VAR_H_ */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200406072358.i57NwCsK065762>