Date: Wed, 22 Aug 2001 17:15:18 -0700 (PDT) From: Dave Zarzycki <zarzycki@freebsd.org> To: <freebsd-net@freebsd.org> Subject: RFC: SACK/FACK patch port to Current Message-ID: <Pine.LNX.4.33.0108221713240.16888-200000@bonk.apple.com>
next in thread | raw e-mail | index | archive | help
[-- Attachment #1 --]
Attached and tested.
I'd like to merge this in unless anybody objects.
davez
--
Dave Zarzycki
Darwin & Mac OS X
Apple Computer, Inc.
[-- Attachment #2 --]
Index: tcp.h
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp.h,v
retrieving revision 1.16
diff -u -d -b -w -u -d -r1.16 tcp.h
--- tcp.h 2001/01/09 18:26:17 1.16
+++ tcp.h 2001/08/22 23:11:21
@@ -85,12 +85,22 @@
#define TCPOPT_SACK_PERMITTED 4 /* Experimental */
#define TCPOLEN_SACK_PERMITTED 2
#define TCPOPT_SACK 5 /* Experimental */
+#define TCPOLEN_SACK 8 /*2*sizeof(tcp_seq):len of sack blk */
#define TCPOPT_TIMESTAMP 8
#define TCPOLEN_TIMESTAMP 10
#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */
#define TCPOPT_TSTAMP_HDR \
(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)
+#define TCPOPT_SACK_PERMIT_HDR \
+ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED)
+#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8)
+
+/* Miscellaneous constants */
+#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */
+#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */
+
+
#define TCPOPT_CC 11 /* CC options: RFC-1644 */
#define TCPOPT_CCNEW 12
#define TCPOPT_CCECHO 13
@@ -133,5 +143,6 @@
#define TCP_MAXSEG 0x02 /* set maximum segment size */
#define TCP_NOPUSH 0x04 /* don't push last block of write */
#define TCP_NOOPT 0x08 /* don't use TCP options */
+#define TCP_SACK_DISABLE 0x300 /* disable SACKs (if enabled by def.) */
#endif
Index: tcp_input.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.137
diff -u -d -b -w -u -d -r1.137 tcp_input.c
--- tcp_input.c 2001/08/22 00:58:16 1.137
+++ tcp_input.c 2001/08/22 23:11:21
@@ -100,7 +100,7 @@
MALLOC_DEFINE(M_TSEGQ, "tseg_qent", "TCP segment queue entry");
-static int tcprexmtthresh = 3;
+int tcprexmtthresh = 3;
tcp_cc tcp_ccgen;
struct tcpstat tcpstat;
@@ -870,6 +870,10 @@
tp->t_rcvtime = ticks;
if (TCPS_HAVEESTABLISHED(tp->t_state))
callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
+#ifdef TCP_SACK
+ if (!tp->sack_disable)
+ tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
+#endif /* TCP_SACK */
/*
* Process options if not in LISTEN state,
@@ -878,6 +882,12 @@
if (tp->t_state != TCPS_LISTEN)
tcp_dooptions(tp, optp, optlen, th, &to);
+#ifdef TCP_SACK
+ if (!tp->sack_disable) {
+ tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
+ tp->rcv_lastend = th->th_seq + tlen;
+ }
+#endif /* TCP_SACK */
/*
* Header prediction: check for the two common cases
* of a uni-directional data xfer. If the packet has
@@ -954,6 +964,19 @@
tcpstat.tcps_rcvackbyte += acked;
sbdrop(&so->so_snd, acked);
tp->snd_una = th->th_ack;
+
+#if defined(TCP_SACK)
+ /*
+ * We want snd_last to track snd_una so
+ * as to avoid sequence wraparound problems
+ * for very large transfers.
+ */
+ tp->snd_last = tp->snd_una;
+#endif /* TCP_SACK */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->snd_fack = tp->snd_una;
+ tp->retran_data = 0;
+#endif /* TCP_FACK */
m_freem(m);
ND6_HINT(tp); /* some progress has been done */
@@ -986,6 +1009,12 @@
* with nothing on the reassembly queue and
* we have enough buffer space to take it.
*/
+
+#ifdef TCP_SACK
+ /* Clean receiver SACK report if present */
+ if (!tp->sack_disable && tp->rcv_numsacks)
+ tcp_clean_sackreport(tp);
+#endif /* TCP_SACK */
++tcpstat.tcps_preddat;
tp->rcv_nxt += tlen;
tcpstat.tcps_rcvpack++;
@@ -1131,6 +1160,17 @@
bzero(taop, sizeof(*taop));
}
tcp_dooptions(tp, optp, optlen, th, &to);
+
+#ifdef TCP_SACK
+ /*
+ * If peer did not send a SACK_PERMITTED option (i.e., if
+ * tcp_dooptions() did not set TF_SACK_PERMIT), set
+ * sack_disable to 1 if it is currently 0.
+ */
+ if (!tp->sack_disable)
+ if ((tp->t_flags & TF_SACK_PERMIT) == 0)
+ tp->sack_disable = 1;
+#endif
if (iss)
tp->iss = iss;
else {
@@ -1138,6 +1178,14 @@
}
tp->irs = th->th_seq;
tcp_sendseqinit(tp);
+#if defined (TCP_SACK)
+ tp->snd_last = tp->snd_una;
+#endif /* TCP_SACK */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->snd_fack = tp->snd_una;
+ tp->retran_data = 0;
+ tp->snd_awnd = 0;
+#endif /* TCP_FACK */
tcp_rcvseqinit(tp);
tp->snd_recover = tp->snd_una;
/*
@@ -1309,6 +1357,17 @@
}
} else
tp->t_flags &= ~TF_RCVD_CC;
+#ifdef TCP_SACK
+ /*
+ * If we've sent a SACK_PERMITTED option, and the peer
+ * also replied with one, then TF_SACK_PERMIT should have
+ * been set in tcp_dooptions(). If it was not, disable SACKs.
+ */
+ if (!tp->sack_disable)
+ if ((tp->t_flags & TF_SACK_PERMIT) == 0)
+ tp->sack_disable = 1;
+#endif
+
tcpstat.tcps_connects++;
soisconnected(so);
/* Do window scaling on this connection? */
@@ -1820,14 +1879,36 @@
* to keep a constant cwnd packets in the
* network.
*/
+
if (!callout_active(tp->tt_rexmt) ||
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /*
+ * In FACK, can enter fast rec. if the receiver
+ * reports a reass. queue longer than 3 segs.
+ */
+ else if (++tp->t_dupacks == tcprexmtthresh ||
+ ((SEQ_GT(tp->snd_fack, tcprexmtthresh *
+ tp->t_maxseg + tp->snd_una)) &&
+ SEQ_GT(tp->snd_una, tp->snd_last))) {
+#else
else if (++tp->t_dupacks == tcprexmtthresh) {
+#endif /* TCP_FACK */
tcp_seq onxt = tp->snd_nxt;
u_int win =
min(tp->snd_wnd, tp->snd_cwnd) / 2 /
tp->t_maxseg;
+#if defined(TCP_SACK)
+ if (SEQ_LT(th->th_ack, tp->snd_last)){
+ /*
+ * False fast retx after
+ * timeout. Do not cut window.
+ */
+ tp->t_dupacks = 0;
+ goto drop;
+ }
+#else
if (tcp_do_newreno && SEQ_LT(th->th_ack,
tp->snd_recover)) {
/* False retransmit, should not
@@ -1838,21 +1919,62 @@
(void) tcp_output(tp);
goto drop;
}
+#endif
if (win < 2)
win = 2;
tp->snd_ssthresh = win * tp->t_maxseg;
+#if defined(TCP_SACK)
+ tp->snd_last = tp->snd_max;
+#else
tp->snd_recover = tp->snd_max;
+#endif
+#ifdef TCP_SACK
+ if (!tp->sack_disable) {
callout_stop(tp->tt_rexmt);
tp->t_rtttime = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->t_dupacks = tcprexmtthresh;
+ (void) tcp_output(tp);
+ /*
+ * During FR, snd_cwnd is held
+ * constant for FACK.
+ */
+ tp->snd_cwnd = tp->snd_ssthresh;
+#else
+ /*
+ * tcp_output() will send
+ * oldest SACK-eligible rtx.
+ */
+ (void) tcp_output(tp);
+ tp->snd_cwnd = tp->snd_ssthresh+
+ tp->t_maxseg * tp->t_dupacks;
+#endif /* TCP_FACK */
+ goto drop;
+ }
+#endif /* TCP_SACK */
+ callout_stop(tp->tt_rexmt);
+ tp->t_rtttime = 0;
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_maxseg;
(void) tcp_output(tp);
+
tp->snd_cwnd = tp->snd_ssthresh +
tp->t_maxseg * tp->t_dupacks;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
goto drop;
} else if (tp->t_dupacks > tcprexmtthresh) {
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /*
+ * while (awnd < cwnd)
+ * sendsomething();
+ */
+ if (!tp->sack_disable) {
+ if (tp->snd_awnd < tp->snd_cwnd)
+ tcp_output(tp);
+ goto drop;
+ }
+#endif /* TCP_FACK */
tp->snd_cwnd += tp->t_maxseg;
(void) tcp_output(tp);
goto drop;
@@ -1861,10 +1983,57 @@
tp->t_dupacks = 0;
break;
}
+
/*
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
+ * If the congestion window was inflated to account
+ * for the other side's cached packets, retract it.
*/
+#if defined(TCP_SACK)
+ if (!tp->sack_disable) {
+ if (tp->t_dupacks >= tcprexmtthresh) {
+ /* Check for a partial ACK */
+ if (tcp_sack_partialack(tp, th)) {
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /* Force call to tcp_output */
+ if (tp->snd_awnd < tp->snd_cwnd)
+ needoutput = 1;
+#else
+ tp->snd_cwnd += tp->t_maxseg;
+ needoutput = 1;
+#endif /* TCP_FACK */
+ } else {
+ /* Out of fast recovery */
+ tp->snd_cwnd = tp->snd_ssthresh;
+ if (tcp_seq_subtract(tp->snd_max,
+ th->th_ack) < tp->snd_ssthresh)
+ tp->snd_cwnd =
+ tcp_seq_subtract(tp->snd_max,
+ th->th_ack);
+ tp->t_dupacks = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ if (SEQ_GT(th->th_ack, tp->snd_fack))
+ tp->snd_fack = th->th_ack;
+#endif /* TCP_FACK */
+ }
+ }
+ } else {
+ if (tp->t_dupacks >= tcprexmtthresh &&
+ !tcp_newreno(tp, th)) {
+ /* Out of fast recovery */
+ tp->snd_cwnd = tp->snd_ssthresh;
+ if (tcp_seq_subtract(tp->snd_max, th->th_ack) <
+ tp->snd_ssthresh)
+ tp->snd_cwnd =
+ tcp_seq_subtract(tp->snd_max,
+ th->th_ack);
+ tp->t_dupacks = 0;
+ }
+ }
+ if (tp->t_dupacks < tcprexmtthresh)
+ tp->t_dupacks = 0;
+#else /* else no TCP_SACK */
if (tcp_do_newreno == 0) {
if (tp->t_dupacks >= tcprexmtthresh &&
tp->snd_cwnd > tp->snd_ssthresh)
@@ -1885,6 +2054,8 @@
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_dupacks = 0;
}
+#endif
+
if (SEQ_GT(th->th_ack, tp->snd_max)) {
tcpstat.tcps_rcvacktoomuch++;
goto dropafterack;
@@ -1982,8 +2153,14 @@
* in NewReno fast recovery mode, so we leave the congestion
* window alone.
*/
+
+#if defined (TCP_SACK)
+ if (tp->t_dupacks < tcprexmtthresh)
+ tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
+#else
if (tcp_do_newreno == 0 || tp->t_dupacks == 0)
tp->snd_cwnd = min(cw + incr,TCP_MAXWIN<<tp->snd_scale);
+#endif
}
if (acked > so->so_snd.sb_cc) {
tp->snd_wnd -= so->so_snd.sb_cc;
@@ -1998,6 +2175,16 @@
tp->snd_una = th->th_ack;
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
+#if defined (TCP_SACK) && defined (TCP_FACK)
+ if (SEQ_GT(tp->snd_una, tp->snd_fack)) {
+ tp->snd_fack = tp->snd_una;
+ /* Update snd_awnd for partial ACK
+ * without any SACK blocks.
+ */
+ tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt,
+ tp->snd_fack) + tp->retran_data;
+ }
+#endif
switch (tp->t_state) {
@@ -2196,7 +2383,10 @@
thflags = tcp_reass(tp, th, &tlen, m);
tp->t_flags |= TF_ACKNOW;
}
-
+#ifdef TCP_SACK
+ if (!tp->sack_disable)
+ tcp_update_sack_list(tp);
+#endif
/*
* Note the amount of data that peer has sent into
* our window, in order to estimate the sender's
@@ -2208,6 +2398,7 @@
thflags &= ~TH_FIN;
}
+
/*
* If FIN is received ACK the FIN and let the user know
* that the connection is closing.
@@ -2498,13 +2689,439 @@
(char *)&to->to_ccecho, sizeof(to->to_ccecho));
NTOHL(to->to_ccecho);
break;
+#ifdef TCP_SACK
+ case TCPOPT_SACK_PERMITTED:
+ if (tp->sack_disable || optlen!=TCPOLEN_SACK_PERMITTED)
+ continue;
+ if (th->th_flags & TH_SYN)
+ /* MUST only be set on SYN */
+ tp->t_flags |= TF_SACK_PERMIT;
+ break;
+ case TCPOPT_SACK:
+ if (tcp_sack_option(tp, th, cp, optlen))
+ continue;
+ break;
+#endif
}
}
if (th->th_flags & TH_SYN)
tcp_mss(tp, mss); /* sets t_maxseg */
}
+#if defined(TCP_SACK)
+u_long
+tcp_seq_subtract(a, b)
+ u_long a, b;
+{
+ return ((long)(a - b));
+}
+#endif
+
+
+
+#ifdef TCP_SACK
+/*
+ * This function is called upon receipt of new valid data (while not in header
+ * prediction mode), and it updates the ordered list of sacks.
+ */
+void
+tcp_update_sack_list(tp)
+ struct tcpcb *tp;
+{
+ /*
+ * First reported block MUST be the most recent one. Subsequent
+ * blocks SHOULD be in the order in which they arrived at the
+ * receiver. These two conditions make the implementation fully
+ * compliant with RFC 2018.
+ */
+ int i, j = 0, count = 0, lastpos = -1;
+ struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
+
+ /* First clean up current list of sacks */
+ for (i = 0; i < tp->rcv_numsacks; i++) {
+ sack = tp->sackblks[i];
+ if (sack.start == 0 && sack.end == 0) {
+ count++; /* count = number of blocks to be discarded */
+ continue;
+ }
+ if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
+ tp->sackblks[i].start = tp->sackblks[i].end = 0;
+ count++;
+ } else {
+ temp[j].start = tp->sackblks[i].start;
+ temp[j++].end = tp->sackblks[i].end;
+ }
+ }
+ tp->rcv_numsacks -= count;
+ if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
+ tcp_clean_sackreport(tp);
+ if (SEQ_LT(tp->rcv_nxt, tp->rcv_laststart)) {
+ /* ==> need first sack block */
+ tp->sackblks[0].start = tp->rcv_laststart;
+ tp->sackblks[0].end = tp->rcv_lastend;
+ tp->rcv_numsacks = 1;
+ }
+ return;
+ }
+ /* Otherwise, sack blocks are already present. */
+ for (i = 0; i < tp->rcv_numsacks; i++)
+ tp->sackblks[i] = temp[i]; /* first copy back sack list */
+ if (SEQ_GEQ(tp->rcv_nxt, tp->rcv_lastend))
+ return; /* sack list remains unchanged */
+ /*
+ * From here, segment just received should be (part of) the 1st sack.
+ * Go through list, possibly coalescing sack block entries.
+ */
+ firstsack.start = tp->rcv_laststart;
+ firstsack.end = tp->rcv_lastend;
+ for (i = 0; i < tp->rcv_numsacks; i++) {
+ sack = tp->sackblks[i];
+ if (SEQ_LT(sack.end, firstsack.start) ||
+ SEQ_GT(sack.start, firstsack.end))
+ continue; /* no overlap */
+ if (sack.start == firstsack.start && sack.end == firstsack.end){
+ /*
+ * identical block; delete it here since we will
+ * move it to the front of the list.
+ */
+ tp->sackblks[i].start = tp->sackblks[i].end = 0;
+ lastpos = i; /* last posn with a zero entry */
+ continue;
+ }
+ if (SEQ_LEQ(sack.start, firstsack.start))
+ firstsack.start = sack.start; /* merge blocks */
+ if (SEQ_GEQ(sack.end, firstsack.end))
+ firstsack.end = sack.end; /* merge blocks */
+ tp->sackblks[i].start = tp->sackblks[i].end = 0;
+ lastpos = i; /* last posn with a zero entry */
+ }
+ if (lastpos != -1) { /* at least one merge */
+ for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
+ sack = tp->sackblks[i];
+ if (sack.start == 0 && sack.end == 0)
+ continue;
+ temp[j++] = sack;
+ }
+ tp->rcv_numsacks = j; /* including first blk (added later) */
+ for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
+ tp->sackblks[i] = temp[i];
+ } else { /* no merges -- shift sacks by 1 */
+ if (tp->rcv_numsacks < MAX_SACK_BLKS)
+ tp->rcv_numsacks++;
+ for (i = tp->rcv_numsacks-1; i > 0; i--)
+ tp->sackblks[i] = tp->sackblks[i-1];
+ }
+ tp->sackblks[0] = firstsack;
+ return;
+}
+
/*
+ * Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue,
+ * and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list
+ * of holes (oldest to newest, in terms of the sequence space).
+ */
+int
+tcp_sack_option(tp, th, cp, optlen)
+ struct tcpcb *tp;
+ struct tcphdr *th;
+ u_char *cp;
+ int optlen;
+{
+ int tmp_olen;
+ u_char *tmp_cp;
+ struct sackhole *cur, *p, *temp;
+
+ if (tp->sack_disable)
+ return 1;
+
+ /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
+ if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
+ return 1;
+ tmp_cp = cp + 2;
+ tmp_olen = optlen - 2;
+ if (tp->snd_numholes < 0)
+ tp->snd_numholes = 0;
+ if (tp->t_maxseg == 0)
+ panic("tcp_sack_option"); /* Should never happen */
+ while (tmp_olen > 0) {
+ struct sackblk sack;
+
+ bcopy((char *) tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
+ NTOHL(sack.start);
+ bcopy((char *) tmp_cp + sizeof(tcp_seq),
+ (char *) &(sack.end), sizeof(tcp_seq));
+ NTOHL(sack.end);
+ tmp_olen -= TCPOLEN_SACK;
+ tmp_cp += TCPOLEN_SACK;
+ if (SEQ_LEQ(sack.end, sack.start))
+ continue; /* bad SACK fields */
+ if (SEQ_LEQ(sack.end, tp->snd_una))
+ continue; /* old block */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /* Updates snd_fack. */
+ if (SEQ_GEQ(sack.end, tp->snd_fack))
+ tp->snd_fack = sack.end;
+#endif /* TCP_FACK */
+ if (SEQ_GT(th->th_ack, tp->snd_una)) {
+ if (SEQ_LT(sack.start, th->th_ack))
+ continue;
+ } else {
+ if (SEQ_LT(sack.start, tp->snd_una))
+ continue;
+ }
+ if (SEQ_GT(sack.end, tp->snd_max))
+ continue;
+ if (tp->snd_holes == 0) { /* first hole */
+ tp->snd_holes = (struct sackhole *)
+ malloc(sizeof(struct sackhole), M_PCB, M_NOWAIT);
+ if (tp->snd_holes == NULL) {
+ /* ENOBUFS, so ignore SACKed block for now*/
+ continue;
+ }
+ cur = tp->snd_holes;
+ cur->start = th->th_ack;
+ cur->end = sack.start;
+ cur->rxmit = cur->start;
+ cur->next = 0;
+ tp->snd_numholes = 1;
+ tp->rcv_lastsack = sack.end;
+ /*
+ * dups is at least one. If more data has been
+ * SACKed, it can be greater than one.
+ */
+ cur->dups = min(tcprexmtthresh,
+ ((sack.end - cur->end)/tp->t_maxseg));
+ if (cur->dups < 1)
+ cur->dups = 1;
+ continue; /* with next sack block */
+ }
+ /* Go thru list of holes: p = previous, cur = current */
+ p = cur = tp->snd_holes;
+ while (cur) {
+ if (SEQ_LEQ(sack.end, cur->start))
+ /* SACKs data before the current hole */
+ break; /* no use going through more holes */
+ if (SEQ_GEQ(sack.start, cur->end)) {
+ /* SACKs data beyond the current hole */
+ cur->dups++;
+ if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+ tcprexmtthresh)
+ cur->dups = tcprexmtthresh;
+ p = cur;
+ cur = cur->next;
+ continue;
+ }
+ if (SEQ_LEQ(sack.start, cur->start)) {
+ /* Data acks at least the beginning of hole */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ if (SEQ_GT(sack.end, cur->rxmit))
+ tp->retran_data -=
+ tcp_seq_subtract(cur->rxmit,
+ cur->start);
+ else
+ tp->retran_data -=
+ tcp_seq_subtract(sack.end,
+ cur->start);
+#endif /* TCP_FACK */
+ if (SEQ_GEQ(sack.end,cur->end)){
+ /* Acks entire hole, so delete hole */
+ if (p != cur) {
+ p->next = cur->next;
+ free(cur, M_PCB);
+ cur = p->next;
+ } else {
+ cur=cur->next;
+ free(p, M_PCB);
+ p = cur;
+ tp->snd_holes = p;
+ }
+ tp->snd_numholes--;
+ continue;
+ }
+ /* otherwise, move start of hole forward */
+ cur->start = sack.end;
+ cur->rxmit = max (cur->rxmit, cur->start);
+ p = cur;
+ cur = cur->next;
+ continue;
+ }
+ /* move end of hole backward */
+ if (SEQ_GEQ(sack.end, cur->end)) {
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ if (SEQ_GT(cur->rxmit, sack.start))
+ tp->retran_data -=
+ tcp_seq_subtract(cur->rxmit,
+ sack.start);
+#endif /* TCP_FACK */
+ cur->end = sack.start;
+ cur->rxmit = min (cur->rxmit, cur->end);
+ cur->dups++;
+ if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+ tcprexmtthresh)
+ cur->dups = tcprexmtthresh;
+ p = cur;
+ cur = cur->next;
+ continue;
+ }
+ if (SEQ_LT(cur->start, sack.start) &&
+ SEQ_GT(cur->end, sack.end)) {
+ /*
+ * ACKs some data in middle of a hole; need to
+ * split current hole
+ */
+ temp = (struct sackhole *)malloc(sizeof(*temp),
+ M_PCB,M_NOWAIT);
+ if (temp == NULL)
+ continue; /* ENOBUFS */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ if (SEQ_GT(cur->rxmit, sack.end))
+ tp->retran_data -=
+ tcp_seq_subtract(sack.end,
+ sack.start);
+ else if (SEQ_GT(cur->rxmit, sack.start))
+ tp->retran_data -=
+ tcp_seq_subtract(cur->rxmit,
+ sack.start);
+#endif /* TCP_FACK */
+ temp->next = cur->next;
+ temp->start = sack.end;
+ temp->end = cur->end;
+ temp->dups = cur->dups;
+ temp->rxmit = max (cur->rxmit, temp->start);
+ cur->end = sack.start;
+ cur->rxmit = min (cur->rxmit, cur->end);
+ cur->dups++;
+ if ( ((sack.end - cur->end)/tp->t_maxseg) >=
+ tcprexmtthresh)
+ cur->dups = tcprexmtthresh;
+ cur->next = temp;
+ p = temp;
+ cur = p->next;
+ tp->snd_numholes++;
+ }
+ }
+ /* At this point, p points to the last hole on the list */
+ if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
+ /*
+ * Need to append new hole at end.
+ * Last hole is p (and it's not NULL).
+ */
+ temp = (struct sackhole *) malloc(sizeof(*temp),
+ M_PCB, M_NOWAIT);
+ if (temp == NULL)
+ continue; /* ENOBUFS */
+ temp->start = tp->rcv_lastsack;
+ temp->end = sack.start;
+ temp->dups = min(tcprexmtthresh,
+ ((sack.end - sack.start)/tp->t_maxseg));
+ if (temp->dups < 1)
+ temp->dups = 1;
+ temp->rxmit = temp->start;
+ temp->next = 0;
+ p->next = temp;
+ tp->rcv_lastsack = sack.end;
+ tp->snd_numholes++;
+ }
+ }
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /*
+ * Update retran_data and snd_awnd. Go through the list of
+ * holes. Increment retran_data by (hole->rxmit - hole->start).
+ */
+ tp->retran_data = 0;
+ cur = tp->snd_holes;
+ while (cur) {
+ tp->retran_data += cur->rxmit - cur->start;
+ cur = cur->next;
+ }
+ tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) +
+ tp->retran_data;
+#endif /* TCP_FACK */
+
+ return 0;
+}
+
+/*
+ * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if
+ * it is completely acked; otherwise, tcp_sack_option(), called from
+ * tcp_dooptions(), will fix up the hole.
+ */
+void
+tcp_del_sackholes(tp, th)
+ struct tcpcb *tp;
+ struct tcphdr *th;
+{
+ if (!tp->sack_disable && tp->t_state != TCPS_LISTEN) {
+ /* max because this could be an older ack just arrived */
+ tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
+ th->th_ack : tp->snd_una;
+ struct sackhole *cur = tp->snd_holes;
+ struct sackhole *prev = cur;
+ while (cur)
+ if (SEQ_LEQ(cur->end, lastack)) {
+ cur = cur->next;
+ free(prev, M_PCB);
+ prev = cur;
+ tp->snd_numholes--;
+ } else if (SEQ_LT(cur->start, lastack)) {
+ cur->start = lastack;
+ if (SEQ_LT(cur->rxmit, cur->start))
+ cur->rxmit = cur->start;
+ break;
+ } else
+ break;
+ tp->snd_holes = cur;
+ }
+}
+
+/*
+ * Delete all receiver-side SACK information.
+ */
+void
+tcp_clean_sackreport(tp)
+ struct tcpcb *tp;
+{
+ int i;
+
+ tp->rcv_numsacks = 0;
+ for (i = 0; i < MAX_SACK_BLKS; i++)
+ tp->sackblks[i].start = tp->sackblks[i].end=0;
+
+}
+
+/*
+ * Checks for partial ack. If partial ack arrives, turn off retransmission
+ * timer, deflate the window, do not clear tp->t_dupacks, and return 1.
+ * If the ack advances at least to tp->snd_last, return 0.
+ */
+int
+tcp_sack_partialack(tp, th)
+ struct tcpcb *tp;
+ struct tcphdr *th;
+{
+ if (SEQ_LT(th->th_ack, tp->snd_last)) {
+ /* Turn off retx. timer (will start again next segment) */
+ callout_stop(tp->tt_rexmt);
+ tp->t_rtttime = 0;
+#ifndef TCP_FACK
+ /*
+ * Partial window deflation. This statement relies on the
+ * fact that tp->snd_una has not been updated yet. In FACK
+ * hold snd_cwnd constant during fast recovery.
+ */
+ if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) {
+ tp->snd_cwnd -= th->th_ack - tp->snd_una;
+ tp->snd_cwnd += tp->t_maxseg;
+ } else
+ tp->snd_cwnd = tp->t_maxseg;
+#endif
+ return 1;
+ }
+ return 0;
+}
+#endif TCP_SACK
+
+/*
* Pull out of band byte out of a segment so
* it doesn't appear in the user's data queue.
* It is still reflected in the segment length for
@@ -2909,7 +3526,12 @@
struct tcpcb *tp;
struct tcphdr *th;
{
+
+#if defined (TCP_SACK)
+if (SEQ_LT(th->th_ack, tp->snd_last)) {
+#else
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+#endif
tcp_seq onxt = tp->snd_nxt;
u_long ocwnd = tp->snd_cwnd;
Index: tcp_output.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_output.c,v
retrieving revision 1.52
diff -u -d -b -w -u -d -r1.52 tcp_output.c
--- tcp_output.c 2001/06/23 03:21:46 1.52
+++ tcp_output.c 2001/08/22 23:11:21
@@ -98,6 +98,106 @@
int tcp_do_newreno = 1;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, &tcp_do_newreno,
0, "Enable NewReno Algorithms");
+
+#ifdef TCP_SACK
+extern int tcprexmtthresh;
+#endif
+
+#ifdef TCP_SACK
+#ifdef TCP_SACK_DEBUG
+void
+tcp_print_holes(tp)
+struct tcpcb *tp;
+{
+ struct sackhole *p = tp->snd_holes;
+ if (p == 0)
+ return;
+ printf("Hole report: start--end dups rxmit\n");
+ while (p) {
+ printf("%d--%d d %d r %d\n", p->start, p->end, p->dups,
+ p->rxmit);
+ p = p->next;
+ }
+ printf("\n");
+}
+#endif /* TCP_SACK_DEBUG */
+
+/*
+ * Returns pointer to a sackhole if there are any pending retransmissions;
+ * NULL otherwise.
+ */
+struct sackhole *
+tcp_sack_output(tp)
+register struct tcpcb *tp;
+{
+ struct sackhole *p;
+ if (tp->sack_disable)
+ return 0;
+ p = tp->snd_holes;
+ while (p) {
+#ifndef TCP_FACK
+ if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) {
+#else
+ /* In FACK, if p->dups is less than tcprexmtthresh, but
+ * snd_fack advances more than tcprextmtthresh * tp->t_maxseg,
+ * tcp_input() will try fast retransmit. This forces output.
+ */
+ if ((p->dups >= tcprexmtthresh ||
+ tp->t_dupacks == tcprexmtthresh) &&
+ SEQ_LT(p->rxmit, p->end)) {
+#endif /* TCP_FACK */
+ if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
+ p = p->next;
+ continue;
+ }
+#ifdef TCP_SACK_DEBUG
+ if (p)
+ tcp_print_holes(tp);
+#endif
+ return p;
+ }
+ p = p->next;
+ }
+ return 0;
+}
+
+/*
+ * After a timeout, the SACK list may be rebuilt. This SACK information
+ * should be used to avoid retransmitting SACKed data. This function
+ * traverses the SACK list to see if snd_nxt should be moved forward.
+ */
+void
+tcp_sack_adjust(tp)
+ struct tcpcb *tp;
+{
+ struct sackhole *cur = tp->snd_holes;
+ if (cur == 0)
+ return; /* No holes */
+ if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
+ return; /* We're already beyond any SACKed blocks */
+ /*
+ * Two cases for which we want to advance snd_nxt:
+ * i) snd_nxt lies between end of one hole and beginning of another
+ * ii) snd_nxt lies between end of last hole and rcv_lastsack
+ */
+ while (cur->next) {
+ if (SEQ_LT(tp->snd_nxt, cur->end))
+ return;
+ if (SEQ_GEQ(tp->snd_nxt, cur->next->start))
+ cur = cur->next;
+ else {
+ tp->snd_nxt = cur->next->start;
+ return;
+ }
+ }
+ if (SEQ_LT(tp->snd_nxt, cur->end))
+ return;
+ tp->snd_nxt = tp->rcv_lastsack;
+ return;
+}
+#endif /* TCP_SACK */
+
+
/*
* Tcp output routine: figure out what should be sent and send it.
*/
@@ -118,6 +218,10 @@
u_char opt[TCP_MAXOLEN];
unsigned ipoptlen, optlen, hdrlen;
int idle, sendalot;
+#ifdef TCP_SACK
+ int i, sack_rxmit = 0;
+ struct sackhole *p;
+#endif
int maxburst = TCP_MAXBURST;
struct rmxp_tao *taop;
struct rmxp_tao tao_noncached;
@@ -161,10 +265,30 @@
}
again:
sendalot = 0;
+#ifdef TCP_SACK
+ /*
+ * If we've recently taken a timeout, snd_max will be greater than
+ * snd_nxt. There may be SACK information that allows us to avoid
+ * resending already delivered data. Adjust snd_nxt accordingly.
+ */
+ if (!tp->sack_disable && SEQ_LT(tp->snd_nxt, tp->snd_max))
+ tcp_sack_adjust(tp);
+#endif
off = tp->snd_nxt - tp->snd_una;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /* Normally, sendable data is limited by off < tp->snd_cwnd.
+ * But in FACK, sendable data is limited by snd_awnd < snd_cwnd,
+ * regardless of offset.
+ */
+ if (!tp->sack_disable && (tp->t_dupacks > tcprexmtthresh))
+ win = tp->snd_wnd;
+ else
+#endif
win = min(tp->snd_wnd, tp->snd_cwnd);
flags = tcp_outflags[tp->t_state];
+
+
/*
* Get standard flags, and add SYN or FIN if requested by 'hidden'
* state flags.
@@ -173,7 +297,33 @@
flags |= TH_FIN;
if (tp->t_flags & TF_NEEDSYN)
flags |= TH_SYN;
+#ifdef TCP_SACK
+ /*
+ * Send any SACK-generated retransmissions. If we're explicitly trying
+ * to send out new data (when sendalot is 1), bypass this function.
+ * If we retransmit in fast recovery mode, decrement snd_cwnd, since
+ * we're replacing a (future) new transmission with a retransmission
+ * now, and we previously incremented snd_cwnd in tcp_input().
+ */
+ if (!tp->sack_disable && !sendalot) {
+ if (tp->t_dupacks >= tcprexmtthresh &&
+ (p = tcp_sack_output(tp))) {
+ off = p->rxmit - tp->snd_una;
+ sack_rxmit = 1;
+#if 0
+ /* Coalesce holes into a single retransmission */
+#endif
+ len = min(tp->t_maxseg, p->end - p->rxmit);
+#ifndef TCP_FACK
+ /* in FACK, hold snd_cwnd constant during recovery */
+ if (SEQ_LT(tp->snd_una, tp->snd_last))
+ tp->snd_cwnd -= tp->t_maxseg;
+#endif
+ }
+ }
+#endif /* TCP_SACK */
+ sendalot = 0;
/*
* If in persist timeout with window of 0, send 1 byte.
* Otherwise, if window is small but nonzero
@@ -207,8 +357,26 @@
}
}
+#ifdef TCP_SACK
+ if (!sack_rxmit) {
+#endif
len = (long)ulmin(so->so_snd.sb_cc, win) - off;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /*
+ * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and
+ * amount of outstanding data (snd_awnd) is >= snd_cwnd, then
+ * do not send data (like zero window conditions)
+ */
+ if (!tp->sack_disable && len && SEQ_GT(tp->snd_last, tp->snd_una) &&
+ (tp->snd_awnd >= tp->snd_cwnd))
+ len = 0;
+#endif /* TCP_FACK */
+#ifdef TCP_SACK
+ }
+#endif
+
+
if ((taop = tcp_gettaocache(tp->t_inpcb)) == NULL) {
taop = &tao_noncached;
bzero(taop, sizeof(*taop));
@@ -293,6 +461,10 @@
goto send;
if (SEQ_LT(tp->snd_nxt, tp->snd_max))
goto send;
+#ifdef TCP_SACK
+ if (sack_rxmit)
+ goto send;
+#endif
}
/*
@@ -335,6 +507,20 @@
if (flags & TH_FIN &&
((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
goto send;
+#ifdef TCP_SACK
+ /*
+ * In SACK, it is possible for tcp_output to fail to send a segment
+ * after the retransmission timer has been turned off. Make sure
+ * that the retransmission timer is set.
+ */
+ if (SEQ_GT(tp->snd_max, tp->snd_una) &&
+ !callout_active(tp->tt_rexmt) &&
+ !callout_active(tp->tt_persist)){
+ callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+ return (0);
+ }
+#endif /* TCP_SACK */
/*
* TCP window updates are not reliable, rather a polling protocol
@@ -395,7 +581,22 @@
mss = htons((u_short) tcp_mssopt(tp));
(void)memcpy(opt + 2, &mss, sizeof(mss));
optlen = TCPOLEN_MAXSEG;
+#ifdef TCP_SACK
+ /*
+ * If this is the first SYN of connection (not a SYN
+ * ACK), include SACK_PERMIT_HDR option. If this is a
+ * SYN ACK, include SACK_PERMIT_HDR option if peer has
+ * already done so.
+ */
+ if (!tp->sack_disable && ((flags & TH_ACK) == 0 ||
+ (tp->t_flags & TF_SACK_PERMIT))) {
+ *((u_int32_t *) (opt + optlen)) =
+ htonl(TCPOPT_SACK_PERMIT_HDR);
+ optlen += 4;
+ }
+#endif
+
if ((tp->t_flags & TF_REQ_SCALE) &&
((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_RCVD_SCALE))) {
@@ -426,6 +627,33 @@
*lp = htonl(tp->ts_recent);
optlen += TCPOLEN_TSTAMP_APPA;
}
+#ifdef TCP_SACK
+ /*
+ * Send SACKs if necessary. This should be the last option processed.
+ * Only as many SACKs are sent as are permitted by the maximum options
+ * size. No more than three SACKs are sent.
+ */
+ if (!tp->sack_disable && tp->t_state == TCPS_ESTABLISHED &&
+ (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
+ tp->rcv_numsacks) {
+ u_int32_t *lp = (u_int32_t *)(opt + optlen);
+ u_int32_t *olp = lp++;
+ int count = 0; /* actual number of SACKs inserted */
+ int maxsack = (TCP_MAXOLEN - (optlen + 4))/TCPOLEN_SACK;
+
+ maxsack = min(maxsack, TCP_MAX_SACK);
+ for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
+ struct sackblk sack = tp->sackblks[i];
+ if (sack.start == 0 && sack.end == 0)
+ continue;
+ *lp++ = htonl(sack.start);
+ *lp++ = htonl(sack.end);
+ count++;
+ }
+ *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
+ optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
+ }
+#endif /* TCP_SACK */
/*
* Send `CC-family' options if our side wants to use them (TF_REQ_CC),
@@ -673,6 +901,23 @@
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
+#ifdef TCP_SACK
+ if (sack_rxmit) {
+ /*
+ * If sendalot was turned on (due to option stuffing), turn it
+ * off. Properly set th_seq field. Advance the ret'x pointer
+ * by len.
+ */
+ if (sendalot)
+ sendalot = 0;
+ th->th_seq = htonl(p->rxmit);
+ p->rxmit += len;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->retran_data += len;
+#endif /* TCP_FACK */
+ }
+#endif /* TCP_SACK */
+
th->th_ack = htonl(tp->rcv_nxt);
if (optlen) {
bcopy(opt, th + 1, optlen);
@@ -747,6 +992,14 @@
tp->t_flags |= TF_SENTFIN;
}
}
+#ifdef TCP_SACK
+ if (!tp->sack_disable) {
+ if (sack_rxmit && (p->rxmit != tp->snd_nxt)) {
+ goto timer;
+ }
+ }
+#endif
+
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
@@ -769,6 +1022,19 @@
* Initialize shift counter which is used for backoff
* of retransmit time.
*/
+#ifdef TCP_SACK
+ timer:
+ if (!tp->sack_disable && sack_rxmit &&
+ !callout_active(tp->tt_rexmt) &&
+ tp->snd_nxt != tp->snd_una) {
+ if (callout_active(tp->tt_persist)) {
+ callout_stop(tp->tt_persist);
+ tp->t_rxtshift = 0;
+ }
+ callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+ }
+#endif
if (!callout_active(tp->tt_rexmt) &&
tp->snd_nxt != tp->snd_una) {
if (callout_active(tp->tt_persist)) {
@@ -859,6 +1125,12 @@
error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
(so->so_options & SO_DONTROUTE), 0);
}
+
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ /* Update snd_awnd to reflect the new data that was sent. */
+ tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) +
+ tp->retran_data;
+#endif /* defined(TCP_SACK) && defined(TCP_FACK) */
if (error) {
/*
Index: tcp_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.113
diff -u -d -b -w -u -d -r1.113 tcp_subr.c
--- tcp_subr.c 2001/08/22 00:58:16 1.113
+++ tcp_subr.c 2001/08/22 23:11:21
@@ -148,6 +148,12 @@
SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
&tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
+#ifdef TCP_SACK
+static int tcp_do_sack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_do_sack, CTLFLAG_RW, &tcp_do_sack, 0,
+ "Experimental Sack");
+#endif
+
static void tcp_cleartaocache __P((void));
static void tcp_notify __P((struct inpcb *, int));
@@ -161,6 +167,14 @@
#define TCBHASHSIZE 512
#endif
+#ifndef TCP_DO_SACK
+#ifdef TCP_SACK
+#define TCP_DO_SACK 1
+#else
+#define TCP_DO_SACK 0
+#endif
+#endif
+
/*
* This is the actual shape of what we allocate using the zone
* allocator. Doing it this way allows us to protect both structures
@@ -527,6 +541,9 @@
callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
+#ifdef TCP_SACK
+ tp->sack_disable = tcp_do_sack ? 0 : 1;
+#endif
if (tcp_do_rfc1323)
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
if (tcp_do_rfc1644)
@@ -591,6 +608,9 @@
register struct tseg_qent *q;
struct inpcb *inp = tp->t_inpcb;
struct socket *so = inp->inp_socket;
+#ifdef TCP_SACK
+ struct sackhole *p, *q_sack;
+#endif
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
@@ -729,6 +749,15 @@
m_freem(q->tqe_m);
FREE(q, M_TSEGQ);
}
+#ifdef TCP_SACK
+ /* Free SACK holes. */
+ q_sack = p = tp->snd_holes;
+ while (p != 0) {
+ q_sack = p->next;
+ free(p, M_PCB);
+ p = q_sack;
+ }
+#endif
inp->inp_ppcb = NULL;
soisdisconnected(so);
#ifdef INET6
Index: tcp_timer.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_timer.c,v
retrieving revision 1.47
diff -u -d -b -w -u -d -r1.47 tcp_timer.c
--- tcp_timer.c 2001/08/22 00:58:16 1.47
+++ tcp_timer.c 2001/08/22 23:11:21
@@ -40,6 +40,8 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/sysctl.h>
@@ -186,6 +188,9 @@
{
struct tcpcb *tp = xtp;
int s;
+#ifdef TCP_SACK
+ struct sackhole *p, *q;
+#endif
#ifdef TCPDEBUG
int ostate;
@@ -197,7 +202,25 @@
return;
}
callout_deactivate(tp->tt_2msl);
+
+#ifdef TCP_SACK
/*
+ * Free SACK holes for 2MSL and REXMT timers.
+ */
+ q = p = tp->snd_holes;
+ while (p != 0) {
+ q = p->next;
+ free(p, M_PCB);
+ p = q;
+ }
+ tp->snd_holes = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->snd_fack = tp->snd_una;
+ tp->retran_data = 0;
+ tp->snd_awnd = 0;
+#endif /* TCP_FACK */
+#endif /* TCP_SACK */
+ /*
* 2 MSL timeout in shutdown went off. If we're closed but
* still waiting for peer to close and connection has been idle
* too long, or if 2MSL time is up from TIME_WAIT, delete connection
@@ -349,6 +372,9 @@
struct tcpcb *tp = xtp;
int s;
int rexmt;
+#ifdef TCP_SACK
+ struct sackhole *p, *q;
+#endif
#ifdef TCPDEBUG
int ostate;
@@ -360,7 +386,25 @@
return;
}
callout_deactivate(tp->tt_rexmt);
+#ifdef TCP_SACK
/*
+ * Free SACK holes for 2MSL and REXMT timers.
+ */
+ q = p = tp->snd_holes;
+ while (p != 0) {
+ q = p->next;
+ free(p, M_PCB);
+ p = q;
+ }
+ tp->snd_holes = 0;
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->snd_fack = tp->snd_una;
+ tp->retran_data = 0;
+ tp->snd_awnd = 0;
+#endif /* TCP_FACK */
+#endif /* TCP_SACK */
+
+ /*
* Retransmission timer went off. Message has not
* been acked within retransmit interval. Back off
* to a longer retransmit interval and retransmit one segment.
@@ -421,11 +465,20 @@
tp->t_srtt = 0;
}
tp->snd_nxt = tp->snd_una;
+#if defined(TCP_SACK)
/*
+ * Note: We overload snd_last to function also as the
+ * snd_last variable described in RFC 2582
+ */
+ tp->snd_last = tp->snd_max;
+#else
+ /*
* Note: We overload snd_recover to function also as the
* snd_last variable described in RFC 2582
*/
+
tp->snd_recover = tp->snd_max;
+#endif /* TCP_SACK */
/*
* Force a segment to be sent.
*/
Index: tcp_usrreq.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.66
diff -u -d -b -w -u -d -r1.66 tcp_usrreq.c
--- tcp_usrreq.c 2001/08/22 00:58:16 1.66
+++ tcp_usrreq.c 2001/08/22 23:11:21
@@ -761,6 +761,16 @@
tp->iss = tcp_new_isn(tp);
tcp_sendseqinit(tp);
+#if defined(TCP_SACK)
+ tp->snd_last = tp->snd_una;
+#endif
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tp->snd_fack = tp->snd_una;
+ tp->retran_data = 0;
+ tp->snd_awnd = 0;
+#endif
+
+
/*
* Generate a CC value for this connection and
* check whether CC or CCnew should be used.
@@ -978,6 +988,11 @@
case TCP_NOPUSH:
optval = tp->t_flags & TF_NOPUSH;
break;
+#ifdef TCP_SACK
+ case TCP_SACK_DISABLE:
+ optval = tp->sack_disable;
+ break;
+#endif
default:
error = ENOPROTOOPT;
break;
Index: tcp_var.h
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_var.h,v
retrieving revision 1.71
diff -u -d -b -w -u -d -r1.71 tcp_var.h
--- tcp_var.h 2001/08/22 00:58:16 1.71
+++ tcp_var.h 2001/08/22 23:11:21
@@ -36,6 +36,21 @@
#ifndef _NETINET_TCP_VAR_H_
#define _NETINET_TCP_VAR_H_
+
+struct sackblk {
+ tcp_seq start; /* start seq no. of sack block */
+ tcp_seq end; /* end seq no. */
+};
+
+struct sackhole {
+ tcp_seq start; /* start seq no. of hole */
+ tcp_seq end; /* end seq no. */
+ int dups; /* number of dup(s)acks for this hole */
+ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */
+ struct sackhole *next; /* next in list */
+};
+
+
/*
* Kernel variables for tcp.
*/
@@ -114,6 +129,31 @@
u_long rcv_wnd; /* receive window */
tcp_seq rcv_up; /* receive urgent pointer */
+#ifdef TCP_SACK
+ int sack_disable; /* disable SACK for this connection */
+ int snd_numholes; /* number of holes seen by sender */
+ struct sackhole *snd_holes; /* linked list of holes (sorted) */
+#if defined(TCP_SACK) && defined(TCP_FACK)
+ tcp_seq snd_fack; /* for FACK congestion control */
+ u_long snd_awnd; /* snd_nxt - snd_fack + */
+ /* retransmitted data */
+ int retran_data; /* amount of outstanding retx. data */
+#endif /* TCP_FACK */
+#endif /* TCP_SACK */
+#if defined(TCP_SACK)
+ tcp_seq snd_last; /* for use in fast recovery */
+#endif
+
+#ifdef TCP_SACK
+ tcp_seq rcv_laststart; /* start of last segment recd. */
+ tcp_seq rcv_lastend; /* end of ... */
+ tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/
+ int rcv_numsacks; /* # distinct sack blks present */
+ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
+#endif
+
+
+
u_long snd_wnd; /* send window */
u_long snd_cwnd; /* congestion-controlled window */
u_long snd_ssthresh; /* snd_cwnd size threshold for
@@ -338,8 +378,12 @@
#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */
#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */
#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */
+#ifdef TCP_SACK
+#define TCPCTL_SACK 14 /*needs to revist*/
+#define TCPCTL_MAXID 15
+#else
#define TCPCTL_MAXID 14
-
+#endif
#define TCPCTL_NAMES { \
{ 0, 0 }, \
{ "rfc1323", CTLTYPE_INT }, \
@@ -355,8 +399,11 @@
{ "pcblist", CTLTYPE_STRUCT }, \
{ "delacktime", CTLTYPE_INT }, \
{ "v6mssdflt", CTLTYPE_INT }, \
+ { "sack", CTLTYPE_INT}, \
}
-
+/*
+#define TCP_SACK_DEBUG
+*/
#ifdef _KERNEL
#ifdef SYSCTL_DECL
@@ -367,6 +414,9 @@
extern struct inpcbinfo tcbinfo;
extern struct tcpstat tcpstat; /* tcp statistics */
extern int tcp_mssdflt; /* XXX */
+#ifdef TCP_SACK
+extern int tcp_do_sack; /* SACK enabled/disabled */
+#endif
extern int tcp_delack_enabled;
extern int tcp_do_newreno;
extern int ss_fltsz;
@@ -406,7 +456,20 @@
tcp_timers __P((struct tcpcb *, int));
void tcp_trace __P((int, int, struct tcpcb *, void *, struct tcphdr *,
int));
+#ifdef TCP_SACK
+int tcp_sack_option __P((struct tcpcb *,struct tcphdr *,u_char *,int));
+void tcp_update_sack_list __P((struct tcpcb *tp));
+void tcp_del_sackholes __P((struct tcpcb *, struct tcphdr *));
+void tcp_clean_sackreport __P((struct tcpcb *tp));
+void tcp_sack_adjust __P((struct tcpcb *tp));
+struct sackhole *
+ tcp_sack_output __P((struct tcpcb *tp));
+int tcp_sack_partialack __P((struct tcpcb *, struct tcphdr *));
+#endif /* TCP_SACK */
+#if defined(TCP_SACK)
+u_long tcp_seq_subtract __P((u_long, u_long ));
+#endif /* TCP_SACK */
extern struct pr_usrreqs tcp_usrreqs;
extern u_long tcp_sendspace;
extern u_long tcp_recvspace;
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?Pine.LNX.4.33.0108221713240.16888-200000>
