From owner-p4-projects@FreeBSD.ORG Mon Jun 7 23:58:24 2004 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id B878B16A4D1; Mon, 7 Jun 2004 23:58:23 +0000 (GMT) Delivered-To: perforce@freebsd.org Received: from mx1.FreeBSD.org (mx1.freebsd.org [216.136.204.125]) by hub.freebsd.org (Postfix) with ESMTP id 789F616A4CE for ; Mon, 7 Jun 2004 23:58:23 +0000 (GMT) Received: from repoman.freebsd.org (repoman.freebsd.org [216.136.204.115]) by mx1.FreeBSD.org (Postfix) with ESMTP id 7038A43D2D for ; Mon, 7 Jun 2004 23:58:23 +0000 (GMT) (envelope-from ps@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.12.11/8.12.11) with ESMTP id i57NwCcD065765 for ; Mon, 7 Jun 2004 23:58:12 GMT (envelope-from ps@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.12.11/8.12.11/Submit) id i57NwCsK065762 for perforce@freebsd.org; Mon, 7 Jun 2004 23:58:12 GMT (envelope-from ps@freebsd.org) Date: Mon, 7 Jun 2004 23:58:12 GMT Message-Id: <200406072358.i57NwCsK065762@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to ps@freebsd.org using -f From: Paul Saab To: Perforce Change Reviews Subject: PERFORCE change 54368 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.1 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 07 Jun 2004 23:58:24 -0000 http://perforce.freebsd.org/chv.cgi?CH=54368 Change 54368 by ps@butter.corp on 2004/06/07 23:58:02 Commit the SACK work done at Yahoo! on RELENG_4 and ported to -current. The scoreboarding code was obtained from OpenBSD, and many of the remaining changes were inspired by OpenBSD, but not taken directly from there. Affected files ... .. //depot/projects/sack/conf/files#2 edit .. //depot/projects/sack/conf/options#2 edit .. //depot/projects/sack/netinet/tcp.h#2 edit .. //depot/projects/sack/netinet/tcp_input.c#2 edit .. //depot/projects/sack/netinet/tcp_output.c#2 edit .. //depot/projects/sack/netinet/tcp_sack.c#1 add .. //depot/projects/sack/netinet/tcp_seq.h#2 edit .. //depot/projects/sack/netinet/tcp_subr.c#2 edit .. //depot/projects/sack/netinet/tcp_syncache.c#2 edit .. //depot/projects/sack/netinet/tcp_timer.c#2 edit .. //depot/projects/sack/netinet/tcp_var.h#2 edit Differences ... ==== //depot/projects/sack/conf/files#2 (text+ko) ==== @@ -1450,6 +1450,7 @@ netinet/tcp_hostcache.c optional inet netinet/tcp_input.c optional inet netinet/tcp_output.c optional inet +netinet/tcp_sack.c optional inet netinet/tcp_subr.c optional inet netinet/tcp_syncache.c optional inet netinet/tcp_timer.c optional inet ==== //depot/projects/sack/conf/options#2 (text+ko) ==== @@ -346,6 +346,7 @@ SLIP_IFF_OPTS opt_slip.h TCPDEBUG TCP_SIGNATURE opt_inet.h +TCP_SACK_DEBUG opt_tcp_sack.h TCP_DROP_SYNFIN opt_tcp_input.h XBONEHACK ==== //depot/projects/sack/netinet/tcp.h#2 (text+ko) ==== @@ -85,14 +85,17 @@ #define TCPOPT_SACK_PERMITTED 4 /* Experimental */ #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 /* Experimental */ +#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_TSTAMP_HDR \ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP) +#define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */ + #define TCPOPT_CC 11 /* CC options: RFC-1644 */ -#define TCPOPT_CCNEW 12 +#define TCPOPT_CCNEW 12 #define TCPOPT_CCECHO 13 #define TCPOLEN_CC 6 #define TCPOLEN_CC_APPA (TCPOLEN_CC+2) @@ -101,6 +104,15 @@ #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 +/* Option definitions */ +#define TCPOPT_SACK_PERMIT_HDR \ +(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED) +#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8) +/* Miscellaneous constants */ +#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */ +#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */ + + /* * Default maximum segment size for TCP. * With an IP MTU of 576, this is 536, ==== //depot/projects/sack/netinet/tcp_input.c#2 (text+ko) ==== @@ -37,6 +37,7 @@ #include "opt_mac.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" +#include "opt_tcp_sack.h" #include #include @@ -159,7 +160,9 @@ struct inpcbinfo tcbinfo; struct mtx *tcbinfo_mtx; -static void tcp_dooptions(struct tcpopt *, u_char *, int, int); +static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *, + int, int, struct tcphdr *); + static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, @@ -721,7 +724,7 @@ * present in a SYN segment. See tcp_timewait(). */ if (thflags & TH_SYN) - tcp_dooptions(&to, optp, optlen, 1); + tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th); if (tcp_timewait((struct tcptw *)inp->inp_ppcb, &to, th, m, tlen)) goto findpcb; @@ -934,7 +937,7 @@ tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif - tcp_dooptions(&to, optp, optlen, 1); + tcp_dooptions(tp, &to, optp, optlen, 1, th); if (!syncache_add(&inc, &to, th, &so, m)) goto drop; if (so == NULL) { @@ -1050,7 +1053,7 @@ * for incoming connections is handled in tcp_syncache. * XXX this is traditional behavior, may need to be cleaned up. */ - tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); + tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th); if (thflags & TH_SYN) { if (to.to_flags & TOF_SCALE) { tp->t_flags |= TF_RCVD_SCALE; @@ -1065,8 +1068,22 @@ tp->t_flags |= TF_RCVD_CC; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); + if (tp->sack_enable) { + if (!(to.to_flags & TOF_SACK)) + tp->sack_enable = 0; + else + tp->t_flags |= TF_SACK_PERMIT; + } + } + if (tp->sack_enable) { + /* Delete stale (cumulatively acked) SACK holes */ + tcp_del_sackholes(tp, th); + tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/ + tp->rcv_lastend = th->th_seq + tlen; + } + /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has @@ -1116,9 +1133,10 @@ if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && - ((!tcp_do_newreno && + ((!tcp_do_newreno && !tp->sack_enable && tp->t_dupacks < tcprexmtthresh) || - (tcp_do_newreno && !IN_FASTRECOVERY(tp)))) { + ((tcp_do_newreno || tp->sack_enable) && + !IN_FASTRECOVERY(tp)))) { KASSERT(headlocked, ("headlocked")); INP_INFO_WUNLOCK(&tcbinfo); /* @@ -1214,6 +1232,9 @@ * with nothing on the reassembly queue and * we have enough buffer space to take it. */ + /* Clean receiver SACK report if present */ + if (tp->sack_enable && tp->rcv_numsacks) + tcp_clean_sackreport(tp); ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; /* @@ -1892,7 +1913,7 @@ th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - (tcp_do_newreno && + ((tcp_do_newreno || tp->sack_enable) && IN_FASTRECOVERY(tp))) { tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); @@ -1900,7 +1921,8 @@ } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win; - if (tcp_do_newreno && + if ((tcp_do_newreno || + tp->sack_enable) && SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; @@ -1915,6 +1937,17 @@ tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; + if (tp->sack_enable) { + tcpstat.tcps_sack_recovery_episode++; + tp->snd_cwnd = + tp->t_maxseg * + tp->t_dupacks; + (void) tcp_output(tp); + tp->snd_cwnd = + tp->snd_ssthresh; + goto drop; + } + tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); @@ -1965,12 +1998,16 @@ * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno) { + if (tcp_do_newreno || tp->sack_enable) { if (IN_FASTRECOVERY(tp)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { - tcp_newreno_partial_ack(tp, th); + if (tp->sack_enable) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); } else { /* + * Out of fast recovery. * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. @@ -2092,7 +2129,8 @@ * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ - if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) { + if ((!tcp_do_newreno && !tp->sack_enable) || + !IN_FASTRECOVERY(tp)) { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) @@ -2110,14 +2148,20 @@ } sowwakeup(so); /* detect una wraparound */ - if (tcp_do_newreno && !IN_FASTRECOVERY(tp) && + if ((tcp_do_newreno || tp->sack_enable) && + !IN_FASTRECOVERY(tp) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - if (tcp_do_newreno && IN_FASTRECOVERY(tp) && + if ((tcp_do_newreno || tp->sack_enable) && + IN_FASTRECOVERY(tp) && SEQ_GEQ(th->th_ack, tp->snd_recover)) EXIT_FASTRECOVERY(tp); tp->snd_una = th->th_ack; + if (tp->sack_enable) { + if (SEQ_GT(tp->snd_una, tp->snd_recover)) + tp->snd_recover = tp->snd_una; + } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; @@ -2318,7 +2362,8 @@ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } - + if (tp->sack_enable) + tcp_update_sack_list(tp); /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's @@ -2521,11 +2566,13 @@ * Parse TCP options and place in tcpopt. */ static void -tcp_dooptions(to, cp, cnt, is_syn) +tcp_dooptions(tp, to, cp, cnt, is_syn, th) + struct tcpcb *tp; struct tcpopt *to; - u_char *cp; + u_char *cp; int cnt; int is_syn; + struct tcphdr *th; { int opt, optlen; @@ -2614,6 +2661,20 @@ to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN); break; #endif + case TCPOPT_SACK_PERMITTED: + if (!tcp_do_sack || + optlen != TCPOLEN_SACK_PERMITTED) + continue; + if (is_syn) { + /* MUST only be set on SYN */ + to->to_flags |= TOF_SACK; + } + break; + + case TCPOPT_SACK: + if (!tp || tcp_sack_option(tp, th, cp, optlen)) + continue; + break; default: continue; } ==== //depot/projects/sack/netinet/tcp_output.c#2 (text+ko) ==== @@ -35,6 +35,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include #include @@ -122,6 +123,8 @@ u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; int idle, sendalot; + int i, sack_rxmit; + struct sackhole *p; #if 0 int maxburst = TCP_MAXBURST; #endif @@ -171,6 +174,13 @@ } } again: + /* + * If we've recently taken a timeout, snd_max will be greater than + * snd_nxt. There may be SACK information that allows us to avoid + * resending already delivered data. Adjust snd_nxt accordingly. + */ + if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max)) + tcp_sack_adjust(tp); sendalot = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); @@ -178,6 +188,36 @@ flags = tcp_outflags[tp->t_state]; /* + * Send any SACK-generated retransmissions. If we're explicitly trying + * to send out new data (when sendalot is 1), bypass this function. + * If we retransmit in fast recovery mode, decrement snd_cwnd, since + * we're replacing a (future) new transmission with a retransmission + * now, and we previously incremented snd_cwnd in tcp_input(). + */ + /* + * Still in sack recovery , reset rxmit flag to zero. + */ + sack_rxmit = 0; + len = 0; + p = NULL; + if (tp->sack_enable && IN_FASTRECOVERY(tp) && + (p = tcp_sack_output(tp))) { + sack_rxmit = 1; + sendalot = 1; + off = p->rxmit - tp->snd_una; + KASSERT(tp->snd_cwnd >= 0,("%s: CWIN is negative: %ld", __func__, tp->snd_cwnd)); + /* Do not retransmit SACK segments beyond snd_recover */ + if (SEQ_GT(p->end, tp->snd_recover)) + len = min(tp->snd_cwnd, tp->snd_recover - p->rxmit); + else + len = min(tp->snd_cwnd, p->end - p->rxmit); + if (len > 0) { + tcpstat.tcps_sack_rexmits++; + tcpstat.tcps_sack_rexmit_bytes += + min(len, tp->t_maxseg); + } + } + /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ @@ -230,9 +270,12 @@ * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. + * + * If sack_rxmit is true we are retransmitting from the scoreboard + * in which case len is already set. */ - len = (long)ulmin(so->so_snd.sb_cc, sendwin) - off; - + if (!sack_rxmit) + len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off); /* * Lop off SYN bit if it has already been sent. However, if this @@ -331,6 +374,8 @@ goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; + if (sack_rxmit) + goto send; } /* @@ -374,7 +419,18 @@ if (flags & TH_FIN && ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una)) goto send; - + /* + * In SACK, it is possible for tcp_output to fail to send a segment + * after the retransmission timer has been turned off. Make sure + * that the retransmission timer is set. + */ + if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) && + !callout_active(tp->tt_rexmt) && + !callout_active(tp->tt_persist)) { + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + return (0); + } /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window @@ -435,6 +491,19 @@ (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; + /* + * If this is the first SYN of connection (not a SYN + * ACK), include SACK_PERMIT_HDR option. If this is a + * SYN ACK, include SACK_PERMIT_HDR option if peer has + * already done so. This is only for active connect, + * since the syncache takes care of the passive connect. + */ + if (tp->sack_enable && ((flags & TH_ACK) == 0 || + (tp->t_flags & TF_SACK_PERMIT))) { + *((u_int32_t *) (opt + optlen)) = + htonl(TCPOPT_SACK_PERMIT_HDR); + optlen += 4; + } if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 || (tp->t_flags & TF_RCVD_SCALE))) { @@ -466,6 +535,32 @@ optlen += TCPOLEN_TSTAMP_APPA; } + /* + * Send SACKs if necessary. This should be the last option processed. + * Only as many SACKs are sent as are permitted by the maximum options + * size. No more than three SACKs are sent. + */ + if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED && + (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT && + tp->rcv_numsacks) { + u_int32_t *lp = (u_int32_t *)(opt + optlen); + u_int32_t *olp = lp++; + int count = 0; /* actual number of SACKs inserted */ + int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK; + + tcpstat.tcps_sack_send_blocks++; + maxsack = min(maxsack, TCP_MAX_SACK); + for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) { + struct sackblk sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) + continue; + *lp++ = htonl(sack.start); + *lp++ = htonl(sack.end); + count++; + } + *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2)); + optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */ + } /* * Send `CC-family' options if our side wants to use them (TF_REQ_CC), * options are allowed (!TF_NOOPT) and it's not a RST. @@ -734,6 +829,10 @@ th->th_seq = htonl(tp->snd_nxt); else th->th_seq = htonl(tp->snd_max); + if (sack_rxmit) { + th->th_seq = htonl(p->rxmit); + p->rxmit += len; + } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); @@ -831,6 +930,8 @@ tp->t_flags |= TF_SENTFIN; } } + if (tp->sack_enable && sack_rxmit && (p->rxmit != tp->snd_nxt)) + goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; @@ -853,6 +954,17 @@ * Initialize shift counter which is used for backoff * of retransmit time. */ +timer: + if (tp->sack_enable && sack_rxmit && + !callout_active(tp->tt_rexmt) && + tp->snd_nxt != tp->snd_max) { + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + if (callout_active(tp->tt_persist)) { + callout_stop(tp->tt_persist); + tp->t_rxtshift = 0; + } + } if (!callout_active(tp->tt_rexmt) && tp->snd_nxt != tp->snd_una) { if (callout_active(tp->tt_persist)) { ==== //depot/projects/sack/netinet/tcp_seq.h#2 (text+ko) ==== @@ -42,6 +42,9 @@ #define SEQ_GT(a,b) ((int)((a)-(b)) > 0) #define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) +#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) +#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) + /* for modulo comparisons of timestamps */ #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) ==== //depot/projects/sack/netinet/tcp_subr.c#2 (text+ko) ==== @@ -36,6 +36,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include #include @@ -201,6 +202,17 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); + +int tcp_do_sack = 1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW, + &tcp_do_sack, 0, "Enable/Disable TCP SACK support"); + +int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */ +SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW, + &tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements"); + +uma_zone_t sack_hole_zone; + static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_discardcb(struct tcpcb *); static void tcp_isn_tick(void *); @@ -292,6 +304,8 @@ tcp_isn_tick(NULL); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); + sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); } void @@ -599,6 +613,7 @@ tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_rfc1644) tp->t_flags |= TF_REQ_CC; + tp->sack_enable = tcp_do_sack; tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no @@ -732,6 +747,7 @@ tp->t_segqlen--; tcp_reass_qsize--; } + tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(tcpcb_zone, tp); @@ -752,7 +768,6 @@ #ifdef INET6 struct socket *so = inp->inp_socket; #endif - tcp_discardcb(tp); #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) ==== //depot/projects/sack/netinet/tcp_syncache.c#2 (text+ko) ==== @@ -39,6 +39,7 @@ #include "opt_ipsec.h" #include "opt_mac.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include #include @@ -702,7 +703,10 @@ if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif - + if (sc->sc_flags & SCF_SACK) { + tp->sack_enable = 1; + tp->t_flags |= TF_SACK_PERMIT; + } /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. @@ -989,6 +993,9 @@ sc->sc_flags = SCF_SIGNATURE; #endif + if (to->to_flags & TOF_SACK) + sc->sc_flags |= SCF_SACK; + /* * XXX * We have the option here of not doing TAO (even if the segment @@ -1105,6 +1112,7 @@ optlen += (sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGNATURE + 2 : 0; #endif + optlen += ((sc->sc_flags & SCF_SACK) ? 4 : 0); } tlen = hlen + sizeof(struct tcphdr) + optlen; @@ -1242,6 +1250,11 @@ optp += TCPOLEN_SIGNATURE + 2; } #endif /* TCP_SIGNATURE */ + + if (sc->sc_flags & SCF_SACK) { + *(u_int32_t *)optp = htonl(TCPOPT_SACK_PERMIT_HDR); + optp += 4; + } } #ifdef INET6 ==== //depot/projects/sack/netinet/tcp_timer.c#2 (text+ko) ==== @@ -32,6 +32,7 @@ #include "opt_inet6.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include #include @@ -217,6 +218,7 @@ return; } INP_LOCK(inp); + tcp_free_sackholes(tp); if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) { INP_UNLOCK(tp->t_inpcb); INP_INFO_WUNLOCK(&tcbinfo); @@ -497,6 +499,7 @@ return; } callout_deactivate(tp->tt_rexmt); + tcp_free_sackholes(tp); /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off ==== //depot/projects/sack/netinet/tcp_var.h#2 (text+ko) ==== @@ -52,6 +52,17 @@ extern int tcp_reass_qsize; extern struct uma_zone *tcp_reass_zone; +struct sackblk { + tcp_seq start; /* start seq no. of sack block */ + tcp_seq end; /* end seq no. */ +}; + +struct sackhole { + tcp_seq start; /* start seq no. of hole */ + tcp_seq end; /* end seq no. */ + tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ + struct sackhole *next; /* next in list */ +}; struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; @@ -179,6 +190,16 @@ u_long rcv_second; /* start of interval second */ u_long rcv_pps; /* received packets per second */ u_long rcv_byps; /* received bytes per second */ + /* SACK related state */ + int sack_enable; /* enable SACK for this connection */ + int snd_numholes; /* number of holes seen by sender */ + struct sackhole *snd_holes; /* linked list of holes (sorted) */ + + tcp_seq rcv_laststart; /* start of last segment recd. */ + tcp_seq rcv_lastend; /* end of ... */ + tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/ + int rcv_numsacks; /* # distinct sack blks present */ + struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ }; #define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY) @@ -216,6 +237,7 @@ #define TOF_SCALE 0x0020 #define TOF_SIGNATURE 0x0040 /* signature option present */ #define TOF_SIGLEN 0x0080 /* signature length valid (RFC2385) */ +#define TOF_SACK 0x0100 /* Peer sent SACK option */ u_int32_t to_tsval; u_int32_t to_tsecr; tcp_cc to_cc; /* holds CC or CCnew */ @@ -249,6 +271,7 @@ #define SCF_CC 0x08 /* negotiated CC */ #define SCF_UNREACH 0x10 /* icmp unreachable received */ #define SCF_SIGNATURE 0x20 /* send MD5 digests */ +#define SCF_SACK 0x80 /* send SACK option */ TAILQ_ENTRY(syncache) sc_hash; TAILQ_ENTRY(syncache) sc_timerq; }; @@ -434,6 +457,13 @@ u_long tcps_hc_added; /* entry added to hostcache */ u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */ + + /* SACK related stats */ + u_long tcps_sack_recovery_episode; /* SACK recovery episodes */ + u_long tcps_sack_rexmits; /* SACK rexmit segments */ + u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ + u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */ + u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */ }; /* @@ -467,7 +497,8 @@ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ -#define TCPCTL_MAXID 14 +#define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ +#define TCPCTL_MAXID 15 #define TCPCTL_NAMES { \ { 0, 0 }, \ @@ -505,6 +536,8 @@ extern int ss_fltsz; extern int ss_fltsz_local; +extern int tcp_do_sack; /* SACK enabled/disabled */ + void tcp_canceltimers(struct tcpcb *); struct tcpcb * tcp_close(struct tcpcb *); @@ -578,6 +611,23 @@ extern u_long tcp_recvspace; tcp_seq tcp_new_isn(struct tcpcb *); +int tcp_sack_option(struct tcpcb *,struct tcphdr *,u_char *,int); +void tcp_update_sack_list(struct tcpcb *tp); +void tcp_del_sackholes(struct tcpcb *, struct tcphdr *); +void tcp_clean_sackreport(struct tcpcb *tp); +void tcp_sack_adjust(struct tcpcb *tp); +struct sackhole *tcp_sack_output(struct tcpcb *tp); +void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); +void tcp_free_sackholes(struct tcpcb *tp); +#ifdef DEBUG +void tcp_print_holes(struct tcpcb *tp); +#endif +int tcp_newreno(struct tcpcb *, struct tcphdr *); +u_long tcp_seq_subtract(u_long, u_long ); +#ifdef TCP_SACK_DEBUG +void tcp_print_holes(struct tcpcb *tp); +#endif /* TCP_SACK_DEBUG */ + #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */