From owner-p4-projects@FreeBSD.ORG Mon Feb 16 16:53:44 2009 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id A6D4F1065677; Mon, 16 Feb 2009 16:53:43 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 49EBD1065670 for ; Mon, 16 Feb 2009 16:53:43 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id 32B9F8FC1B for ; Mon, 16 Feb 2009 16:53:43 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.3/8.14.3) with ESMTP id n1GGrhPA085563 for ; Mon, 16 Feb 2009 16:53:43 GMT (envelope-from andre@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.3/8.14.3/Submit) id n1GGrhNM085561 for perforce@freebsd.org; Mon, 16 Feb 2009 16:53:43 GMT (envelope-from andre@freebsd.org) Date: Mon, 16 Feb 2009 16:53:43 GMT Message-Id: <200902161653.n1GGrhNM085561@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to andre@freebsd.org using -f From: Andre Oppermann To: Perforce Change Reviews Cc: Subject: PERFORCE change 157800 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 16 Feb 2009 16:53:46 -0000 http://perforce.freebsd.org/chv.cgi?CH=157800 Change 157800 by andre@andre_flirtbox on 2009/02/16 16:53:08 Checkpoint WIP. Affected files ... .. //depot/projects/tcp_new/netinet/tcp_input.c#7 edit .. //depot/projects/tcp_new/netinet/tcp_output.c#4 edit .. //depot/projects/tcp_new/netinet/tcp_var.h#3 edit Differences ... ==== //depot/projects/tcp_new/netinet/tcp_input.c#7 (text+ko) ==== @@ -179,20 +179,6 @@ #endif /* - * Indicate whether this ack should be delayed. We can delay the ack if - * - there is no delayed ack timer in progress and - * - our last ack wasn't a 0-sized window. We never want to delay - * the ack that opens up a 0-sized window and - * - delayed acks are enabled or - * - this is a half-synchronized T/TCP connection. - */ -#define DELAY_ACK(tp) \ - ((!tcp_timer_active(tp, TT_DELACK) && \ - (tp->t_flags & TF_RXWIN0SENT) == 0) && \ - (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) - - -/* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended * ip6_protox[] call format in ip6_input @@ -362,7 +348,7 @@ tcpstat.tcps_rcvbadoff++; goto drop; } - tlen -= off; /* tlen is used instead of ti->ti_len */ + tlen -= off; /* tlen is used instead of th->th_len */ if (off > sizeof (struct tcphdr)) { if (isipv6) { #ifdef INET6 @@ -932,9 +918,10 @@ * discouraged to shrink the window. * RFC793: section 3.7, page 42-44 * RFC1122: section 4.2.2.16 + * + * XXXAO: Fix up. rcv_wnd is an absolute pointer in seq space. */ - rwin = sbspace(&so->so_rcv); - rwin = imax(rwin, (int)(tp->rcv_advwin - tp->rcv_nxt)); + rwin = tp->rcv_wnd - tp->rcv_nxt; /* * Validation checks on any incoming segment. @@ -947,7 +934,7 @@ * into established state and initializations of the timers. */ case TCPS_SYN_RECEIVED: - tp->t_starttime = tcp_uptime(); + tp->t_starttime = time_uptime; TCPS_TRANS(tp, TCPS_ESTABLISHED); soisconnected(so); @@ -963,7 +950,7 @@ */ case TCPS_SYN_SENT: /* - * RST is handled separately below. + * RST is handled separatetly below. * RFC793: section 3.9, page 66-67, second check */ if (thflags & TH_RST) @@ -1029,11 +1016,14 @@ * RFC793: section 3.1, page 18-19 * RFC1122: section 4.2.2.6 * RFC1191: section 3.1 + * + * NB: MSS is computed twice. Once when we send the inital + * SYN and once when get back the SYN-ACK. */ if (to.to_flags & TOF_MSS) - tcp_mss(tp, to.to_mss); + tp->snd_mss = tcp_mss(tptoinpinc(tp), to.to_mss, 0); else - tcp_mss(tp, tcp_mssdflt); + tp->snd_mss = tcp_mss(tptoinpinc(tp), 0, 0); /* * Do window scaling on this connection? @@ -1129,7 +1119,7 @@ tp->snd_wu_ack = th->th_ack; th->th_seq++; /* SYN is acked */ - tp->t_starttime = tcp_uptime(); + tp->t_starttime = time_uptime; TCPS_TRANS(tp, TCPS_ESTABLISHED); #ifdef MAC SOCK_LOCK(so); @@ -1218,7 +1208,7 @@ * * We store the receive time as uptime with second * resolution. This makes us independent from the - * wrap-around after 2^32 / hz (24.8 days at 1ms hz). + * wrap-around after 2^32 / 2 / hz (24.8 days at 1ms hz). * * XXXAO: Linux says PAWS is broken. Analyze if true or not. * Retransmitted segments are not presented for further processing. @@ -1425,6 +1415,7 @@ case TCPS_SYN_SENT: /* * In TCPS_SYN_SENT the RST MUST carry the ACK flag. + * RFC793: section 3.4, page 37, Reset Processing * RFC793: section 3.9, page 66, first check */ if (!(thflags & TH_ACK)) { @@ -1434,12 +1425,17 @@ } /* - * The ACK must be within what we sent but does - * not have to ACK the SYN. + * The ACK must acknowledge the SYN and any data + * we may have sent with the original SYN. + * RFC793: section 3.4, page 37, Reset Processing * RFC793: section 3.9, page 66, first check + * + * NB: We accept ACKing the SYN w/o and with data + * as some implementations refuse to ACK data in + * a SYN. */ - if (SEQ_LT(th->th_ack, tp->snd_una) || - SEQ_GT(th->th_ack, th->snd_nxt)) { + if (th->th_ack != tp->snd_una || + th->th_ack != th->snd_nxt) { tcplog("RST does not match, segment ignored"); tcpstat.tcps_badrst++; goto drop; @@ -1735,6 +1731,8 @@ /* * Update send SACK information and tell us how much more * data has left the network (relative to last SACK we got). + * XXXAO: Determine if there was a duplicate ACK going on + * based on the changes of the SACK information. */ if ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes)) sacked = tcp_sack_doack(tp, &to, th->th_ack); @@ -1759,7 +1757,7 @@ /* * Update congestion control information. */ - nudgeoutput = tcp_congest(tp, th, tiwin, acked, tlen, sacked); + nudgeoutput |= tcp_congest(tp, th, tiwin, acked, tlen, sacked); /* * Drop acknowledged data from send socket buffer @@ -1783,12 +1781,10 @@ * data from the socket buffer. */ if (acked > so->so_snd.sb_cc) { - tp->snd_wnd -= so->so_snd.sb_cc; sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc); ourfinisacked = 1; } else { sbdrop_locked(&so->so_snd, acked); - tp->snd_wnd -= acked; ourfinisacked = 0; } @@ -1896,6 +1892,19 @@ * NB: Continue with segment. */ } + + /* + * Stop the retransmit timer if all data we sent + * has been acknowledged. Otherwise restart it + * if we still have outstanding data. + * + * XXXAO: Refine the test. The TF_NEEDFIN may not + * enough. + */ + if (tp->snd_una == tp->snd_nxt && !(tp->t_flags & TF_NEEDFIN)) + tcp_timer_activate(TT_RXMIT, 0); + else + tcp_timer_activate(TT_RXMIT, tp->snd_rto); } /* @@ -1918,7 +1927,7 @@ */ if ((thflags & TH_URG) && th->th_urp > 0 && tlen > 0 && !TCPS_HAVERCVDFIN(tp->t_state)) { - tcp_do_urg(tp, th, tlen); + tcp_do_urg(tp, th, &tlen); } else if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) { tp->rcv_up = tp->rcv_nxt; } @@ -2090,6 +2099,11 @@ } /* + * Update size of receive window. + */ + tp->rcv_wnd = sbspace(so->so_rcv); + + /* * NB: sorwakeup_locked implicitly unlocks. */ sorwakeup_locked(so); @@ -2249,12 +2263,15 @@ * * XXXAO: Multi-delack? */ - if (nudgeoutput || (tp->t_flags & TF_ACKNOW)) + if ((tp->t_flags & TF_ACKNOW) || tp->snd_delack > 1 || + nudgeoutput || (tp->t_flags & TF_RXWIN0SENT) || + !tcp_delack_enabled) { (void) tcp_output(tp); - else if (tp->t_flags & TF_DELACK) { - tp->t_flags &= ~TF_DELACK; + } else if (SEQ_GT(tp->rcv_nxt, tp->snd_lastack)) { + tp->snd_delack++; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } + INP_UNLOCK(tp->t_inpcb); return; @@ -2374,7 +2391,7 @@ * XXXAO: Report violations of the options specs. */ static void -tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) +tcp_do_options(struct tcpopt *to, u_char *cp, int cnt, int flags) { int opt, optlen; @@ -2471,68 +2488,106 @@ * Finish this function and validate against all relevant RFCs. * Use bintime second part for t_rcvtime. * And a couple of other things. - * - * XXXAO: Linux talks about some problem with the RTO algorithm. - * Figure out what the problem is. - * - * XXXAO: The sliding window of eight measurements from RFC793 is - * way too little when using timestamps in fast networks. - * Average 10ms of measurements and integrate that into a 1000ms - * sliding window. The same for the variance. When using timestamps. */ static void tcp_do_time(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, int acked, int tlen, int sacked) { - int delta, rtt; + int rtt; int tick = tcp_ticks; + INP_LOCK_ASSERT(tp->t_inpcb); KASSERT(tp != NULL && th != NULL && to != NULL, - ("%s: ", __func__)); - INP_LOCK_ASSERT(tp->t_inpcb); + ("%s: insufficient parameters", __func__)); /* + * 1. We received a valid segment. + * * Make note of most recent segment received time. */ - tp->t_rcvtime = tcp_ticks; /* XXX: ticks64 */ + tp->t_rcvtime = tcp_uptime(); + tp->t_rcvticks = tick; /* + * 2. If timestamps are used decide which to reflect. + * * When using timestamps and delayed ACKs we should reply * with the TSval from the earliest unacknowledged segment. - * RFC1323: Section 3.4, Page 15, Case (A) + * RFC1323: section 3.4, Page 15, Case (A) * * On packet loss echo the TSval from the latest segment * that filled a hole. Only reflect timestamps that advance * the left edge of the window. - * RFC1323: Section 3.4, Page 15, Case (B & C) + * RFC1323: section 3.4, Page 15, Case (B & C) + * + * Corrected algorithm. + * Stevens Vol.2: section 26.6, page 870 + * Braden93 + * + * If SACK is enabled we should be able to reflect every + * timestamp as long as it GEQ than the one before. This + * way we avoid late out-of-order segments. Whenever more + * data was sacked advance reflected timestamp. * - * XXXAO: With SACK we could do better. - * if (sacked > 0) ... + * Does this give PAWS problems? */ if (to->to_flags & TOF_TS) { - if ((!(tp->t_flags & TF_DELACK) && th->th_seq == tp->rcv_nxt) || - (!TAILQ_EMPTY(tp->rcv_trq) && th->th_seq == tp->rcv_nxt)) +#ifdef TCP_RFC1323_BRADEN + if (TS_GEQ(to->to_tsval, tp->snd_tsecr) && + SEQ_LEQ(th->th_ack, tp->snd_lastack)) { +#endif +#ifdef TCP_RFC1323bis_plusSACK + if (TS_GT(to->to_tsval, tp->snd_tsecr) && + ((th->th_seq == tp->rcv_nxt && tp->snd_delack == 0) || + sacked > 0) { +#endif tp->snd_tsecr = to->to_tsval; - tp->snd_tsecrts = tcp_ticks; /* XXX: ticks64 */ + tp->snd_tsecrts = tcp_ticks; + } + KASSERT(!TS_GT(to->to_secr, tick), + ("%s: timestamp newer than our time", __func__)); /* * Remember highest most recent reflected TS. */ - if (to->to_tsecr > tp->ts_recent) + if (SEQ_LEQ(th->th_seq, tp->snd_lastack) && + TS_GT(to->to_tsecr > tp->ts_recent)) tp->ts_recent = to->to_tsecr; + tp->ts_recentts = tick; } /* + * 3. If timestamps are used calculate the current RTT. + */ + if (to->to_flags & TOF_TS) { + rtt = tick - to->to_tsecr; + } else if (acked > 0 && tp->snd_rtseq != 0 && + SEQ_GT(th->th_ack, tp->snd_rtseq) && + TAILQ_EMPTY(tp->rcv_trq) && tp->snd_rtoshift == 0) { + rtt = tick - tp->snd_rtts; + tp->snd_rtseq = 0; + } else + return; + + /* + * 4. If no timestamps are used see whether new data was ack'ed + * and if so, calculate the current RTT. + * * We can only measure the RTT if new data was acknowledged. * That means we can only update the RTT estimates when we * are sending data. * * XXXAO: Not really true with timestamps and a steady receive * stream. + * + * Karns algorithm. Only update on non-retransmitted segments. + * Compute the time delta in ticks (1/hz). + * + * XXXAO: How to deal with retransmits when using timestamps? */ - if (acked == 0) - return; /* + * 5. Update at all? + * * If we haven't sent anything for more than one RTO ignore * the time measurement or our estimate will be way off. */ @@ -2543,35 +2598,85 @@ } /* - * Karns algorithm. Only update on non-retransmitted segments. + * Remember the lowest RTT we've ever seen. + * Must be at least 1 tick. + */ + if (tp->t_rttlowest > rtt) + tp->t_rttlowest = max(rtt, 1); + + /* + * Recompute the SRTT, RTTVAR and RTO. * - * XXXAO: How to deal with retransmits when using timestamps? + * XXXAO: Make it pluggable so that different algorithms + * can be tested. */ + tp->snd_rto = tcp_do_rto(tp, rtt); /* - * Compute the time delta in ticks (1/hz). + * We received an ack for a packet that wasn't retransmitted; + * it is probably safe to discard any error indications we've + * received recently. This isn't quite right, but close enough + * for now (a route might have failed after we sent a segment, + * and the return path might not be symmetrical). + * XXXAO: Doesn't belong here. */ - if (to->to_flags & TOF_TS) { - rtt = tick - to->to_tsecr; - } else if (tp->t_rtseq != 0 && SEQ_GT(th->th_ack, tp->t_rtseq) && - TAILQ_EMPTY(tp->rcv_trq) && tp->snd_rtoshift == 0) { - rtt = tick - tp->t_rtseq; - tp->t_rtseq = 0; - } else - return; + tp->t_softerror = 0; /* - * Limit delta to some reasonable amount. + * Statistics. */ - rtt = min(60*hz, max(1, rtt)); + tp->t_rttupdated++; + tcpstat.tcps_rttupdated++; + + return; +} + +/* + * Compute the SRTT, RTTVAR and return the updated RTO. + * RFC1122: section 4.2.3.1 + * RFC2988: entire document + * + * External parameters that affect the RTO calculation: + * minimum RTO value (fixed sysctl) + * maximum RTO value (fixed sysctl) + * initial RTO value (fixed sysctl) + * + * XXXAO: Linux talks about some problem with the RTO algorithm. + * Figure out what the problem is. + * + * XXXAO: The sliding window of eight measurements from RFC793 is + * way too little when using timestamps in fast networks. + * Average 10ms of measurements and integrate that into a 1000ms + * sliding window. The same for the variance. When using timestamps. + * Or integrate over one RTO. + * + * XXXAO: We should use rttlowest as base and all deviations from it + * count as RTT variance. Use a squared algorithm to bias it to the + * upper level. Trying to calculate the actual RTT is futile and + * very volatile. rttlowest is a very good and fairly stable statistic + * baseline. One can't get better than speed of light in optical media. + * Everything faster than one tick doesn't concern us anyway. Having + * stable baseline simplifies and improves a number of statistical + * calculations and assumptions. Some magic has to be applied when + * a better lower baseline is measured though. + */ +static int +tcp_do_rto(struct tcpcb *tp, int rtt) +{ + int delta, rto; + INP_LOCK_ASSERT(tp->t_inpcb); + KASSERT(tp != NULL, + ("%s: insufficient parameters", __func__)); + /* - * Remember the lowest RTT we've ever seen. + * Limit delta to some reasonable amount. */ - if (tp->t_rttlowest > rtt) - tp->rttlowest = rtt; + rtt = min(60 * hz, max(1, rtt)); /* + * 6. Integrate new measurement. + * * Compute smoothed RTT and smoothed RTT variance. */ if (tp->t_srtt) { @@ -2595,7 +2700,7 @@ * rttvar is stored as fixed point with 4 bits after the * binary point (scaled by 16). The following is * equivalent to rfc793 smoothing with an alpha of .75 - * (rttvar = rttvar*3/4 + |delta| / 4). This replaces + * (rttvar = rttvar * 3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ if (delta < 0) @@ -2616,29 +2721,17 @@ tp->t_rxtshift = 0; /* + * 7. Recompute RTO timer. + * * The retransmit should happen at rtt + 4 * rttvar. * XXX: Backoff. * RFC2988, Section 2, Page 2-3, Cases 2.1 through 2.5 */ - tp->snd_rto = max(((tp->t_srtt >> TCP_RTT_SHIFT) + - max(4 * (tp->t_rttvar >> TCP_RTTVAR_SHIFT), TCPTV_REXMTMAX)), - tcp_rexmit_min); + rto = max(((tp->t_srtt >> TCP_RTT_SHIFT) + + max(4 * (tp->t_rttvar >> TCP_RTTVAR_SHIFT), TCPTV_REXMTMAX)), + tcp_rexmit_min); - /* - * We received an ack for a packet that wasn't retransmitted; - * it is probably safe to discard any error indications we've - * received recently. This isn't quite right, but close enough - * for now (a route might have failed after we sent a segment, - * and the return path might not be symmetrical). - * XXX: Doesn't belong here. - */ - tp->t_softerror = 0; - - /* - * Statistics. - */ - tp->t_rttupdated++; - tcpstat.tcps_rttupdated++; + return (rto); } /* @@ -2757,7 +2850,7 @@ */ int tcp_do_wu(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, - int tiwin, in acked, int tlen, int sacked) + int tiwin, int acked, int tlen, int sacked) { KASSERT(tp != NULL && th != NULL, @@ -2810,14 +2903,14 @@ * RFC793: section 3.7, page 42-44, "Managing the Window" * RFC1122: section 4.2.2.16 */ - if (SEQ_DELTA(tp->snd_nxt, tp->snd_una) + tiwin < tp->snd_wnd) + if (SEQ_DELTA(tp->snd_nxt, tp->snd_una + acked) + tiwin < tp->snd_wnd) tcplog("peer shrank the window"); /* * Update the window and keep track of this update. */ tp->snd_wnd = tiwin; - if (th->th_seq > tp->snd_wu_seq) + if (SEQ_GT(th->th_seq, tp->snd_wu_seq)) tp->snd_wu_seq = th->th_seq; if (tp->snd_wnd > tp->snd_maxwnd) tp->snd_maxwnd = tp->snd_wnd; @@ -2936,61 +3029,63 @@ } /* - * Determine a reasonable value for maxseg size. - * If the route is known, check route for mtu. - * If none, use an mss that can be handled on the outgoing - * interface without forcing IP to fragment. + * Determine a reasonable value for MSS size. If the route is known, + * check route for mtu. If none, use an MSS that can be handled on + * the outgoing interface without forcing IP to fragment. * If no route is found, route has no mtu, or the destination * isn't local, use a default, hopefully conservative size (usually * 512 or the default IP max size, but no more than the mtu of the * interface), as we can't discover anything about intervening * gateways or networks. - * We also initialize the congestion/slow start window to be a single - * segment if the destination isn't local. - * While looking at the routing entry, we also initialize other - * path-dependent parameters from pre-set or cached values in the - * routing entry. + * RFC793: section x * - * Also take into account the space needed for options that we - * send regularly. Make maxseg shorter by that amount to assure - * that we can send maxseg amount of data even when the options - * are present. Store the upper limit of the length of options plus - * data in maxopd. XXX: No longer needed. - * - * NOTE that this routine is only called when we process an incoming - * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). - * - * XXXAO: - * Split up and simplify this function. - * Move initialization of cached values into its own function. + * NB: If no offer received pass as zero. */ -void -tcp_mss(struct tcpcb *tp, int offer) +uint16_t +tcp_mss(struct in_conninfo *inc, int offer, int mtuflags) { - struct inpcb *inp = tp->t_inpcb; - struct socket *so = inp->inp_socket; - u_long bufsize; - u_long maxmtu; - int rtt, mss; - int origoffer = offer; - int mtuflags = 0; + uint16_t mss = 0; + uint32_t maxmtu = 0; + uint32_t thcmtu = 0; + int min_protoh; +#ifdef INET6 + int isipv6 = inc->inc_isipv6 ? 1 : 0; +#endif + + KASSERT(inc != NULL, + ("%s: NULL in_conninfo pointer", __func__)); + #ifdef INET6 - int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; + if (isipv6) { + mss = tcp_v6mssdflt; + maxmtu = tcp_maxmtu6(inc, mtuflags); + min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + } else #endif - struct hc_metrics_lite metrics; + { + mss = tcp_mssdflt; + maxmtu = tcp_maxmtu(inc, mtuflags); + min_protoh = sizeof(struct tcpiphdr); + } + thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ /* - * Initialize. - * If there is no route to sender, - * we stay with the default mss. + * Determine MTU. */ - mss = tcp_mssopt(tcpcbtoinc(tp), &mtuflags); + if (maxmtu && thcmtu) + mss = min(maxmtu, thcmtu) - min_protoh; + else if (maxmtu || thcmtu) + mss = max(maxmtu, thcmtu) - min_protoh; + + if (offer == 0) + return (mss); /* * Prevent DoS attack with too small MSS. Round up * to at least minmss. */ offer = max(offer, tcp_minmss); + /* * Sanity check: make sure that maxopd will be large * enough to allow some data on segments even if the @@ -3000,14 +3095,61 @@ offer = max(offer, 64); /* - * maxopd stores the maximum length of data AND options - * in a segment; maxseg is the amount of data in a normal - * segment. We need to store this value (maxopd) apart - * from maxseg, because now every segment carries options - * and thus we normally have somewhat less data in segments. + * Use a symmetric MSS. It is very unlikely that we + * have a different MSS in on the way back. + * + * XXXAO: More comment */ - tp->snd_mss = mss = min(mss, offer); - tp->t_maxopd = mss; + mss = min(mss, offer); + + return (mss); +} + +/* + * Return the initial send window for a new connection or + * after an idle timeout. + * RFC3390: entire document + * + * min(4*MSS, max(2*MSS, 4380 bytes)) + * + * NB: MSS must already be initialized. + */ +int +tcp_init_cwnd(struct tcpcb *tp) +{ + int cwnd; + + if (tcp_do_rfc3390) + cwnd = min(4 * tp->snd_mss, max(2 * tp->snd_mss, 4380)); +#ifdef INET6 + else if (isipv6 && in6_localaddr(&inp->in6p_faddr)) + cwnd = tp->snd_mss * ss_fltsz_local; +#endif + else if (in_localaddr(inp->inp_faddr)) + cwnd = tp->snd_mss * ss_fltsz_local; + else + cwnd = tp->snd_mss * ss_fltsz; + + return (cwnd); +} + +/* + * Prime some TCP variables from cached values. + */ +static void +tcp_init_values(struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + u_long bufsize; + u_long maxmtu; + int rtt, mss; + int origoffer = offer; + int mtuflags = 0; +#ifdef INET6 + int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; +#endif + struct hc_metrics_lite metrics; /* * rmx information is now retrieved from tcp_hostcache. @@ -3038,17 +3180,6 @@ min(tp->snd_wnd, so->so_snd.sb_hiwat))); else #endif - if (tcp_do_rfc3390) - tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); -#ifdef INET6 - else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || - (!isipv6 && in_localaddr(inp->inp_faddr))) -#else - else if (in_localaddr(inp->inp_faddr)) -#endif - tp->snd_cwnd = mss * ss_fltsz_local; - else - tp->snd_cwnd = mss * ss_fltsz; /* * If there's a pipesize, change the socket buffer to that size, @@ -3125,40 +3256,3 @@ tp->t_flags |= TF_TSO; } - -/* - * Determine the MSS option to send on an outgoing SYN. - */ -int -tcp_mssopt(struct in_conninfo *inc, int mtuflags) -{ - int mss = 0; - u_long maxmtu = 0; - u_long thcmtu = 0; - size_t min_protoh; -#ifdef INET6 - int isipv6 = inc->inc_isipv6 ? 1 : 0; -#endif - - KASSERT(inc != NULL, ("%s: NULL in_conninfo pointer", __func__)); - -#ifdef INET6 - if (isipv6) { - mss = tcp_v6mssdflt; - maxmtu = tcp_maxmtu6(inc, mtuflags); - min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); - } else -#endif - { - mss = tcp_mssdflt; - maxmtu = tcp_maxmtu(inc, mtuflags); - min_protoh = sizeof(struct tcpiphdr); - } - thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ - if (maxmtu && thcmtu) - mss = min(maxmtu, thcmtu) - min_protoh; - else if (maxmtu || thcmtu) - mss = max(maxmtu, thcmtu) - min_protoh; - - return (mss); -} ==== //depot/projects/tcp_new/netinet/tcp_output.c#4 (text+ko) ==== @@ -27,11 +27,9 @@ * SUCH DAMAGE. * * @(#)tcp_output.c 8.4 (Berkeley) 5/24/95 + * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.139 2007/07/01 11:38:27 gnn Exp $ */ -#include -__FBSDID("$FreeBSD: src/sys/netinet/tcp_output.c,v 1.145 2007/11/30 23:46:51 bz Exp $"); - #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" @@ -74,85 +72,82 @@ #include #endif -#ifdef IPSEC +#ifdef FAST_IPSEC #include -#endif /*IPSEC*/ +#endif /*FAST_IPSEC*/ #include #include -#ifdef notyet -extern struct mbuf *m_copypack(); -#endif - -int path_mtu_discovery = 1; +int tcp_do_pmtud = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, - &path_mtu_discovery, 1, "Enable Path MTU Discovery"); - -int ss_fltsz = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW, - &ss_fltsz, 1, "Slow start flight size"); + &tcp_do_pmtud, 1, "Enable Path MTU Discovery"); -int ss_fltsz_local = 4; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW, - &ss_fltsz_local, 1, "Slow start flight size for local networks"); - -int tcp_do_newreno = 1; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW, - &tcp_do_newreno, 0, "Enable NewReno Algorithms"); - int tcp_do_tso = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW, - &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); + &tcp_do_tso, 0, "Enable TCP Segmentation Offload"); int tcp_do_autosndbuf = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW, - &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); + &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing"); int tcp_autosndbuf_inc = 8*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW, - &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer"); + &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer"); int tcp_autosndbuf_max = 256*1024; SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW, - &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); + &tcp_autosndbuf_max, 0, "Max size of automatic send buffer"); - /* * Tcp output routine: figure out what should be sent and send it. + * + * 1. How much to send, if any + * 1.1 subject to nagles algorithm (don't send small segments) + * 1.2 subject to send window + * 1.3 subject to congestion window + * 2. Send window probe (persist mode) + * 3. Send an outstanding ACK + * 3.1 subject to delayed ack + * 4. Send a window update + * 4.1 subject to silly window avoidance + * 4.2 subject to delayed ack + * 5. Send retransmit + * 6. Send urgent data + * 7. Send based on flags */ int tcp_output(struct tcpcb *tp) { - struct socket *so = tp->t_inpcb->inp_socket; - long len, recwin, sendwin; - int off, flags, error; - struct mbuf *m; - struct ip *ip = NULL; - struct ipovly *ipov = NULL; - struct tcphdr *th; + int off, flags, error, optlen; + tcp_win len, recwin, swin; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + struct tcphdr ths; + struct tcpopt to; u_char opt[TCP_MAXOLEN]; - unsigned ipoptlen, optlen, hdrlen; -#ifdef IPSEC - unsigned ipsec_optlen = 0; +#ifdef TCP_SIGNATURE + int sigoff = 0; #endif - int idle, sendalot; - int sack_rxmit, sack_bytes_rxmt; - struct sackhole *p; - int tso = 0; - struct tcpopt to; -#if 0 - int maxburst = TCP_MAXBURST; -#endif -#ifdef INET6 - struct ip6_hdr *ip6 = NULL; - int isipv6; + INP_LOCK_ASSERT(tp->t_inpcb); + KASSERT(tp->t_state > TCPS_LISTEN, + ("%s: TCPS_LISTEN invalid", __func__)); + KASSERT(tp->t_state != TCPS_SYN_RECEIVED, + ("%s: TCPS_SYN_RECEIVED invalid", __func__)); + KASSERT(tp->t_state < TCPS_TIME_WAIT, + ("%s: TCPS_TIME_WAIT invalid", __func__)); - isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; -#endif + KASSERT(SEQ_GEQ(tp->snd_rxmit, tp->snd_una), + ("%s: snd_rxmit < snd_una", __func__)) + KASSERT(SEQ_LEQ(tp->snd_rxmit, tp->snd_nxt), + ("%s: snd_rxmit > snd_nxt", __func__)) - INP_LOCK_ASSERT(tp->t_inpcb); + /* + * Get standard flags. Removal of inappropriate flags for a + * specific segment is handled by the segmentation code. + */ + flags = tcp_outflags[tp->t_state]; /* * Determine length of data that should be transmitted, @@ -160,792 +155,610 @@ * If there is some data or critical controls (SYN, RST) * to send, then transmit; otherwise, investigate further. */ - idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); - if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) { - /* - * We have been idle for "a while" and no acks are - * expected to clock out any data we send -- - * slow start to get ack "clock" running again. - * - * Set the slow-start flight size depending on whether - * this is a local network or not. - */ - int ss = ss_fltsz; -#ifdef INET6 - if (isipv6) { - if (in6_localaddr(&tp->t_inpcb->in6p_faddr)) - ss = ss_fltsz_local; - } else -#endif /* INET6 */ - if (in_localaddr(tp->t_inpcb->inp_faddr)) - ss = ss_fltsz_local; - tp->snd_cwnd = tp->t_maxseg * ss; - } - tp->t_flags &= ~TF_LASTIDLE; - if (idle) { - if (tp->t_flags & TF_MORETOCOME) { - tp->t_flags |= TF_LASTIDLE; - idle = 0; - } - } -again: - /* - * If we've recently taken a timeout, snd_max will be greater than - * snd_nxt. There may be SACK information that allows us to avoid - * resending already delivered data. Adjust snd_nxt accordingly. - */ - if ((tp->t_flags & TF_SACK_PERMIT) && - SEQ_LT(tp->snd_nxt, tp->snd_max)) - tcp_sack_adjust(tp); - sendalot = 0; - off = tp->snd_nxt - tp->snd_una; - sendwin = min(tp->snd_wnd, tp->snd_cwnd); + + - flags = tcp_outflags[tp->t_state]; /* - * Send any SACK-generated retransmissions. If we're explicitly trying - * to send out new data (when sendalot is 1), bypass this function. - * If we retransmit in fast recovery mode, decrement snd_cwnd, since - * we're replacing a (future) new transmission with a retransmission - * now, and we previously incremented snd_cwnd in tcp_input(). + * We have been idle for "a while" and no acks are + * expected to clock out any data we send -- + * slow start to get ack "clock" running again. + * + * Set the slow-start flight size depending on whether + * this is a local network or not. */ - /* - * Still in sack recovery , reset rxmit flag to zero. - */ - sack_rxmit = 0; - sack_bytes_rxmt = 0; - len = 0; - p = NULL; - if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) && - (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { - long cwin; - - cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; - if (cwin < 0) - cwin = 0; - /* Do not retransmit SACK segments beyond snd_recover */ - if (SEQ_GT(p->end, tp->snd_recover)) { - /* - * (At least) part of sack hole extends beyond - * snd_recover. Check to see if we can rexmit data - * for this hole. - */ >>> TRUNCATED FOR MAIL (1000 lines) <<<