Date: Sun, 22 Mar 2009 12:20:29 GMT From: Andre Oppermann <andre@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 159606 for review Message-ID: <200903221220.n2MCKT8b016089@repoman.freebsd.org>
index | next in thread | raw e-mail
http://perforce.freebsd.org/chv.cgi?CH=159606 Change 159606 by andre@andre_t61 on 2009/03/22 12:20:02 Checkpoint WIP. Affected files ... .. //depot/projects/tcp_new/netinet/tcp_input.c#8 edit .. //depot/projects/tcp_new/netinet/tcp_output.c#5 edit .. //depot/projects/tcp_new/netinet/tcp_timer.c#2 edit .. //depot/projects/tcp_new/netinet/tcp_var.h#4 edit Differences ... ==== //depot/projects/tcp_new/netinet/tcp_input.c#8 (text+ko) ==== @@ -1686,7 +1686,7 @@ ("%s: tlen < 0", __func__)); /* - * If new data is received on a connection after the + * <<If new data is received on a connection after the * socket is closed or the user process is gone, and * doesn't has a file descriptor reference anymore, * send an RST the other end. This is an artifact @@ -1697,7 +1697,7 @@ * won't be delivering it to an application. And we * can't just wait here and drop the data into a void * until the other side gives up as that could go on - * forever. + * forever.>> * Stevens Vol.2: section 28.8, page 957, lines 687-696 * * NB: Segments without any data but ack'ing our FIN are @@ -1731,8 +1731,6 @@ /* * Update send SACK information and tell us how much more * data has left the network (relative to last SACK we got). - * XXXAO: Determine if there was a duplicate ACK going on - * based on the changes of the SACK information. */ if ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes)) sacked = tcp_sack_doack(tp, &to, th->th_ack); @@ -1755,13 +1753,20 @@ tcp_do_time(tp, th, &to, acked, tlen, sacked); /* + * Process the ACK to advance the unacknowledged pointer, + * or to detect duplicate ACKs. + */ + tcp_do_ack(tp, th, tiwin, acked, tlen, sacked); + + /* * Update congestion control information. */ - nudgeoutput |= tcp_congest(tp, th, tiwin, acked, tlen, sacked); + tcp_cc_ack(tp, th, tiwin, acked, tlen, sacked); + KASSERT(tp->snd_cwnd > tp->snd_mss, + ("%s: cwnd < 1*mss after congestion control function", __func__)); /* - * Drop acknowledged data from send socket buffer - * and advance the unacknowledged pointer. + * Drop acknowledged data from send socket buffer. * RFC793: section 3.9, page 72, fifth check */ if (acked > 0) @@ -1789,11 +1794,6 @@ } /* - * Advance the unacknowledged pointer. - */ - tp->snd_una = th->th_ack; - - /* * Wake up and inform any writers on the socket. * * NB: sowwakeup_locked() does an implicit unlock. @@ -1811,6 +1811,10 @@ ("%s: got ack for FIN but haven't sent FIN yet", __func__)); + KASSERT(!tcp_timer_active(TT_RXMIT), + ("%s: ourfinisacked but RXMIT still active", + __func__); + /* * Handle ack'ed FIN according to previous state. */ @@ -1892,19 +1896,6 @@ * NB: Continue with segment. */ } - - /* - * Stop the retransmit timer if all data we sent - * has been acknowledged. Otherwise restart it - * if we still have outstanding data. - * - * XXXAO: Refine the test. The TF_NEEDFIN may not - * enough. - */ - if (tp->snd_una == tp->snd_nxt && !(tp->t_flags & TF_NEEDFIN)) - tcp_timer_activate(TT_RXMIT, 0); - else - tcp_timer_activate(TT_RXMIT, tp->snd_rto); } /* @@ -1947,7 +1938,7 @@ * segment with urgent that got pulled and now is zero */ if (!TCPS_HAVERCVDFIN(tp->t_state) && - (tlen > 0 || (tp->rcv_trq != NULL && th->th_flags & TH_FIN))) { + (tlen > 0 || (tp->rcv_trq != NULL && (th->th_flags & TH_FIN)))) { int newsize = 0; /* Rcvbuf autoscaling. */ /* @@ -2214,6 +2205,7 @@ * the ACK for our FIN. */ tcp_twstart(tp); + tp = NULL; INP_INFO_WUNLOCK(&tcbinfo); goto done; @@ -2262,15 +2254,19 @@ * delayed ACK timer and be done. * * XXXAO: Multi-delack? + * XXXAO: Always call into tcp_output and have it decide what to do. */ + (void)tcp_output(tp, TPO_TINPUT); +#if 0 if ((tp->t_flags & TF_ACKNOW) || tp->snd_delack > 1 || nudgeoutput || (tp->t_flags & TF_RXWIN0SENT) || !tcp_delack_enabled) { - (void) tcp_output(tp); + (void) tcp_output(tp, TPO_TINPUT); } else if (SEQ_GT(tp->rcv_nxt, tp->snd_lastack)) { tp->snd_delack++; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } +#endif INP_UNLOCK(tp->t_inpcb); return; @@ -2295,7 +2291,7 @@ */ tp->t_flags |= TF_ACKNOW; m_freem(m); - (void) tcp_output(tp); + (void) tcp_output(tp, TPO_TINPUT); INP_UNLOCK(tp->t_inpcb); return; @@ -2848,7 +2844,7 @@ * on segments without ACK. The SYN_RECEIVED case is completely handled * in syncache and the initialization is done there. */ -int +static int tcp_do_wu(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, int tiwin, int acked, int tlen, int sacked) { @@ -2912,8 +2908,8 @@ tp->snd_wnd = tiwin; if (SEQ_GT(th->th_seq, tp->snd_wu_seq)) tp->snd_wu_seq = th->th_seq; - if (tp->snd_wnd > tp->snd_maxwnd) - tp->snd_maxwnd = tp->snd_wnd; + if (tp->snd_maxwnd < tiwin) + tp->snd_maxwnd = tiwin; /* * Force a call to tcp_output only if we have data to send. @@ -2924,6 +2920,53 @@ return (0); } +static void +tcp_do_ack(tp, th, tiwin, acked, tlen, sacked) +{ + /* + * Without SACK detecting a duplicate ACK is based on an + * empty segment with the same ACK as we already know and + * the same advertised receive window. Otherwise we could + * mistake a simple window update for a duplicate ACK. + * + * With SACK is gets much simpler. Any increase in the + * sack'ed data equals to a duplicate ACK. + * + * Things become difficult when we have an ongoing two-way + * data exchange. Here the receiver seeing the loss has + * add new SACK information or to prevent the transmission + * of new data to make the ACK segment detectable as duplicate + * ACK. + * + * XXXAO: This is not entirely correct as it allows for other + * packets between the duplicate ACKs. + */ + if (sacked > 0 || + (tlen == 0 && acked == 0 && SEQ_LT(tp->snd_una, tp->snd_nxt) && tp->snd_wnd == tiwin)) + tp->snd_dupack += 1; + else if (acked > 0 && tp->snd_dupack > 0) + tp->snd_dupack = 0; + + KASSERT(SEQ_LT(tp->snd_una, tp->snd_nxt) || tp->snd_dupack == 0, + ("%s: snd_dupack > 0 but snd_una == snd_nxt", __func__)); + + /* + * Advance the unacknowledged pointer. + */ + tp->snd_una += acked; + + /* + * Stop the retransmit timer if all data we sent has been + * acknowledged. Otherwise restart it if we still have + * outstanding data. + */ + if (tp->snd_una == tp->snd_nxt) + tcp_timer_activate(TT_RXMIT, 0); + else if (acked > 0) + tcp_timer_activate(TT_RXMIT, tp->snd_rto); + +} + /* * Process urgent data in TCP segments. * ==== //depot/projects/tcp_new/netinet/tcp_output.c#5 (text+ko) ==== @@ -103,6 +103,13 @@ /* * Tcp output routine: figure out what should be sent and send it. * + * We get here through: + * 1. write/send/etc + * 2. tcp_input (not always) + * 3. read/recfrom + * 4. delayed ACK, retransmission or persistent timeout + * + * Our work is to find out: * 1. How much to send, if any * 1.1 subject to nagles algorithm (don't send small segments) * 1.2 subject to send window @@ -111,14 +118,14 @@ * 3. Send an outstanding ACK * 3.1 subject to delayed ack * 4. Send a window update - * 4.1 subject to silly window avoidance + * 4.1 subject to silly window avoidance (don't send small window updates) * 4.2 subject to delayed ack * 5. Send retransmit * 6. Send urgent data * 7. Send based on flags */ int -tcp_output(struct tcpcb *tp) +tcp_output(struct tcpcb *tp, int reason) { int off, flags, error, optlen; tcp_win len, recwin, swin; @@ -150,21 +157,23 @@ flags = tcp_outflags[tp->t_state]; /* - * Determine length of data that should be transmitted, - * and flags that will be used. - * If there is some data or critical controls (SYN, RST) - * to send, then transmit; otherwise, investigate further. + * Determine our current receive window. + * This value is used for the window field in the TCP + * header and to determine whether we have to send a + * window update. + * + * NB: rwin is already scaled. */ - + rwin = tcp_rcv_wnd(tp, so); - /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- * slow start to get ack "clock" running again. + * RFC2581: Restart window. * - * Set the slow-start flight size depending on whether - * this is a local network or not. + * XXXAO: Use a decaying algorithm. It's not useful + * to have cwnd to drop of a cliff. */ if (tp->snd_nxt == tp->snd_una && (ticks - tp->t_rcvtime) >= max(tp->t_rxtcur, tcp_min_idle)) { @@ -172,12 +181,9 @@ } /* - * Compute our current receive window. - * XXXAO: Handle window updates. - */ - rwin = tcp_rcv_wnd(tp, so); - - /* + * Determine length of data that should be transmitted, if there + * is some data to send, then transmit; otherwise, investigate further. + * * First step: how much to send. * * Check out our send window. @@ -192,41 +198,45 @@ * c) how much data we have to send * d) the pacing algorithm (optional) * - * XXXAO: Add output pacing where one can limit the amount - * of data that is sent in a time period through a socket - * option. + * duna = unacknowledged data in flight + * swnd = remaining space in send window as advertised by remote end + * cwnd = congestion window, remaing amount of data that can be unacknowledged in flight + * dlen = remaing amount of data in send buffer available for sending + * len = amount of data we have *and* can send righ now + * + * <- duna -><- swnd -> + * <- cwnd -> + * <-dlen-> + * seq .......|+++++++++xxxxxxxx---z---------|....... + * ^ ^ + * snd_una snd_nxt * - * XXXAO */ - swin = tp->snd_wnd - (tp->snd_nxt - tp->snd_una); - swin = tp->snd_wnd - tp->snd_inflight; /* XXXAO: Alternative, SACK */ + duna = SEQ_DELTA(tp->snd_nxt, tp->snd_una); + swnd = imax(0, tp->snd_wnd - duna); + cwnd = imax(0, tp->snd_cwnd - duna); + dlen = so->so_snd.sb_cc - duna; + len = min(dlen, min(swnd, cwnd)); - len = min(swin, tp->snd_cwnd); - len = so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una); - len = tcp_snd_pace(tp, len); /* XXXAO: todo token bucket */ - - if (tp->t_flags & TF_REXMT) { - len = tcp_snd_rexmt(tp, len); - goto send; + if (len > 0 && (tp->t_flags & TF_PACE)) { + len = tcp_snd_pace(tp, len); /* XXXAO: todo token bucket, mss sized */ + if (len == 0) + return (0); /* next token is pending */ } - /* - * Second step: Do we send? - */ - if (tp->t_flags & TF_ACKNOW) - goto send; + inflight = duna - tp->snd_sacked; /* * Send out a SYN immediatly. */ - if (flags & TH_SYN) + if ((flags & TH_SYN) && !(tp->t_flags & TF_SENTSYN)) goto send; /* * If our state indicates that FIN should be sent * and we have not yet done so, then we need to send. */ - if (flags & TH_FIN) { + if ((flags & TH_FIN) && !(tp->t_flags & TF_SENTFIN)) { /* * All data is already sent and only the FIN is outstanding. */ @@ -238,11 +248,23 @@ * if the window is big enough. Do not care about nagle * and others. Otherwise things will go their normal way. */ - if (len <= snd_wnd) + if (len > 0) goto send; } /* + * Pending ACK? + */ + if (tp->t_flags & TF_ACKNOW) + goto send; + if (SEQ_LT(tp->snd_lastack, tp->snd_nxt) && !(tp->t_flags & TF_DELACK)) + goto send; + if (tp->t_flags & TF_DUPACK) { + len = 0; + goto send; + } + + /* * Sender silly window avoidance. We transmit under the following * conditions when len is non-zero: * @@ -254,11 +276,17 @@ * data (receiver may be limited the window size) * - We need to retransmit * + * The idea behind delayed ACK is twofold: + * a) aggregate multiple ACKs together + * b) aggregate the response from application with the ACK + * In both cases the events are probably very close together + * and thus the delayed ACK time should be very short. + * * a) Nagle algorithm: tinygram problem * b) silly window syndrome: buffer almost full * * Quoting Nagle: - * The concept behind delayed ACKs is to bet, when receiving some data from the net, + * <<The concept behind delayed ACKs is to bet, when receiving some data from the net, * that the local application will send a reply very soon. So there's no need to * send an ACK immediately; the ACK can be piggybacked on the next data going the * other way. If that doesn't happen, after a 500ms delay, an ACK is sent anyway. @@ -277,49 +305,119 @@ * an ACK is a bet that the local application will reply to the data just received. * Some apps, like character echo in Telnet servers, do respond every time. Others, * like X-Windows "clients" (really servers, but X is backwards about this), only reply - * some of the time. + * some of the time.>> * http://developers.slashdot.org/comments.pl?sid=174457&threshold=1&commentsort=0&mode=thread&cid=14515105 * * XXXAO: mss - options! */ if (len) { + /* + * Always send if there is no outstanding data in flight. + */ if (tp->snd_nxt == tp->snd_una) goto send; + + /* + * Always send if NODELAY is enabled. This gives at least + * one segment per application write no matter how small + * the amount of data. + */ if (tp->t_flags & TF_NODELAY) goto send; + + /* + * Always send if we have more than one MSS worth of data. + */ if (len >= tp->snd_mss) goto send; + + /* + * For small windows send if we have half a window worth + * of data. + */ if (tp->snd_maxwnd > 0 && len >= tp->snd_maxwnd / 2) goto send; } /* - * Send window update? We only send them if the window opened - * up again either because the socket buffer was drained or + * Persistent mode. + * Send out probe byte if there is data available. + * RFC793: section 3.7, page 42-44 + * RFC1122: section 4.2.2.17 + */ + if (swnd == 0 && dlen > 0 && (tp->t_flags & TF_FORCEDATA)) { + len = 1; + goto send; + } + if (swnd == 0 && duna > tp->snd_wnd) { + /* + * Window shrank + * after we sent into it. If window shrank to 0, + * cancel pending retransmit, pull snd_nxt back + * to (closed) window, and set the persist timer + * if it isn't already going. If the window didn't + * close completely, just wait for an ACK. + */ + tcp_timer_activate(tp, TT_REXMT, 0); + tp->t_rxtshift = 0; + if (!tcp_timer_active(tp, TT_PERSIST)) + tcp_setpersist(tp); + } + + /* + * Send window update? + * + * The receive window informs the remote side about the + * remaining space in our receive buffer. We only send + * window updates if the socket buffer was drained or * enlarged. - * When the application reads data from the socket we get notified - * to potentially inform the remote end about more receive space. + * + * When the application reads (and by it removes) data + * from the receive buffer we get notified and have to + * decide whether the change justifies a window update + * segment. + * + * We must avoid to the silly window syndrome whereas + * every read from the receive buffer, no matter how + * small, causes a window update to be sent. + * + * To prevent this we employ a silly window avoidance + * algorithm which causes updates to the window only + * when the new window is enlarged by at least two MSS + * sized segments. This part is done by tcp_rcv_wnd() + * and already incorporated into the rwin value we got. * - * XXXAO: Do not send many small window updates if we are not - * expecting more data and there was enough space adversized - * the last time. + * Our logic to determine whether to send an independent + * window update segment is more stringent. We only + * send window updates if the new space in the receive + * buffer is at least double the previous value. This + * prevents a flurry of independent window updates when + * the socket buffer has queued a lot of data and the + * application is doing small reads. This may leave + * some available space in the receive buffer not + * advertised to the remote side. As soon as it is + * sending data again our resulting ACKs will contain + * full value and no stalling will happen. * - * NB: Do not send window updates if the remote end won't send + * Independent window updates are not sent if a delayed + * ACK is pending. There we can simply piggy back the + * new window information on the pending ACK. Neither + * do we send window updates if we have received a FIN. + * It would be pointless as we are unable to receive * more data. + * + * RFC793: section 3.7, page 42-44 + * RFC1122: section 4.2.2.16 + * Stevens Vol.2: section 26.3, page 858-861, figure 26.8 */ - if (!TCPS_HAVERCVDFIN(tp->t_state) && rwin > tp->rcv_advwnd) { - delta = rwin - tp->rcv_advwnd; - - if (delta >= 2 * tp->snd_mss) - goto send; - if (2 * delta >= (long)so->so_rcv.sb_hiwat) + if (tp->rcv_advwin < rwin && !(tp->t_flags & TF_DELACK) && + !TCPS_HAVERCVDFIN(tp->t_state)) + if (rwin >= 2 * tp->rcv_advwin) goto send; - } /* * No reason to send a segment, just return. */ - SOCKBUF_UNLOCK(&so->so_snd); return (0); send: @@ -415,17 +513,20 @@ * According to RFC1323 the window field in a SYN (i.e., a <SYN> * or <SYN,ACK>) segment itself is never scaled. The <SYN,ACK> * case is handled in syncache. + * + * XXXAO: when sending dup-acks do not mix with window updates, otherwise + * the logic at the receiver may mistake the dup-ack + * XXXAO: rwin is already scaled. */ if (flags & TH_SYN) th->th_win = (u_short)(min(rwin, TCP_MAXWIN)); + else if (tp->t_flags & TF_DUPACK) + th->th_win = (u_short)tp->rcv_advwin; else th->th_win = (u_short)(rwin >> tp->rcv_scale); /* * Fill in fields. - * - * XXXAO: remembering maximum advertised window for - * use in delaying messages about window sizes. */ if (tp->snd_nxt == tp->snd_rxmit) { th->th_seq = tp->snd_nxt; @@ -436,8 +537,7 @@ } /* - * If resending a FIN, be sure not to use a new sequence number. - * XXXAO: Resending SYN? + * If resending a SYN or FIN, be sure not to use a new sequence number. */ if ((flags & TH_SYN) && (tp->t_flags & TF_SENTSYN)) th->th_seq--; @@ -453,7 +553,10 @@ SOCKBUF_UNLOCK(&so->so_snd); /* - * NB: len > 0 means we sent this much data w/o an error. + * NB: len > 0 means we sent this much data w/o error. + * error == 0 means we sent at least a single segment w/o error. + * + * XXXAO: Avoid unconditional writes to the tcpcb. */ if (len > 0) { /* @@ -463,35 +566,45 @@ tp->snd_nxt += len; else tp->snd_rxmit += len; + } + if (error == 0) { /* - * Data sent (as far as we can tell). - * If this advertises a larger window than any other segment, - * then remember the size of the advertised window. - * Any pending ACK has now been sent. + * Integrate FIN into sequence space. */ - if (rwin > 0 && SEQ_GT(tp->rcv_nxt + rwin, tp->rcv_adv)) - tp->rcv_adv = tp->rcv_nxt + rwin; + if ((flags & TH_FIN) && !(tp-t_flags & TF_SENTFIN)) { + tp->snd_nxt++; + tp->t_flags |= TF_SENTFIN; + } } - if (error == 0) { + if (len > 0 || error == 0) { /* - * Integrate SYN and FIN into sequence space. - * XXXAO: If we send data with SYN this breaks. + * Integrate SYN into sequence space. */ if ((flags & TH_SYN) && !(tp->t_flags & TF_SENTSYN)) { tp->snd_nxt++; tp->t_flags |= TF_SENTSYN; } - if ((flags & TH_FIN) && !(tp-t_flags & TF_SENTFIN)) { - tp->snd_nxt++; - tp->t_flags |= TF_SENTFIN; - } + + /* + * Any pending ACK has been sent. + * Clear related flags and disarm the delayed ACK timer. + */ + tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); + if (SEQ_LT(tp->snd_lastack, tp->rcv_nxt) + tp->snd_lastack = tp->rcv_nxt; + if (tcp_timer_active(tp, TT_DELACK)) + tcp_timer_activate(tp, TT_DELACK, 0); /* * Remember last advertised receive window. + * We need this information to send proper + * duplicate ACKs and to know whether we + * have to send a window update later on. */ - tp->rcv_advwnd = rwin; + if (tp->rcv_advwin != rwin) + tp->rcv_advwin = rwin; /* * Adjust the RXWIN0SENT flag - indicate that we have advertised @@ -507,15 +620,6 @@ tp->t_flags &= ~TF_RXWIN0SENT; } - if (len > 0 || error == 0) { - if (SEQ_LT(tp->last_ack_sent, tp->rcv_nxt) - tp->last_ack_sent = tp->rcv_nxt; - - tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); - if (tcp_timer_active(tp, TT_DELACK)) - tcp_timer_activate(tp, TT_DELACK, 0); - } - if (len > 0 && error == 0) { if ((tp->t_flags & TF_FORCEDATA) == 0 || !tcp_timer_active(tp, TT_PERSIST)) @@ -639,7 +743,7 @@ } tcpstat.tcps_sndtotal++; - if (tp->t_flags & TF_ACKNOW) /* XXXAO: test whether we increased last_ack_sent */ + if (tp->t_flags & TF_ACKNOW) /* XXXAO: test whether we increased snd_lastack */ tcpstat.tcps_sndacks++; else if (flags & (TH_SYN|TH_FIN|TH_RST)) tcpstat.tcps_sndctrl++; @@ -651,131 +755,12 @@ return (0); } -int -tcp_junk() -{ - /* - * If in persist timeout with window of 0, send 1 byte. - * Otherwise, if window is small but nonzero - * and timer expired, we will send what we can - * and go to transmit state. - */ - if (tp->t_flags & TF_FORCEDATA) { - if (snd_wnd == 0) { - /* - * If we still have some data to send, then - * clear the FIN bit. Usually this would - * happen below when it realizes that we - * aren't sending all the data. However, - * if we have exactly 1 byte of unsent data, - * then it won't clear the FIN bit below, - * and if we are in persist state, we wind - * up sending the packet without recording - * that we sent the FIN bit. - * - * We can't just blindly clear the FIN bit, - * because if we don't have any more data - * to send then the probe will be the FIN - * itself. - */ - if (off < so->so_snd.sb_cc) - flags &= ~TH_FIN; - snd_wnd = 1; - } else { - tcp_timer_activate(tp, TT_PERSIST, 0); - tp->t_rxtshift = 0; - } - - if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ - goto send; - /* - * TCP window updates are not reliable, rather a polling protocol - * using ``persist'' packets is used to insure receipt of window - * updates. The three ``states'' for the output side are: - * idle not doing retransmits or persists - * persisting to move a small or zero window - * (re)transmitting and thereby not persisting - * - * If send window is too small, there is data to transmit, and no - * retransmit or persist is pending, then go to persist state. - * If nothing happens soon, send when timer expires: - * if window is nonzero, transmit what we can, - * otherwise force out a byte. - * XXX: We don't force anything here, only return!? - */ - if (len > 0 && !tcp_timer_active(tp, TT_REXMT) && - !tcp_timer_active(tp, TT_PERSIST)) { - tp->t_rxtshift = 0; - tcp_setpersist(tp); - } - } - - /* - * Urgent data pending. - */ - if (SEQ_GT(tp->snd_up, tp->snd_una)) - goto send; - - if (len < 0) { - /* - * If FIN has been sent but not acked, - * but we haven't been called to retransmit, - * len will be < 0. Otherwise, window shrank - * after we sent into it. If window shrank to 0, - * cancel pending retransmit, pull snd_nxt back - * to (closed) window, and set the persist timer - * if it isn't already going. If the window didn't - * close completely, just wait for an ACK. - */ - len = 0; - if (snd_wnd == 0) { - tcp_timer_activate(tp, TT_REXMT, 0); - tp->t_rxtshift = 0; - if (!tcp_timer_active(tp, TT_PERSIST)) - tcp_setpersist(tp); - } - } - - /* - * TSO may only be used if we are in a pure bulk sending state. The - * presence of TCP-MD5, SACK retransmits, SACK advertizements and - * IP options prevent using TSO. With TSO the TCP header is the same - * (except for the sequence number) for all generated packets. This - * makes it impossible to transmit any options which vary per generated - * segment or packet. - */ - if ((tp->t_flags & TF_TSO) && tcp_do_tso && - ((tp->t_flags & TF_SIGNATURE) == 0) && - inp->inp_options == NULL && - inp->in6p_options == NULL && - inp->inp_sp == NULL) /* XXXAO: update */ - tso = 1; - -#if 0 - /* - * Urgent pointer handling. - */ - if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { - th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt)); - th->th_flags |= TH_URG; - } else { - /* - * If no urgent pointer to send, then we pull - * the urgent pointer to the left edge of the send window - * so that it doesn't drift into the send window on sequence - * number wraparound. - */ - tp->snd_up = tp->snd_una; /* drag it along */ - } -#endif -} - /* * Do a retransmit from snd_nxt or a later point. This is separate * from the normal transmit case as the logic is quite a bit different. */ static int -tcp_do_retransmit() +tcp_retransmit(struct tcpcb *tp, int len) { /* @@ -786,27 +771,12 @@ */ /* * We have the following mechanisms: - * 1. Fast retransmit: After we get three duplicate ACKs - * 2. NewReno Fast recovery RFC3782 + * 1. Fast recovery: After we get three duplicate ACKs RFC2581 + * 2. NewReno RFC3782 * 3. Limited transmit RFC3042 * 4. SACK tells us where to send how much data RFC3517 */ - /* - * XXXAO: remembering maximum advertised window for - * use in delaying messages about window sizes. - */ - if (tp->snd_nxt == tp->snd_rxmit) { - th->th_seq = tp->snd_nxt; - off = tp->snd_nxt - tp->snd_una; - } else { - th->th_seq = tp->snd_rxmit; - off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc); - } - /* - * Check if we have to remove FIN on SACK retransmits. - */ - if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) - flags &= ~TH_FIN; + } /* @@ -843,6 +813,9 @@ #ifdef FAST_IPSEC /* * NB: This is an expensive operation and involves memory allocation. + * + * XXXAO: If the IPSEC header size doesn't change during a session + * lifetime we could compute the number at establishment time. */ linkhdr += (int)ipsec_hdrsiz_tcp(tp); #endif @@ -1006,6 +979,14 @@ ("%s: data beyond FIN", __func__); /* + * Set the PUSH bit to indicate that we have reached + * the end of the send buffer. + */ + if (off + slen == so->so_snd.sb_cc) { + th->th_flags =| TH_PSH; + } + + /* * If we're sending everything we've got, set PUSH. * This will keep happy those implementations which * only give data to the user when a buffer fills or @@ -1124,49 +1105,86 @@ /* - * Shall we send data or not? - * And what window shall we advertize? + * Calculate and update our current receive window. + * Return the scaled receive window. */ -static int +static u_int tcp_rcv_wnd(struct tcpcb *tp, struct socket *so) { - int delta; + int delta, rwin; KASSERT(SEQ_GEQ(tp->rcv_wnd, tp->rcv_nxt), - ("%s: ", __func__)); - - delta = sbspace(so->so_rcv) - SEQ_DELTA(tp->rcv_wnd, tp->rcv_nxt); + ("%s: receive window below rcv_nxt", __func__)); /* - * Determine if we should send window update. - * Silly window avoidance: Only send window update - * if we've got at least two segments of space. - * If the socket buffer was shrunk then delta is - * a negative value. + * Calculate the amount of space in the receive buffer relative + * to the current end of the receive window. If the receive + * buffer was shrunk delta becomes negative. + * + * <- sb_hiwat -> + * <- sb_cc -> + * seq .......|++++++++++------------------z-|....... + * ^ ^ + * rcv_nxt rcv_wnd + * + * XXXAO: To avoid the locking overhead tcp_usr_rcvd could update + * a rcv_read pointer. */ + SB_LOCK(so->so_rcv); + if (so->so_rcv.sb_hiwat - so->so_rcv.sb_cc > 0) + delta = SEQ_DELTA(tp->rcv_wnd - so->so_rcv.sb_hiwat, + tp->rcv_nxt - so->so_rcv.sb_cc); + else + delta = so->so_rcv.sb_hiwat - so->so_rcv.sb_cc; + SB_UNLOCK(so->so_rcv); + /* - * - if socket buffer is less than 1/4 free, send many updates - * - piggy back window update on delayed ack - * - if socket buffer > 1/4 free send updates only from time to time - * - when sending dup-acks do not mix with window updates, otherwise - * the logic at the receiver may mistake the dup-ack - * - the new value must be larger than the minimal unscaled increment - * - if delta is more than 50% or we reach the full window + * Silly window avoidance: Only grow the window if we've + * got at least two segments of additional space available. + * Take into account the granularity of the window scale + * shift. + * + * NB: We do not shrink the window even if the receive + * buffer was shrunk on us. We won't re-open the window + * as more data comes in though. + * + * RFC793: section 3.7, page 42-44 + * RFC1122: section 4.2.2.16 + * Stevens Vol.2: section 26.3, page 858-861 */ if (delta > 0 && (delta >> tp->rcv_scale) > 0 && - delta >= 2 * tp->snd_mss) { + (tp->rcv_scale << (delta >> tp->rcv_scale)) >= 2 * tp->snd_mss) tp->rcv_wnd += delta; - } + + /* + * Report shrunk socket buffers. + */ if (delta < 0) tcp_log("our receive socket buffer was shrunk"); - rwin = (tp->rcv_wnd - tp->rcv_nxt) - so->so_rcv.sb_cc; + /* + * Our current open receive window to be advertized is + * the remaining space in the socket buffer. + */ + rwin = SEQ_DELTA(tp->rcv_wnd - tp->rcv_nxt); + + return (rwin >> tp->rcv_scale); +} - return (rwin); +/* + * Pace the segment stream by limiting the amount of data + * that is sent per time unit (tocken bucket). + * + * NB: Never go below one MSS per time unit. + */ +static void +tcp_snd_pace(struct tcpcp *tp) +{ + return; } -void -tcp_snd_autoscale(struct tcpcb *tp) +static void +tcp_snd_autoscale(struct tcpcb *tp, int swnd) { /* * Automatic sizing of send socket buffer. Often the send buffer @@ -1219,7 +1237,7 @@ if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && so->so_snd.sb_cc < tcp_autosndbuf_max && - sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { + swin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) { if (!sbreserve_locked(&so->so_snd, min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc, tcp_autosndbuf_max), so, curthread)) >>> TRUNCATED FOR MAIL (1000 lines) <<<help
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200903221220.n2MCKT8b016089>
