From owner-p4-projects@FreeBSD.ORG Sat Jul 18 20:34:32 2009 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id E06E71065693; Sat, 18 Jul 2009 20:34:31 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 850C01065691 for ; Sat, 18 Jul 2009 20:34:31 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id 71B718FC08 for ; Sat, 18 Jul 2009 20:34:31 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.3/8.14.3) with ESMTP id n6IKYVjO013394 for ; Sat, 18 Jul 2009 20:34:31 GMT (envelope-from andre@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.3/8.14.3/Submit) id n6IKYVZ9013392 for perforce@freebsd.org; Sat, 18 Jul 2009 20:34:31 GMT (envelope-from andre@freebsd.org) Date: Sat, 18 Jul 2009 20:34:31 GMT Message-Id: <200907182034.n6IKYVZ9013392@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to andre@freebsd.org using -f From: Andre Oppermann To: Perforce Change Reviews Cc: Subject: PERFORCE change 166244 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 18 Jul 2009 20:34:33 -0000 http://perforce.freebsd.org/chv.cgi?CH=166244 Change 166244 by andre@andre_t61 on 2009/07/18 20:34:11 Update and enhance comments. More refined loss recovery handling. tcp_do_ack() now handles fast recovery entry and exit. Add pipe size tracking for SACK based loss recovery. Move limited transmit directly into tcp_output(). Add SACK based recovery to tcp_retransmit(). Affected files ... .. //depot/projects/tcp_new/netinet/tcp_input.c#14 edit .. //depot/projects/tcp_new/netinet/tcp_output.c#14 edit .. //depot/projects/tcp_new/netinet/tcp_sack.c#10 edit .. //depot/projects/tcp_new/netinet/tcp_var.h#14 edit Differences ... ==== //depot/projects/tcp_new/netinet/tcp_input.c#14 (text+ko) ==== @@ -1733,6 +1733,8 @@ /* * Update send SACK information and tell us how much more * data has left the network (relative to last SACK we got). + * RFC2018: section 5 + * RFC3517: section 4 Update(), section 5 first sentence and (B) */ if ((to.to_flags & TOF_SACK) || !RB_EMPTY(&tp->snd_sackblocks)) sacked = tcp_sack_doack(tp, &to, th->th_ack); @@ -1762,8 +1764,11 @@ /* * Update congestion control information. + * NB: The algorithm must not increase cwnd when acked is zero. */ - //tcp_cc_ack(tp, th, tiwin, acked, tlen, sacked); + if (tp->t_phase < TP_LOSSRECOV) + tcp_cc_ack(tp, th, tiwin, acked, tlen, sacked); + KASSERT(tp->snd_cwnd > tp->snd_mss, ("%s: cwnd < 1*mss after congestion control function", __func__)); @@ -2953,19 +2958,45 @@ tp->snd_dupack++; else if (tp->snd_dupack > 0 && (acked > 0 || SEQ_GT(th->th_seq, tp->snd_una))) tp->snd_dupack = 0; - - if (tp->snd_dupack > 0 && tp->t_phase < TP_LOSSRECOV) + + /* Advance the unacknowledged pointer. */ + if (acked > 0) + tp->snd_una += acked; + + /* Exit loss recovery phase. */ + if (SEQ_GEQ(tp->snd_una, tp->snd_recover)) { + tp->snd_pipe = 0; + tp->t_phase = TP_SENDING; + tcp_cc_post_fr(tp); + } + + /* Enter loss recovery phase. */ + if (tp->snd_dupack == 3 && tp->t_phase < TP_LOSSRECOV) { + tcp_cc_pre_fr(tp); /* updates ssthresh */ tp->t_phase = TP_LOSSRECOV; + tp->snd_recover = tp->snd_nxt; + tp->snd_rxmit = tp->snd_una; + tp->snd_pipe = tcp_sack_pipe(tp); + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd += 3 * tp->snd_mss; + } + + /* In loss recovery phase. */ + if (tp->t_phase == TP_LOSSRECOV) { + tp->snd_pipe -= acked; + tp->snd_pipe -= sacked; + tp->snd_cwnd += tp->snd_mss; + if (acked) { + tp->snd_rxmit = tp->snd_una; + tp->snd_cwnd -= acked; + if (acked > tp->snd_mss) + tp->snd_cwnd += tp->snd_mss; + } + } KASSERT(SEQ_LT(tp->snd_una, tp->snd_nxt) || tp->snd_dupack == 0, ("%s: snd_dupack > 0 but snd_una == snd_nxt", __func__)); - /* - * Advance the unacknowledged pointer. - */ - if (acked > 0) - tp->snd_una += acked; - KASSERT(tp->snd_una == tp->snd_nxt || tcp_timer_active(tp, TT_REXMT), ("%s: outstanding data but REXMT timer not active", __func__)); ==== //depot/projects/tcp_new/netinet/tcp_output.c#14 (text+ko) ==== @@ -107,7 +107,7 @@ int optlen, int rwin, int flags); static int tcp_retransmit(struct tcpcb *tp, struct socket *so, struct tcpopt *to, u_char *opt, int *len, - int optlen, int rwin, int dlen, int flags); + int optlen, int rwin, int dlen, int slen, int flags); static int tcp_send_segments(struct tcpcb *tp, struct tcphdr *ths, u_char *opt, int off, int *olen, int optlen); static u_int tcp_rcv_wnd(struct tcpcb *tp, struct socket *so); @@ -146,7 +146,7 @@ { int flags, error, optlen = 0; tcp_win len; - int duna, swnd, cwnd, dlen, inflight, rwin; + int duna, swnd, cwnd, dlen, slen, inflight, rwin; int tcp_min_idle = 1; /* XXXAO */ struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; @@ -196,6 +196,7 @@ * swnd = remaining space in send window as advertised by remote end * cwnd = congestion window, remaing amount of data that can be in flight unacknowledged * dlen = remaing amount of data in send buffer available for sending + * slen = available amount of data that fits into send window * len = amount of data we have *and* can send righ now * * <- duna -><- swnd -> @@ -206,23 +207,12 @@ * snd_una snd_nxt * */ - duna = SEQ_DELTA(tp->snd_nxt, tp->snd_una); + duna = SEQ_DELTA(tp->snd_una, tp->snd_nxt); swnd = imax(0, tp->snd_wnd - duna); cwnd = imax(0, tp->snd_cwnd - duna); - dlen = min(so->so_snd.sb_cc - duna, swnd); - len = min(dlen, cwnd); - - /* - * XXXAO: todo token bucket, mss sized - * Retransmits should not fall under pacing limit - * and neither ACKs, window updates, etc. if there - * is no data pending. - */ - if (len > 0 && (tp->t_flags & TF_PACE)) { - len = tcp_snd_pace(tp, len); - if (len == 0) - return (0); /* next token is pending */ - } + dlen = so->so_snd.sb_cc - duna; + slen = min(dlen, swnd); + len = min(slen, cwnd); /* * Conservative approximation of data still travelling in the network. @@ -259,12 +249,17 @@ } break; case TP_SENDING: + /* + * Limited transmit: transmit new data upon the arrival of the + * first two consecutive duplicate ACKs. + * RFC3042: section 2 + */ + if (tp->snd_dupack > 0 && dlen > len && cwnd < tp->snd_mss) + len = min(slen, tp->snd_mss); /* up to one mss above cwnd */ break; case TP_LOSSRECOV: case TP_REXMT: - error = tcp_retransmit(tp, so, &to, &opt[0], &len, optlen, rwin, dlen, flags); - if (len == 0) - return (0); + error = tcp_retransmit(tp, so, &to, &opt[0], &len, optlen, rwin, dlen, slen, flags); break; case TP_PERSIST: /* @@ -279,8 +274,8 @@ } if (swnd == 0 && duna > tp->snd_wnd) { /* - * Window shrank - * after we sent into it. If window shrank to 0, + * Window shrank after we sent into it. + * If window shrank to 0, * cancel pending retransmit, pull snd_nxt back * to (closed) window, and set the persist timer * if it isn't already going. If the window didn't @@ -298,6 +293,18 @@ } /* + * XXXAO: todo token bucket, mss sized + * Retransmits should not fall under pacing limit + * and neither ACKs, window updates, etc. if there + * is no data pending. + */ + if (len > 0 && (tp->t_flags & TF_PACE)) { + len = tcp_snd_pace(tp, len); + if (len == 0) + return (0); /* next token is pending */ + } + + /* * Send out a SYN immediatly. */ if ((flags & TH_SYN) && !(tp->t_flags & TF_SENTSYN)) @@ -766,9 +773,9 @@ */ static int tcp_retransmit(struct tcpcb *tp, struct socket *so, struct tcpopt *to, - u_char *opt, int *len, int optlen, int rwin, int dlen, int flags) + u_char *opt, int *len, int optlen, int rwin, int dlen, int slen, int flags) { - int error, off, rlen = 0; + int error = 0, off, rlen = 0, rxmit; struct tcphdr ths, *th = &ths; /* @@ -793,62 +800,73 @@ * 5. TCP congestion window validation RFC2861 */ - /* - * Limited transmit: transmit new data upon the arrival of the - * first two consecutive duplicate ACKs. - * RFC3042: section 2 - */ - if (tp->snd_dupack < tcp_dupthresh && dlen > *len) { - *len = min(dlen, tp->snd_mss); /* up to one mss above cwnd */ - return (0); - } + do { + /* Calculate amount of data we may inject into the pipe (C). */ + rxmit = imax(0, tp->snd_cwnd - tp->snd_pipe); - /* - * Remember the highest byte sent yet - * and set snd_rxmit to snd_una. - */ - if (tp->snd_dupack == tcp_dupthresh) { - tp->snd_recover = tp->snd_nxt; - tp->snd_rxmit = tp->snd_una; - rlen = tcp_sack_firsthole(tp, &rexmit); - } else { - rlen = tcp_sack_nextseg(tp, &tp->snd_rexmit, dlen); - } - - if (rlen == 0) - if (dlen) - *len = dlen; /* XXXAO: pipe! */ + if (!RB_EMPTY(&tp->snd_sackblocks)) { + /* + * Get the amount of consequtive data for retransmit. + * (C.1) modulo (C.3) + */ + if (tp->snd_rxmit == tp->snd_una) + rlen = tcp_sack_firsthole(tp, &rexmit); + else + rlen = tcp_sack_nextseg(tp, &tp->snd_rexmit, slen); + /* + * If we have nothing to retransmit, see if we can + * send some new data. + * (C.3) + */ + if (rlen == 0) + if (slen > 0 && (rxmit >= tp->snd_mss || + (rxmit >= slen && dlen == slen)) + *len = min(slen, rxmit); + else + *len = 0; + break; + } + /* + * Retransmit what we've got. + * (C.1) + */ + if (rxmit >= rlen || (rlen > rxmit && rxmit > tp->snd_mss)) + rlen = min(rlen, pipe); + else + break; + } else if (tp->snd_rexmit == tp->snd_una) + rlen = min(tp->snd_mss, SEQ_DELTA(tp->snd_una, tp->snd_nxt)); else - *len = 0; - return (0); - } else - rlen = min(rlen, pipe); /* XXXAO: pipe! */ + break; + /* + * Fill in headers. + */ + th->th_win = (u_short)rwin; + th->th_seq = tp->snd_rxmit; + th->th_flags = flags; + th->th_ack = tp->rcv_nxt; - /* - * Fill in headers. - */ - th->th_win = (u_short)rwin; - th->th_seq = tp->snd_rxmit; - th->th_flags = flags; - th->th_ack = tp->rcv_nxt; + /* + * If resending a SYN or FIN, be sure NOT to use a new sequence number. + */ + if ((flags & TH_SYN) && (tp->t_flags & TF_SENTSYN)) + th->th_seq--; + if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) && + th->th_seq == tp->snd_nxt) + th->th_seq--; + + SOCKBUF_LOCK(&so->so_snd); + off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc); + error = tcp_send_segments(tp, &ths, opt, off, &rlen, optlen); + SOCKBUF_UNLOCK(&so->so_snd); - /* - * If resending a SYN or FIN, be sure NOT to use a new sequence number. - */ - if ((flags & TH_SYN) && (tp->t_flags & TF_SENTSYN)) - th->th_seq--; - if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) && - th->th_seq == tp->snd_nxt) - th->th_seq--; - - SOCKBUF_LOCK(&so->so_snd); - off = min(tp->snd_rxmit - tp->snd_una, so->so_snd.sb_cc); - error = tcp_send_segments(tp, &ths, opt, off, &rlen, optlen); - SOCKBUF_UNLOCK(&so->so_snd); + /* Start from here the next time. */ + tp->snd_rxmit += rlen; + /* Increase amount of data in the 'pipe' (C.4). */ + tp->snd_pipe += rlen; - /* Start from here the next time. */ - tp->snd_rxmit += rlen; + } while (error == 0); return (error); } ==== //depot/projects/tcp_new/netinet/tcp_sack.c#10 (text+ko) ==== @@ -254,9 +254,10 @@ /* * D-SACK, was a duplicate retransmit. * RFC2883: section 5 - * XXXAO: Adjust pipe. + * XXXAO: Adjust pipe for data that has left the network. */ if (i == 0 && SEQ_DELTA(sack.tsb_blk.start, sack.tsb_blk.end) <= tp->snd_mss) { + tp->snd_pipe -= SEQ_DELTA(sack.tsb_blk.start, sack.tsb_blk.end); //TCPSTAT_INC(); } continue; @@ -391,6 +392,34 @@ return (len); } +/* + * Calculate the number of segments assumed to be in the 'pipe'. + * Instead of counting all the bytes from snd_una up to snd_nxt + * we start from the highest sackblock and work our way down. + * The calculated number is only valid when snd_una == snd_rxmit; + */ +int +tcp_sack_pipe(struct tcpcb *tp) +{ + int pipe = 0, blocks = 0, sacked = 0; + tcp_seq prev; + struct tcp_sack_block *tsb; + + prev = tp->snd_nxt; + + RB_FOREACH_REVERSE(tsb, tcp_sackblocks, &tp->snd_sackblocks) { + pipe += SEQ_DELTA(tsb->tsb_blk.end, prev); + sacked += SEQ_DELTA(tsb->tsb_blk.start, tsb->tsb_blk.end); + if (sacked > 3 * tp->snd_mss) + break; + if (blocks++ > 2) + break; + prev = tsb->tsb_blk.start; + } + + return (pipe); +} + #ifdef DDB static void db_print_sackblocks(struct tcpcb *tp) ==== //depot/projects/tcp_new/netinet/tcp_var.h#14 (text+ko) ==== @@ -173,7 +173,6 @@ tcp_seq snd_una; /* send unacknowledged */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_rxmit; /* from where to retransmit */ - tcp_seq snd_inflight; /* estimate of data currently in the network (~SACK) */ u_int snd_maxburst; /* maximum send burst length */ tcp_seq snd_up; /* send urgent pointer */ @@ -188,6 +187,7 @@ int snd_dupack; /* number of duplicate ACK's reveived */ tcp_seq snd_recover; /* fast retransmit recover */ + int snd_pipe; /* bytes assumed to be inflight in the pipe */ int snd_abcack; /* count the ack'ed data for ABC */ tcp_seq snd_rtseq; /* seq# of current RTT measurement */ @@ -647,6 +647,7 @@ void tcp_sack_flush(struct tcpcb *); void tcp_sack_init(void); int tcp_sack_nextseg(struct tcpcb *, tcp_seq *); +int tcp_sack_pipe(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); u_long tcp_seq_subtract(u_long, u_long );