Date: Fri, 16 May 2008 14:09:32 GMT From: Andre Oppermann <andre@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 141720 for review Message-ID: <200805161409.m4GE9WTs009000@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=141720 Change 141720 by andre@andre_flirtbox on 2008/05/16 14:08:46 Remove TCP inflight code. Anything like this belongs into the congestion control algrithms, not into a separate things. On top of it some assumptions of this code are not true anymore. Affected files ... .. //depot/projects/tcp_new/netinet/tcp.h#2 edit .. //depot/projects/tcp_new/netinet/tcp_input.c#4 edit .. //depot/projects/tcp_new/netinet/tcp_output.c#2 edit .. //depot/projects/tcp_new/netinet/tcp_subr.c#2 edit .. //depot/projects/tcp_new/netinet/tcp_usrreq.c#2 edit .. //depot/projects/tcp_new/netinet/tcp_var.h#2 edit .. //depot/projects/tcp_reass/netinet/tcp_output.c#9 edit Differences ... ==== //depot/projects/tcp_new/netinet/tcp.h#2 (text+ko) ==== @@ -210,10 +210,9 @@ /* FreeBSD extensions to tcp_info. */ u_int32_t tcpi_snd_wnd; /* Advertised send window. */ - u_int32_t tcpi_snd_bwnd; /* Bandwidth send window. */ /* Padding to grow without breaking ABI. */ - u_int32_t __tcpi_pad[32]; /* Padding. */ + u_int32_t __tcpi_pad[36]; /* Padding. */ }; #endif ==== //depot/projects/tcp_new/netinet/tcp_input.c#4 (text+ko) ==== @@ -2525,8 +2525,6 @@ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); tcpstat.tcps_usedssthresh++; } - if (metrics.rmx_bandwidth) - tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Check the interface for TSO capabilities. ==== //depot/projects/tcp_new/netinet/tcp_output.c#2 (text+ko) ==== @@ -200,7 +200,6 @@ sendalot = 0; off = tp->snd_nxt - tp->snd_una; sendwin = min(tp->snd_wnd, tp->snd_cwnd); - sendwin = min(sendwin, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; /* ==== //depot/projects/tcp_new/netinet/tcp_subr.c#2 (text+ko) ==== @@ -161,39 +161,6 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); -/* - * TCP bandwidth limiting sysctls. Note that the default lower bound of - * 1024 exists only for debugging. A good production default would be - * something like 6100. - */ -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, - "TCP inflight data limiting"); - -static int tcp_inflight_enable = 1; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, - &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); - -static int tcp_inflight_debug = 0; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, - &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); - -static int tcp_inflight_rttthresh; -SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, - &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", - "RTT threshold below which inflight will deactivate itself"); - -static int tcp_inflight_min = 6144; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, - &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); - -static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, - &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); - -static int tcp_inflight_stab = 20; -SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, - &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); - uma_zone_t sack_hole_zone; static struct inpcb *tcp_notify(struct inpcb *, int); @@ -265,7 +232,6 @@ if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_rexmit_slop = TCPTV_CPU_VAR; - tcp_inflight_rttthresh = TCPTV_INFLIGHT_RTTTHRESH; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; INP_INFO_LOCK_INIT(&tcbinfo, "tcp"); @@ -623,7 +589,6 @@ tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; tp->t_bw_rtttime = ticks; @@ -734,7 +699,6 @@ metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; /* XXX: This wraps if the pipe is more than 4 Gbit per second */ - metrics.rmx_bandwidth = tp->snd_bandwidth; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; @@ -1696,153 +1660,6 @@ } #endif /* IPSEC */ -/* - * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING - * - * This code attempts to calculate the bandwidth-delay product as a - * means of determining the optimal window size to maximize bandwidth, - * minimize RTT, and avoid the over-allocation of buffers on interfaces and - * routers. This code also does a fairly good job keeping RTTs in check - * across slow links like modems. We implement an algorithm which is very - * similar (but not meant to be) TCP/Vegas. The code operates on the - * transmitter side of a TCP connection and so only effects the transmit - * side of the connection. - * - * BACKGROUND: TCP makes no provision for the management of buffer space - * at the end points or at the intermediate routers and switches. A TCP - * stream, whether using NewReno or not, will eventually buffer as - * many packets as it is able and the only reason this typically works is - * due to the fairly small default buffers made available for a connection - * (typicaly 16K or 32K). As machines use larger windows and/or window - * scaling it is now fairly easy for even a single TCP connection to blow-out - * all available buffer space not only on the local interface, but on - * intermediate routers and switches as well. NewReno makes a misguided - * attempt to 'solve' this problem by waiting for an actual failure to occur, - * then backing off, then steadily increasing the window again until another - * failure occurs, ad-infinitum. This results in terrible oscillation that - * is only made worse as network loads increase and the idea of intentionally - * blowing out network buffers is, frankly, a terrible way to manage network - * resources. - * - * It is far better to limit the transmit window prior to the failure - * condition being achieved. There are two general ways to do this: First - * you can 'scan' through different transmit window sizes and locate the - * point where the RTT stops increasing, indicating that you have filled the - * pipe, then scan backwards until you note that RTT stops decreasing, then - * repeat ad-infinitum. This method works in principle but has severe - * implementation issues due to RTT variances, timer granularity, and - * instability in the algorithm which can lead to many false positives and - * create oscillations as well as interact badly with other TCP streams - * implementing the same algorithm. - * - * The second method is to limit the window to the bandwidth delay product - * of the link. This is the method we implement. RTT variances and our - * own manipulation of the congestion window, bwnd, can potentially - * destabilize the algorithm. For this reason we have to stabilize the - * elements used to calculate the window. We do this by using the minimum - * observed RTT, the long term average of the observed bandwidth, and - * by adding two segments worth of slop. It isn't perfect but it is able - * to react to changing conditions and gives us a very stable basis on - * which to extend the algorithm. - */ -void -tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq) -{ - u_long bw; - u_long bwnd; - int save_ticks; - - INP_LOCK_ASSERT(tp->t_inpcb); - - /* - * If inflight_enable is disabled in the middle of a tcp connection, - * make sure snd_bwnd is effectively disabled. - */ - if (tcp_inflight_enable == 0 || tp->t_rttlow < tcp_inflight_rttthresh) { - tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->snd_bandwidth = 0; - return; - } - - /* - * Figure out the bandwidth. Due to the tick granularity this - * is a very rough number and it MUST be averaged over a fairly - * long period of time. XXX we need to take into account a link - * that is not using all available bandwidth, but for now our - * slop will ramp us up if this case occurs and the bandwidth later - * increases. - * - * Note: if ticks rollover 'bw' may wind up negative. We must - * effectively reset t_bw_rtttime for this case. - */ - save_ticks = ticks; - if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1) - return; - - bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / - (save_ticks - tp->t_bw_rtttime); - tp->t_bw_rtttime = save_ticks; - tp->t_bw_rtseq = ack_seq; - if (tp->t_bw_rtttime == 0 || (int)bw < 0) - return; - bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4; - - tp->snd_bandwidth = bw; - - /* - * Calculate the semi-static bandwidth delay product, plus two maximal - * segments. The additional slop puts us squarely in the sweet - * spot and also handles the bandwidth run-up case and stabilization. - * Without the slop we could be locking ourselves into a lower - * bandwidth. - * - * Situations Handled: - * (1) Prevents over-queueing of packets on LANs, especially on - * high speed LANs, allowing larger TCP buffers to be - * specified, and also does a good job preventing - * over-queueing of packets over choke points like modems - * (at least for the transmit side). - * - * (2) Is able to handle changing network loads (bandwidth - * drops so bwnd drops, bandwidth increases so bwnd - * increases). - * - * (3) Theoretically should stabilize in the face of multiple - * connections implementing the same algorithm (this may need - * a little work). - * - * (4) Stability value (defaults to 20 = 2 maximal packets) can - * be adjusted with a sysctl but typically only needs to be - * on very slow connections. A value no smaller then 5 - * should be used, but only reduce this default if you have - * no other choice. - */ -#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2) - bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10; -#undef USERTT - - if (tcp_inflight_debug > 0) { - static int ltime; - if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) { - ltime = ticks; - printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n", - tp, - bw, - tp->t_rttbest, - tp->t_srtt, - bwnd - ); - } - } - if ((long)bwnd < tcp_inflight_min) - bwnd = tcp_inflight_min; - if (bwnd > tcp_inflight_max) - bwnd = tcp_inflight_max; - if ((long)bwnd < tp->t_maxseg * 2) - bwnd = tp->t_maxseg * 2; - tp->snd_bwnd = bwnd; -} - #ifdef TCP_SIGNATURE /* * Callback function invoked by m_apply() to digest TCP segment data ==== //depot/projects/tcp_new/netinet/tcp_usrreq.c#2 (text+ko) ==== @@ -1230,7 +1230,6 @@ */ ti->tcpi_rcv_space = tp->rcv_wnd; ti->tcpi_snd_wnd = tp->snd_wnd; - ti->tcpi_snd_bwnd = tp->snd_bwnd; } /* @@ -1814,13 +1813,12 @@ tp->rcv_adv, tp->rcv_wnd, tp->rcv_up); db_print_indent(indent); - db_printf("snd_wnd: %lu snd_cwnd: %lu snd_bwnd: %lu\n", - tp->snd_wnd, tp->snd_cwnd, tp->snd_bwnd); + db_printf("snd_wnd: %lu snd_cwnd: %lu, + tp->snd_wnd, tp->snd_cwnd); db_print_indent(indent); - db_printf("snd_ssthresh: %lu snd_bandwidth: %lu snd_recover: " - "0x%08x\n", tp->snd_ssthresh, tp->snd_bandwidth, - tp->snd_recover); + db_printf("snd_ssthresh: %lu snd_recover: " + "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); db_printf("t_maxopd: %u t_rcvtime: %lu t_startime: %lu\n", ==== //depot/projects/tcp_new/netinet/tcp_var.h#2 (text+ko) ==== @@ -144,12 +144,10 @@ u_long snd_wnd; /* send window */ u_long snd_cwnd; /* congestion-controlled window */ - u_long snd_bwnd; /* bandwidth-controlled window */ u_long snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ - u_long snd_bandwidth; /* calculated bandwidth or 0 */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ u_int t_maxopd; /* mss plus options */ @@ -561,7 +559,6 @@ void tcp_timer_activate(struct tcpcb *, int, u_int); int tcp_timer_active(struct tcpcb *, int); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); -void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ ==== //depot/projects/tcp_reass/netinet/tcp_output.c#9 (text+ko) ====
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200805161409.m4GE9WTs009000>