Date: Fri, 16 Aug 2002 19:33:42 -0700 (PDT) From: Matthew Dillon <dillon@apollo.backplane.com> To: freebsd-hackers@FreeBSD.ORG, freebsd-net@FreeBSD.ORG Subject: Commit schedule for bandwidth delay product pipeline limiting for TCP Message-ID: <200208170233.g7H2XgqG047569@apollo.backplane.com> References: <200207200103.g6K135Ap081155@apollo.backplane.com> <3D3AB5AF.F2F637C3@pipeline.ch> <200207211747.g6LHlKHv003686@apollo.backplane.com>
next in thread | previous in thread | raw e-mail | index | archive | help
Well, I'm back from vacation. I see nobody in the general group has
commented much on my bandwidth delay product code. A couple of people
have corresponded with me in email and generally the response is
positive.
Since this code must be enabled via a sysctl I feel it is safe to
commit it to -current. I also intend to MFC it to -stable prior
to the freeze (MFC after: 1 week). I believe that we can eventually
enable the sysctl by default.
I intend to commit this code on Saturday (tomorrow). I've included the
patch set below for those who need a reminder of what this is. Generally
speaking this code is very similar, though not intended to duplicate,
the algorithm described by the TCP Vegas paper. I will also commit
manual page updates to tcp(4) and tuning(7) to describe the effects
of the sysctls.
-Matt
Index: netinet/tcp_input.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.169
diff -u -r1.169 tcp_input.c
--- netinet/tcp_input.c 15 Aug 2002 18:51:26 -0000 1.169
+++ netinet/tcp_input.c 17 Aug 2002 02:24:01 -0000
@@ -1018,6 +1018,7 @@
else if (tp->t_rtttime &&
SEQ_GT(th->th_ack, tp->t_rtseq))
tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+ tcp_xmit_bandwidth_limit(tp, th->th_ack);
acked = th->th_ack - tp->snd_una;
tcpstat.tcps_rcvackpack++;
tcpstat.tcps_rcvackbyte += acked;
@@ -1819,6 +1820,7 @@
tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+ tcp_xmit_bandwidth_limit(tp, th->th_ack);
/*
* If all outstanding data is acked, stop retransmit
@@ -2445,6 +2447,8 @@
delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
if ((tp->t_rttvar += delta) <= 0)
tp->t_rttvar = 1;
+ if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
+ tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
} else {
/*
* No rtt measurement yet - use the unsmoothed rtt.
@@ -2453,6 +2457,7 @@
*/
tp->t_srtt = rtt << TCP_RTT_SHIFT;
tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+ tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
}
tp->t_rtttime = 0;
tp->t_rxtshift = 0;
@@ -2592,6 +2597,7 @@
if (rt->rt_rmx.rmx_locks & RTV_RTT)
tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
+ tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
tcpstat.tcps_usedrtt++;
if (rt->rt_rmx.rmx_rttvar) {
tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
Index: netinet/tcp_output.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_output.c,v
retrieving revision 1.67
diff -u -r1.67 tcp_output.c
--- netinet/tcp_output.c 12 Aug 2002 03:22:46 -0000 1.67
+++ netinet/tcp_output.c 17 Aug 2002 02:24:01 -0000
@@ -168,6 +168,7 @@
sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
win = min(tp->snd_wnd, tp->snd_cwnd);
+ win = min(win, tp->snd_bwnd);
flags = tcp_outflags[tp->t_state];
/*
@@ -780,7 +781,7 @@
tp->snd_max = tp->snd_nxt;
/*
* Time this transmission if not a retransmission and
- * not currently timing anything.
+ * not currently timing anything.
*/
if (tp->t_rtttime == 0) {
tp->t_rtttime = ticks;
Index: netinet/tcp_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.140
diff -u -r1.140 tcp_subr.c
--- netinet/tcp_subr.c 1 Aug 2002 03:54:43 -0000 1.140
+++ netinet/tcp_subr.c 17 Aug 2002 02:24:01 -0000
@@ -146,6 +146,32 @@
SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
&tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
+static int tcp_inflight_enable = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
+ &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
+
+static int tcp_inflight_debug = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
+ &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
+
+static int tcp_inflight_min = 1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
+ &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
+
+static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
+ &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
+
+#if 0
+static int tcp_inflight_attack = 20;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_attack, CTLFLAG_RW,
+ &tcp_inflight_attack, 0, "TCP inflight compensation attack rate (%)");
+
+static int tcp_inflight_shift = 0;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_shift, CTLFLAG_RW,
+ &tcp_inflight_shift, 0, "TCP inflight compensation shift (+/-100) ");
+#endif
+
static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
@@ -566,8 +592,10 @@
tp->t_rttmin = tcp_rexmit_min;
tp->t_rxtcur = TCPTV_RTOBASE;
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->t_rcvtime = ticks;
+ tp->t_bw_rtttime = ticks;
/*
* IPv4 TTL initialization is necessary for an IPv6 socket as well,
* because the socket may be bound to an IPv6 wildcard address,
@@ -1531,3 +1559,101 @@
tcp_cleartaocache()
{
}
+
+/*
+ * This code attempts to calculate the bandwidth-delay product.
+ * The problem with calculating this product is that our manipulation
+ * of the congestion window modifies both the perceived bandwidth
+ * and the srtt. It is possible to get a fairly stable maximal
+ * bandwidth by increasing the congestion window. The bandwidth
+ * calculation will be fairly good even if bwnd is set very high.
+ * However, figuring out the minimal srtt is far more difficult
+ * because we do not want the TCP stream to suffer greatly and therefore
+ * cannot reduce the congestion window to something very small.
+ *
+ * What we do is first increase the congestion window to try to
+ * obtain a maximal (or at least a 'larger') bandwidth, then decrease
+ * the congestion window to try to obtain a minimal (or at least a 'smaller')
+ * rtt. We also have to detect the case where BWND is too high and
+ * neither increasing nor decreasing it has the desired effect on the
+ * calculation. By detecting this special case we can stabilize the
+ * algorithm and recalculate bwnd within a reasonable period of time.
+ */
+void
+tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
+{
+ u_long bw;
+ u_long bwnd;
+ int save_ticks;
+
+ /*
+ * If inflight_enable is disabled in the middle of a tcp connection,
+ * make sure snd_bwnd is effectively disabled.
+ */
+ if (tcp_inflight_enable == 0) {
+ tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->snd_bandwidth = 0;
+ return;
+ }
+
+ /*
+ * Figure out the bandwidth. Due to the tick granularity this
+ * is a very rough number and it MUST be averaged over a fairly
+ * long period of time.
+ */
+ save_ticks = ticks;
+ if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
+ return;
+
+ bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
+ (save_ticks - tp->t_bw_rtttime);
+ tp->t_bw_rtttime = save_ticks;
+ tp->t_bw_rtseq = ack_seq;
+ if (tp->t_bw_rtttime == 0)
+ return;
+ bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
+
+ tp->snd_bandwidth = bw;
+
+ /*
+ * Calculate the semi-static bandwidth delay product, plus two maximal
+ * segments. The additional slop puts us squarely in the sweet
+ * spot and also handles the bandwidth run-up case. Without the
+ * slop we could be locking ourselves into a lower bandwidth.
+ *
+ * Situations Handled:
+ * (1) prevents over-queueing of packets on LANs, especially
+ * high speed LANs, allowing larger TCP buffers to be
+ * specified.
+ *
+ * (2) able to handle increased network loads (bandwidth drops
+ * so bwnd drops).
+ *
+ * (3) Randomly changes the window size in order to force
+ * bandwidth balancing between connections.
+ */
+#define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
+ bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + 2 * tp->t_maxseg;
+
+ if (tcp_inflight_debug > 0) {
+ static int ltime;
+ if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
+ ltime = ticks;
+ printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
+ tp,
+ bw,
+ tp->t_rttbest,
+ tp->t_srtt,
+ bwnd
+ );
+ }
+ }
+ if ((long)bwnd < tcp_inflight_min)
+ bwnd = tcp_inflight_min;
+ if (bwnd > tcp_inflight_max)
+ bwnd = tcp_inflight_max;
+ if ((long)bwnd < tp->t_maxseg * 2)
+ bwnd = tp->t_maxseg * 2;
+ tp->snd_bwnd = bwnd;
+}
+
Index: netinet/tcp_usrreq.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_usrreq.c,v
retrieving revision 1.79
diff -u -r1.79 tcp_usrreq.c
--- netinet/tcp_usrreq.c 29 Jul 2002 09:01:39 -0000 1.79
+++ netinet/tcp_usrreq.c 17 Aug 2002 02:24:01 -0000
@@ -875,6 +875,7 @@
tp->t_state = TCPS_SYN_SENT;
callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
tp->iss = tcp_new_isn(tp);
+ tp->t_bw_rtseq = tp->iss;
tcp_sendseqinit(tp);
/*
@@ -961,6 +962,7 @@
tp->t_state = TCPS_SYN_SENT;
callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
tp->iss = tcp_new_isn(tp);
+ tp->t_bw_rtseq = tp->iss;
tcp_sendseqinit(tp);
/*
Index: netinet/tcp_var.h
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_var.h,v
retrieving revision 1.82
diff -u -r1.82 tcp_var.h
--- netinet/tcp_var.h 19 Jul 2002 18:27:39 -0000 1.82
+++ netinet/tcp_var.h 21 Jul 2002 02:26:36 -0000
@@ -124,10 +124,12 @@
u_long snd_wnd; /* send window */
u_long snd_cwnd; /* congestion-controlled window */
+ u_long snd_bwnd; /* bandwidth-controlled window */
u_long snd_ssthresh; /* snd_cwnd size threshold for
* for slow start exponential to
* linear switch
*/
+ u_long snd_bandwidth; /* calculated bandwidth or 0 */
tcp_seq snd_recover; /* for use in fast recovery */
u_int t_maxopd; /* mss plus options */
@@ -137,6 +139,9 @@
int t_rtttime; /* round trip time */
tcp_seq t_rtseq; /* sequence number being timed */
+ int t_bw_rtttime; /* used for bandwidth calculation */
+ tcp_seq t_bw_rtseq; /* used for bandwidth calculation */
+
int t_rxtcur; /* current retransmit value (ticks) */
u_int t_maxseg; /* maximum segment size */
int t_srtt; /* smoothed round-trip time */
@@ -144,6 +149,7 @@
int t_rxtshift; /* log(2) of rexmt exp. backoff */
u_int t_rttmin; /* minimum rtt allowed */
+ u_int t_rttbest; /* best rtt we've seen */
u_long t_rttupdated; /* number of times rtt sampled */
u_long max_sndwnd; /* largest window peer has offered */
@@ -473,6 +479,7 @@
struct tcpcb *
tcp_timers(struct tcpcb *, int);
void tcp_trace(int, int, struct tcpcb *, void *, struct tcphdr *, int);
+void tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq);
void syncache_init(void);
void syncache_unreach(struct in_conninfo *, struct tcphdr *);
int syncache_expand(struct in_conninfo *, struct tcphdr *,
To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-net" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200208170233.g7H2XgqG047569>
