Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 15 Dec 2004 12:27:52 -0600 (CST)
From:      Dan Nelson <dnelson@allantgroup.com>
To:        FreeBSD-gnats-submit@FreeBSD.org
Cc:        Matthew Dillon <dillon@apollo.backplane.com>
Subject:   kern/75122: [PATCH] Incorrect inflight bandwidth calculation on first packet
Message-ID:  <200412151827.iBFIRqDB019997@dan.emsphone.com>
Resent-Message-ID: <200412151830.iBFIUW8I052812@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help

>Number:         75122
>Category:       kern
>Synopsis:       [PATCH] Incorrect inflight bandwidth calculation on first packet
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Wed Dec 15 18:30:31 GMT 2004
>Closed-Date:
>Last-Modified:
>Originator:     Dan Nelson
>Release:        FreeBSD 5.3-STABLE i386
>Organization:
The Allant Group
>Environment:
System: FreeBSD dan.emsphone.com 5.3-STABLE FreeBSD 5.3-STABLE #383: Wed Dec 15 11:29:24 CST 2004 zsh@dan.emsphone.com:/usr/src/sys/i386/compile/DANSMP i386

	
>Description:

Matt, I'm CC'ing you because it looks like the bug is also in
Dragonfly, so I think you'll want to commit something similar.

The inflight window scaling algorithm keeps a decaying average of a
socket's bandwidth, but on the first call to tcp_xmit_bandwidth_limit,
the sequence number of the previous packet is not known, so the
(ack_seq - tp->t_bw_rtseq) clause just gives you the raw sequence
number, resulting in a random value for the calculated bandwidth.

Until enough packets have traveled so the average has decayed to the
correct value, the calculated window is large enough that it's not even
used.  On a dialup, for example, it never gets a chance.

	
>How-To-Repeat:
	

Run net.inet.tcp.hostcache.list, and take a look at the BANDWIDTH
column.  Note that there is a secondary bug in this sysctl output that
multiplies the bandwidth by 8, even though the original value is
already in octets/sec.

>Fix:

The minimal fix is to check for (tp->t_bw_rtttime == 0 || tp->t_bw_rtseq
== 0 || (int)bw < 0), and if it evalutes true, store the current values
and return.  I recommend committing the whole patch though.  It changed
the debugging output to print the instantaneous bw, and the current and
previous average bw.

Index: tcp_hostcache.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_hostcache.c,v
retrieving revision 1.7
diff -u -r1.7 tcp_hostcache.c
--- tcp_hostcache.c	16 Aug 2004 18:32:07 -0000	1.7
+++ tcp_hostcache.c	15 Dec 2004 17:42:02 -0000
@@ -175,7 +175,7 @@
      &tcp_hostcache.purgeall, 0, "Expire all entires on next purge run");
 
 SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
-	CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0,
+	CTLTYPE_STRING | CTLFLAG_RD, 0, 0,
 	sysctl_tcp_hc_list, "A", "List of all hostcache entries");
 
 
@@ -676,7 +676,7 @@
 				(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
 			    msec(hc_entry->rmx_rttvar *
 				(RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
-			    hc_entry->rmx_bandwidth * 8,
+			    hc_entry->rmx_bandwidth,
 			    hc_entry->rmx_cwnd,
 			    hc_entry->rmx_sendpipe,
 			    hc_entry->rmx_recvpipe,
Index: tcp_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/netinet/tcp_subr.c,v
retrieving revision 1.201.2.4
diff -u -r1.201.2.4 tcp_subr.c
--- tcp_subr.c	1 Dec 2004 05:37:28 -0000	1.201.2.4
+++ tcp_subr.c	15 Dec 2004 17:43:19 -0000
@@ -1907,7 +1907,7 @@
 void
 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
 {
-	u_long bw;
+	u_long bw, avgbw;
 	u_long bwnd;
 	int save_ticks;
 
@@ -1933,18 +1933,26 @@
 	 * effectively reset t_bw_rtttime for this case.
 	 */
 	save_ticks = ticks;
-	if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
+
+	/* Must wait at least 1 tick to calculate anything */
+	if (save_ticks == tp->t_bw_rtttime)
 		return;
 
 	bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
 	    (save_ticks - tp->t_bw_rtttime);
+
+	/* If this is our first time through, or if something has wrapped, 
+	   record the current values and return */
+	if (tp->t_bw_rtttime == 0 || tp->t_bw_rtseq == 0 || (int)bw < 0) {	
+		tp->t_bw_rtttime = save_ticks;
+		tp->t_bw_rtseq = ack_seq;
+		return;
+	}
+
 	tp->t_bw_rtttime = save_ticks;
 	tp->t_bw_rtseq = ack_seq;
-	if (tp->t_bw_rtttime == 0 || (int)bw < 0)
-		return;
-	bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
 
-	tp->snd_bandwidth = bw;
+	avgbw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
 
 	/*
 	 * Calculate the semi-static bandwidth delay product, plus two maximal
@@ -1975,22 +1983,25 @@
 	 *	    no other choice.
 	 */
 #define USERTT	((tp->t_srtt + tp->t_rttbest) / 2)
-	bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
+	bwnd = (int64_t)avgbw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
 #undef USERTT
 
 	if (tcp_inflight_debug > 0) {
 		static int ltime;
 		if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
 			ltime = ticks;
-			printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
+			printf("%p bw %lu(%lu,%lu) rttbest %d srtt %d bwnd %ld\n",
 			    tp,
-			    bw,
+			    avgbw, tp->snd_bandwidth, bw,
 			    tp->t_rttbest,
 			    tp->t_srtt,
 			    bwnd
 			);
 		}
 	}
+	
+	tp->snd_bandwidth = avgbw;
+	
 	if ((long)bwnd < tcp_inflight_min)
 		bwnd = tcp_inflight_min;
 	if (bwnd > tcp_inflight_max)
>Release-Note:
>Audit-Trail:
>Unformatted:



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200412151827.iBFIRqDB019997>