Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 20 Sep 2002 17:53:49 -0700 (PDT)
From:      Julian Elischer <julian@elischer.org>
To:        net@freebsd.org
Subject:   Tcp question.
Message-ID:  <Pine.BSF.4.21.0209201718100.21069-100000@InterJet.elischer.org>

next in thread | raw e-mail | index | archive | help


OK so I have 3 machines:


A------router--------B-------router--------C


if I send data from B to A I see 7MB/sec.
if I send data from B to C I see 700KB/sec

tcpdump shows some odd behaviour in the slow link:
(tcpdump run from (B))

The initial negotiation appers as:
000000 DAL-DMZ-IFACE.916 > DAL-DMZ-HOST.ssh: S [tcp sum ok] 
   643471912:1643471912(0) win 16384 
   <mss 1460,nop,wscale 0,nop,nop,timestamp 16260087 0>
   (DF) (ttl 64, id 5122, len 60)
000554 DAL-DMZ-HOST.ssh > DAL-DMZ-IFACE.916: S [tcp sum ok]
   2813576059:2813576059(0) ack 1643471913 win 24624 
   <nop,nop,timestamp 259781373 16260087,nop,wscale 0,mss 1380>
   (DF) (ttl 64, id 18212, len 60)
000496 DAL-DMZ-IFACE.916 > DAL-DMZ-HOST.ssh: . [tcp sum ok]
   ack 1 win 16416 
   <nop,nop,timestamp 16260087 259781373> (DF) (ttl 64, id 5123, len 52)
014218 DAL-DMZ-HOST.ssh > DAL-DMZ-IFACE.916: P [tcp sum ok]
   1:29(28) ack 1 win 24624 
   <nop,nop,timestamp 259781374 16260087> (DF) (ttl 64, id 18213, len 80)

 snapsot once transfer has started shows:

003589 C.ssh > B.916: P 3725:3769(44) ack 60722 win 24624 
  nop,nop,timestamp 259781841 16260556> (DF)
  (ttl 64, id 18250, len 96)
001155 B.916 > C.ssh: . 60722:62090(1368) ack 3769 win 16416 
  nop,nop,timestamp 16260556 259781841> (DF)
  (ttl 64, id 5212, len 1420)
000014 B.916 > C.ssh: . 62090:63458(1368) ack 3769 win 16416 
  nop,nop,timestamp 16260556 259781841> (DF)
  (ttl 64, id 5213, len 1420)
000014 B.916 > C.ssh: . 63458:64826(1368) ack 3769 win 16416 
  nop,nop,timestamp 16260556 259781841> (DF)
  (ttl 64, id 5214, len 1420)
000012 B.916 > C.ssh: . 64826:66194(1368) ack 3769 win 16416 
  nop,nop,timestamp 16260556 259781841> (DF)
  (ttl 64, id 5215, len 1420)
000026 B.916 > C.ssh: . 66194:67562(1368) ack 3769 win 16416 
  nop,nop,timestamp 16260556 259781841> (DF)
  (ttl 64, id 5216, len 1420)
000014 B.916 > C.ssh: . 67562:68930(1368) ack 3769 win 16416 
  nop,nop,timestamp 16260556 259781841> (DF)
  (ttl 64, id 5217, len 1420)
000013 B.916 > C.ssh: . 68930:70298(1368) ack 3769 win 16416 
  nop,nop,timestamp 16260556 259781841> (DF)
  (ttl 64, id 5218, len 1420)
000765 C.ssh > B.916: . [tcp sum ok] ack 63458 win 24624 
  nop,nop,timestamp 259781842 16260556> (DF)
  (ttl 64, id 18251, len 52)
000221 C.ssh > B.916: . [tcp sum ok] ack 66194 win 24624 
  nop,nop,timestamp 259781842 16260556> (DF)
  (ttl 64, id 18252, len 52)
**why wait here**?
003030 C.ssh > B.916: . [tcp sum ok] ack 68930 win 24624 
  nop,nop,timestamp 259781842 16260556> (DF)
  (ttl 64, id 18253, len 52)
000279 C.ssh > B.916: P 3769:3813(44) ack 70298 win 24624 
  nop,nop,timestamp 259781842 16260556> (DF)
  (ttl 64, id 18254, len 96)
000711 B.916 > C.ssh: P 70298:70810(512) ack 3813 win 16372 
  nop,nop,timestamp 16260557 259781842> (DF)
  (ttl 64, id 5219, len 564)

My question is:
"Why, when the acks have come back for upto offset 66194
does the sender not start sending more data.. I might also add that I don't see
why it stopped doing so because the window is NOT full.

Contrast this with the transfer from B to A  (TEN times the throughput)

initial setup packets:
000000 B.914 > A.ssh: S [tcp sum ok]
   2201265429:2201265429(0) win 16384
   <mss 1460,nop,wscale 0,nop,nop,timestamp 16265986 0>
   (DF) (ttl 64, id 20682, len 60)
000359 A.ssh > B.914: S [tcp sum ok]
   434721721:434721721(0) ack 2201265430 win 17376
   <mss 1460,nop,wscale 0,nop,nop,timestamp 891011454 16265986>
   (DF) (ttl 62, id 13590, len 60)
000048 B.914 > A.ssh: . [tcp sum ok] ack 1 win 17376
   <nop,nop,timestamp 16265986 891011454> (DF)
   (ttl 64, id 20683, len 52)
002443 A.ssh > B.914: P 1:55(54) ack 1 win 17376
   <nop,nop,timestamp 891011454 16265986> (DF)
   (ttl 62, id 13593, len 106)

data once transfer is under way..

000233 A.ssh > B.914: . [tcp sum ok] ack 185678 win 5256
   <nop,nop,timestamp 891011760 16266292> (DF) [tos 0x8] 
   (ttl 62, id 13716, len 52)
000240 A.ssh > B.914: . [tcp sum ok] ack 188574 win 2360
   <nop,nop,timestamp 891011760 16266292> (DF) [tos 0x8] 
   (ttl 62, id 13717, len 52)
000218 A.ssh > B.914: . [tcp sum ok] ack 190022 win 17296
   <nop,nop,timestamp 891011760 16266292> (DF) [tos 0x8] 
   (ttl 62, id 13718, len 52)
000069 B.914 > A.ssh: P 190022:191470(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20841, len 1500)
000654 B.914 > A.ssh: . 191470:192918(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20842, len 1500)
000022 B.914 > A.ssh: . 192918:194366(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20843, len 1500)
000021 B.914 > A.ssh: . 194366:195814(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20844, len 1500)
000012 B.914 > A.ssh: . 195814:197262(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20845, len 1500)
000021 B.914 > A.ssh: . 197262:198710(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20846, len 1500)
000024 B.914 > A.ssh: . 198710:200158(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20847, len 1500)
000012 B.914 > A.ssh: . 200158:201606(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20848, len 1500)
000021 B.914 > A.ssh: . 201606:203054(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20849, len 1500)
000023 B.914 > A.ssh: . 203054:204502(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20850, len 1500)
000019 B.914 > A.ssh: . 204502:205950(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20851, len 1500)
000640 A.ssh > B.914: . [tcp sum ok] ack 192918 win 14400
   <nop,nop,timestamp 891011760 16266292> (DF) [tos 0x8] 
   (ttl 62, id 13719, len 52)
000240 A.ssh > B.914: . [tcp sum ok] ack 195814 win 11504
   <nop,nop,timestamp 891011760 16266292> (DF) [tos 0x8] 
   (ttl 62, id 13720, len 52)
000227 A.ssh > B.914: . [tcp sum ok] ack 198710 win 8608
   <nop,nop,timestamp 891011760 16266292> (DF) [tos 0x8] 
   (ttl 62, id 13721, len 52)
000121 A.ssh > B.914: . [tcp sum ok] ack 200158 win 17376
   <nop,nop,timestamp 891011760 16266292> (DF) [tos 0x8] 
   (ttl 62, id 13722, len 52)
000021 B.914 > A.ssh: . 205950:207398(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20852, len 1500)
000011 B.914 > A.ssh: . 207398:208846(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20853, len 1500)
000012 B.914 > A.ssh: . 208846:210294(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20854, len 1500)
000013 B.914 > A.ssh: . 210294:211742(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20855, len 1500)
000029 B.914 > A.ssh: P 211742:213190(1448) ack 367 win 17376
   <nop,nop,timestamp 16266292 891011760> (DF) [tos 0x8] 
   (ttl 64, id 20856, len 1500)
000154 A.ssh > B.914: . [tcp sum ok] ack 203054 win 14480
   <nop,nop,timestamp 891011760 16266292> (DF) [tos 0x8] 
   (ttl 62, id 13723, len 52)


  This is FreeBSD 4.4
PLUS the following patch to tcp that came from 4.5.

Index: sys/netinet/tcp_var.h
===================================================================
RCS file: /build/cvs/freebsd/src/sys/netinet/tcp_var.h,v
retrieving revision 1.56.2.8
diff -u -r1.56.2.8 tcp_var.h
--- sys/netinet/tcp_var.h	22 Aug 2001 00:59:13 -0000	1.56.2.8
+++ sys/netinet/tcp_var.h	7 Mar 2002 18:40:18 -0000
@@ -95,6 +95,7 @@
 #define	TF_SENDCCNEW	0x08000		/* send CCnew instead of CC in SYN */
 #define	TF_MORETOCOME	0x10000		/* More data to be appended to sock */
 #define	TF_LQ_OVERFLOW	0x20000		/* listen queue overflow */
+#define TF_RXWIN0SENT	0x40000		/* sent a receiver win 0 in response */
 	int	t_force;		/* 1 if forcing out a byte */
 
 	tcp_seq	snd_una;		/* send unacknowledged */
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /build/cvs/freebsd/src/sys/kern/uipc_socket.c,v
retrieving revision 1.68.2.16
diff -u -r1.68.2.16 uipc_socket.c
--- sys/kern/uipc_socket.c	14 Jun 2001 20:46:06 -0000	1.68.2.16
+++ sys/kern/uipc_socket.c	7 Mar 2002 18:37:50 -0000
@@ -910,6 +910,14 @@
 		    !sosendallatonce(so) && !nextrecord) {
 			if (so->so_error || so->so_state & SS_CANTRCVMORE)
 				break;
+			/*
+			 * The window might have closed to zero, make
+			 * sure we send an ack now that we've drained
+			 * the buffer or we might end up blocking until
+			 * the idle takes over (5 seconds).
+			 */
+			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
+				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
 			error = sbwait(&so->so_rcv);
 			if (error) {
 				sbunlock(&so->so_rcv);
Index: sys/netinet/tcp_input.c
===================================================================
RCS file: /build/cvs/freebsd/src/sys/netinet/tcp_input.c,v
retrieving revision 1.107.2.16
diff -u -r1.107.2.16 tcp_input.c
--- sys/netinet/tcp_input.c	22 Aug 2001 00:59:12 -0000	1.107.2.16
+++ sys/netinet/tcp_input.c	7 Mar 2002 18:38:37 -0000
@@ -158,10 +158,15 @@
 #endif
 
 /*
- * Indicate whether this ack should be delayed.
+ * Indicate whether this ack should be delayed.  We can delay the ack if
+ *	- delayed acks are enabled and
+ *	- there is no delayed ack timer in progress and
+ *	- our last ack wasn't a 0-sized window.  We never want to delay
+ *	  the ack that opens up a 0-sized window.
  */
 #define DELAY_ACK(tp) \
-	(tcp_delack_enabled && !callout_pending(tp->tt_delack))
+	(tcp_delack_enabled && !callout_pending(tp->tt_delack) && \
+	(tp->t_flags & TF_RXWIN0SENT) == 0)
 
 static int
 tcp_reass(tp, th, tlenp, m)
@@ -840,7 +845,7 @@
 #endif
 			tp = intotcpcb(inp);
 			tp->t_state = TCPS_LISTEN;
-			tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT);
+			tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT|TF_NODELAY);
 
 			/* Compute proper scaling value from buffer space */
 			while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
Index: sys/netinet/tcp_output.c
===================================================================
RCS file: /build/cvs/freebsd/src/sys/netinet/tcp_output.c,v
retrieving revision 1.39.2.10
diff -u -r1.39.2.10 tcp_output.c
--- sys/netinet/tcp_output.c	7 Jul 2001 04:30:38 -0000	1.39.2.10
+++ sys/netinet/tcp_output.c	7 Mar 2002 18:45:18 -0000
@@ -116,7 +116,9 @@
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
 	int idle, sendalot;
+#if 0
 	int maxburst = TCP_MAXBURST;
+#endif
 	struct rmxp_tao *taop;
 	struct rmxp_tao tao_noncached;
 #ifdef INET6
@@ -268,28 +270,38 @@
 	win = sbspace(&so->so_rcv);
 
 	/*
-	 * Sender silly window avoidance.  If connection is idle
-	 * and can send all data, a maximum segment,
-	 * at least a maximum default-size segment do it,
-	 * or are forced, do it; otherwise don't bother.
-	 * If peer's buffer is tiny, then send
-	 * when window is at least half open.
-	 * If retransmitting (possibly after persist timer forced us
-	 * to send into a small window), then must resend.
+	 * Sender silly window avoidance.   We transmit under the following
+	 * conditions when len is non-zero:
+	 *
+	 *	- We have a full segment
+	 *	- This is the last buffer in a write()/send() and we are
+	 *	  either idle or running NODELAY
+	 *	- we've timed out (e.g. persist timer)
+	 *	- we have more then 1/2 the maximum send window's worth of
+	 *	  data (receiver may be limited the window size)
+	 *	- we need to retransmit
 	 */
 	if (len) {
 		if (len == tp->t_maxseg)
 			goto send;
-		if (!(tp->t_flags & TF_MORETOCOME) &&
-		    (idle || tp->t_flags & TF_NODELAY) &&
-		    (tp->t_flags & TF_NOPUSH) == 0 &&
-		    len + off >= so->so_snd.sb_cc)
+		/*
+		 * NOTE! on localhost connections an 'ack' from the remote
+		 * end may occur synchronously with the output and cause
+		 * us to flush a buffer queued with moretocome.  XXX
+		 *
+		 * note: the len + off check is almost certainly unnecessary.
+		 */
+		if (!(tp->t_flags & TF_MORETOCOME) &&	/* normal case */
+		    (idle || (tp->t_flags & TF_NODELAY)) &&
+		    len + off >= so->so_snd.sb_cc &&
+		    (tp->t_flags & TF_NOPUSH) == 0) {
 			goto send;
-		if (tp->t_force)
+		}
+		if (tp->t_force)			/* typ. timeout case */
 			goto send;
 		if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
 			goto send;
-		if (SEQ_LT(tp->snd_nxt, tp->snd_max))
+		if (SEQ_LT(tp->snd_nxt, tp->snd_max))	/* retransmit case */
 			goto send;
 	}
 
@@ -688,6 +700,20 @@
 	if (win > (long)TCP_MAXWIN << tp->rcv_scale)
 		win = (long)TCP_MAXWIN << tp->rcv_scale;
 	th->th_win = htons((u_short) (win>>tp->rcv_scale));
+
+	/*
+	 * Adjust the RXWIN0SENT flag - indicate that we have advertised
+	 * a 0 window.  This may cause the remote transmitter to stall.  This
+	 * flag tells soreceive() to disable delayed acknowledgements when
+	 * draining the buffer.  This can occur if the receiver is attempting
+	 * to read more data then can be buffered prior to transmitting on
+	 * the connection.
+	 */
+	if (win == 0)
+		tp->t_flags |= TF_RXWIN0SENT;
+	else
+		tp->t_flags &= ~TF_RXWIN0SENT;
+
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		th->th_urp = htons((u_short)(tp->snd_up - tp->snd_nxt));
 		th->th_flags |= TH_URG;
@@ -912,7 +938,17 @@
 	tp->t_flags &= ~TF_ACKNOW;
 	if (tcp_delack_enabled)
 		callout_stop(tp->tt_delack);
+#if 0
+	/*
+	 * This completely breaks TCP if newreno is turned on.  What happens
+	 * is that if delayed-acks are turned on on the receiver, this code
+	 * on the transmitter effectively destroys the TCP window, forcing
+	 * it to four packets (1.5Kx4 = 6K window).
+	 */
 	if (sendalot && (!tcp_do_newreno || --maxburst))
+		goto again;
+#endif
+	if (sendalot)
 		goto again;
 	return (0);
 }








To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-net" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?Pine.BSF.4.21.0209201718100.21069-100000>