Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 16 Feb 2009 16:53:43 GMT
From:      Andre Oppermann <andre@FreeBSD.org>
To:        Perforce Change Reviews <perforce@freebsd.org>
Subject:   PERFORCE change 157800 for review
Message-ID:  <200902161653.n1GGrhNM085561@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=157800

Change 157800 by andre@andre_flirtbox on 2009/02/16 16:53:08

	Checkpoint WIP.

Affected files ...

.. //depot/projects/tcp_new/netinet/tcp_input.c#7 edit
.. //depot/projects/tcp_new/netinet/tcp_output.c#4 edit
.. //depot/projects/tcp_new/netinet/tcp_var.h#3 edit

Differences ...

==== //depot/projects/tcp_new/netinet/tcp_input.c#7 (text+ko) ====

@@ -179,20 +179,6 @@
 #endif
 
 /*
- * Indicate whether this ack should be delayed.  We can delay the ack if
- *	- there is no delayed ack timer in progress and
- *	- our last ack wasn't a 0-sized window.  We never want to delay
- *	  the ack that opens up a 0-sized window and
- *		- delayed acks are enabled or
- *		- this is a half-synchronized T/TCP connection.
- */
-#define DELAY_ACK(tp)							\
-	((!tcp_timer_active(tp, TT_DELACK) &&				\
-	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
-	    (tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
-
-
-/*
  * TCP input handling is split into multiple parts:
  *   tcp6_input is a thin wrapper around tcp_input for the extended
  *	ip6_protox[] call format in ip6_input
@@ -362,7 +348,7 @@
 		tcpstat.tcps_rcvbadoff++;
 		goto drop;
 	}
-	tlen -= off;	/* tlen is used instead of ti->ti_len */
+	tlen -= off;	/* tlen is used instead of th->th_len */
 	if (off > sizeof (struct tcphdr)) {
 		if (isipv6) {
 #ifdef INET6
@@ -932,9 +918,10 @@
 	 * discouraged to shrink the window.
 	 *  RFC793: section 3.7, page 42-44
 	 *  RFC1122: section 4.2.2.16
+	 *
+	 * XXXAO: Fix up.  rcv_wnd is an absolute pointer in seq space.
 	 */
-	rwin = sbspace(&so->so_rcv);
-	rwin = imax(rwin, (int)(tp->rcv_advwin - tp->rcv_nxt));
+	rwin = tp->rcv_wnd - tp->rcv_nxt;
 
 	/*
 	 * Validation checks on any incoming segment.
@@ -947,7 +934,7 @@
 	 * into established state and initializations of the timers.
 	 */
 	case TCPS_SYN_RECEIVED:
-		tp->t_starttime = tcp_uptime();
+		tp->t_starttime = time_uptime;
 		TCPS_TRANS(tp, TCPS_ESTABLISHED);
 		soisconnected(so);
 
@@ -963,7 +950,7 @@
 	 */
 	case TCPS_SYN_SENT:
 		/*
-		 * RST is handled separately below.
+		 * RST is handled separatetly below.
 		 *  RFC793: section 3.9, page 66-67, second check
 		 */
 		if (thflags & TH_RST)
@@ -1029,11 +1016,14 @@
 		 *  RFC793: section 3.1, page 18-19
 		 *  RFC1122: section 4.2.2.6
 		 *  RFC1191: section 3.1
+		 *
+		 * NB: MSS is computed twice.  Once when we send the inital
+		 * SYN and once when get back the SYN-ACK.
 		 */
 		if (to.to_flags & TOF_MSS)
-			tcp_mss(tp, to.to_mss);
+			tp->snd_mss = tcp_mss(tptoinpinc(tp), to.to_mss, 0);
 		else
-			tcp_mss(tp, tcp_mssdflt);
+			tp->snd_mss = tcp_mss(tptoinpinc(tp), 0, 0);
 
 		/*
 		 * Do window scaling on this connection?
@@ -1129,7 +1119,7 @@
 		tp->snd_wu_ack = th->th_ack;
 		th->th_seq++;		/* SYN is acked */
 
-		tp->t_starttime = tcp_uptime();
+		tp->t_starttime = time_uptime;
 		TCPS_TRANS(tp, TCPS_ESTABLISHED);
 #ifdef MAC
 		SOCK_LOCK(so);
@@ -1218,7 +1208,7 @@
 		 *
 		 * We store the receive time as uptime with second
 		 * resolution.  This makes us independent from the
-		 * wrap-around after 2^32 / hz (24.8 days at 1ms hz).
+		 * wrap-around after 2^32 / 2 / hz (24.8 days at 1ms hz).
 		 *
 		 * XXXAO: Linux says PAWS is broken.  Analyze if true or not.
 		 * Retransmitted segments are not presented for further processing.
@@ -1425,6 +1415,7 @@
 		case TCPS_SYN_SENT:
 			/*
 			 * In TCPS_SYN_SENT the RST MUST carry the ACK flag.
+			 *  RFC793: section 3.4, page 37, Reset Processing
 			 *  RFC793: section 3.9, page 66, first check
 			 */
 			if (!(thflags & TH_ACK)) {
@@ -1434,12 +1425,17 @@
 			}
 
 			/*
-			 * The ACK must be within what we sent but does
-			 * not have to ACK the SYN.
+			 * The ACK must acknowledge the SYN and any data
+			 * we may have sent with the original SYN.
+			 *  RFC793: section 3.4, page 37, Reset Processing
 			 *  RFC793: section 3.9, page 66, first check
+			 *
+			 * NB: We accept ACKing the SYN w/o and with data
+			 * as some implementations refuse to ACK data in
+			 * a SYN.
 			 */
-			if (SEQ_LT(th->th_ack, tp->snd_una) ||
-			    SEQ_GT(th->th_ack, th->snd_nxt)) {
+			if (th->th_ack != tp->snd_una ||
+			    th->th_ack != th->snd_nxt) {
 				tcplog("RST does not match, segment ignored");
 				tcpstat.tcps_badrst++;
 				goto drop;
@@ -1735,6 +1731,8 @@
 	/*
 	 * Update send SACK information and tell us how much more
 	 * data has left the network (relative to last SACK we got).
+	 * XXXAO: Determine if there was a duplicate ACK going on
+	 * based on the changes of the SACK information.
 	 */
 	if ((to.to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))
 		sacked = tcp_sack_doack(tp, &to, th->th_ack);
@@ -1759,7 +1757,7 @@
 	/*
 	 * Update congestion control information.
 	 */
-	nudgeoutput = tcp_congest(tp, th, tiwin, acked, tlen, sacked);
+	nudgeoutput |= tcp_congest(tp, th, tiwin, acked, tlen, sacked);
 
 	/*
 	 * Drop acknowledged data from send socket buffer
@@ -1783,12 +1781,10 @@
 		 * data from the socket buffer.
 		 */
 		if (acked > so->so_snd.sb_cc) {
-			tp->snd_wnd -= so->so_snd.sb_cc;
 			sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
 			ourfinisacked = 1;
 		} else {
 			sbdrop_locked(&so->so_snd, acked);
-			tp->snd_wnd -= acked;
 			ourfinisacked = 0;
 		}
 
@@ -1896,6 +1892,19 @@
 			 * NB: Continue with segment.
 			 */
 		}
+
+		/*
+		 * Stop the retransmit timer if all data we sent
+		 * has been acknowledged.  Otherwise restart it
+		 * if we still have outstanding data.
+		 *
+		 * XXXAO: Refine the test.  The TF_NEEDFIN may not
+		 * enough.
+		 */
+		if (tp->snd_una == tp->snd_nxt && !(tp->t_flags & TF_NEEDFIN))
+			tcp_timer_activate(TT_RXMIT, 0);
+		else
+			tcp_timer_activate(TT_RXMIT, tp->snd_rto);
 	}
 
 	/*
@@ -1918,7 +1927,7 @@
 	 */
 	if ((thflags & TH_URG) && th->th_urp > 0 && tlen > 0 &&
 	    !TCPS_HAVERCVDFIN(tp->t_state)) {
-		tcp_do_urg(tp, th, tlen);
+		tcp_do_urg(tp, th, &tlen);
 	} else if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) {
 		tp->rcv_up = tp->rcv_nxt;
 	}
@@ -2090,6 +2099,11 @@
 				}
 
 				/*
+				 * Update size of receive window.
+				 */
+				tp->rcv_wnd = sbspace(so->so_rcv);
+
+				/*
 				 * NB: sorwakeup_locked implicitly unlocks.
 				 */
 				sorwakeup_locked(so);
@@ -2249,12 +2263,15 @@
 	 *
 	 * XXXAO: Multi-delack?
 	 */
-	if (nudgeoutput || (tp->t_flags & TF_ACKNOW))
+	if ((tp->t_flags & TF_ACKNOW) || tp->snd_delack > 1 ||
+	    nudgeoutput || (tp->t_flags & TF_RXWIN0SENT) ||
+	    !tcp_delack_enabled) {
 		(void) tcp_output(tp);
-	else if (tp->t_flags & TF_DELACK) {
-		tp->t_flags &= ~TF_DELACK;
+	} else if (SEQ_GT(tp->rcv_nxt, tp->snd_lastack)) {
+		tp->snd_delack++;
 		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 	}
+
 	INP_UNLOCK(tp->t_inpcb);
 	return;
 
@@ -2374,7 +2391,7 @@
  * XXXAO: Report violations of the options specs.
  */
 static void
-tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
+tcp_do_options(struct tcpopt *to, u_char *cp, int cnt, int flags)
 {
 	int opt, optlen;
 
@@ -2471,68 +2488,106 @@
  *  Finish this function and validate against all relevant RFCs.
  *  Use bintime second part for t_rcvtime.
  *  And a couple of other things.
- *
- * XXXAO: Linux talks about some problem with the RTO algorithm.
- * Figure out what the problem is.
- *
- * XXXAO: The sliding window of eight measurements from RFC793 is
- * way too little when using timestamps in fast networks.
- * Average 10ms of measurements and integrate that into a 1000ms
- * sliding window.  The same for the variance.  When using timestamps.
  */
 static void
 tcp_do_time(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
     int acked, int tlen, int sacked)
 {
-	int delta, rtt;
+	int rtt;
 	int tick = tcp_ticks;
 
+	INP_LOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp != NULL && th != NULL && to != NULL,
-	    ("%s: ", __func__));
-	INP_LOCK_ASSERT(tp->t_inpcb);
+	    ("%s: insufficient parameters", __func__));
 
 	/*
+	 * 1. We received a valid segment.
+	 *
 	 * Make note of most recent segment received time.
 	 */
-	tp->t_rcvtime = tcp_ticks;		/* XXX: ticks64 */
+	tp->t_rcvtime = tcp_uptime();
+	tp->t_rcvticks = tick;
 
 	/*
+	 * 2. If timestamps are used decide which to reflect.
+	 *
 	 * When using timestamps and delayed ACKs we should reply
 	 * with the TSval from the earliest unacknowledged segment.
-	 *  RFC1323: Section 3.4, Page 15, Case (A)
+	 *  RFC1323: section 3.4, Page 15, Case (A)
 	 *
 	 * On packet loss echo the TSval from the latest segment
 	 * that filled a hole.  Only reflect timestamps that advance
 	 * the left edge of the window.
-	 *  RFC1323: Section 3.4, Page 15, Case (B & C)
+	 *  RFC1323: section 3.4, Page 15, Case (B & C)
+	 *
+	 * Corrected algorithm.
+	 *  Stevens Vol.2: section 26.6, page 870
+	 *  Braden93
+	 *
+	 * If SACK is enabled we should be able to reflect every
+	 * timestamp as long as it GEQ than the one before.  This
+	 * way we avoid late out-of-order segments.  Whenever more
+	 * data was sacked advance reflected timestamp.
 	 *
-	 * XXXAO: With SACK we could do better.
-	 * if (sacked > 0) ...
+	 * Does this give PAWS problems?
 	 */
 	if (to->to_flags & TOF_TS) {
-		if ((!(tp->t_flags & TF_DELACK) && th->th_seq == tp->rcv_nxt) ||
-		    (!TAILQ_EMPTY(tp->rcv_trq) && th->th_seq == tp->rcv_nxt))
+#ifdef TCP_RFC1323_BRADEN
+		if (TS_GEQ(to->to_tsval, tp->snd_tsecr) &&
+		    SEQ_LEQ(th->th_ack, tp->snd_lastack)) {
+#endif
+#ifdef TCP_RFC1323bis_plusSACK
+		if (TS_GT(to->to_tsval, tp->snd_tsecr) &&
+		    ((th->th_seq == tp->rcv_nxt && tp->snd_delack == 0) ||
+		     sacked > 0) {
+#endif
 			tp->snd_tsecr = to->to_tsval;
-			tp->snd_tsecrts = tcp_ticks;	/* XXX: ticks64 */
+			tp->snd_tsecrts = tcp_ticks;
+		}
+		KASSERT(!TS_GT(to->to_secr, tick),
+		    ("%s: timestamp newer than our time", __func__));
 		/*
 		 * Remember highest most recent reflected TS.
 		 */
-		if (to->to_tsecr > tp->ts_recent)
+		if (SEQ_LEQ(th->th_seq, tp->snd_lastack) &&
+		    TS_GT(to->to_tsecr > tp->ts_recent))
 			tp->ts_recent = to->to_tsecr;
+			tp->ts_recentts = tick;
 	}
 
 	/*
+	 * 3. If timestamps are used calculate the current RTT.
+	 */
+	if (to->to_flags & TOF_TS) {
+		rtt = tick - to->to_tsecr;
+	} else if (acked > 0 && tp->snd_rtseq != 0 &&
+	    SEQ_GT(th->th_ack, tp->snd_rtseq) &&
+	    TAILQ_EMPTY(tp->rcv_trq) && tp->snd_rtoshift == 0) {
+		rtt = tick - tp->snd_rtts;
+		tp->snd_rtseq = 0;
+	} else
+		return;
+
+	/*
+	 * 4. If no timestamps are used see whether new data was ack'ed
+	 *    and if so, calculate the current RTT.
+	 *
 	 * We can only measure the RTT if new data was acknowledged.
 	 * That means we can only update the RTT estimates when we
 	 * are sending data.
 	 *
 	 * XXXAO: Not really true with timestamps and a steady receive
 	 * stream.
+	 *
+	 * Karns algorithm.  Only update on non-retransmitted segments.
+	 * Compute the time delta in ticks (1/hz).
+	 *
+	 * XXXAO: How to deal with retransmits when using timestamps?
 	 */
-	if (acked == 0)
-		return;
 
 	/*
+	 * 5. Update at all?
+	 *
 	 * If we haven't sent anything for more than one RTO ignore
 	 * the time measurement or our estimate will be way off.
 	 */
@@ -2543,35 +2598,85 @@
 	}
 
 	/*
-	 * Karns algorithm.  Only update on non-retransmitted segments.
+	 * Remember the lowest RTT we've ever seen.
+	 * Must be at least 1 tick.
+	 */
+	if (tp->t_rttlowest > rtt)
+		tp->t_rttlowest = max(rtt, 1);
+
+	/*
+	 * Recompute the SRTT, RTTVAR and RTO.
 	 *
-	 * XXXAO: How to deal with retransmits when using timestamps?
+	 * XXXAO: Make it pluggable so that different algorithms
+	 * can be tested.
 	 */
+	tp->snd_rto = tcp_do_rto(tp, rtt);
 
 	/*
-	 * Compute the time delta in ticks (1/hz).
+	 * We received an ack for a packet that wasn't retransmitted;
+	 * it is probably safe to discard any error indications we've
+	 * received recently.  This isn't quite right, but close enough
+	 * for now (a route might have failed after we sent a segment,
+	 * and the return path might not be symmetrical).
+	 * XXXAO: Doesn't belong here.
 	 */
-	if (to->to_flags & TOF_TS) {
-		rtt = tick - to->to_tsecr;
-	} else if (tp->t_rtseq != 0 && SEQ_GT(th->th_ack, tp->t_rtseq) &&
-	    TAILQ_EMPTY(tp->rcv_trq) && tp->snd_rtoshift == 0) {
-		rtt = tick - tp->t_rtseq;
-		tp->t_rtseq = 0;
-	} else
-		return;
+	tp->t_softerror = 0;
 
 	/*
-	 * Limit delta to some reasonable amount.
+	 * Statistics.
 	 */
-	rtt = min(60*hz, max(1, rtt));
+	tp->t_rttupdated++;
+	tcpstat.tcps_rttupdated++;
+
+	return;
+}
+
+/*
+ * Compute the SRTT, RTTVAR and return the updated RTO.
+ *  RFC1122: section 4.2.3.1
+ *  RFC2988: entire document
+ *
+ * External parameters that affect the RTO calculation:
+ *  minimum RTO value (fixed sysctl)
+ *  maximum RTO value (fixed sysctl)
+ *  initial RTO value (fixed sysctl)
+ *
+ * XXXAO: Linux talks about some problem with the RTO algorithm.
+ * Figure out what the problem is.
+ *
+ * XXXAO: The sliding window of eight measurements from RFC793 is
+ * way too little when using timestamps in fast networks.
+ * Average 10ms of measurements and integrate that into a 1000ms
+ * sliding window.  The same for the variance.  When using timestamps.
+ * Or integrate over one RTO.
+ *
+ * XXXAO: We should use rttlowest as base and all deviations from it
+ * count as RTT variance.  Use a squared algorithm to bias it to the
+ * upper level.  Trying to calculate the actual RTT is futile and
+ * very volatile.  rttlowest is a very good and fairly stable statistic
+ * baseline.  One can't get better than speed of light in optical media.
+ * Everything faster than one tick doesn't concern us anyway.  Having
+ * stable baseline simplifies and improves a number of statistical
+ * calculations and assumptions.  Some magic has to be applied when
+ * a better lower baseline is measured though.
+ */
+static int
+tcp_do_rto(struct tcpcb *tp, int rtt)
+{
+	int delta, rto;
 
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	KASSERT(tp != NULL,
+	    ("%s: insufficient parameters", __func__));
+	    
 	/*
-	 * Remember the lowest RTT we've ever seen.
+	 * Limit delta to some reasonable amount.
 	 */
-	if (tp->t_rttlowest > rtt)
-		tp->rttlowest = rtt;
+	rtt = min(60 * hz, max(1, rtt));
 
 	/*
+	 * 6. Integrate new measurement.
+	 *
 	 * Compute smoothed RTT and smoothed RTT variance.
 	 */
 	if (tp->t_srtt) {
@@ -2595,7 +2700,7 @@
 		 * rttvar is stored as fixed point with 4 bits after the
 		 * binary point (scaled by 16).  The following is
 		 * equivalent to rfc793 smoothing with an alpha of .75
-		 * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
+		 * (rttvar = rttvar * 3/4 + |delta| / 4).  This replaces
 		 * rfc793's wired-in beta.
 		 */
 		if (delta < 0)
@@ -2616,29 +2721,17 @@
 	tp->t_rxtshift = 0;
 
 	/*
+	 * 7. Recompute RTO timer.
+	 *
 	 * The retransmit should happen at rtt + 4 * rttvar.
 	 * XXX: Backoff.
 	 *  RFC2988, Section 2, Page 2-3, Cases 2.1 through 2.5
 	 */
-	tp->snd_rto = max(((tp->t_srtt >> TCP_RTT_SHIFT) +
-			max(4 * (tp->t_rttvar >> TCP_RTTVAR_SHIFT), TCPTV_REXMTMAX)),
-			tcp_rexmit_min);
+	rto = max(((tp->t_srtt >> TCP_RTT_SHIFT) +
+		max(4 * (tp->t_rttvar >> TCP_RTTVAR_SHIFT), TCPTV_REXMTMAX)),
+		tcp_rexmit_min);
 
-	/*
-	 * We received an ack for a packet that wasn't retransmitted;
-	 * it is probably safe to discard any error indications we've
-	 * received recently.  This isn't quite right, but close enough
-	 * for now (a route might have failed after we sent a segment,
-	 * and the return path might not be symmetrical).
-	 * XXX: Doesn't belong here.
-	 */
-	tp->t_softerror = 0;
-
-	/*
-	 * Statistics.
-	 */
-	tp->t_rttupdated++;
-	tcpstat.tcps_rttupdated++;
+	return (rto);
 }
 
 /*
@@ -2757,7 +2850,7 @@
  */
 int
 tcp_do_wu(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to,
-    int tiwin, in acked, int tlen, int sacked)
+    int tiwin, int acked, int tlen, int sacked)
 {
 
 	KASSERT(tp != NULL && th != NULL,
@@ -2810,14 +2903,14 @@
 		 *  RFC793: section 3.7, page 42-44, "Managing the Window"
 		 *  RFC1122: section 4.2.2.16
 		 */
-		if (SEQ_DELTA(tp->snd_nxt, tp->snd_una) + tiwin < tp->snd_wnd)
+		if (SEQ_DELTA(tp->snd_nxt, tp->snd_una + acked) + tiwin < tp->snd_wnd)
 			tcplog("peer shrank the window");
 
 		/*
 		 * Update the window and keep track of this update.
 		 */
 		tp->snd_wnd = tiwin;
-		if (th->th_seq > tp->snd_wu_seq)
+		if (SEQ_GT(th->th_seq, tp->snd_wu_seq))
 			tp->snd_wu_seq = th->th_seq;
 		if (tp->snd_wnd > tp->snd_maxwnd)
 			tp->snd_maxwnd = tp->snd_wnd;
@@ -2936,61 +3029,63 @@
 }
 
 /*
- * Determine a reasonable value for maxseg size.
- * If the route is known, check route for mtu.
- * If none, use an mss that can be handled on the outgoing
- * interface without forcing IP to fragment.
+ * Determine a reasonable value for MSS size.  If the route is known,
+ * check route for mtu.  If none, use an MSS that can be handled on
+ * the outgoing interface without forcing IP to fragment.
  * If no route is found, route has no mtu, or the destination
  * isn't local, use a default, hopefully conservative size (usually
  * 512 or the default IP max size, but no more than the mtu of the
  * interface), as we can't discover anything about intervening
  * gateways or networks.
- * We also initialize the congestion/slow start window to be a single
- * segment if the destination isn't local.
- * While looking at the routing entry, we also initialize other
- * path-dependent parameters from pre-set or cached values in the
- * routing entry.
+ *  RFC793: section x
  *
- * Also take into account the space needed for options that we
- * send regularly.  Make maxseg shorter by that amount to assure
- * that we can send maxseg amount of data even when the options
- * are present.  Store the upper limit of the length of options plus
- * data in maxopd. XXX: No longer needed.
- *
- * NOTE that this routine is only called when we process an incoming
- * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
- *
- * XXXAO:
- *  Split up and simplify this function.
- *  Move initialization of cached values into its own function.
+ * NB: If no offer received pass as zero.
  */
-void
-tcp_mss(struct tcpcb *tp, int offer)
+uint16_t
+tcp_mss(struct in_conninfo *inc, int offer, int mtuflags)
 {
-	struct inpcb *inp = tp->t_inpcb;
-	struct socket *so = inp->inp_socket;
-	u_long bufsize;
-	u_long maxmtu;
-	int rtt, mss;
-	int origoffer = offer;
-	int mtuflags = 0;
+	uint16_t mss = 0;
+	uint32_t maxmtu = 0;
+	uint32_t thcmtu = 0;
+	int min_protoh;
+#ifdef INET6
+	int isipv6 = inc->inc_isipv6 ? 1 : 0;
+#endif
+
+	KASSERT(inc != NULL,
+	    ("%s: NULL in_conninfo pointer", __func__));
+
 #ifdef INET6
-	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+	if (isipv6) {
+		mss = tcp_v6mssdflt;
+		maxmtu = tcp_maxmtu6(inc, mtuflags);
+		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+	} else
 #endif
-	struct hc_metrics_lite metrics;
+	{
+		mss = tcp_mssdflt;
+		maxmtu = tcp_maxmtu(inc, mtuflags);
+		min_protoh = sizeof(struct tcpiphdr);
+	}
+	thcmtu = tcp_hc_getmtu(inc);	/* IPv4 and IPv6 */
 
 	/*
-	 * Initialize.
-	 * If there is no route to sender,
-	 * we stay with the default mss.
+	 * Determine MTU.
 	 */
-	mss = tcp_mssopt(tcpcbtoinc(tp), &mtuflags);
+	if (maxmtu && thcmtu)
+		mss = min(maxmtu, thcmtu) - min_protoh;
+	else if (maxmtu || thcmtu)
+		mss = max(maxmtu, thcmtu) - min_protoh;
+
+	if (offer == 0)
+		return (mss);
 
 	/*
 	 * Prevent DoS attack with too small MSS. Round up
 	 * to at least minmss.
 	 */
 	offer = max(offer, tcp_minmss);
+
 	/*
 	 * Sanity check: make sure that maxopd will be large
 	 * enough to allow some data on segments even if the
@@ -3000,14 +3095,61 @@
 	offer = max(offer, 64);
 
 	/*
-	 * maxopd stores the maximum length of data AND options
-	 * in a segment; maxseg is the amount of data in a normal
-	 * segment.  We need to store this value (maxopd) apart
-	 * from maxseg, because now every segment carries options
-	 * and thus we normally have somewhat less data in segments.
+	 * Use a symmetric MSS.  It is very unlikely that we
+	 * have a different MSS in on the way back.
+	 *
+	 * XXXAO: More comment
 	 */
-	tp->snd_mss = mss = min(mss, offer);
-	tp->t_maxopd = mss;
+	mss = min(mss, offer);
+
+	return (mss);
+}
+
+/*
+ * Return the initial send window for a new connection or
+ * after an idle timeout.
+ *  RFC3390: entire document
+ *
+ *  min(4*MSS, max(2*MSS, 4380 bytes))
+ *
+ * NB: MSS must already be initialized.
+ */
+int
+tcp_init_cwnd(struct tcpcb *tp)
+{
+	int cwnd;
+
+	if (tcp_do_rfc3390)
+		cwnd = min(4 * tp->snd_mss, max(2 * tp->snd_mss, 4380));
+#ifdef INET6
+	else if (isipv6 && in6_localaddr(&inp->in6p_faddr))
+		cwnd = tp->snd_mss * ss_fltsz_local;
+#endif
+	else if (in_localaddr(inp->inp_faddr))
+		cwnd = tp->snd_mss * ss_fltsz_local;
+	else
+		cwnd = tp->snd_mss * ss_fltsz;
+
+	return (cwnd);
+}
+
+/*
+ * Prime some TCP variables from cached values.
+ */
+static void
+tcp_init_values(struct tcpcb *tp)
+{
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
+	u_long bufsize;
+	u_long maxmtu;
+	int rtt, mss;
+	int origoffer = offer;
+	int mtuflags = 0;
+#ifdef INET6
+	int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+#endif
+	struct hc_metrics_lite metrics;
 
 	/*
 	 * rmx information is now retrieved from tcp_hostcache.
@@ -3038,17 +3180,6 @@
 				 min(tp->snd_wnd, so->so_snd.sb_hiwat)));
 	else
 #endif
-	if (tcp_do_rfc3390)
-		tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
-#ifdef INET6
-	else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
-		 (!isipv6 && in_localaddr(inp->inp_faddr)))
-#else
-	else if (in_localaddr(inp->inp_faddr))
-#endif
-		tp->snd_cwnd = mss * ss_fltsz_local;
-	else
-		tp->snd_cwnd = mss * ss_fltsz;
 
 	/*
 	 * If there's a pipesize, change the socket buffer to that size,
@@ -3125,40 +3256,3 @@
 		tp->t_flags |= TF_TSO;
 
 }
-
-/*
- * Determine the MSS option to send on an outgoing SYN.
- */
-int
-tcp_mssopt(struct in_conninfo *inc, int mtuflags)
-{
-	int mss = 0;
-	u_long maxmtu = 0;
-	u_long thcmtu = 0;
-	size_t min_protoh;
-#ifdef INET6
-	int isipv6 = inc->inc_isipv6 ? 1 : 0;
-#endif
-
-	KASSERT(inc != NULL, ("%s: NULL in_conninfo pointer", __func__));
-
-#ifdef INET6
-	if (isipv6) {
-		mss = tcp_v6mssdflt;
-		maxmtu = tcp_maxmtu6(inc, mtuflags);
-		min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
-	} else
-#endif
-	{
-		mss = tcp_mssdflt;
-		maxmtu = tcp_maxmtu(inc, mtuflags);
-		min_protoh = sizeof(struct tcpiphdr);
-	}
-	thcmtu = tcp_hc_getmtu(inc);	/* IPv4 and IPv6 */
-	if (maxmtu && thcmtu)
-		mss = min(maxmtu, thcmtu) - min_protoh;
-	else if (maxmtu || thcmtu)
-		mss = max(maxmtu, thcmtu) - min_protoh;
-
-	return (mss);
-}

==== //depot/projects/tcp_new/netinet/tcp_output.c#4 (text+ko) ====

@@ -27,11 +27,9 @@
  * SUCH DAMAGE.
  *
  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
+ * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.139 2007/07/01 11:38:27 gnn Exp $
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/netinet/tcp_output.c,v 1.145 2007/11/30 23:46:51 bz Exp $");
-
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
@@ -74,85 +72,82 @@
 #include <netinet/tcp_debug.h>
 #endif
 
-#ifdef IPSEC
+#ifdef FAST_IPSEC
 #include <netipsec/ipsec.h>
-#endif /*IPSEC*/
+#endif /*FAST_IPSEC*/
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
-#ifdef notyet
-extern struct mbuf *m_copypack();
-#endif
-
-int path_mtu_discovery = 1;
+int 	tcp_do_pmtud = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW,
-	&path_mtu_discovery, 1, "Enable Path MTU Discovery");
-
-int ss_fltsz = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, slowstart_flightsize, CTLFLAG_RW,
-	&ss_fltsz, 1, "Slow start flight size");
+    &tcp_do_pmtud, 1, "Enable Path MTU Discovery");
 
-int ss_fltsz_local = 4;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize, CTLFLAG_RW,
-	&ss_fltsz_local, 1, "Slow start flight size for local networks");
-
-int     tcp_do_newreno = 1;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
-	&tcp_do_newreno, 0, "Enable NewReno Algorithms");
-
 int	tcp_do_tso = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
-	&tcp_do_tso, 0, "Enable TCP Segmentation Offload");
+    &tcp_do_tso, 0, "Enable TCP Segmentation Offload");
 
 int	tcp_do_autosndbuf = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
-	&tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
+    &tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
 
 int	tcp_autosndbuf_inc = 8*1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
-	&tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
+    &tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
 
 int	tcp_autosndbuf_max = 256*1024;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
-	&tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
+    &tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
 
-
 /*
  * Tcp output routine: figure out what should be sent and send it.
+ *
+ * 1. How much to send, if any
+ *  1.1 subject to nagles algorithm (don't send small segments)
+ *  1.2 subject to send window
+ *  1.3 subject to congestion window
+ * 2. Send window probe (persist mode)
+ * 3. Send an outstanding ACK
+ *  3.1 subject to delayed ack
+ * 4. Send a window update
+ *  4.1 subject to silly window avoidance
+ *  4.2 subject to delayed ack
+ * 5. Send retransmit
+ * 6. Send urgent data
+ * 7. Send based on flags
  */
 int
 tcp_output(struct tcpcb *tp)
 {
-	struct socket *so = tp->t_inpcb->inp_socket;
-	long len, recwin, sendwin;
-	int off, flags, error;
-	struct mbuf *m;
-	struct ip *ip = NULL;
-	struct ipovly *ipov = NULL;
-	struct tcphdr *th;
+	int off, flags, error, optlen;
+	tcp_win len, recwin, swin;
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
+	struct tcphdr ths;
+	struct tcpopt to;
 	u_char opt[TCP_MAXOLEN];
-	unsigned ipoptlen, optlen, hdrlen;
-#ifdef IPSEC
-	unsigned ipsec_optlen = 0;
+#ifdef TCP_SIGNATURE
+	int sigoff = 0;
 #endif
-	int idle, sendalot;
-	int sack_rxmit, sack_bytes_rxmt;
-	struct sackhole *p;
-	int tso = 0;
-	struct tcpopt to;
-#if 0
-	int maxburst = TCP_MAXBURST;
-#endif
-#ifdef INET6
-	struct ip6_hdr *ip6 = NULL;
-	int isipv6;
+	INP_LOCK_ASSERT(tp->t_inpcb);
+	KASSERT(tp->t_state > TCPS_LISTEN,
+	    ("%s: TCPS_LISTEN invalid", __func__));
+	KASSERT(tp->t_state != TCPS_SYN_RECEIVED,
+	    ("%s: TCPS_SYN_RECEIVED invalid", __func__));
+	KASSERT(tp->t_state < TCPS_TIME_WAIT,
+	    ("%s: TCPS_TIME_WAIT invalid", __func__));
 
-	isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
-#endif
+	KASSERT(SEQ_GEQ(tp->snd_rxmit, tp->snd_una),
+	    ("%s: snd_rxmit < snd_una", __func__))
+	KASSERT(SEQ_LEQ(tp->snd_rxmit, tp->snd_nxt),
+	    ("%s: snd_rxmit > snd_nxt", __func__))
 
-	INP_LOCK_ASSERT(tp->t_inpcb);
+	/*
+	 * Get standard flags.  Removal of inappropriate flags for a
+	 * specific segment is handled by the segmentation code.
+	 */
+	flags = tcp_outflags[tp->t_state];
 
 	/*
 	 * Determine length of data that should be transmitted,
@@ -160,792 +155,610 @@
 	 * If there is some data or critical controls (SYN, RST)
 	 * to send, then transmit; otherwise, investigate further.
 	 */
-	idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
-	if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
-		/*
-		 * We have been idle for "a while" and no acks are
-		 * expected to clock out any data we send --
-		 * slow start to get ack "clock" running again.
-		 *
-		 * Set the slow-start flight size depending on whether
-		 * this is a local network or not.
-		 */
-		int ss = ss_fltsz;
-#ifdef INET6
-		if (isipv6) {
-			if (in6_localaddr(&tp->t_inpcb->in6p_faddr))
-				ss = ss_fltsz_local;
-		} else
-#endif /* INET6 */
-		if (in_localaddr(tp->t_inpcb->inp_faddr))
-			ss = ss_fltsz_local;
-		tp->snd_cwnd = tp->t_maxseg * ss;
-	}
-	tp->t_flags &= ~TF_LASTIDLE;
-	if (idle) {
-		if (tp->t_flags & TF_MORETOCOME) {
-			tp->t_flags |= TF_LASTIDLE;
-			idle = 0;
-		}
-	}
-again:
-	/*
-	 * If we've recently taken a timeout, snd_max will be greater than
-	 * snd_nxt.  There may be SACK information that allows us to avoid
-	 * resending already delivered data.  Adjust snd_nxt accordingly.
-	 */
-	if ((tp->t_flags & TF_SACK_PERMIT) &&
-	    SEQ_LT(tp->snd_nxt, tp->snd_max))
-		tcp_sack_adjust(tp);
-	sendalot = 0;
-	off = tp->snd_nxt - tp->snd_una;
-	sendwin = min(tp->snd_wnd, tp->snd_cwnd);
+
+
 
-	flags = tcp_outflags[tp->t_state];
 	/*
-	 * Send any SACK-generated retransmissions.  If we're explicitly trying
-	 * to send out new data (when sendalot is 1), bypass this function.
-	 * If we retransmit in fast recovery mode, decrement snd_cwnd, since
-	 * we're replacing a (future) new transmission with a retransmission
-	 * now, and we previously incremented snd_cwnd in tcp_input().
+	 * We have been idle for "a while" and no acks are
+	 * expected to clock out any data we send --
+	 * slow start to get ack "clock" running again.
+	 *
+	 * Set the slow-start flight size depending on whether
+	 * this is a local network or not.
 	 */
-	/*
-	 * Still in sack recovery , reset rxmit flag to zero.
-	 */
-	sack_rxmit = 0;
-	sack_bytes_rxmt = 0;
-	len = 0;
-	p = NULL;
-	if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) &&
-	    (p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
-		long cwin;
-		
-		cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
-		if (cwin < 0)
-			cwin = 0;
-		/* Do not retransmit SACK segments beyond snd_recover */
-		if (SEQ_GT(p->end, tp->snd_recover)) {
-			/*
-			 * (At least) part of sack hole extends beyond
-			 * snd_recover. Check to see if we can rexmit data
-			 * for this hole.
-			 */

>>> TRUNCATED FOR MAIL (1000 lines) <<<



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200902161653.n1GGrhNM085561>