Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 24 Nov 2007 19:13:02 GMT
From:      Rui Paulo <rpaulo@FreeBSD.org>
To:        Perforce Change Reviews <perforce@FreeBSD.org>
Subject:   PERFORCE change 129464 for review
Message-ID:  <200711241913.lAOJD2WH064564@repoman.freebsd.org>

next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=129464

Change 129464 by rpaulo@rpaulo_zoo on 2007/11/24 19:12:59

	End host TCP ECN implementation. My Google Summer Of Code
	project for 2006.
	
	Obtained from:	NetBSD

Affected files ...

.. //depot/projects/tcpecn/netinet/tcp_input.c#2 edit
.. //depot/projects/tcpecn/netinet/tcp_output.c#2 edit
.. //depot/projects/tcpecn/netinet/tcp_syncache.c#2 edit
.. //depot/projects/tcpecn/netinet/tcp_usrreq.c#2 edit
.. //depot/projects/tcpecn/netinet/tcp_var.h#2 edit

Differences ...

==== //depot/projects/tcpecn/netinet/tcp_input.c#2 (text+ko) ====

@@ -128,6 +128,14 @@
     &tcp_do_rfc3390, 0,
     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
 
+int	tcp_do_ecn = 0;
+int	tcp_ecn_maxretries = 1;
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
+    &tcp_do_ecn, 0, "TCP ECN support");
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
+    &tcp_ecn_maxretries, 0, "Max retries before giving up on ECN");
+
 static int tcp_insecure_rst = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
     &tcp_insecure_rst, 0,
@@ -152,14 +160,32 @@
 
 static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
-		     struct socket *, struct tcpcb *, int, int);
+		     struct socket *, struct tcpcb *, int, int, uint8_t);
 static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 static void	 tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 static void	 tcp_xmit_timer(struct tcpcb *, int);
 static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+static void inline
+		 tcp_congestion_exp(struct tcpcb *);
 
+static void inline
+tcp_congestion_exp(struct tcpcb *tp)
+{
+	u_int win;
+	
+	win = min(tp->snd_wnd, tp->snd_cwnd) /
+	    2 / tp->t_maxseg;
+	if (win < 2)
+		win = 2;
+	tp->snd_ssthresh = win * tp->t_maxseg;
+	ENTER_FASTRECOVERY(tp);
+	tp->snd_recover = tp->snd_max;
+	if (tp->t_flags & TF_ECN_PERMIT)
+		tp->t_flags |= TF_ECN_SND_CWR;
+}
+
 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
 #ifdef INET6
 #define ND6_HINT(tp) \
@@ -238,6 +264,7 @@
 	int drop_hdrlen;
 	int thflags;
 	int rstreason = 0;	/* For badport_bandlim accounting purposes */
+	uint8_t iptos;
 #ifdef IPFIREWALL_FORWARD
 	struct m_tag *fwd_tag;
 #endif
@@ -347,6 +374,13 @@
 		ip->ip_v = IPVERSION;
 	}
 
+#ifdef INET6
+	if (isipv6)
+		iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+	else
+#endif
+		iptos = ip->ip_tos;
+
 	/*
 	 * Check that TCP offset makes sense,
 	 * pull out TCP options and adjust length.		XXX
@@ -642,7 +676,8 @@
 			 * contains.  tcp_do_segment() consumes
 			 * the mbuf chain and unlocks the inpcb.
 			 */
-			tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen);
+			tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
+			    iptos);
 			INP_INFO_UNLOCK_ASSERT(&tcbinfo);
 			return;
 		}
@@ -842,7 +877,7 @@
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
 	 */
-	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen);
+	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);
 	INP_INFO_UNLOCK_ASSERT(&tcbinfo);
 	return;
 
@@ -866,7 +901,7 @@
 
 static void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
-    struct tcpcb *tp, int drop_hdrlen, int tlen)
+    struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos)
 {
 	int thflags, acked, ourfinisacked, needoutput = 0;
 	int headlocked = 1;
@@ -909,6 +944,35 @@
 	tiwin = th->th_win << tp->snd_scale;
 
 	/*
+	 * TCP ECN processing.
+	 */
+	if (tp->t_flags & TF_ECN_PERMIT) {
+		switch (iptos & IPTOS_ECN_MASK) {
+		case IPTOS_ECN_CE:
+			tp->t_flags |= TF_ECN_SND_ECE;
+			tcpstat.tcps_ecn_ce++;
+			break;
+		case IPTOS_ECN_ECT0:
+			tcpstat.tcps_ecn_ect0++;
+			break;
+		case IPTOS_ECN_ECT1:
+			tcpstat.tcps_ecn_ect1++;
+			break;
+		}
+
+		if (thflags & TH_CWR)
+			tp->t_flags &= ~TF_ECN_SND_ECE;
+
+		/*
+		 * Congestion experienced.
+		 * Ignore if we are already trying to recover.
+		 */
+		if ((thflags & TH_ECE) &&
+		    SEQ_GEQ(tp->snd_una, tp->snd_recover))
+			tcp_congestion_exp(tp);
+	}
+
+	/*
 	 * Parse options on any incoming segment.
 	 */
 	tcp_dooptions(&to, (u_char *)(th + 1),
@@ -976,7 +1040,8 @@
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    th->th_seq == tp->rcv_nxt &&
-	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
+	    (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_CWR))
+	        == TH_ACK &&
 	    tp->snd_nxt == tp->snd_max &&
 	    tiwin && tiwin == tp->snd_wnd && 
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
@@ -1253,6 +1318,8 @@
 	 * Otherwise this is an acceptable SYN segment
 	 *	initialize tp->rcv_nxt and tp->irs
 	 *	if seg contains ack then advance tp->snd_una
+	 *	if seg contains an ECE and ECN support is enabled, the stream
+	 *	    is ECN capable.
 	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 	 *	arrange for segment to be acked (eventually)
 	 *	continue processing rest of data/controls, beginning with URG
@@ -1297,6 +1364,12 @@
 				    tcp_delacktime);
 			else
 				tp->t_flags |= TF_ACKNOW;
+
+			if ((thflags & TH_ECE) && tcp_do_ecn) {
+				tp->t_flags |= TF_ECN_PERMIT;
+				tcpstat.tcps_ecn_shs++;
+			}
+			
 			/*
 			 * Received <SYN,ACK> in SYN_SENT[*] state.
 			 * Transitions:
@@ -1758,6 +1831,9 @@
 				 * so bump cwnd by the amount in the receiver
 				 * to keep a constant cwnd packets in the
 				 * network.
+				 *
+				 * When using TCP ECN, notify the peer that
+				 * we reduced the cwnd.
 				 */
 				if (!tcp_timer_active(tp, TT_REXMT) ||
 				    th->th_ack != tp->snd_una)
@@ -1789,7 +1865,6 @@
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
-					u_int win;
 
 					/*
 					 * If we're doing sack, check to
@@ -1803,20 +1878,15 @@
 							tp->t_dupacks = 0;
 							break;
 						}
-					} else if (tcp_do_newreno) {
+					} else if (tcp_do_newreno ||
+					    tcp_do_ecn) {
 						if (SEQ_LEQ(th->th_ack,
 						    tp->snd_recover)) {
 							tp->t_dupacks = 0;
 							break;
 						}
 					}
-					win = min(tp->snd_wnd, tp->snd_cwnd) /
-					    2 / tp->t_maxseg;
-					if (win < 2)
-						win = 2;
-					tp->snd_ssthresh = win * tp->t_maxseg;
-					ENTER_FASTRECOVERY(tp);
-					tp->snd_recover = tp->snd_max;
+					tcp_congestion_exp(tp);
 					tcp_timer_activate(tp, TT_REXMT, 0);
 					tp->t_rtttime = 0;
 					if (tp->t_flags & TF_SACK_PERMIT) {

==== //depot/projects/tcpecn/netinet/tcp_output.c#2 (text+ko) ====

@@ -884,6 +884,49 @@
 	    tp->snd_nxt == tp->snd_max)
 		tp->snd_nxt--;
 	/*
+	 * If we are starting a connection, send ECN setup
+	 * SYN packet. If we are on a retransmit, we may
+	 * resend those bits a number of times as per
+	 * RFC 3168.
+	 */
+	if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
+		if (tp->t_rxtshift >= 1) {
+			if (tp->t_rxtshift <= tcp_ecn_maxretries)
+				flags |= TH_ECE|TH_CWR;
+		} else
+			flags |= TH_ECE|TH_CWR;
+	}
+	
+	if (tp->t_state == TCPS_ESTABLISHED &&
+	    (tp->t_flags & TF_ECN_PERMIT)) {
+		/*
+		 * If the peer has ECN, mark data packets with
+		 * ECN capable transmission (ECT).
+		 * Ignore pure ack packets, retransmissions and window probes.
+		 */
+		if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
+		    !((tp->t_flags & TF_FORCEDATA) && len == 1)) {
+#ifdef INET6
+			if (isipv6)
+				ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
+			else
+#endif
+				ip->ip_tos |= IPTOS_ECN_ECT0;
+			tcpstat.tcps_ecn_ect0++;
+		}
+		
+		/*
+		 * Reply with proper ECN notifications.
+		 */
+		if (tp->t_flags & TF_ECN_SND_CWR) {
+			flags |= TH_CWR;
+			tp->t_flags &= ~TF_ECN_SND_CWR;
+		} 
+		if (tp->t_flags & TF_ECN_SND_ECE)
+			flags |= TH_ECE;
+	}
+	
+	/*
 	 * If we are doing retransmissions, then snd_nxt will
 	 * not reflect the first unsent octet.  For ACK only
 	 * packets, we do not want the sequence number of the

==== //depot/projects/tcpecn/netinet/tcp_syncache.c#2 (text+ko) ====

@@ -127,7 +127,7 @@
 	u_int8_t	sc_ip_tos;		/* IPv4 TOS */
 	u_int8_t	sc_requested_s_scale:4,
 			sc_requested_r_scale:4;
-	u_int8_t	sc_flags;
+	u_int16_t	sc_flags;
 #define SCF_NOOPT	0x01			/* no TCP options */
 #define SCF_WINSCALE	0x02			/* negotiated window scaling */
 #define SCF_TIMESTAMP	0x04			/* negotiated timestamps */
@@ -135,6 +135,7 @@
 #define SCF_UNREACH	0x10			/* icmp unreachable received */
 #define SCF_SIGNATURE	0x20			/* send MD5 digests */
 #define SCF_SACK	0x80			/* send SACK option */
+#define SCF_ECN		0x100			/* send ECN setup packet */
 #ifdef MAC
 	struct label	*sc_label;		/* MAC label reference */
 #endif
@@ -778,6 +779,9 @@
 			tp->t_flags |= TF_SACK_PERMIT;
 	}
 
+	if (sc->sc_flags & SCF_ECN)
+		tp->t_flags |= TF_ECN_PERMIT;
+
 	/*
 	 * Set up MSS and get cached values from tcp_hostcache.
 	 * This might overwrite some of the defaults we just set.
@@ -1190,7 +1194,9 @@
 		sc->sc_peer_mss = to->to_mss;	/* peer mss may be zero */
 	if (noopt)
 		sc->sc_flags |= SCF_NOOPT;
-
+	if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
+		sc->sc_flags |= SCF_ECN;
+	
 	if (tcp_syncookies) {
 		syncookie_generate(sch, sc, &flowtmp);
 #ifdef INET6
@@ -1325,6 +1331,41 @@
 	th->th_win = htons(sc->sc_wnd);
 	th->th_urp = 0;
 
+	if (sc->sc_flags & SCF_ECN) {
+		th->th_flags |= TH_ECE;
+		tcpstat.tcps_ecn_shs++;
+		
+		/*
+		 * draft-ietf-tcpm-ecnsyn-00.txt
+		 *
+		 * "[...] a TCP node MAY respond to an ECN-setup
+		 * SYN packet by setting ECT in the responding
+		 * ECN-setup SYN/ACK packet, indicating to routers 
+		 * that the SYN/ACK packet is ECN-Capable.
+		 * This allows a congested router along the path
+		 * to mark the packet instead of dropping the
+		 * packet as an indication of congestion."
+		 *
+		 * "[...] There can be a great benefit in setting
+		 * an ECN-capable codepoint in SYN/ACK packets [...]
+		 * Congestion is  most likely to occur in
+		 * the server-to-client direction.  As a result,
+		 * setting an ECN-capable codepoint in SYN/ACK
+		 * packets can reduce the occurence of three-second
+		 * retransmit timeouts resulting from the drop
+		 * of SYN/ACK packets."
+		 *
+		 * Page 4 and 6, January 2006.
+		 */
+#ifdef INET6
+		if (sc->sc_inc.inc_isipv6)
+			ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
+		else
+#endif
+			ip->ip_tos |= IPTOS_ECN_ECT0;
+		tcpstat.tcps_ecn_ect0++;
+	}
+	
 	/* Tack on the TCP options. */
 	if ((sc->sc_flags & SCF_NOOPT) == 0) {
 		to.to_flags = 0;

==== //depot/projects/tcpecn/netinet/tcp_usrreq.c#2 (text+ko) ====

@@ -1712,6 +1712,10 @@
 		db_printf("%sTF_TSO", comma ? ", " : "");
 		comma = 1;
 	}
+	if (t_flags & TF_ECN_PERMIT) {
+		db_printf("%sTF_ECN_PERMIT", comma ? ", " : "");
+		comma = 1;
+	}
 }
 
 static void

==== //depot/projects/tcpecn/netinet/tcp_var.h#2 (text+ko) ====

@@ -123,6 +123,9 @@
 #define	TF_SIGNATURE	0x400000	/* require MD5 digests (RFC2385) */
 #define	TF_FORCEDATA	0x800000	/* force out a byte */
 #define	TF_TSO		0x1000000	/* TSO enabled on this connection */
+#define	TF_ECN_PERMIT	0x2000000	/* connection ECN-ready */
+#define	TF_ECN_SND_CWR	0x4000000	/* ECN CWR in queue */
+#define TF_ECN_SND_ECE	0x8000000	/* ECN ECE in queue */
 
 	tcp_seq	snd_una;		/* send unacknowledged */
 	tcp_seq	snd_max;		/* highest sequence number sent;
@@ -429,6 +432,12 @@
 	u_long  tcps_sack_rcv_blocks;	    /* SACK blocks (options) received */
 	u_long  tcps_sack_send_blocks;	    /* SACK blocks (options) sent     */
 	u_long  tcps_sack_sboverflow; 	    /* times scoreboard overflowed */
+	
+	/* ECN related stats */
+	u_long	tcps_ecn_ce;		/* ECN Congestion Experienced */
+	u_long	tcps_ecn_ect0;		/* ECN Capable Transport */
+	u_long	tcps_ecn_ect1;		/* ECN Capable Transport */
+	u_long	tcps_ecn_shs;		/* ECN successful handshakes */
 };
 
 /*
@@ -505,6 +514,8 @@
 
 extern	int tcp_do_sack;		/* SACK enabled/disabled */
 extern	int tcp_sc_rst_sock_fail;	/* RST on sock alloc failure */
+extern	int tcp_do_ecn;			/* TCP ECN enabled/disabled */
+extern	int tcp_ecn_maxretries;
 
 int	 tcp_addoptions(struct tcpopt *, u_char *);
 struct tcpcb *



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200711241913.lAOJD2WH064564>