From owner-p4-projects@FreeBSD.ORG Sat Nov 24 19:13:03 2007 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id E72E716A46B; Sat, 24 Nov 2007 19:13:02 +0000 (UTC) Delivered-To: perforce@FreeBSD.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 8EE4C16A41A for ; Sat, 24 Nov 2007 19:13:02 +0000 (UTC) (envelope-from rpaulo@FreeBSD.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id 8244B13C461 for ; Sat, 24 Nov 2007 19:13:02 +0000 (UTC) (envelope-from rpaulo@FreeBSD.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id lAOJD20t064567 for ; Sat, 24 Nov 2007 19:13:02 GMT (envelope-from rpaulo@FreeBSD.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.1/8.14.1/Submit) id lAOJD2WH064564 for perforce@freebsd.org; Sat, 24 Nov 2007 19:13:02 GMT (envelope-from rpaulo@FreeBSD.org) Date: Sat, 24 Nov 2007 19:13:02 GMT Message-Id: <200711241913.lAOJD2WH064564@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to rpaulo@FreeBSD.org using -f From: Rui Paulo To: Perforce Change Reviews Cc: Subject: PERFORCE change 129464 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 24 Nov 2007 19:13:03 -0000 http://perforce.freebsd.org/chv.cgi?CH=129464 Change 129464 by rpaulo@rpaulo_zoo on 2007/11/24 19:12:59 End host TCP ECN implementation. My Google Summer Of Code project for 2006. Obtained from: NetBSD Affected files ... .. //depot/projects/tcpecn/netinet/tcp_input.c#2 edit .. //depot/projects/tcpecn/netinet/tcp_output.c#2 edit .. //depot/projects/tcpecn/netinet/tcp_syncache.c#2 edit .. //depot/projects/tcpecn/netinet/tcp_usrreq.c#2 edit .. //depot/projects/tcpecn/netinet/tcp_var.h#2 edit Differences ... ==== //depot/projects/tcpecn/netinet/tcp_input.c#2 (text+ko) ==== @@ -128,6 +128,14 @@ &tcp_do_rfc3390, 0, "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)"); +int tcp_do_ecn = 0; +int tcp_ecn_maxretries = 1; +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN"); +SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW, + &tcp_do_ecn, 0, "TCP ECN support"); +SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW, + &tcp_ecn_maxretries, 0, "Max retries before giving up on ECN"); + static int tcp_insecure_rst = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW, &tcp_insecure_rst, 0, @@ -152,14 +160,32 @@ static void tcp_dooptions(struct tcpopt *, u_char *, int, int); static void tcp_do_segment(struct mbuf *, struct tcphdr *, - struct socket *, struct tcpcb *, int, int); + struct socket *, struct tcpcb *, int, int, uint8_t); static void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static void tcp_xmit_timer(struct tcpcb *, int); static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); +static void inline + tcp_congestion_exp(struct tcpcb *); +static void inline +tcp_congestion_exp(struct tcpcb *tp) +{ + u_int win; + + win = min(tp->snd_wnd, tp->snd_cwnd) / + 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_ssthresh = win * tp->t_maxseg; + ENTER_FASTRECOVERY(tp); + tp->snd_recover = tp->snd_max; + if (tp->t_flags & TF_ECN_PERMIT) + tp->t_flags |= TF_ECN_SND_CWR; +} + /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */ #ifdef INET6 #define ND6_HINT(tp) \ @@ -238,6 +264,7 @@ int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ + uint8_t iptos; #ifdef IPFIREWALL_FORWARD struct m_tag *fwd_tag; #endif @@ -347,6 +374,13 @@ ip->ip_v = IPVERSION; } +#ifdef INET6 + if (isipv6) + iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + else +#endif + iptos = ip->ip_tos; + /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX @@ -642,7 +676,8 @@ * contains. tcp_do_segment() consumes * the mbuf chain and unlocks the inpcb. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); + tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, + iptos); INP_INFO_UNLOCK_ASSERT(&tcbinfo); return; } @@ -842,7 +877,7 @@ * state. tcp_do_segment() always consumes the mbuf chain, unlocks * the inpcb, and unlocks pcbinfo. */ - tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen); + tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); INP_INFO_UNLOCK_ASSERT(&tcbinfo); return; @@ -866,7 +901,7 @@ static void tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, - struct tcpcb *tp, int drop_hdrlen, int tlen) + struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { int thflags, acked, ourfinisacked, needoutput = 0; int headlocked = 1; @@ -909,6 +944,35 @@ tiwin = th->th_win << tp->snd_scale; /* + * TCP ECN processing. + */ + if (tp->t_flags & TF_ECN_PERMIT) { + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->t_flags |= TF_ECN_SND_ECE; + tcpstat.tcps_ecn_ce++; + break; + case IPTOS_ECN_ECT0: + tcpstat.tcps_ecn_ect0++; + break; + case IPTOS_ECN_ECT1: + tcpstat.tcps_ecn_ect1++; + break; + } + + if (thflags & TH_CWR) + tp->t_flags &= ~TF_ECN_SND_ECE; + + /* + * Congestion experienced. + * Ignore if we are already trying to recover. + */ + if ((thflags & TH_ECE) && + SEQ_GEQ(tp->snd_una, tp->snd_recover)) + tcp_congestion_exp(tp); + } + + /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), @@ -976,7 +1040,8 @@ */ if (tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && - (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && + (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK|TH_ECE|TH_CWR)) + == TH_ACK && tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && @@ -1253,6 +1318,8 @@ * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una + * if seg contains an ECE and ECN support is enabled, the stream + * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG @@ -1297,6 +1364,12 @@ tcp_delacktime); else tp->t_flags |= TF_ACKNOW; + + if ((thflags & TH_ECE) && tcp_do_ecn) { + tp->t_flags |= TF_ECN_PERMIT; + tcpstat.tcps_ecn_shs++; + } + /* * Received in SYN_SENT[*] state. * Transitions: @@ -1758,6 +1831,9 @@ * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. + * + * When using TCP ECN, notify the peer that + * we reduced the cwnd. */ if (!tcp_timer_active(tp, TT_REXMT) || th->th_ack != tp->snd_una) @@ -1789,7 +1865,6 @@ goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; - u_int win; /* * If we're doing sack, check to @@ -1803,20 +1878,15 @@ tp->t_dupacks = 0; break; } - } else if (tcp_do_newreno) { + } else if (tcp_do_newreno || + tcp_do_ecn) { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } - win = min(tp->snd_wnd, tp->snd_cwnd) / - 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_ssthresh = win * tp->t_maxseg; - ENTER_FASTRECOVERY(tp); - tp->snd_recover = tp->snd_max; + tcp_congestion_exp(tp); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { ==== //depot/projects/tcpecn/netinet/tcp_output.c#2 (text+ko) ==== @@ -884,6 +884,49 @@ tp->snd_nxt == tp->snd_max) tp->snd_nxt--; /* + * If we are starting a connection, send ECN setup + * SYN packet. If we are on a retransmit, we may + * resend those bits a number of times as per + * RFC 3168. + */ + if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) { + if (tp->t_rxtshift >= 1) { + if (tp->t_rxtshift <= tcp_ecn_maxretries) + flags |= TH_ECE|TH_CWR; + } else + flags |= TH_ECE|TH_CWR; + } + + if (tp->t_state == TCPS_ESTABLISHED && + (tp->t_flags & TF_ECN_PERMIT)) { + /* + * If the peer has ECN, mark data packets with + * ECN capable transmission (ECT). + * Ignore pure ack packets, retransmissions and window probes. + */ + if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && + !((tp->t_flags & TF_FORCEDATA) && len == 1)) { +#ifdef INET6 + if (isipv6) + ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); + else +#endif + ip->ip_tos |= IPTOS_ECN_ECT0; + tcpstat.tcps_ecn_ect0++; + } + + /* + * Reply with proper ECN notifications. + */ + if (tp->t_flags & TF_ECN_SND_CWR) { + flags |= TH_CWR; + tp->t_flags &= ~TF_ECN_SND_CWR; + } + if (tp->t_flags & TF_ECN_SND_ECE) + flags |= TH_ECE; + } + + /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only * packets, we do not want the sequence number of the ==== //depot/projects/tcpecn/netinet/tcp_syncache.c#2 (text+ko) ==== @@ -127,7 +127,7 @@ u_int8_t sc_ip_tos; /* IPv4 TOS */ u_int8_t sc_requested_s_scale:4, sc_requested_r_scale:4; - u_int8_t sc_flags; + u_int16_t sc_flags; #define SCF_NOOPT 0x01 /* no TCP options */ #define SCF_WINSCALE 0x02 /* negotiated window scaling */ #define SCF_TIMESTAMP 0x04 /* negotiated timestamps */ @@ -135,6 +135,7 @@ #define SCF_UNREACH 0x10 /* icmp unreachable received */ #define SCF_SIGNATURE 0x20 /* send MD5 digests */ #define SCF_SACK 0x80 /* send SACK option */ +#define SCF_ECN 0x100 /* send ECN setup packet */ #ifdef MAC struct label *sc_label; /* MAC label reference */ #endif @@ -778,6 +779,9 @@ tp->t_flags |= TF_SACK_PERMIT; } + if (sc->sc_flags & SCF_ECN) + tp->t_flags |= TF_ECN_PERMIT; + /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. @@ -1190,7 +1194,9 @@ sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ if (noopt) sc->sc_flags |= SCF_NOOPT; - + if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn) + sc->sc_flags |= SCF_ECN; + if (tcp_syncookies) { syncookie_generate(sch, sc, &flowtmp); #ifdef INET6 @@ -1325,6 +1331,41 @@ th->th_win = htons(sc->sc_wnd); th->th_urp = 0; + if (sc->sc_flags & SCF_ECN) { + th->th_flags |= TH_ECE; + tcpstat.tcps_ecn_shs++; + + /* + * draft-ietf-tcpm-ecnsyn-00.txt + * + * "[...] a TCP node MAY respond to an ECN-setup + * SYN packet by setting ECT in the responding + * ECN-setup SYN/ACK packet, indicating to routers + * that the SYN/ACK packet is ECN-Capable. + * This allows a congested router along the path + * to mark the packet instead of dropping the + * packet as an indication of congestion." + * + * "[...] There can be a great benefit in setting + * an ECN-capable codepoint in SYN/ACK packets [...] + * Congestion is most likely to occur in + * the server-to-client direction. As a result, + * setting an ECN-capable codepoint in SYN/ACK + * packets can reduce the occurence of three-second + * retransmit timeouts resulting from the drop + * of SYN/ACK packets." + * + * Page 4 and 6, January 2006. + */ +#ifdef INET6 + if (sc->sc_inc.inc_isipv6) + ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); + else +#endif + ip->ip_tos |= IPTOS_ECN_ECT0; + tcpstat.tcps_ecn_ect0++; + } + /* Tack on the TCP options. */ if ((sc->sc_flags & SCF_NOOPT) == 0) { to.to_flags = 0; ==== //depot/projects/tcpecn/netinet/tcp_usrreq.c#2 (text+ko) ==== @@ -1712,6 +1712,10 @@ db_printf("%sTF_TSO", comma ? ", " : ""); comma = 1; } + if (t_flags & TF_ECN_PERMIT) { + db_printf("%sTF_ECN_PERMIT", comma ? ", " : ""); + comma = 1; + } } static void ==== //depot/projects/tcpecn/netinet/tcp_var.h#2 (text+ko) ==== @@ -123,6 +123,9 @@ #define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x800000 /* force out a byte */ #define TF_TSO 0x1000000 /* TSO enabled on this connection */ +#define TF_ECN_PERMIT 0x2000000 /* connection ECN-ready */ +#define TF_ECN_SND_CWR 0x4000000 /* ECN CWR in queue */ +#define TF_ECN_SND_ECE 0x8000000 /* ECN ECE in queue */ tcp_seq snd_una; /* send unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; @@ -429,6 +432,12 @@ u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */ u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */ u_long tcps_sack_sboverflow; /* times scoreboard overflowed */ + + /* ECN related stats */ + u_long tcps_ecn_ce; /* ECN Congestion Experienced */ + u_long tcps_ecn_ect0; /* ECN Capable Transport */ + u_long tcps_ecn_ect1; /* ECN Capable Transport */ + u_long tcps_ecn_shs; /* ECN successful handshakes */ }; /* @@ -505,6 +514,8 @@ extern int tcp_do_sack; /* SACK enabled/disabled */ extern int tcp_sc_rst_sock_fail; /* RST on sock alloc failure */ +extern int tcp_do_ecn; /* TCP ECN enabled/disabled */ +extern int tcp_ecn_maxretries; int tcp_addoptions(struct tcpopt *, u_char *); struct tcpcb *