From owner-p4-projects@FreeBSD.ORG Fri Aug 5 16:09:10 2011 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 97CC91065672; Fri, 5 Aug 2011 16:09:09 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 5A053106566B for ; Fri, 5 Aug 2011 16:09:09 +0000 (UTC) (envelope-from cnicutar@freebsd.org) Received: from skunkworks.freebsd.org (skunkworks.freebsd.org [IPv6:2001:4f8:fff6::2d]) by mx1.freebsd.org (Postfix) with ESMTP id 479308FC13 for ; Fri, 5 Aug 2011 16:09:09 +0000 (UTC) Received: from skunkworks.freebsd.org (localhost [127.0.0.1]) by skunkworks.freebsd.org (8.14.4/8.14.4) with ESMTP id p75G99TF076737 for ; Fri, 5 Aug 2011 16:09:09 GMT (envelope-from cnicutar@freebsd.org) Received: (from perforce@localhost) by skunkworks.freebsd.org (8.14.4/8.14.4/Submit) id p75G99g0076734 for perforce@freebsd.org; Fri, 5 Aug 2011 16:09:09 GMT (envelope-from cnicutar@freebsd.org) Date: Fri, 5 Aug 2011 16:09:09 GMT Message-Id: <201108051609.p75G99g0076734@skunkworks.freebsd.org> X-Authentication-Warning: skunkworks.freebsd.org: perforce set sender to cnicutar@freebsd.org using -f From: Catalin Nicutar To: Perforce Change Reviews Precedence: bulk Cc: Subject: PERFORCE change 197225 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 05 Aug 2011 16:09:10 -0000 http://p4web.freebsd.org/@@197225?ac=10 Change 197225 by cnicutar@cnicutar_cronos on 2011/08/05 16:08:46 Forward-port UTO kernel changes from 8 to HEAD. Affected files ... .. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp.h#2 edit .. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_input.c#2 edit .. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_output.c#2 edit .. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_subr.c#2 edit .. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_syncache.c#2 edit .. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_syncache.h#2 edit .. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_timer.c#2 edit .. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_usrreq.c#2 edit .. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_var.h#2 edit Differences ... ==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp.h#2 (text+ko) ==== @@ -96,6 +96,8 @@ #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 +#define TCPOPT_UTO 28 +#define TCPOLEN_UTO 4 /* Miscellaneous constants */ #define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */ @@ -103,6 +105,14 @@ /* + * The timeout ranges for TCP UTO have security implications; in particular, + * long timeouts might allow for denial-of-service attacks. + */ +#define TCP_UTOMIN 100 /* Minimum acceptable timeout. */ +#define TCP_UTOMAX 600 /* Maximum advertised timeout. */ + + +/* * The default maximum segment size (MSS) to be used for new TCP connections * when path MTU discovery is not enabled. * @@ -158,6 +168,8 @@ #define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */ #define TCP_INFO 0x20 /* retrieve tcp_info structure */ #define TCP_CONGESTION 0x40 /* get/set congestion control algorithm */ +#define TCP_SNDUTO_TIMEOUT 0x80 /* get/set sent UTO value */ +#define TCP_RCVUTO_TIMEOUT 0x100 /* accept UTO suggestion */ #define TCP_CA_NAME_MAX 16 /* max congestion control name length */ ==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_input.c#2 (text+ko) ==== @@ -1324,6 +1324,21 @@ (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif tcp_dooptions(&to, optp, optlen, TO_SYN); + + if (to.to_flags & TOF_UTO) { + /* + * Storing the value even if the user might not + * accept it. Also, not clamping it just yet. + */ + tp->rcv_uto = (to.to_uto & UTO_MINS) ? + (to.to_uto & ~(UTO_MINS)) * 60 : to.to_uto; + /* + * XXX-CN Using option both for send and receive. + * Clear it for syncache. + */ + to.to_flags &= ~TOF_UTO; + } + syncache_add(&inc, &to, th, inp, &so, m); /* * Entry added to syncache and mbuf consumed. @@ -1511,6 +1526,18 @@ (thflags & TH_SYN) ? TO_SYN : 0); /* + * Processing received UTO even if the user doesn't accept it + * yet. The user might want to accept it later (perhaps after + * authentication) but the peer need not send it again. + * The value is converter to seconds and not clamped (the user + * needs to know the real value received). + */ + if (to.to_flags & TOF_UTO) { + tp->rcv_uto = (to.to_uto & UTO_MINS) ? + (to.to_uto & ~(UTO_MINS)) * 60 : to.to_uto; + } + + /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection @@ -3169,6 +3196,17 @@ to->to_sacks = cp + 2; TCPSTAT_INC(tcps_sack_rcv_blocks); break; + case TCPOPT_UTO: + if (optlen != TCPOLEN_UTO) + continue; + if (!V_uto_enable) + continue; + to->to_flags |= TOF_UTO; + bcopy((char *)cp + 2, + (char *)&to->to_uto, sizeof(to->to_uto)); + to->to_uto = htons(to->to_uto); + /* Avoid converting to seconds: it might overflow. */ + break; default: continue; } ==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_output.c#2 (text+ko) ==== @@ -705,6 +705,18 @@ to.to_sacks = (u_char *)tp->sackblks; } } + /* UTO */ + if (tp->t_flags & TF_SND_UTO) { + to.to_uto = tp->snd_uto; + to.to_flags |= TOF_UTO; + /* + * The option is sent with the SYN and with the first + * non-SYN segment. + */ + if (!(flags & TH_SYN)) + tp->t_flags &= ~TF_SND_UTO; + + } #ifdef TCP_SIGNATURE /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) @@ -1491,6 +1503,39 @@ TCPSTAT_INC(tcps_sack_send_blocks); break; } + case TOF_UTO: + while (optlen % 4) { + optlen += TCPOLEN_NOP; + *optp++ = TCPOPT_NOP; + } + if (TCP_MAXOLEN - optlen < TCPOLEN_UTO) + continue; + optlen += TCPOLEN_UTO; + *optp++ = TCPOPT_UTO; + *optp++ = TCPOLEN_UTO; + + if (to->to_uto > UTO_MINS_TH) { + /* + * If the timeout is larger than UTO_MINS + * we'll specify minutes. + * XXX-CN UTO_MINS is arbitrary. + */ + to->to_uto /= 60; + to->to_uto |= UTO_MINS; + } + + /* + * XXX-CN to_uto is 32b because the user is allowed + * to specify more than 16b of seconds (dividing the + * value by 60 will make it fit). + */ + { + uint16_t uto = to->to_uto; + uto = htons(uto); + bcopy((u_char *)&uto, optp, sizeof(uto)); + optp += sizeof(uto); + } + break; default: panic("%s: unknown TCP option type", __func__); break; ==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_subr.c#2 (text+ko) ==== @@ -161,6 +161,24 @@ "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, uto, CTLFLAG_RW, 0, "TCP UTO"); + +VNET_DEFINE(int, uto_enable) = 1; +SYSCTL_VNET_INT(_net_inet_tcp_uto, OID_AUTO, enable, CTLFLAG_RW, + &VNET_NAME(uto_enable), 0, + "Enable TCP UTO for all connections"); + +VNET_DEFINE(int, uto_min_timeout) = TCP_UTOMIN; +SYSCTL_VNET_INT(_net_inet_tcp_uto, OID_AUTO, min_timeout, CTLFLAG_RW, + &VNET_NAME(uto_min_timeout), 0, + "Minimum accepted timeout for a connection"); + +VNET_DEFINE(int, uto_max_timeout) = 600; +SYSCTL_VNET_INT(_net_inet_tcp_uto, OID_AUTO, max_timeout, CTLFLAG_RW, + &VNET_NAME(uto_max_timeout), 0, + "Maximum accepted timeout for a connection"); + + /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds ==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_syncache.c#2 (text+ko) ==== @@ -827,6 +827,14 @@ #endif if (sc->sc_flags & SCF_SACK) tp->t_flags |= TF_SACK_PERMIT; + if (sc->sc_flags & SCF_SND_UTO) { + tp->t_flags |= TF_SND_UTO; + tp->snd_uto = sc->sc_snd_uto; + } + if (sc->sc_flags & SCF_RCV_UTO) { + tp->t_flags |= TF_RCV_UTO; + tp->rcv_uto = sc->sc_rcv_uto; + } } if (sc->sc_flags & SCF_ECN) @@ -1039,6 +1047,14 @@ struct syncache scs; struct ucred *cred; + /* + * The client may have sent us an UTO suggestion; even if it hasn't, + * we need to inherit the current disposition (i.e. will the resulting + * socket accept suggestions?). + */ + uint8_t rcv_uto_tf = 0; + uint32_t rcv_uto = 0; + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, @@ -1063,6 +1079,19 @@ sb_hiwat = so->so_rcv.sb_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); + /* Set User Timeout to send in SYN-ACK. */ + if (tp->t_flags & TF_SND_UTO) { + /* Also inherited after connection is established. */ + to->to_uto = tp->snd_uto; + to->to_flags |= TOF_UTO; + } + + if (tp->t_flags & TF_RCV_UTO) { + /* Remember received timeout to pass on. */ + rcv_uto_tf = 1; + rcv_uto = tp->rcv_uto; + } + /* By the time we drop the lock these should no longer be used. */ so = NULL; tp = NULL; @@ -1271,7 +1300,17 @@ sc->sc_flags |= SCF_NOOPT; if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) sc->sc_flags |= SCF_ECN; + if (to->to_flags & TOF_UTO) { + sc->sc_snd_uto = to->to_uto; + sc->sc_flags |= SCF_SND_UTO; + } + /* Inherit received UTO. */ + if (rcv_uto_tf) { + sc->sc_rcv_uto = rcv_uto; + sc->sc_flags |= SCF_RCV_UTO; + } + if (V_tcp_syncookies) { syncookie_generate(sch, sc, &flowtmp); #ifdef INET6 @@ -1438,6 +1477,10 @@ } if (sc->sc_flags & SCF_SACK) to.to_flags |= TOF_SACKPERM; + if (sc->sc_flags & SCF_SND_UTO) { + to.to_uto = sc->sc_snd_uto; + to.to_flags |= TOF_UTO; + } #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; ==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_syncache.h#2 (text+ko) ==== @@ -82,7 +82,8 @@ struct label *sc_label; /* MAC label reference */ struct ucred *sc_cred; /* cred cache for jail checks */ - u_int32_t sc_spare[2]; /* UTO */ + u_int32_t sc_snd_uto; /* user timeout to send */ + u_int32_t sc_rcv_uto; /* user timeout received */ }; /* @@ -96,6 +97,8 @@ #define SCF_SIGNATURE 0x20 /* send MD5 digests */ #define SCF_SACK 0x80 /* send SACK option */ #define SCF_ECN 0x100 /* send ECN setup packet */ +#define SCF_SND_UTO 0x200 /* send UTO */ +#define SCF_RCV_UTO 0x400 /* receive UTO suggestions */ #define SYNCOOKIE_SECRET_SIZE 8 /* dwords */ #define SYNCOOKIE_LIFETIME 16 /* seconds */ ==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_timer.c#2 (text+ko) ==== @@ -67,6 +67,9 @@ #include #endif +/* XXX-CN this will have to move */ +#define ticks_to_secs(t) ((t) / hz) + int tcp_keepinit; SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW, &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection"); @@ -309,6 +312,18 @@ return; } callout_deactivate(&tp->t_timers->tt_keep); + if ((tp->snd_uto) || ((tp->t_flags & TF_RCV_UTO) && tp->rcv_uto)) { + /* + * This connection is using UTO (either sending or has + * received a value). We need to stop sending keepalives + * (RFC 5482 4.2). + * Returning without resetting the timer. + */ + INP_WUNLOCK(inp); + INP_INFO_WUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); + return; + } /* * Keep-alive timer went off; send something * or drop connection if idle for too long. @@ -447,6 +462,7 @@ int rexmt; int headlocked; struct inpcb *inp; + int uto_left = 0; #ifdef TCPDEBUG int ostate; @@ -477,12 +493,45 @@ } callout_deactivate(&tp->t_timers->tt_rexmt); tcp_free_sackholes(tp); + + if (tp->t_rxtshift == 0) + /* UTO starting again since it's the first retransmit. */ + tp->t_suto = 0; + + if (tp->snd_uto || ((tp->t_flags & TF_RCV_UTO) && tp->rcv_uto)) { + /* + * Since we're using UTO for this connection we need to + * compute how much time we've got left. + */ + uto_left = 0; + if (tp->t_flags & TF_RCV_UTO) + /* Clamping the received value. */ + uto_left = min(V_uto_max_timeout, + max(V_uto_min_timeout, tp->rcv_uto)); + + /* Taking the longer timeout. */ + uto_left = max(tp->snd_uto, uto_left); + + /* Subtract time that has passed since the first retransmit. */ + if (tp->t_suto) + uto_left -= ticks_to_secs(ticks - tp->t_suto); + + /* + * The user may choose a value that's less than TCP_MAXRXTSHIFT + * retransmits. + */ + if (uto_left <= 0) + /* Before or after the retransmits, UTO was exceeded. */ + goto timeoutdrop; + } + /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. */ - if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT && uto_left <= 0) { +timeoutdrop: tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); in_pcbref(inp); @@ -525,13 +574,22 @@ tp->t_flags &= ~TF_WASCRECOVERY; tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); tp->t_flags |= TF_PREVVALID; + tp->t_suto = ticks; /* Keep track of UTO start. */ } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if (tp->t_state == TCPS_SYN_SENT) rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift]; - else - rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + else { + if (tp->t_rxtshift <= TCP_MAXRXTSHIFT) + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + else + rexmt = TCPTV_REXMTMAX; + } + /* We might want to wait less than an entire backoff. */ + if (uto_left) + rexmt = min(rexmt, uto_left * hz); + TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* ==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_usrreq.c#2 (text+ko) ==== @@ -1322,6 +1322,44 @@ INP_WUNLOCK(inp); break; #endif /* TCP_SIGNATURE */ + case TCP_SNDUTO_TIMEOUT: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + if (optval == 0) { + /* Disable sending the option. */ + tp->t_flags &= ~TF_SND_UTO; + tp->snd_uto = 0; + } else if (optval >= V_uto_min_timeout && + optval <= V_uto_max_timeout) { + /* The timeout is acceptable. */ + tp->snd_uto = optval; + tp->t_flags |= TF_SND_UTO; + } else + error = EINVAL; + + INP_WUNLOCK(inp); + break; + + case TCP_RCVUTO_TIMEOUT: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + if (optval <= 0) + /* This connection will ignore suggestions. */ + tp->t_flags &= ~TF_RCV_UTO; + else + tp->t_flags |= TF_RCV_UTO; + INP_WUNLOCK(inp); + break; case TCP_NODELAY: case TCP_NOOPT: INP_WUNLOCK(inp); @@ -1454,7 +1492,16 @@ error = sooptcopyout(sopt, &optval, sizeof optval); break; #endif - + case TCP_SNDUTO_TIMEOUT: + optval = tp->snd_uto; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + case TCP_RCVUTO_TIMEOUT: + optval = tp->rcv_uto; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof optval); + break; case TCP_NODELAY: optval = tp->t_flags & TF_NODELAY; INP_WUNLOCK(inp); ==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_var.h#2 (text+ko) ==== @@ -203,9 +203,13 @@ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ - uint32_t t_ispare[12]; /* 4 keep timers, 5 UTO, 3 TBD */ + uint32_t t_ispare[9]; /* 4 keep timers, 2 UTO, 3 TBD */ void *t_pspare2[4]; /* 4 TBD */ uint64_t _pad[6]; /* 6 TBD (1-2 CC/RTT?) */ + + uint32_t snd_uto; /* sent timeout */ + uint32_t rcv_uto; /* received suggestion from peer */ + int t_suto; /* uto starting time */ }; /* @@ -225,6 +229,8 @@ #define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x001000 /* don't push */ #define TF_PREVVALID 0x002000 /* saved values for bad rxmit valid */ +#define TF_SND_UTO 0x004000 /* send UTO option */ +#define TF_RCV_UTO 0x008000 /* accept UTO suggestions */ #define TF_MORETOCOME 0x010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */ #define TF_LASTIDLE 0x040000 /* connection was previously idle */ @@ -292,7 +298,8 @@ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ -#define TOF_MAXOPT 0x0100 +#define TOF_UTO 0x0100 /* user timeout option */ +#define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ @@ -300,7 +307,7 @@ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ - u_int32_t to_spare; /* UTO */ + u_int32_t to_uto; /* UTO */ }; /* @@ -308,6 +315,12 @@ */ #define TO_SYN 0x01 /* parse SYN-only options */ +/* + * Values for TCP UTO. + */ +#define UTO_MINS 0x8000 /* Highest bit set means "minutes". */ +#define UTO_MINS_TH 3600 /* Send minutes if >= one hour. */ + struct hc_metrics_lite { /* must stay in sync with hc_metrics */ u_long rmx_mtu; /* MTU for this path */ u_long rmx_ssthresh; /* outbound gateway buffer limit */ @@ -611,6 +624,10 @@ VNET_DECLARE(int, ss_fltsz_local); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_abc_l_var); +VNET_DECLARE(int, uto_enable); +VNET_DECLARE(int, uto_min_timeout); +VNET_DECLARE(int, uto_max_timeout); + #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcpstat VNET(tcpstat) @@ -623,6 +640,9 @@ #define V_ss_fltsz_local VNET(ss_fltsz_local) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) +#define V_uto_enable VNET(uto_enable) +#define V_uto_min_timeout VNET(uto_min_timeout) +#define V_uto_max_timeout VNET(uto_max_timeout) VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */ VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */