Date: Mon, 8 Mar 1999 21:57:44 -0500 (EST) From: Garrett Wollman <wollman@khavrinen.lcs.mit.edu> To: dg@root.com, net@freebsd.org Subject: TCP Timer scaleability (with patch) Message-ID: <199903090257.VAA21463@khavrinen.lcs.mit.edu>
next in thread | raw e-mail | index | archive | help
One of the problems with the BSD-style TCP is that, for a machine with large numbers of outstanding TCP connections, TCP timer processing can take a significant amount of overhead, and -- what's worse -- often results in an ``avalanache'' of outgoing traffic as many connections trigger some sort of response at the same time. The following patch eliminates much of this behavior. Note that this is still very, very preliminary, but it does look promising -- at least, the TCP connection I'm using right now to type this message is operational. There is still one big chunk missing in this patch, which is the conversion of the RTT estimator and related clunkery to measure in timer ticks rather than slow-timer ticks. Once this is done, it should eliminate all the locations where slow run-time multiplies and divides are done in the patch below. I expect this to have a significant beneficial effect on large servers like wcarchive; for smaller machines, it's probably a bit of a wash: slightly slower TCP over fast links versus much faster retransmission over lossy links. The other thing I'm not quite sure about is where I've found all of the right locations to set t_starttime (the inverse of t_duration in -current TCP). I think it should be set whenever the connection hits or crosses ESTABLISHED, and left zero otherwise, but there may be other places where it is needed. (The old code would increment t_duration in any state other than LISTEN, but I don't think that's right -- some of the T/TCP code, for example, cared about the actual duration, and not how long this particular socket was sitting idle before the connection began.) -GAWollman ------------------------------------ ? tcp.patch ? sys/compile/GENERIC ? sys/compile/FORLENNH Index: sys/net/if_vlan.c =================================================================== RCS file: /home/cvs/src/sys/net/if_vlan.c,v retrieving revision 1.4 diff -u -r1.4 if_vlan.c --- if_vlan.c 1998/12/04 22:54:52 1.4 +++ if_vlan.c 1999/03/09 02:09:58 @@ -48,7 +48,9 @@ #include <sys/param.h> #include <sys/kernel.h> +#include <sys/linker_set.h> #include <sys/mbuf.h> +#include <sys/queue.h> #include <sys/socket.h> #include <sys/sockio.h> #include <sys/sysctl.h> @@ -69,6 +71,7 @@ #include <netinet/if_ether.h> #endif +SYSCTL_DECL(_net_link); /* XXX */ SYSCTL_NODE(_net_link, IFT_8021_VLAN, vlan, CTLFLAG_RW, 0, "IEEE 802.1Q VLAN"); SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW, 0, "for consistency"); Index: sys/netinet/tcp_input.c =================================================================== RCS file: /home/cvs/src/sys/netinet/tcp_input.c,v retrieving revision 1.84 diff -u -r1.84 tcp_input.c --- tcp_input.c 1999/02/06 00:47:45 1.84 +++ tcp_input.c 1999/03/09 02:10:08 @@ -491,9 +491,9 @@ * Segment received on connection. * Reset idle time and keep-alive timer. */ - tp->t_idle = 0; + tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); /* * Process options if not in LISTEN state, @@ -559,9 +559,11 @@ if ((to.to_flag & TOF_TS) != 0) tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1); - else if (tp->t_rtt && + else if (tp->t_rtttime && SEQ_GT(ti->ti_ack, tp->t_rtseq)) - tcp_xmit_timer(tp, tp->t_rtt); + tcp_xmit_timer(tp, 1 + + TCPT_SLOWHZ(ticks + - tp->t_rtttime)); acked = ti->ti_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; @@ -579,9 +581,11 @@ * decide between more output or persist. */ if (tp->snd_una == tp->snd_max) - tp->t_timer[TCPT_REXMT] = 0; - else if (tp->t_timer[TCPT_PERSIST] == 0) - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + callout_stop(tp->tt_rexmt); + else if (!callout_pending(tp->tt_persist)) + callout_reset(tp->tt_rexmt, + tp->t_rxtcur, + tcp_timer_rexmt, tp); sowwakeup(so); if (so->so_snd.sb_cc) @@ -728,6 +732,7 @@ taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; tp->t_state = TCPS_ESTABLISHED; /* @@ -751,7 +756,8 @@ tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN); tcpstat.tcps_connects++; soisconnected(so); - tp->t_timer[TCPT_KEEP] = tcp_keepinit; + callout_reset(tp->tt_keep, tcp_keepinit, + tcp_timer_keep, tp); dropsocket = 0; /* committed to socket */ tcpstat.tcps_accepts++; goto trimthenstep6; @@ -770,7 +776,7 @@ */ tp->t_flags |= TF_ACKNOW; tp->t_state = TCPS_SYN_RECEIVED; - tp->t_timer[TCPT_KEEP] = tcp_keepinit; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); dropsocket = 0; /* committed to socket */ tcpstat.tcps_accepts++; goto trimthenstep6; @@ -878,13 +884,15 @@ * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ + tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; tiflags &= ~TH_SYN; } else { tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); } } else { /* @@ -897,7 +905,7 @@ * If there was no CC option, clear cached CC value. */ tp->t_flags |= TF_ACKNOW; - tp->t_timer[TCPT_REXMT] = 0; + callout_stop(tp->tt_rexmt); if (to.to_flag & TOF_CC) { if (taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { @@ -907,12 +915,16 @@ * SYN-SENT* -> FIN-WAIT-1* */ taop->tao_cc = to.to_cc; + tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + callout_reset(tp->tt_keep, + tcp_keepidle, + tcp_timer_keep, + tp); } tp->t_flags |= TF_NEEDSYN; } else @@ -971,7 +983,7 @@ if ((tiflags & TH_SYN) && (to.to_flag & TOF_CC) && tp->cc_recv != 0) { if (tp->t_state == TCPS_TIME_WAIT && - tp->t_duration > TCPTV_MSL) + (ticks - tp->t_starttime) > TCPTV_MSL) goto dropwithreset; if (CC_GT(to.to_cc, tp->cc_recv)) { tp = tcp_close(tp); @@ -1287,12 +1299,14 @@ * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ + tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; tp->t_flags &= ~TF_NEEDFIN; } else { tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + callout_reset(tp->tt_keep, tcp_keepidle, + tcp_timer_keep, tp); } /* * If segment contains data or ACK, will call tcp_reass() @@ -1347,7 +1361,7 @@ * to keep a constant cwnd packets in the * network. */ - if (tp->t_timer[TCPT_REXMT] == 0 || + if (!callout_pending(tp->tt_rexmt) || ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == tcprexmtthresh) { @@ -1359,8 +1373,8 @@ if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; - tp->t_timer[TCPT_REXMT] = 0; - tp->t_rtt = 0; + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); @@ -1428,8 +1442,9 @@ */ if (to.to_flag & TOF_TS) tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1); - else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) - tcp_xmit_timer(tp,tp->t_rtt); + else if (tp->t_rtttime && SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, + 1 + TCPT_SLOWHZ(ticks - tp->t_rtttime)); /* * If all outstanding data is acked, stop retransmit @@ -1438,10 +1453,11 @@ * timer, using current (possibly backed-off) value. */ if (ti->ti_ack == tp->snd_max) { - tp->t_timer[TCPT_REXMT] = 0; + callout_stop(tp->tt_rexmt); needoutput = 1; - } else if (tp->t_timer[TCPT_PERSIST] == 0) - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + } else if (!callout_pending(tp->tt_persist)) + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); /* * If no data (only SYN) was ACK'd, @@ -1497,7 +1513,8 @@ */ if (so->so_state & SS_CANTRCVMORE) { soisdisconnected(so); - tp->t_timer[TCPT_2MSL] = tcp_maxidle; + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); } tp->t_state = TCPS_FIN_WAIT_2; } @@ -1515,11 +1532,14 @@ tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && - tp->t_duration < TCPTV_MSL) - tp->t_timer[TCPT_2MSL] = - tp->t_rxtcur * TCPTV_TWTRUNC; + (ticks - tp->t_starttime) < TCPTV_MSL) + callout_reset(tp->tt_2msl, + tp->t_rxtcur * + TCPTV_TWTRUNC, + tcp_timer_2msl, tp); else - tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + callout_reset(tp->tt_2msl, 2*TCPTV_MSL, + tcp_timer_2msl, tp); soisdisconnected(so); } break; @@ -1543,7 +1563,8 @@ * it and restart the finack timer. */ case TCPS_TIME_WAIT: - tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + callout_reset(tp->tt_2msl, 2 * TCPTV_MSL, + tcp_timer_2msl, tp); goto dropafterack; } } @@ -1679,6 +1700,7 @@ * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: + tp->t_starttime = ticks; case TCPS_ESTABLISHED: tp->t_state = TCPS_CLOSE_WAIT; break; @@ -1701,14 +1723,16 @@ tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && - tp->t_duration < TCPTV_MSL) { - tp->t_timer[TCPT_2MSL] = - tp->t_rxtcur * TCPTV_TWTRUNC; + (ticks - tp->t_starttime) < TCPTV_MSL) { + callout_reset(tp->tt_2msl, + tp->t_rxtcur * TCPTV_TWTRUNC, + tcp_timer_2msl, tp); /* For transaction client, force ACK now. */ tp->t_flags |= TF_ACKNOW; } else - tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + callout_reset(tp->tt_2msl, 2 * TCPTV_MSL, + tcp_timer_2msl, tp); soisdisconnected(so); break; @@ -1716,7 +1740,8 @@ * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: - tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + callout_reset(tp->tt_2msl, 2 * TCPTV_MSL, + tcp_timer_2msl, tp); break; } } @@ -1997,7 +2022,7 @@ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); } - tp->t_rtt = 0; + tp->t_rtttime = 0; tp->t_rxtshift = 0; /* @@ -2011,8 +2036,12 @@ * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ - TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), - max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + { + int newrxt; + TCPT_RANGESET(newrxt, TCP_REXMTVAL(tp), + max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + tp->t_rxtcur = TCPT_TICKS(newrxt); + } /* * We received an ack for a packet that wasn't retransmitted; @@ -2119,10 +2148,14 @@ /* default variation is +- 1 rtt */ tp->t_rttvar = tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + { + int newrxt; + TCPT_RANGESET(newrxt, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_rxtcur = TCPT_TICKS(newrxt); } - TCPT_RANGESET(tp->t_rxtcur, - ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); } /* * if there's an mtu associated with the route, use it Index: sys/netinet/tcp_output.c =================================================================== RCS file: /home/cvs/src/sys/netinet/tcp_output.c,v retrieving revision 1.32 diff -u -r1.32 tcp_output.c --- tcp_output.c 1999/01/20 17:31:59 1.32 +++ tcp_output.c 1999/03/09 02:10:11 @@ -40,6 +40,7 @@ #include <sys/param.h> #include <sys/systm.h> +#include <sys/kernel.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> @@ -93,7 +94,7 @@ * to send, then transmit; otherwise, investigate further. */ idle = (tp->snd_max == tp->snd_una); - if (idle && tp->t_idle >= tp->t_rxtcur) + if (idle && (ticks - tp->t_rcvtime) >= tp->t_rxtcur) /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- @@ -143,7 +144,7 @@ flags &= ~TH_FIN; win = 1; } else { - tp->t_timer[TCPT_PERSIST] = 0; + callout_stop(tp->tt_persist); tp->t_rxtshift = 0; } } @@ -194,10 +195,10 @@ */ len = 0; if (win == 0) { - tp->t_timer[TCPT_REXMT] = 0; + callout_stop(tp->tt_rexmt); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; - if (tp->t_timer[TCPT_PERSIST] == 0) + if (!callout_pending(tp->tt_persist)) tcp_setpersist(tp); } } @@ -285,11 +286,11 @@ * persisting to move a small or zero window * (re)transmitting and thereby not persisting * - * tp->t_timer[TCPT_PERSIST] - * is set when we are in persist state. + * callout_pending(tp->tt_persist) + * is true when we are in persist state. * tp->t_force * is set when we are called to send a persist packet. - * tp->t_timer[TCPT_REXMT] + * callout_pending(tp->tt_rexmt) * is set when we are retransmitting * The output side is idle when both timers are zero. * @@ -299,8 +300,8 @@ * if window is nonzero, transmit what we can, * otherwise force out a byte. */ - if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && - tp->t_timer[TCPT_PERSIST] == 0) { + if (so->so_snd.sb_cc && !callout_pending(tp->tt_rexmt) && + !callout_pending(tp->tt_persist)) { tp->t_rxtshift = 0; tcp_setpersist(tp); } @@ -563,7 +564,8 @@ * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ - if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST]) + if (len || (flags & (TH_SYN|TH_FIN)) + || callout_pending(tp->tt_persist)) ti->ti_seq = htonl(tp->snd_nxt); else ti->ti_seq = htonl(tp->snd_max); @@ -609,7 +611,7 @@ * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ - if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { + if (tp->t_force == 0 || !callout_pending(tp->tt_persist)) { tcp_seq startseq = tp->snd_nxt; /* @@ -630,8 +632,8 @@ * Time this transmission if not a retransmission and * not currently timing anything. */ - if (tp->t_rtt == 0) { - tp->t_rtt = 1; + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; tp->t_rtseq = startseq; tcpstat.tcps_segstimed++; } @@ -645,11 +647,12 @@ * Initialize shift counter which is used for backoff * of retransmit time. */ - if (tp->t_timer[TCPT_REXMT] == 0 && + if (!callout_pending(tp->tt_rexmt) && tp->snd_nxt != tp->snd_una) { - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; - if (tp->t_timer[TCPT_PERSIST]) { - tp->t_timer[TCPT_PERSIST] = 0; + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + if (callout_pending(tp->tt_persist)) { + callout_stop(tp->tt_persist); tp->t_rxtshift = 0; } } @@ -742,16 +745,17 @@ tcp_setpersist(tp) register struct tcpcb *tp; { - register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; + int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; + int tt; - if (tp->t_timer[TCPT_REXMT]) - panic("tcp_output REXMT"); + if (callout_pending(tp->tt_rexmt)) + panic("tcp_setpersist: retransmit pending"); /* * Start/restart persistance timer. */ - TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], - t * tcp_backoff[tp->t_rxtshift], - TCPTV_PERSMIN, TCPTV_PERSMAX); + TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], + TCPTV_PERSMIN, TCPTV_PERSMAX); + callout_reset(tp->tt_persist, TCPT_TICKS(tt), tcp_timer_persist, tp); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } Index: sys/netinet/tcp_subr.c =================================================================== RCS file: /home/cvs/src/sys/netinet/tcp_subr.c,v retrieving revision 1.52 diff -u -r1.52 tcp_subr.c --- tcp_subr.c 1999/02/04 03:27:43 1.52 +++ tcp_subr.c 1999/03/09 02:10:14 @@ -39,13 +39,14 @@ #include <sys/param.h> #include <sys/systm.h> +#include <sys/callout.h> #include <sys/kernel.h> -#include <sys/sysctl.h> #include <sys/malloc.h> #include <sys/mbuf.h> +#include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> -#include <sys/protosw.h> +#include <sys/sysctl.h> #include <vm/vm_zone.h> @@ -117,6 +118,7 @@ char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; } inp_tp_u; struct tcpcb tcb; + struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; }; #undef ALIGNMENT #undef ALIGNM1 @@ -132,6 +134,12 @@ tcp_iss = random(); /* wrong, but better than a constant */ tcp_ccgen = 1; tcp_cleartaocache(); + + tcp_keepinit = TCPTV_KEEP_INIT; + tcp_keepidle = TCPTV_KEEP_IDLE; + tcp_keepintvl = TCPTV_KEEPINTVL; + tcp_maxpersistidle = TCPTV_KEEP_IDLE; + LIST_INIT(&tcb); tcbinfo.listhead = &tcb; if (!(getenv_int("net.inet.tcp.tcbhashsize", &hashsize))) @@ -145,6 +153,7 @@ &tcbinfo.porthashmask); tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets, ZONE_INTERRUPT, 0); + if (max_protohdr < sizeof(struct tcpiphdr)) max_protohdr = sizeof(struct tcpiphdr); if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) @@ -299,6 +308,12 @@ tp->t_segq = NULL; tp->t_maxseg = tp->t_maxopd = tcp_mssdflt; + /* Set up our timeouts. */ + callout_init(tp->tt_rexmt = &it->inp_tp_rexmt); + callout_init(tp->tt_persist = &it->inp_tp_persist); + callout_init(tp->tt_keep = &it->inp_tp_keep); + callout_init(tp->tt_2msl = &it->inp_tp_2msl); + if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_rfc1644) @@ -312,7 +327,7 @@ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = TCPTV_MIN; - tp->t_rxtcur = TCPTV_RTOBASE; + tp->t_rxtcur = TCPT_TICKS(TCPTV_RTOBASE); tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; inp->inp_ip_ttl = ip_defttl; @@ -362,6 +377,15 @@ int dosavessthresh; /* + * Make sure that all of our timers are stopped before we + * delete the PCB. + */ + callout_stop(tp->tt_rexmt); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_2msl); + + /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as the 16 samples. @@ -693,7 +717,7 @@ tp->t_maxseg = mss; tcpstat.tcps_mturesent++; - tp->t_rtt = 0; + tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_output(tp); } Index: sys/netinet/tcp_timer.c =================================================================== RCS file: /home/cvs/src/sys/netinet/tcp_timer.c,v retrieving revision 1.28 diff -u -r1.28 tcp_timer.c --- tcp_timer.c 1998/04/24 09:25:35 1.28 +++ tcp_timer.c 1999/03/09 02:10:15 @@ -63,15 +63,15 @@ #include <netinet/tcp_debug.h> #endif -int tcp_keepinit = TCPTV_KEEP_INIT; +int tcp_keepinit; SYSCTL_INT(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLFLAG_RW, &tcp_keepinit , 0, ""); -int tcp_keepidle = TCPTV_KEEP_IDLE; +int tcp_keepidle; SYSCTL_INT(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLFLAG_RW, &tcp_keepidle , 0, ""); -static int tcp_keepintvl = TCPTV_KEEPINTVL; +int tcp_keepintvl; SYSCTL_INT(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLFLAG_RW, &tcp_keepintvl , 0, ""); @@ -81,7 +81,7 @@ static int tcp_keepcnt = TCPTV_KEEPCNT; /* max idle probes */ -static int tcp_maxpersistidle = TCPTV_KEEP_IDLE; +int tcp_maxpersistidle; /* max idle time in persist */ int tcp_maxidle; @@ -120,7 +120,6 @@ { register struct inpcb *ip, *ipnxt; register struct tcpcb *tp; - register int i; int s; #ifdef TCPDEBUG int ostate; @@ -143,29 +142,6 @@ tp = intotcpcb(ip); if (tp == 0 || tp->t_state == TCPS_LISTEN) continue; - for (i = 0; i < TCPT_NTIMERS; i++) { - if (tp->t_timer[i] && --tp->t_timer[i] == 0) { -#ifdef TCPDEBUG - ostate = tp->t_state; -#endif - tp = tcp_timers(tp, i); - if (tp == NULL) - goto tpgone; -#ifdef TCPDEBUG - if (tp->t_inpcb->inp_socket->so_options - & SO_DEBUG) - tcp_trace(TA_USER, ostate, tp, - (struct tcpiphdr *)0, - PRU_SLOWTIMO); -#endif - } - } - tp->t_idle++; - tp->t_duration++; - if (tp->t_rtt) - tp->t_rtt++; -tpgone: - ; } tcp_iss += TCP_ISSINCR/PR_SLOWHZ; /* increment iss */ #ifdef TCP_COMPAT_42 @@ -183,10 +159,10 @@ tcp_canceltimers(tp) struct tcpcb *tp; { - register int i; - - for (i = 0; i < TCPT_NTIMERS; i++) - tp->t_timer[i] = 0; + callout_stop(tp->tt_2msl); + callout_stop(tp->tt_persist); + callout_stop(tp->tt_keep); + callout_stop(tp->tt_rexmt); } int tcp_backoff[TCP_MAXRXTSHIFT + 1] = @@ -197,175 +173,242 @@ /* * TCP timer processing. */ -struct tcpcb * -tcp_timers(tp, timer) - register struct tcpcb *tp; - int timer; -{ - register int rexmt; - switch (timer) { +void +tcp_timer_2msl(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; +#ifdef TCPDEBUG + int ostate; + ostate = tp->t_state; +#endif /* * 2 MSL timeout in shutdown went off. If we're closed but * still waiting for peer to close and connection has been idle * too long, or if 2MSL time is up from TIME_WAIT, delete connection * control block. Otherwise, check again in a bit. */ - case TCPT_2MSL: - if (tp->t_state != TCPS_TIME_WAIT && - tp->t_idle <= tcp_maxidle) - tp->t_timer[TCPT_2MSL] = tcp_keepintvl; - else - tp = tcp_close(tp); - break; + if (tp->t_state != TCPS_TIME_WAIT && + (ticks - tp->t_rcvtime) <= tcp_maxidle) + callout_reset(tp->tt_2msl, tcp_keepintvl, + tcp_timer_2msl, tp); + else + tp = tcp_close(tp); +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif +} + +void +tcp_timer_keep(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif /* - * Retransmission timer went off. Message has not - * been acked within retransmit interval. Back off - * to a longer retransmit interval and retransmit one segment. + * Keep-alive timer went off; send something + * or drop connection if idle for too long. */ - case TCPT_REXMT: - if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { - tp->t_rxtshift = TCP_MAXRXTSHIFT; - tcpstat.tcps_timeoutdrop++; - tp = tcp_drop(tp, tp->t_softerror ? - tp->t_softerror : ETIMEDOUT); - break; - } - tcpstat.tcps_rexmttimeo++; - rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; - TCPT_RANGESET(tp->t_rxtcur, rexmt, - tp->t_rttmin, TCPTV_REXMTMAX); - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; - /* - * If losing, let the lower level know and try for - * a better route. Also, if we backed off this far, - * our srtt estimate is probably bogus. Clobber it - * so we'll take the next rtt measurement as our srtt; - * move the current srtt into rttvar to keep the current - * retransmit times until then. - */ - if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { - in_losing(tp->t_inpcb); - tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); - tp->t_srtt = 0; - } - tp->snd_nxt = tp->snd_una; - /* - * Force a segment to be sent. - */ - tp->t_flags |= TF_ACKNOW; + tcpstat.tcps_keeptimeo++; + if (tp->t_state < TCPS_ESTABLISHED) + goto dropit; + if ((always_keepalive || + tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && + tp->t_state <= TCPS_CLOSING) { + if ((ticks - tp->t_rcvtime) >= tcp_keepidle + tcp_maxidle) + goto dropit; /* - * If timing a segment in this window, stop the timer. + * Send a packet designed to force a response + * if the peer is up and reachable: + * either an ACK if the connection is still alive, + * or an RST if the peer has closed the connection + * due to timeout or reboot. + * Using sequence number tp->snd_una-1 + * causes the transmitted zero-length segment + * to lie outside the receive window; + * by the protocol spec, this requires the + * correspondent TCP to respond. */ - tp->t_rtt = 0; + tcpstat.tcps_keepprobe++; +#ifdef TCP_COMPAT_42 /* - * Close the congestion window down to one segment - * (we'll open it by one segment for each ack we get). - * Since we probably have a window's worth of unacked - * data accumulated, this "slow start" keeps us from - * dumping all that data as back-to-back packets (which - * might overwhelm an intermediate gateway). - * - * There are two phases to the opening: Initially we - * open by one mss on each ack. This makes the window - * size increase exponentially with time. If the - * window is larger than the path can handle, this - * exponential growth results in dropped packet(s) - * almost immediately. To get more time between - * drops but still "push" the network to take advantage - * of improving conditions, we switch from exponential - * to linear window opening at some threshhold size. - * For a threshhold, we use half the current window - * size, truncated to a multiple of the mss. - * - * (the minimum cwnd that will give us exponential - * growth is 2 mss. We don't allow the threshhold - * to go below this.) + * The keepalive packet must have nonzero length + * to get a 4.2 host to respond. */ - { - u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; - if (win < 2) - win = 2; - tp->snd_cwnd = tp->t_maxseg; - tp->snd_ssthresh = win * tp->t_maxseg; - tp->t_dupacks = 0; - } - (void) tcp_output(tp); - break; + tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, + tp->rcv_nxt - 1, tp->snd_una - 1, 0); +#else + tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); +#endif + callout_reset(tp->tt_keep, tcp_keepintvl, tcp_timer_keep, tp); + } else + callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); + +#ifdef TCPDEBUG + if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif + return; + +dropit: + tcpstat.tcps_keepdrops++; + tp = tcp_drop(tp, ETIMEDOUT); +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif +} + +void +tcp_timer_persist(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + /* * Persistance timer into zero window. * Force a byte to be output, if possible. */ - case TCPT_PERSIST: - tcpstat.tcps_persisttimeo++; - /* - * Hack: if the peer is dead/unreachable, we do not - * time out if the window is closed. After a full - * backoff, drop the connection if the idle time - * (no responses to probes) reaches the maximum - * backoff that we would use if retransmitting. - */ - if (tp->t_rxtshift == TCP_MAXRXTSHIFT && - (tp->t_idle >= tcp_maxpersistidle || - tp->t_idle >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { - tcpstat.tcps_persistdrop++; - tp = tcp_drop(tp, ETIMEDOUT); - break; - } - tcp_setpersist(tp); - tp->t_force = 1; - (void) tcp_output(tp); - tp->t_force = 0; - break; - + tcpstat.tcps_persisttimeo++; /* - * Keep-alive timer went off; send something - * or drop connection if idle for too long. + * Hack: if the peer is dead/unreachable, we do not + * time out if the window is closed. After a full + * backoff, drop the connection if the idle time + * (no responses to probes) reaches the maximum + * backoff that we would use if retransmitting. */ - case TCPT_KEEP: - tcpstat.tcps_keeptimeo++; - if (tp->t_state < TCPS_ESTABLISHED) - goto dropit; - if ((always_keepalive || - tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && - tp->t_state <= TCPS_CLOSING) { - if (tp->t_idle >= tcp_keepidle + tcp_maxidle) - goto dropit; - /* - * Send a packet designed to force a response - * if the peer is up and reachable: - * either an ACK if the connection is still alive, - * or an RST if the peer has closed the connection - * due to timeout or reboot. - * Using sequence number tp->snd_una-1 - * causes the transmitted zero-length segment - * to lie outside the receive window; - * by the protocol spec, this requires the - * correspondent TCP to respond. - */ - tcpstat.tcps_keepprobe++; -#ifdef TCP_COMPAT_42 - /* - * The keepalive packet must have nonzero length - * to get a 4.2 host to respond. - */ - tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, - tp->rcv_nxt - 1, tp->snd_una - 1, 0); -#else - tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, - tp->rcv_nxt, tp->snd_una - 1, 0); -#endif - tp->t_timer[TCPT_KEEP] = tcp_keepintvl; - } else - tp->t_timer[TCPT_KEEP] = tcp_keepidle; - break; - dropit: - tcpstat.tcps_keepdrops++; + if (tp->t_rxtshift == TCP_MAXRXTSHIFT && + ((ticks - tp->t_rcvtime) >= tcp_maxpersistidle || + (ticks - tp->t_rcvtime) + >= TCPT_TICKS(TCP_REXMTVAL(tp) * tcp_totbackoff))) { + tcpstat.tcps_persistdrop++; tp = tcp_drop(tp, ETIMEDOUT); - break; + goto out; + } + tcp_setpersist(tp); + tp->t_force = 1; + (void) tcp_output(tp); + tp->t_force = 0; + +out: +#ifdef TCPDEBUG + if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif +} + +void +tcp_timer_rexmt(xtp) + void *xtp; +{ + struct tcpcb *tp = xtp; + int rexmt; +#ifdef TCPDEBUG + int ostate; + + ostate = tp->t_state; +#endif + + /* + * Retransmission timer went off. Message has not + * been acked within retransmit interval. Back off + * to a longer retransmit interval and retransmit one segment. + */ + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + tp->t_rxtshift = TCP_MAXRXTSHIFT; + tcpstat.tcps_timeoutdrop++; + tp = tcp_drop(tp, tp->t_softerror ? + tp->t_softerror : ETIMEDOUT); + goto out; + } + tcpstat.tcps_rexmttimeo++; + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + { + int newrxt; + TCPT_RANGESET(newrxt, rexmt, + tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_rxtcur = TCPT_TICKS(newrxt); + callout_reset(tp->tt_rexmt, tp->t_rxtcur, tcp_timer_rexmt, tp); + } + /* + * If losing, let the lower level know and try for + * a better route. Also, if we backed off this far, + * our srtt estimate is probably bogus. Clobber it + * so we'll take the next rtt measurement as our srtt; + * move the current srtt into rttvar to keep the current + * retransmit times until then. + */ + if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) { + in_losing(tp->t_inpcb); + tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); + tp->t_srtt = 0; + } + tp->snd_nxt = tp->snd_una; + /* + * Force a segment to be sent. + */ + tp->t_flags |= TF_ACKNOW; + /* + * If timing a segment in this window, stop the timer. + */ + tp->t_rtttime = 0; + /* + * Close the congestion window down to one segment + * (we'll open it by one segment for each ack we get). + * Since we probably have a window's worth of unacked + * data accumulated, this "slow start" keeps us from + * dumping all that data as back-to-back packets (which + * might overwhelm an intermediate gateway). + * + * There are two phases to the opening: Initially we + * open by one mss on each ack. This makes the window + * size increase exponentially with time. If the + * window is larger than the path can handle, this + * exponential growth results in dropped packet(s) + * almost immediately. To get more time between + * drops but still "push" the network to take advantage + * of improving conditions, we switch from exponential + * to linear window opening at some threshhold size. + * For a threshhold, we use half the current window + * size, truncated to a multiple of the mss. + * + * (the minimum cwnd that will give us exponential + * growth is 2 mss. We don't allow the threshhold + * to go below this.) + */ + { + u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; + if (win < 2) + win = 2; + tp->snd_cwnd = tp->t_maxseg; + tp->snd_ssthresh = win * tp->t_maxseg; + tp->t_dupacks = 0; } - return (tp); + (void) tcp_output(tp); + +out: +#ifdef TCPDEBUG + if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) + tcp_trace(TA_USER, ostate, tp, (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif } Index: sys/netinet/tcp_timer.h =================================================================== RCS file: /home/cvs/src/sys/netinet/tcp_timer.h,v retrieving revision 1.13 diff -u -r1.13 tcp_timer.h --- tcp_timer.h 1997/09/07 05:26:48 1.13 +++ tcp_timer.h 1999/03/09 02:10:16 @@ -38,17 +38,6 @@ #define _NETINET_TCP_TIMER_H_ /* - * Definitions of the TCP timers. These timers are counted - * down PR_SLOWHZ times a second. - */ -#define TCPT_NTIMERS 4 - -#define TCPT_REXMT 0 /* retransmit */ -#define TCPT_PERSIST 1 /* retransmit persistence */ -#define TCPT_KEEP 2 /* keep alive */ -#define TCPT_2MSL 3 /* 2*msl quiet time timer */ - -/* * The TCPT_REXMT timer is used to force retransmissions. * The TCP has the TCPT_REXMT timer set whenever segments * have been sent for which ACKs are expected but not yet @@ -87,7 +76,7 @@ /* * Time constants. */ -#define TCPTV_MSL ( 30*PR_SLOWHZ) /* max seg lifetime (hah!) */ +#define TCPTV_MSL ( 30*hz) /* max seg lifetime (hah!) */ #define TCPTV_SRTTBASE 0 /* base roundtrip time; if 0, no idea yet */ #define TCPTV_RTOBASE ( 3*PR_SLOWHZ) /* assumed RTO if no info */ @@ -96,9 +85,9 @@ #define TCPTV_PERSMIN ( 5*PR_SLOWHZ) /* retransmit persistence */ #define TCPTV_PERSMAX ( 60*PR_SLOWHZ) /* maximum persist interval */ -#define TCPTV_KEEP_INIT ( 75*PR_SLOWHZ) /* initial connect keep alive */ -#define TCPTV_KEEP_IDLE (120*60*PR_SLOWHZ) /* dflt time before probing */ -#define TCPTV_KEEPINTVL ( 75*PR_SLOWHZ) /* default probe interval */ +#define TCPTV_KEEP_INIT ( 75*hz) /* initial connect keepalive */ +#define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ +#define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ #define TCPTV_MIN ( 1*PR_SLOWHZ) /* minimum allowable value */ @@ -118,20 +107,36 @@ /* * Force a time value to be in a certain range. */ -#define TCPT_RANGESET(tv, value, tvmin, tvmax) { \ +#define TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ (tv) = (value); \ if ((u_long)(tv) < (u_long)(tvmin)) \ (tv) = (tvmin); \ else if ((u_long)(tv) > (u_long)(tvmax)) \ (tv) = (tvmax); \ -} +} while(0) +/* + * Convert slow-timeout ticks to timer ticks. We don't really want to do + * this as it is rather expensive, so this is only a transitional stage + * until we are able to update all the code which counts timer ticks. + */ +#define TCPT_TICKS(stt) ((stt) * hz / PR_SLOWHZ) +#define TCPT_SLOWHZ(tt) (((tt) * PR_SLOWHZ) / hz) + #ifdef KERNEL extern int tcp_keepinit; /* time to establish connection */ extern int tcp_keepidle; /* time before keepalive probes begin */ +extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_maxidle; /* time to drop after starting probes */ +extern int tcp_maxpersistidle; extern int tcp_ttl; /* time to live for TCP segs */ extern int tcp_backoff[]; -#endif -#endif +void tcp_timer_2msl __P((void *xtp)); +void tcp_timer_keep __P((void *xtp)); +void tcp_timer_persist __P((void *xtp)); +void tcp_timer_rexmt __P((void *xtp)); + +#endif /* KERNEL */ + +#endif /* !_NETINET_TCP_TIMER_H_ */ Index: sys/netinet/tcp_usrreq.c =================================================================== RCS file: /home/cvs/src/sys/netinet/tcp_usrreq.c,v retrieving revision 1.40 diff -u -r1.40 tcp_usrreq.c --- tcp_usrreq.c 1999/01/20 17:31:59 1.40 +++ tcp_usrreq.c 1999/03/09 02:10:18 @@ -514,7 +514,7 @@ if (oinp) { if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && otp->t_state == TCPS_TIME_WAIT && - otp->t_duration < TCPTV_MSL && + (ticks - otp->t_starttime) < TCPTV_MSL && (otp->t_flags & TF_RCVD_CC)) otp = tcp_close(otp); else @@ -540,7 +540,7 @@ soisconnecting(so); tcpstat.tcps_connattempt++; tp->t_state = TCPS_SYN_SENT; - tp->t_timer[TCPT_KEEP] = tcp_keepinit; + callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp); tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; tcp_sendseqinit(tp); @@ -787,7 +787,8 @@ soisdisconnected(tp->t_inpcb->inp_socket); /* To prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) - tp->t_timer[TCPT_2MSL] = tcp_maxidle; + callout_reset(tp->tt_2msl, tcp_maxidle, + tcp_timer_2msl, tp); } return (tp); } Index: sys/netinet/tcp_var.h =================================================================== RCS file: /home/cvs/src/sys/netinet/tcp_var.h,v retrieving revision 1.50 diff -u -r1.50 tcp_var.h --- tcp_var.h 1999/02/16 10:49:52 1.50 +++ tcp_var.h 1999/03/09 02:10:20 @@ -49,7 +49,10 @@ int t_dupacks; /* consecutive dup acks recd */ struct tcpiphdr *t_template; /* skeletal packet for transmit */ - int t_timer[TCPT_NTIMERS]; /* tcp timers */ + struct callout *tt_rexmt; /* retransmit timer */ + struct callout *tt_persist; /* retransmit persistence */ + struct callout *tt_keep; /* keepalive */ + struct callout *tt_2msl; /* 2*msl TIME_WAIT timer */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ int t_state; /* state of this connection */ @@ -98,12 +101,12 @@ */ u_int t_maxopd; /* mss plus options */ - u_int t_idle; /* inactivity time */ - u_long t_duration; /* connection duration */ - int t_rtt; /* round trip time */ + u_long t_rcvtime; /* inactivity time */ + u_long t_starttime; /* time connection was established */ + int t_rtttime; /* round trip time */ tcp_seq t_rtseq; /* sequence number being timed */ - int t_rxtcur; /* current retransmit value */ + int t_rxtcur; /* current retransmit value (ticks) */ u_int t_maxseg; /* maximum segment size */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ Index: sys/sys/callout.h =================================================================== RCS file: /home/cvs/src/sys/sys/callout.h,v retrieving revision 1.12 diff -u -r1.12 callout.h --- callout.h 1999/03/06 04:46:20 1.12 +++ callout.h 1999/03/09 02:10:22 @@ -75,7 +75,8 @@ #define callout_fired(c) ((c)->c_flags & CALLOUT_FIRED) void callout_init __P((struct callout *)); -#define callout_pending(c) (((c)->c_flags & CALLOUT_PENDING) ? \ +#define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING) +#define callout_pending_at(c) (((c)->c_flags & CALLOUT_PENDING) ? \ ((c)->c_time - ticks) : 0) void callout_reset __P((struct callout *, int, void (*)(void *), void *)); void callout_stop __P((struct callout *)); To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-net" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199903090257.VAA21463>