From owner-freebsd-net Fri Apr 9 15: 9: 4 1999 Delivered-To: freebsd-net@freebsd.org Received: from sumatra.americantv.com (sumatra.americantv.com [207.170.17.37]) by hub.freebsd.org (Postfix) with ESMTP id 1887A15FA5 for ; Fri, 9 Apr 1999 14:49:59 -0700 (PDT) (envelope-from jlemon@americantv.com) Received: from right.PCS (right.PCS [148.105.10.31]) by sumatra.americantv.com (8.8.5/8.8.5) with ESMTP id MAA18608 for ; Fri, 9 Apr 1999 12:20:15 -0500 (CDT) Received: (from jlemon@localhost) by right.PCS (8.6.13/8.6.4) id MAA23764; Fri, 9 Apr 1999 12:19:45 -0500 Message-ID: <19990409121944.42560@right.PCS> Date: Fri, 9 Apr 1999 12:19:44 -0500 From: Jonathan Lemon To: freebsd-net@freebsd.org Subject: patch for slowtimeout handling Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii X-Mailer: Mutt 0.61.1 Sender: owner-freebsd-net@FreeBSD.ORG Precedence: bulk X-Loop: FreeBSD.org Attached is a patch that alters the handling of the TCP timers, especially the slowtimo() function. Its main design is to handle cases where there are a large number of sockets in the TIME_WAIT state (large being defined to be about 40K or so). The general concept of the patch is that timers can either be expressed by the current integer countdown and linear list scan, or on callout basis. Switching from one mode to another depends on where the connection is in the TCP state machine. Currently, I only switch to a callout basis upon entering the TIME_WAIT state. (The rationale for this was I didn't want to add more overhead of adding/removing entries from the timing wheel every time I reset the keepalive timers). There currently is a separate timing wheel for each TCP timer, again because I wanted to make sure that TIME_WAIT entries did not impact the normal processing. Each entry ticks down at the normal slowtimo() rate (1/2 second). The 4 timing wheels could be merged, or just scrapped and put on the current existing wheel in kern_timeout.c. (This wasn't done since these patches are against the RELENG_3 branch, which doesn't have Garret's external callout interface yet). I'd like to solicit feedback on these patches; and the possibility of integrating the changes (probably in some modified form) into the tree. -- Jonathan Index: tcp_fsm.h =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_fsm.h,v retrieving revision 1.10 diff -u -r1.10 tcp_fsm.h --- tcp_fsm.h 1997/08/16 19:15:38 1.10 +++ tcp_fsm.h 1999/04/09 16:48:21 @@ -63,6 +63,20 @@ #define TCPS_HAVEESTABLISHED(s) ((s) >= TCPS_ESTABLISHED) #define TCPS_HAVERCVDFIN(s) ((s) >= TCPS_TIME_WAIT) +#if 0 +#define TCP_NEWSTATE(tp, state) ((tp)->t_state = state) +#else +#define TCP_NEWSTATE(tp, state) tcp_newstate(tp, state) +#define TCP_NEWSTATE_OPT(tp, state) \ +do { \ + if (state == TCPS_TIME_WAIT || \ + (tp)->t_state == TCPS_TIME_WAIT) \ + tcp_newstate(tp, state); \ + else \ + (tp)->t_state = state; \ +} while(0) +#endif + #ifdef TCPOUTFLAGS /* * Flags used when sending segments in tcp_output. Index: tcp_input.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_input.c,v retrieving revision 1.82 diff -u -r1.82 tcp_input.c --- tcp_input.c 1998/12/03 20:23:20 1.82 +++ tcp_input.c 1999/02/27 06:45:04 @@ -88,7 +88,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW, &tcp_delack_enabled, 0, ""); -u_long tcp_now; +u_long tcp_now = 1; /* we treat 0 specially */ struct inpcbhead tcb; struct inpcbinfo tcbinfo; @@ -115,7 +115,7 @@ (tp)->t_segq == NULL && \ (tp)->t_state == TCPS_ESTABLISHED) { \ if (tcp_delack_enabled) \ - tp->t_flags |= TF_DELACK; \ + TCPDELACK_SETF(tp, TF_DELACK); \ else \ tp->t_flags |= TF_ACKNOW; \ (tp)->rcv_nxt += (ti)->ti_len; \ @@ -477,7 +477,7 @@ } inp->inp_options = ip_srcroute(); tp = intotcpcb(inp); - tp->t_state = TCPS_LISTEN; + TCP_NEWSTATE(tp, TCPS_LISTEN); tp->t_flags |= tp0->t_flags & (TF_NOPUSH|TF_NOOPT); /* Compute proper scaling value from buffer space */ @@ -491,9 +491,9 @@ * Segment received on connection. * Reset idle time and keep-alive timer. */ - tp->t_idle = 0; + TCP_SETIDLE(tp, 0); if (TCPS_HAVEESTABLISHED(tp->t_state)) - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + TCP_SETTMR(tp, TCPT_KEEP, tcp_keepidle); /* * Process options if not in LISTEN state, @@ -559,9 +559,9 @@ if ((to.to_flag & TOF_TS) != 0) tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1); - else if (tp->t_rtt && + else if (TCP_RTTVAL(tp) && SEQ_GT(ti->ti_ack, tp->t_rtseq)) - tcp_xmit_timer(tp, tp->t_rtt); + tcp_xmit_timer(tp, TCP_RTTVAL(tp)); acked = ti->ti_ack - tp->snd_una; tcpstat.tcps_rcvackpack++; tcpstat.tcps_rcvackbyte += acked; @@ -579,9 +579,9 @@ * decide between more output or persist. */ if (tp->snd_una == tp->snd_max) - tp->t_timer[TCPT_REXMT] = 0; - else if (tp->t_timer[TCPT_PERSIST] == 0) - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + TCP_SETTMR(tp, TCPT_REXMT, 0); + else if (TCP_TMRVAL(tp, TCPT_PERSIST) == 0) + TCP_SETTMR(tp, TCPT_REXMT, tp->t_rxtcur); sowwakeup(so); if (so->so_snd.sb_cc) @@ -606,7 +606,7 @@ sbappend(&so->so_rcv, m); sorwakeup(so); if (tcp_delack_enabled) { - tp->t_flags |= TF_DELACK; + TCPDELACK_SETF(tp, TF_DELACK); } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); @@ -728,7 +728,7 @@ taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { taop->tao_cc = to.to_cc; - tp->t_state = TCPS_ESTABLISHED; + TCP_NEWSTATE(tp, TCPS_ESTABLISHED); /* * If there is a FIN, or if there is data and the @@ -739,7 +739,7 @@ */ if (tcp_delack_enabled && ((tiflags & TH_FIN) || (ti->ti_len != 0 && in_localaddr(inp->inp_faddr)))) - tp->t_flags |= (TF_DELACK | TF_NEEDSYN); + TCPDELACK_SETF(tp, TF_DELACK | TF_NEEDSYN); else tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); @@ -751,7 +751,7 @@ tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN); tcpstat.tcps_connects++; soisconnected(so); - tp->t_timer[TCPT_KEEP] = tcp_keepinit; + TCP_SETTMR(tp, TCPT_KEEP, tcp_keepinit); dropsocket = 0; /* committed to socket */ tcpstat.tcps_accepts++; goto trimthenstep6; @@ -769,8 +769,8 @@ * do a standard 3-way handshake. */ tp->t_flags |= TF_ACKNOW; - tp->t_state = TCPS_SYN_RECEIVED; - tp->t_timer[TCPT_KEEP] = tcp_keepinit; + TCP_NEWSTATE(tp, TCPS_SYN_RECEIVED); + TCP_SETTMR(tp, TCPT_KEEP, tcp_keepinit); dropsocket = 0; /* committed to socket */ tcpstat.tcps_accepts++; goto trimthenstep6; @@ -869,7 +869,7 @@ * ACKNOW will be turned on later. */ if (tcp_delack_enabled && ti->ti_len != 0) - tp->t_flags |= TF_DELACK; + TCPDELACK_SETF(tp, TF_DELACK); else tp->t_flags |= TF_ACKNOW; /* @@ -879,12 +879,12 @@ * SYN_SENT* --> FIN_WAIT_1 */ if (tp->t_flags & TF_NEEDFIN) { - tp->t_state = TCPS_FIN_WAIT_1; + TCP_NEWSTATE(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; tiflags &= ~TH_SYN; } else { - tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + TCP_NEWSTATE(tp, TCPS_ESTABLISHED); + TCP_SETTMR(tp, TCPT_KEEP, tcp_keepidle); } } else { /* @@ -897,7 +897,7 @@ * If there was no CC option, clear cached CC value. */ tp->t_flags |= TF_ACKNOW; - tp->t_timer[TCPT_REXMT] = 0; + TCP_SETTMR(tp, TCPT_REXMT, 0); if (to.to_flag & TOF_CC) { if (taop->tao_cc != 0 && CC_GT(to.to_cc, taop->tao_cc)) { @@ -908,19 +908,19 @@ */ taop->tao_cc = to.to_cc; if (tp->t_flags & TF_NEEDFIN) { - tp->t_state = TCPS_FIN_WAIT_1; + TCP_NEWSTATE(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { - tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + TCP_NEWSTATE(tp, TCPS_ESTABLISHED); + TCP_SETTMR(tp, TCPT_KEEP, tcp_keepidle); } tp->t_flags |= TF_NEEDSYN; } else - tp->t_state = TCPS_SYN_RECEIVED; + TCP_NEWSTATE(tp, TCPS_SYN_RECEIVED); } else { /* CC.NEW or no option => invalidate cache */ taop->tao_cc = 0; - tp->t_state = TCPS_SYN_RECEIVED; + TCP_NEWSTATE(tp, TCPS_SYN_RECEIVED); } } @@ -971,7 +971,7 @@ if ((tiflags & TH_SYN) && (to.to_flag & TOF_CC) && tp->cc_recv != 0) { if (tp->t_state == TCPS_TIME_WAIT && - tp->t_duration > TCPTV_MSL) + TCP_DURATIONVAL(tp) > TCPTV_MSL) goto dropwithreset; if (CC_GT(to.to_cc, tp->cc_recv)) { tp = tcp_close(tp); @@ -1055,7 +1055,7 @@ case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: - tp->t_state = TCPS_CLOSED; + TCP_NEWSTATE(tp, TCPS_CLOSED); tcpstat.tcps_drops++; tp = tcp_close(tp); break; @@ -1288,11 +1288,11 @@ * SYN-RECEIVED* -> FIN-WAIT-1 */ if (tp->t_flags & TF_NEEDFIN) { - tp->t_state = TCPS_FIN_WAIT_1; + TCP_NEWSTATE(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { - tp->t_state = TCPS_ESTABLISHED; - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + TCP_NEWSTATE(tp, TCPS_ESTABLISHED); + TCP_SETTMR(tp, TCPT_KEEP, tcp_keepidle); } /* * If segment contains data or ACK, will call tcp_reass() @@ -1347,7 +1347,7 @@ * to keep a constant cwnd packets in the * network. */ - if (tp->t_timer[TCPT_REXMT] == 0 || + if (TCP_TMRVAL(tp, TCPT_REXMT) == 0 || ti->ti_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks == tcprexmtthresh) { @@ -1359,7 +1359,7 @@ if (win < 2) win = 2; tp->snd_ssthresh = win * tp->t_maxseg; - tp->t_timer[TCPT_REXMT] = 0; + TCP_SETTMR(tp, TCPT_REXMT, 0); tp->t_rtt = 0; tp->snd_nxt = ti->ti_ack; tp->snd_cwnd = tp->t_maxseg; @@ -1428,8 +1428,8 @@ */ if (to.to_flag & TOF_TS) tcp_xmit_timer(tp, tcp_now - to.to_tsecr + 1); - else if (tp->t_rtt && SEQ_GT(ti->ti_ack, tp->t_rtseq)) - tcp_xmit_timer(tp,tp->t_rtt); + else if (TCP_RTTVAL(tp) && SEQ_GT(ti->ti_ack, tp->t_rtseq)) + tcp_xmit_timer(tp, TCP_RTTVAL(tp)); /* * If all outstanding data is acked, stop retransmit @@ -1438,10 +1438,10 @@ * timer, using current (possibly backed-off) value. */ if (ti->ti_ack == tp->snd_max) { - tp->t_timer[TCPT_REXMT] = 0; + TCP_SETTMR(tp, TCPT_REXMT, 0); needoutput = 1; - } else if (tp->t_timer[TCPT_PERSIST] == 0) - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + } else if (TCP_TMRVAL(tp, TCPT_PERSIST) == 0) + TCP_SETTMR(tp, TCPT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, @@ -1497,9 +1497,9 @@ */ if (so->so_state & SS_CANTRCVMORE) { soisdisconnected(so); - tp->t_timer[TCPT_2MSL] = tcp_maxidle; + TCP_SETTMR(tp, TCPT_2MSL, tcp_maxidle); } - tp->t_state = TCPS_FIN_WAIT_2; + TCP_NEWSTATE(tp, TCPS_FIN_WAIT_2); } break; @@ -1511,15 +1511,15 @@ */ case TCPS_CLOSING: if (ourfinisacked) { - tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && - tp->t_duration < TCPTV_MSL) - tp->t_timer[TCPT_2MSL] = - tp->t_rxtcur * TCPTV_TWTRUNC; + TCP_DURATIONVAL(tp) < TCPTV_MSL) + TCP_SETTMR(tp, TCPT_2MSL, + tp->t_rxtcur * TCPTV_TWTRUNC); else - tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + TCP_SETTMR(tp, TCPT_2MSL, 2 * TCPTV_MSL); + TCP_NEWSTATE(tp, TCPS_TIME_WAIT); soisdisconnected(so); } break; @@ -1543,7 +1543,7 @@ * it and restart the finack timer. */ case TCPS_TIME_WAIT: - tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + TCP_SETTMR(tp, TCPT_2MSL, 2 * TCPTV_MSL); goto dropafterack; } } @@ -1667,7 +1667,7 @@ * more input can be expected, send ACK now. */ if (tcp_delack_enabled && (tp->t_flags & TF_NEEDSYN)) - tp->t_flags |= TF_DELACK; + TCPDELACK_SETF(tp, TF_DELACK); else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; @@ -1680,7 +1680,7 @@ */ case TCPS_SYN_RECEIVED: case TCPS_ESTABLISHED: - tp->t_state = TCPS_CLOSE_WAIT; + TCP_NEWSTATE(tp, TCPS_CLOSE_WAIT); break; /* @@ -1688,7 +1688,7 @@ * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: - tp->t_state = TCPS_CLOSING; + TCP_NEWSTATE(tp, TCPS_CLOSING); break; /* @@ -1697,18 +1697,18 @@ * standard timers. */ case TCPS_FIN_WAIT_2: - tp->t_state = TCPS_TIME_WAIT; tcp_canceltimers(tp); /* Shorten TIME_WAIT [RFC-1644, p.28] */ if (tp->cc_recv != 0 && - tp->t_duration < TCPTV_MSL) { - tp->t_timer[TCPT_2MSL] = - tp->t_rxtcur * TCPTV_TWTRUNC; + TCP_DURATIONVAL(tp) < TCPTV_MSL) { + TCP_SETTMR(tp, TCPT_2MSL, + tp->t_rxtcur * TCPTV_TWTRUNC); /* For transaction client, force ACK now. */ tp->t_flags |= TF_ACKNOW; } else - tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + TCP_SETTMR(tp, TCPT_2MSL, 2 * TCPTV_MSL); + TCP_NEWSTATE(tp, TCPS_TIME_WAIT); soisdisconnected(so); break; @@ -1716,7 +1716,7 @@ * In TIME_WAIT state restart the 2 MSL time_wait timer. */ case TCPS_TIME_WAIT: - tp->t_timer[TCPT_2MSL] = 2 * TCPTV_MSL; + TCP_SETTMR(tp, TCPT_2MSL, 2 * TCPTV_MSL); break; } } @@ -1997,7 +1997,7 @@ tp->t_srtt = rtt << TCP_RTT_SHIFT; tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); } - tp->t_rtt = 0; + tp->t_rtt = 0; tp->t_rxtshift = 0; /* Index: tcp_output.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_output.c,v retrieving revision 1.32 diff -u -r1.32 tcp_output.c --- tcp_output.c 1999/01/20 17:31:59 1.32 +++ tcp_output.c 1999/02/27 06:46:16 @@ -93,7 +93,7 @@ * to send, then transmit; otherwise, investigate further. */ idle = (tp->snd_max == tp->snd_una); - if (idle && tp->t_idle >= tp->t_rxtcur) + if (idle && TCP_IDLEVAL(tp) >= tp->t_rxtcur) /* * We have been idle for "a while" and no acks are * expected to clock out any data we send -- @@ -143,7 +143,7 @@ flags &= ~TH_FIN; win = 1; } else { - tp->t_timer[TCPT_PERSIST] = 0; + TCP_SETTMR(tp, TCPT_PERSIST, 0); tp->t_rxtshift = 0; } } @@ -194,10 +194,10 @@ */ len = 0; if (win == 0) { - tp->t_timer[TCPT_REXMT] = 0; + TCP_SETTMR(tp, TCPT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; - if (tp->t_timer[TCPT_PERSIST] == 0) + if (TCP_TMRVAL(tp, TCPT_PERSIST) == 0) tcp_setpersist(tp); } } @@ -299,8 +299,8 @@ * if window is nonzero, transmit what we can, * otherwise force out a byte. */ - if (so->so_snd.sb_cc && tp->t_timer[TCPT_REXMT] == 0 && - tp->t_timer[TCPT_PERSIST] == 0) { + if (so->so_snd.sb_cc && TCP_TMRVAL(tp, TCPT_REXMT) == 0 && + TCP_TMRVAL(tp, TCPT_PERSIST) == 0) { tp->t_rxtshift = 0; tcp_setpersist(tp); } @@ -563,7 +563,7 @@ * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ - if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST]) + if (len || (flags & (TH_SYN|TH_FIN)) || TCP_TMRVAL(tp, TCPT_PERSIST)) ti->ti_seq = htonl(tp->snd_nxt); else ti->ti_seq = htonl(tp->snd_max); @@ -609,7 +609,7 @@ * In transmit state, time the transmission and arrange for * the retransmit. In persist state, just set snd_max. */ - if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) { + if (tp->t_force == 0 || TCP_TMRVAL(tp, TCPT_PERSIST) == 0) { tcp_seq startseq = tp->snd_nxt; /* @@ -631,7 +631,7 @@ * not currently timing anything. */ if (tp->t_rtt == 0) { - tp->t_rtt = 1; + TCP_SETRTT(tp, 1); tp->t_rtseq = startseq; tcpstat.tcps_segstimed++; } @@ -645,11 +645,11 @@ * Initialize shift counter which is used for backoff * of retransmit time. */ - if (tp->t_timer[TCPT_REXMT] == 0 && + if (TCP_TMRVAL(tp, TCPT_REXMT) == 0 && tp->snd_nxt != tp->snd_una) { - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; - if (tp->t_timer[TCPT_PERSIST]) { - tp->t_timer[TCPT_PERSIST] = 0; + TCP_SETTMR(tp, TCPT_REXMT, tp->t_rxtcur); + if (TCP_TMRVAL(tp, TCPT_PERSIST)) { + TCP_SETTMR(tp, TCPT_PERSIST, 0); tp->t_rxtshift = 0; } } @@ -732,7 +732,7 @@ if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + win; tp->last_ack_sent = tp->rcv_nxt; - tp->t_flags &= ~(TF_ACKNOW|TF_DELACK); + TCPDELACK_CLRF(tp, TF_ACKNOW|TF_DELACK); if (sendalot) goto again; return (0); @@ -743,15 +743,17 @@ register struct tcpcb *tp; { register int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; + int persist; - if (tp->t_timer[TCPT_REXMT]) + if (TCP_TMRVAL(tp, TCPT_REXMT)) panic("tcp_output REXMT"); /* * Start/restart persistance timer. */ - TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], + TCPT_RANGESET(persist, t * tcp_backoff[tp->t_rxtshift], TCPTV_PERSMIN, TCPTV_PERSMAX); + TCP_SETTMR(tp, TCPT_PERSIST, persist); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; } Index: tcp_subr.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_subr.c,v retrieving revision 1.49.2.1 diff -u -r1.49.2.1 tcp_subr.c --- tcp_subr.c 1999/02/04 06:40:28 1.49.2.1 +++ tcp_subr.c 1999/02/26 21:38:08 @@ -85,6 +85,10 @@ SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, &tcp_do_rfc1644 , 0, ""); +static int tcp_tcbhashsize = 0; +SYSCTL_INT(_net_inet_tcp, TCPCTL_TCBHASHSIZE, tcbhashsize, + CTLFLAG_RD, &tcp_tcbhashsize, 0, ""); + SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, &tcbinfo.ipi_count, 0, "Number of active PCBs"); @@ -140,6 +144,7 @@ printf("WARNING: TCB hash size not a power of 2\n"); hashsize = 512; /* safe default */ } + tcp_tcbhashsize = hashsize; tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); tcbinfo.porthashbase = hashinit(hashsize, M_PCB, &tcbinfo.porthashmask); @@ -149,6 +154,7 @@ max_protohdr = sizeof(struct tcpiphdr); if (max_linkhdr + sizeof(struct tcpiphdr) > MHLEN) panic("tcp_init"); + tcp_timer_init(); } /* @@ -315,8 +321,11 @@ tp->t_rxtcur = TCPTV_RTOBASE; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; + TCP_SETIDLE(tp, 0); + TCP_SETDURATION(tp, 0); inp->inp_ip_ttl = ip_defttl; inp->inp_ppcb = (caddr_t)tp; + tcpt_attach(tp); return (tp); /* XXX */ } @@ -333,7 +342,7 @@ struct socket *so = tp->t_inpcb->inp_socket; if (TCPS_HAVERCVDSYN(tp->t_state)) { - tp->t_state = TCPS_CLOSED; + TCP_NEWSTATE(tp, TCPS_CLOSED); (void) tcp_output(tp); tcpstat.tcps_drops++; } else @@ -449,6 +458,8 @@ } if (tp->t_template) (void) m_free(dtom(tp->t_template)); +tcp_canceltimers(tp); + tcpt_detach(tp); inp->inp_ppcb = NULL; soisdisconnected(so); in_pcbdetach(inp); Index: tcp_timer.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_timer.c,v retrieving revision 1.28 diff -u -r1.28 tcp_timer.c --- tcp_timer.c 1998/04/24 09:25:35 1.28 +++ tcp_timer.c 1999/04/09 16:51:55 @@ -1,4 +1,4 @@ -/* +/*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * @@ -40,11 +40,14 @@ #include #include #include +#include #include #include #include #include +#include + #include /* before tcp_seq.h, for tcp_random18() */ #include @@ -85,21 +88,147 @@ /* max idle time in persist */ int tcp_maxidle; +LIST_HEAD(tcptw_list, tcpt_callout); +LIST_HEAD(, tcpt_callout) active_head; +TAILQ_HEAD(, tcpt_callout) delack_head; + +struct tcpt_timingwheel { + int tw_size; + int tw_mask; + struct tcptw_list *tw_wheel; +}; +struct tcpt_timingwheel tcpt_timingwheel[TCPT_NTIMERS]; + +struct vm_zone *tcptzone; +MALLOC_DEFINE(M_TCPTW, "tcptw", "TCP Timing wheel"); + +/* must be a power of 2 */ +static int tcpt_wheelsize[] = { 32768, 32768, 32768, 32768 }; + +void +tcp_timer_init() +{ + int i, s; + struct tcpt_timingwheel *tw; + + for (i = 0; i < TCPT_NTIMERS; i++) { + tw = &tcpt_timingwheel[i]; + tw->tw_size = tcpt_wheelsize[i]; + tw = &tcpt_timingwheel[i]; + MALLOC(tw->tw_wheel, struct tcptw_list *, + tw->tw_size * sizeof(struct tcptw_list), + M_TCPTW, M_NOWAIT); + tw->tw_mask = tw->tw_size - 1; + for (s = 0; s < tw->tw_size; s++) + LIST_INIT(&tw->tw_wheel[s]); + } + tcptzone = zinit("tcptw", sizeof(struct tcpt_callout), maxsockets, + ZONE_INTERRUPT, 0); + TAILQ_INIT(&delack_head); + LIST_INIT(&active_head); +} + +void +tcpt_attach(tp) + struct tcpcb *tp; +{ + struct tcpt_callout *tc; + + tc = zalloci(tcptzone); + if (tc == NULL) + panic("tcpt_attach: no buffers"); + tc->tc_use_callout = 0; + tc->tc_tp = tp; + LIST_INSERT_HEAD(&active_head, tc, tc_le_active); + tp->timer_callout = (caddr_t)tc; +} + +void +tcpt_detach(tp) + struct tcpcb *tp; +{ + struct tcpt_callout *tc = (struct tcpt_callout *)tp->timer_callout; + int i; + + if (tc->tc_use_callout) { + for (i = 0; i < TCPT_NTIMERS; i++) { + if (tp->t_timer[i] == 0) + continue; + LIST_REMOVE(tc, tc_le[i]); + } + } else { + LIST_REMOVE(tc, tc_le_active); + } + + zfreei(tcptzone, tp->timer_callout); + tp->timer_callout = (caddr_t)0; +} + +/* + * Note: a hazard exists here where it could be possible that the + * callout is on _no_ lists. E.g.: we enter TCP_WAIT state with + * no timeouts set; this will remove us from the active list, but + * not add us to any other lists. + */ +void +tcp_newstate(tp, state) + struct tcpcb *tp; + int state; +{ + struct tcpt_callout *tc = (struct tcpt_callout *)tp->timer_callout; + int i, use_callout; + + use_callout = 0; /* default */ + + switch (state) { + case TCPS_TIME_WAIT: + use_callout = 1; + } + tp->t_state = state; + if (use_callout == tc->tc_use_callout) + return; + + if (tc->tc_use_callout == 0) { + LIST_REMOVE(tc, tc_le_active); + tc->tc_use_callout = 1; + + for (i = 0; i < TCPT_NTIMERS; i++) { + struct tcpt_timingwheel *tw; + struct tcptw_list *bucket; + + if (tp->t_timer[i] == 0) + continue; + tw = &tcpt_timingwheel[i]; + bucket = &tw->tw_wheel[tp->t_timer[i] & tw->tw_mask]; + LIST_INSERT_HEAD(bucket, tc, tc_le[i]); + } + } else { + for (i = 0; i < TCPT_NTIMERS; i++) { + if (tp->t_timer[i] == 0) + continue; + LIST_REMOVE(tc, tc_le[i]); + } + tc->tc_use_callout = 0; + LIST_INSERT_HEAD(&active_head, tc, tc_le_active); + } +} + /* * Fast timeout routine for processing delayed acks */ void tcp_fasttimo() { - register struct inpcb *inp; - register struct tcpcb *tp; + struct tcpt_callout *tc, *ntc; + struct tcpcb *tp; int s; if (tcp_delack_enabled) { s = splnet(); - for (inp = tcb.lh_first; inp != NULL; inp = inp->inp_list.le_next) { - if ((tp = (struct tcpcb *)inp->inp_ppcb) && - (tp->t_flags & TF_DELACK)) { + for (tc = LIST_FIRST(&active_head); tc; tc = ntc) { + ntc = LIST_NEXT(tc, tc_le_active); + tp = tc->tc_tp; + if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; tcpstat.tcps_delack++; @@ -118,10 +247,11 @@ void tcp_slowtimo() { - register struct inpcb *ip, *ipnxt; - register struct tcpcb *tp; - register int i; - int s; + struct tcpt_timingwheel *tw; + struct tcpt_callout *tc, *ntc; + struct tcptw_list *bucket; + struct tcpcb *tp; + int s, i, curtick; #ifdef TCPDEBUG int ostate; #endif @@ -129,22 +259,18 @@ s = splnet(); tcp_maxidle = tcp_keepcnt * tcp_keepintvl; + curtick = tcp_now + 1; - ip = tcb.lh_first; - if (ip == NULL) { - splx(s); - return; - } - /* - * Search through tcb's and update active timers. - */ - for (; ip != NULL; ip = ipnxt) { - ipnxt = ip->inp_list.le_next; - tp = intotcpcb(ip); - if (tp == 0 || tp->t_state == TCPS_LISTEN) + for (tc = LIST_FIRST(&active_head); tc; tc = ntc) { + ntc = LIST_NEXT(tc, tc_le_active); + tp = tc->tc_tp; + if (tp->t_state == TCPS_LISTEN) continue; for (i = 0; i < TCPT_NTIMERS; i++) { - if (tp->t_timer[i] && --tp->t_timer[i] == 0) { + if (tp->t_timer[i] && tp->t_timer[i] < curtick) + printf("Error1: timer < tick\n"); + if (tp->t_timer[i] && tp->t_timer[i] == curtick) { + tp->t_timer[i] = 0; #ifdef TCPDEBUG ostate = tp->t_state; #endif @@ -160,13 +286,43 @@ #endif } } - tp->t_idle++; - tp->t_duration++; - if (tp->t_rtt) - tp->t_rtt++; tpgone: ; } + + for (i = 0; i < TCPT_NTIMERS; i++) { + tw = &tcpt_timingwheel[i]; + bucket = &tw->tw_wheel[curtick & tw->tw_mask]; + for (tc = LIST_FIRST(bucket); tc; tc = ntc) { + ntc = LIST_NEXT(tc, tc_le[i]); +#ifdef TCPDEBUG + if (tc->tc_tp->t_timer[i] && tc->tc_tp->t_timer[i] < curtick) + printf("Error2: timer < tick\n"); + if (tc->tc_tp->t_flags & TF_DELACK) + printf("Error3: delack on timingwheel\n"); +#endif + if (tc->tc_tp->t_timer[i] == curtick) { + LIST_REMOVE(tc, tc_le[i]); + tc->tc_tp->t_timer[i] = 0; +#ifdef TCPDEBUG + ostate = tp->t_state; +#endif + tp = tcp_timers(tc->tc_tp, i); + if (tp == NULL) + goto tpgone2; +#ifdef TCPDEBUG + if (tp->t_inpcb->inp_socket->so_options + & SO_DEBUG) + tcp_trace(TA_USER, ostate, tp, + (struct tcpiphdr *)0, + PRU_SLOWTIMO); +#endif + } +tpgone2: + ; + } + } + tcp_iss += TCP_ISSINCR/PR_SLOWHZ; /* increment iss */ #ifdef TCP_COMPAT_42 if ((int)tcp_iss < 0) @@ -177,16 +333,68 @@ } /* + * for now, allocate the timer structure at the same time as the tcp block + */ +void +tcp_settimer(tp, timer, timeout) + struct tcpcb *tp; + int timer; + int timeout; +{ + struct tcpt_callout *tc = (struct tcpt_callout *)tp->timer_callout; + struct tcpt_timingwheel *tw; + struct tcptw_list *bucket; + +#if 0 + /* + * normal processing. This has been hoisted up into the macro. + */ + if (tc->tc_use_callout == 0) { + tp->t_timer[timer] = (timeout == 0 ? 0 : tcp_now + timeout); + return; + } +#endif + + tw = &tcpt_timingwheel[timer]; + + if (tp->t_timer[timer]) { + /* + * XXX + * one could check that the new timeout doesn't + * put the structure back in the same bucket. + */ + LIST_REMOVE(tc, tc_le[timer]); + tp->t_timer[timer] = 0; + } + if (timeout == 0) + return; + + tp->t_timer[timer] = tcp_now + timeout; + bucket = &tw->tw_wheel[tc->tc_tp->t_timer[timer] & tw->tw_mask]; + LIST_INSERT_HEAD(bucket, tc, tc_le[timer]); +} + +/* * Cancel all timers for TCP tp. */ void tcp_canceltimers(tp) struct tcpcb *tp; { + struct tcpt_callout *tc = (struct tcpt_callout *)tp->timer_callout; register int i; - for (i = 0; i < TCPT_NTIMERS; i++) - tp->t_timer[i] = 0; + if (tc->tc_use_callout) { + for (i = 0; i < TCPT_NTIMERS; i++) { + if (tp->t_timer[i] == 0) + continue; + LIST_REMOVE(tc, tc_le[i]); + tp->t_timer[i] = 0; + } + } else { + for (i = 0; i < TCPT_NTIMERS; i++) + tp->t_timer[i] = 0; + } } int tcp_backoff[TCP_MAXRXTSHIFT + 1] = @@ -214,8 +422,8 @@ */ case TCPT_2MSL: if (tp->t_state != TCPS_TIME_WAIT && - tp->t_idle <= tcp_maxidle) - tp->t_timer[TCPT_2MSL] = tcp_keepintvl; + TCP_IDLEVAL(tp) <= tcp_maxidle) + TCP_SETTMR(tp, TCPT_2MSL, tcp_keepintvl); else tp = tcp_close(tp); break; @@ -237,7 +445,7 @@ rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); - tp->t_timer[TCPT_REXMT] = tp->t_rxtcur; + TCP_SETTMR(tp, TCPT_REXMT, tp->t_rxtcur); /* * If losing, let the lower level know and try for * a better route. Also, if we backed off this far, @@ -309,8 +517,8 @@ * backoff that we would use if retransmitting. */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && - (tp->t_idle >= tcp_maxpersistidle || - tp->t_idle >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + (TCP_IDLEVAL(tp) >= tcp_maxpersistidle || + TCP_IDLEVAL(tp) >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { tcpstat.tcps_persistdrop++; tp = tcp_drop(tp, ETIMEDOUT); break; @@ -332,7 +540,7 @@ if ((always_keepalive || tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { - if (tp->t_idle >= tcp_keepidle + tcp_maxidle) + if (TCP_IDLEVAL(tp) >= tcp_keepidle + tcp_maxidle) goto dropit; /* * Send a packet designed to force a response @@ -358,9 +566,9 @@ tcp_respond(tp, tp->t_template, (struct mbuf *)NULL, tp->rcv_nxt, tp->snd_una - 1, 0); #endif - tp->t_timer[TCPT_KEEP] = tcp_keepintvl; + TCP_SETTMR(tp, TCPT_KEEP, tcp_keepintvl); } else - tp->t_timer[TCPT_KEEP] = tcp_keepidle; + TCP_SETTMR(tp, TCPT_KEEP, tcp_keepidle); break; dropit: tcpstat.tcps_keepdrops++; Index: tcp_timer.h =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_timer.h,v retrieving revision 1.13 diff -u -r1.13 tcp_timer.h --- tcp_timer.h 1997/09/07 05:26:48 1.13 +++ tcp_timer.h 1999/04/09 16:45:01 @@ -37,6 +37,7 @@ #ifndef _NETINET_TCP_TIMER_H_ #define _NETINET_TCP_TIMER_H_ +#include /* * Definitions of the TCP timers. These timers are counted * down PR_SLOWHZ times a second. @@ -125,6 +126,74 @@ else if ((u_long)(tv) > (u_long)(tvmax)) \ (tv) = (tvmax); \ } + +/* + * Set a tcp timer + */ +#if 0 +#define TCP_SETTMR(tp, timer, timeout) \ + (tp)->t_timer[timer] = timeout +#define TCP_TMRVAL(tp, timer) \ + (tp)->t_timer[timer] +#define TCPDELACK_SETF(tp, flags) \ + (tp)->t_flags |= (flags) +#define TCPDELACK_CLRF(tp, flags) \ + (tp)->t_flags &= ~(flags) +#define TCP_SETIDLE(tp, val) ((tp)->t_idle = val) +#define TCP_IDLEVAL(tp) (tp)->t_idle +#define TCP_SETDURATION(tp, val) ((tp)->t_duration = val) +#define TCP_DURATIONVAL(tp) (tp)->t_duration +#define TCP_SETRTT(tp, val) (tp)->t_rtt = val +#define TCP_RTTVAL(tp) (tp)->t_rtt +#else +struct tcpt_callout { + struct tcpcb *tc_tp; /* backpointer to tcpcb */ + int tc_use_callout; + TAILQ_ENTRY(tcpt_callout) tc_tqe_delack; + LIST_ENTRY(tcpt_callout) tc_le[TCPT_NTIMERS]; + LIST_ENTRY(tcpt_callout) tc_le_active; +}; +struct tcpcb; +void tcp_timer_init __P((void)); +void tcp_settimer __P((struct tcpcb *, int, int)); +int tcp_readtimer __P((struct tcpcb *, int)); +void tcp_newstate __P((struct tcpcb *, int)); +void tcpt_attach __P((struct tcpcb *)); +void tcpt_detach __P((struct tcpcb *)); +#define TCP_SETTMR(tp, timer, timeout) \ +do { \ + if (((struct tcpt_callout *)tp->timer_callout)->tc_use_callout) \ + tcp_settimer(tp, timer, timeout); \ + else \ + tp->t_timer[timer] = \ + (timeout == 0 ? 0 : tcp_now + timeout); \ +} while(0) +#define TCP_TMRVAL(tp, timer) (tp)->t_timer[timer] +#define TCPDELACK_SETF(tp, flags) (tp)->t_flags |= (flags) +#define TCPDELACK_CLRF(tp, flags) (tp)->t_flags &= ~(flags) + +#define TCPDELACK_SETF_X(tp, flags) \ +do { \ + (tp)->t_flags |= (flags); \ + tcp_setdelack(tp, 1); \ +} while(0) +#define TCPDELACK_CLRF_X(tp, flags) \ +do { \ + (tp)->t_flags &= ~(flags); \ + tcp_setdelack(tp, 0); \ +} while(0) +/* + * these are slightly wrong, since they age all connections regardless + * of the state; the original tcp_slowtimo did not age LISTEN sockets. + */ +#define TCP_SETIDLE(tp, val) ((tp)->t_idle = tcp_now + val) +#define TCP_IDLEVAL(tp) (tcp_now - (tp)->t_idle) +#define TCP_SETDURATION(tp, val) ((tp)->t_duration = tcp_now + val) +#define TCP_DURATIONVAL(tp) (tcp_now - (tp)->t_duration) +#define TCP_SETRTT(tp, val) (tp)->t_rtt = tcp_now + val +#define TCP_RTTVAL(tp) \ + ((tp)->t_rtt ? tcp_now - (tp)->t_rtt : 0) +#endif #ifdef KERNEL extern int tcp_keepinit; /* time to establish connection */ Index: tcp_usrreq.c =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_usrreq.c,v retrieving revision 1.40 diff -u -r1.40 tcp_usrreq.c --- tcp_usrreq.c 1999/01/20 17:31:59 1.40 +++ tcp_usrreq.c 1999/02/26 21:19:39 @@ -207,7 +207,7 @@ if (inp->inp_lport == 0) error = in_pcbbind(inp, (struct sockaddr *)0, p); if (error == 0) - tp->t_state = TCPS_LISTEN; + TCP_NEWSTATE(tp, TCPS_LISTEN); COMMON_END(PRU_LISTEN); } @@ -514,7 +514,7 @@ if (oinp) { if (oinp != inp && (otp = intotcpcb(oinp)) != NULL && otp->t_state == TCPS_TIME_WAIT && - otp->t_duration < TCPTV_MSL && + TCP_DURATIONVAL(otp) < TCPTV_MSL && (otp->t_flags & TF_RCVD_CC)) otp = tcp_close(otp); else @@ -539,8 +539,8 @@ soisconnecting(so); tcpstat.tcps_connattempt++; - tp->t_state = TCPS_SYN_SENT; - tp->t_timer[TCPT_KEEP] = tcp_keepinit; + TCP_NEWSTATE(tp, TCPS_SYN_SENT); + TCP_SETTMR(tp, TCPT_KEEP, tcp_keepinit); tp->iss = tcp_iss; tcp_iss += TCP_ISSINCR/2; tcp_sendseqinit(tp); @@ -715,7 +715,7 @@ so->so_state |= nofd; return (ENOBUFS); } - tp->t_state = TCPS_CLOSED; + TCP_NEWSTATE(tp, TCPS_CLOSED); return (0); } @@ -766,7 +766,7 @@ case TCPS_CLOSED: case TCPS_LISTEN: - tp->t_state = TCPS_CLOSED; + TCP_NEWSTATE(tp, TCPS_CLOSED); tp = tcp_close(tp); break; @@ -776,18 +776,18 @@ break; case TCPS_ESTABLISHED: - tp->t_state = TCPS_FIN_WAIT_1; + TCP_NEWSTATE(tp, TCPS_FIN_WAIT_1); break; case TCPS_CLOSE_WAIT: - tp->t_state = TCPS_LAST_ACK; + TCP_NEWSTATE(tp, TCPS_LAST_ACK); break; } if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { soisdisconnected(tp->t_inpcb->inp_socket); /* To prevent the connection hanging in FIN_WAIT_2 forever. */ if (tp->t_state == TCPS_FIN_WAIT_2) - tp->t_timer[TCPT_2MSL] = tcp_maxidle; + TCP_SETTMR(tp, TCPT_2MSL, tcp_maxidle); } return (tp); } Index: tcp_var.h =================================================================== RCS file: /home/ncvs/src/sys/netinet/tcp_var.h,v retrieving revision 1.49 diff -u -r1.49 tcp_var.h --- tcp_var.h 1999/01/20 17:32:00 1.49 +++ tcp_var.h 1999/02/26 20:02:42 @@ -131,6 +131,7 @@ /* RFC 1644 variables */ tcp_cc cc_send; /* send connection count */ tcp_cc cc_recv; /* receive connection count */ + caddr_t timer_callout; /* timer callout */ }; /* @@ -305,7 +306,8 @@ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* receive buffer space */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ -#define TCPCTL_MAXID 12 +#define TCPCTL_TCBHASHSIZE 12 /* size of TCBHASH (read-only) */ +#define TCPCTL_MAXID 13 #define TCPCTL_NAMES { \ { 0, 0 }, \ @@ -320,6 +322,7 @@ { "recvspace", CTLTYPE_INT }, \ { "keepinit", CTLTYPE_INT }, \ { "pcblist", CTLTYPE_STRUCT }, \ + { "pcbhashsize", CTLTYPE_INT }, \ } #ifdef KERNEL To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-net" in the body of the message