From owner-p4-projects@FreeBSD.ORG Sun Jul 19 22:15:52 2009 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id 6F74D106571C; Sun, 19 Jul 2009 22:15:51 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 113C31065716 for ; Sun, 19 Jul 2009 22:15:51 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id F26EB8FC22 for ; Sun, 19 Jul 2009 22:15:50 +0000 (UTC) (envelope-from andre@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.3/8.14.3) with ESMTP id n6JMFosE023342 for ; Sun, 19 Jul 2009 22:15:50 GMT (envelope-from andre@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.3/8.14.3/Submit) id n6JMFoBe023340 for perforce@freebsd.org; Sun, 19 Jul 2009 22:15:50 GMT (envelope-from andre@freebsd.org) Date: Sun, 19 Jul 2009 22:15:50 GMT Message-Id: <200907192215.n6JMFoBe023340@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to andre@freebsd.org using -f From: Andre Oppermann To: Perforce Change Reviews Cc: Subject: PERFORCE change 166292 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 19 Jul 2009 22:15:52 -0000 http://perforce.freebsd.org/chv.cgi?CH=166292 Change 166292 by andre@andre_t61 on 2009/07/19 22:15:42 Rename tcp_reass_enabled to tcp_reass_enable. Change tcp_reass_maxblocks to a limit per connection. Add tcp_reass_globalmaxblocks as global zonelimit. Change tcp_reass_qtimo to tcp_reass_timeout as fixed timeout in milliseconds. Update sysctl descriptions. Decouple reassembly block zonelimit from nmbclusters. Add per connection block counter, tracking and limiting code. Combine all exit cases where the mbuf is freed into goto label 'done'. Differenciate between space and time efficiency through the use of m_collapse(). Add RFC2883 D-SACK support for duplicate retransmits. Fix merging of next/previous block test. Affected files ... .. //depot/projects/tcp_reass/netinet/tcp_reass.c#36 edit .. //depot/projects/tcp_reass/netinet/tcp_var.h#20 edit Differences ... ==== //depot/projects/tcp_reass/netinet/tcp_reass.c#36 (text+ko) ==== @@ -107,25 +107,30 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); -static int tcp_reass_enabled = 1; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR, - &tcp_reass_enabled, 0, - "Enable/disable use of TCP Reassembly Queue"); +static int tcp_reass_enable = 1; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_RW, + &tcp_reass_enable, 0, + "Enable/disable use of TCP reassembly queue"); -static int tcp_reass_maxblocks = 65535; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN, +static int tcp_reass_maxblocks = 32; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RW, &tcp_reass_maxblocks, 0, - "Global maximum number of TCP Segment Blocks in Reassembly Queue"); + "Per connection limit of TCP segment blocks in reassembly queue"); + +static int tcp_reass_globalmaxblocks = 65535; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, globalmaxblocks, CTLFLAG_RDTUN, + &tcp_reass_globalmaxblocks, 0, + "Global limit of TCP segment blocks in reassembly queue"); -static int tcp_reass_qtimo = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW, - &tcp_reass_qtimo, 0, - "Reassembly Queue Timeout in multiples of the Retransmission Timeout"); +static int tcp_reass_timeout = 30 * HZ; +SYSCTL_PROC(_net_inet_tcp_reass, OID_AUTO, timeout, CTLTYPE_INT|CTLFLAG_RW, + &tcp_reass_timeout, NULL, sysctl_msec_to_ticks, "I", + "Reassembly queue flush timeout in milliseconds"); static int tcp_reass_spacetime = 0; SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW, &tcp_reass_spacetime, 0, - "Reassembly Queue strategy of space vs. time efficiency"); + "Reassembly queue strategy of space vs. time efficiency"); static struct tcp_reass_block * tcp_reass_merge(struct tcp_reass_block *, struct tcp_reass_block *); @@ -158,30 +163,17 @@ #endif /* - * Adjust TCP reassembly zone limits when the nmbclusters zone changes. - */ -static void -tcp_reass_zone_change(void *tag) -{ - - tcp_reass_maxblocks = nmbclusters / 16; - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); -} - -/* * Initialize TCP reassembly zone on startup. */ void tcp_reass_init(void) { - TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks", - &tcp_reass_maxblocks); + TUNABLE_INT_FETCH("net.inet.tcp.reass.globalmaxblocks", + &tcp_reass_globalmaxblocks); tcp_reass_zone = uma_zcreate("tcpreass", sizeof(struct tcp_reass_block), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); - EVENTHANDLER_REGISTER(nmbclusters_change, - tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); + uma_zone_set_max(tcp_reass_zone, tcp_reass_globalmaxblocks); } /* @@ -252,6 +244,7 @@ if (trb->trb_m != NULL) m_freem(trb->trb_m); tp->rcv_reass_size -= SEQ_DELTA(trb->trb_seqs, trb->trb_seqe); + tp->rcv_reass_blocks--; uma_zfree(tcp_reass_zone, trb); } @@ -326,11 +319,8 @@ th = NULL; /* Prevent further use. */ /* Check if it is really neccessary to do all the work. */ - if (!tcp_reass_enabled && RB_EMPTY(&tp->rcv_reass)) { - *tlenp = 0; - m_freem(m); - return (0); - } + if (!tcp_reass_enable && RB_EMPTY(&tp->rcv_reass)) + goto done; KASSERT(SEQ_LT(tp->rcv_nxt, th_seq), ("%s: sequence number below rcv_nxt", __func__)); @@ -359,16 +349,13 @@ * buffer vs. actual real data with 2k clusters and 1500 byte * packets by introducing a correction factor of 11/8th. */ - /* if (th_seq != tp->rcv_nxt && - tp->t_trqmcnt > (sbspace(&so->so_rcv) / 8 * 11)) { + tp->rcv_reass_blocks > tcp_reass_maxblocks) { + //(sbspace(&so->so_rcv) / 8 * 11) TCPSTAT_INC(tcps_reass_overflow); TCPSTAT_INC(tcps_rcvmemdrop); - m_freem(m); - *tlenp = 0; - return (0); + goto done; } - */ /* * FIN handling is a bit tricky. @@ -390,10 +377,9 @@ tcp_timer_activate(tp, TT_REASS, 0); return (thflags); } - } else if (*tlenp == 0) { - m_freem(m); - return (0); - } else + } else if (*tlenp == 0) + goto done; + else thflags &= ~TH_FIN; /* Get rid of packet header and mtags. */ @@ -401,7 +387,8 @@ /* Trim empty mbufs from head of chain. */ m = m_trimhead(m); /* Compact mbuf chain. */ - m = m_collapse(m, M_DONTWAIT, 1024); + if (tcp_reass_spacetime) + m = m_collapse(m, M_DONTWAIT, 1024); KASSERT(m != NULL, ("%s: m is NULL after collapse", __func__)); @@ -420,9 +407,9 @@ if (SEQ_GEQ(trbs.trb_seqs, trb->trb_seqs) && SEQ_LEQ(trbs.trb_seqe, trb->trb_seqe)) { tcp_reass_sacktrack(tp, trb); - m_freem(m); - *tlenp = 0; - return (0); + tp->rcv_reass_dsack.start = trbs.trb_seqs; + tp->rcv_reass_dsack.end = trbs.trb_seqe; + goto done; } tp->rcv_reass_size += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe); @@ -433,7 +420,7 @@ /* Merge in next blocks if there is overlap. */ while ((trbn = RB_NEXT(tcp_ra, &tp->rcv_reass, trb)) != NULL && - SEQ_GEQ(trbn->trb_seqs, trb->trb_seqe)) { + SEQ_GEQ(trb->trb_seqe, trbn->trb_seqs)) { trbn = tcp_reass_merge(trb, trbn); tcp_reass_free(tp, trbn); } @@ -446,7 +433,7 @@ /* Merge in previous blocks if there is overlap. */ while ((trbn = RB_PREV(tcp_ra, &tp->rcv_reass, trb)) != NULL && - SEQ_GEQ(trbn->trb_seqe, trb->trb_seqs)) { + SEQ_LEQ(trb->trb_seqs, trbn->trb_seqe)) { trbn = tcp_reass_merge(trb, trbn); tcp_reass_free(tp, trbn); } @@ -460,6 +447,7 @@ KASSERT(trbn == NULL, ("%s: RB_INSERT failed", __func__)); tcp_reass_sacktrack(tp, trb); tp->rcv_reass_size += SEQ_DELTA(trb->trb_seqs, trb->trb_seqe); + tp->rcv_reass_blocks++; } else if (tp->rcv_nxt == th_seq) { trbn = RB_INSERT(tcp_ra, &tp->rcv_reass, &trbs); KASSERT(trbn == NULL, ("%s: RB_INSERT failed", __func__)); @@ -483,7 +471,7 @@ TCPSTAT_INC(tcps_reass_missingseg); SOCKBUF_LOCK(&so->so_rcv); - + /* We can only ever dequeue one block. */ trb = RB_MIN(tcp_ra, &tp->rcv_reass); if (!(so->so_rcv.sb_state & SBS_CANTRCVMORE)) { sbappendstream_locked(&so->so_rcv, trb->trb_m); @@ -506,14 +494,19 @@ * the sequence space and if queue is not empty. Otherwise * deactivate it. */ - if (tcp_reass_qtimo && !RB_EMPTY(&tp->rcv_reass)) + if (tcp_reass_timeout && !RB_EMPTY(&tp->rcv_reass)) tcp_timer_activate(tp, TT_REASS, - tp->t_rxtcur * tcp_reass_qtimo); + tp->t_rxtcur * tcp_reass_timeout); else tcp_timer_activate(tp, TT_REASS, 0); ND6_HINT(tp); return (thflags); + +done: + m_freem(m); + *tlenp = 0; + return (0); } /* @@ -538,7 +531,11 @@ } trb->trb_seqe = trbn->trb_seqe; trb->trb_mt->m_next = trbn->trb_m; - trb->trb_mt = trbn->trb_mt; + if (tcp_reass_spacetime) { + trb->trb_mt = m_collapse(trb->trb_mt, M_DONTWAIT, 1024); + trb->trb_mt = m_last(trb->trb_mt, NULL); + } else + trb->trb_mt = trbn->trb_mt; } else if (SEQ_LEQ(trb->trb_seqs, trbn->trb_seqe)) { if (SEQ_LEQ(trb->trb_seqs, trbn->trb_seqs)) return (trbn); @@ -547,8 +544,12 @@ trb->trb_m = m_trimhead(trb->trb_m); } trb->trb_seqs = trbn->trb_seqs; + trb->trb_m = trbn->trb_m; trbn->trb_mt->m_next = trb->trb_m; - trb->trb_m = trbn->trb_m; + if (tcp_reass_spacetime) { + trbn->trb_mt = m_collapse(trbn->trb_mt, M_DONTWAIT, 1024); + trb->trb_mt = m_last(trbn->trb_mt, NULL); + } } else return (NULL); @@ -562,13 +563,15 @@ /* * Put the sequence number of the reassembly queue blocks into * the SACK options of an outgoing segment. + * RFC2018: section ... + * RFC2883: section ... */ int tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks) { + int nsacks = 0; + tcp_seq sack_seq; struct tcp_reass_block *trb; - tcp_seq sack_seq; - int nsacks = 0; INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(numsacks > 0, @@ -576,9 +579,24 @@ KASSERT(!LIST_EMPTY(&tp->rcv_reass_sack), ("%s: sack list empty", __func__)); + /* DSACK */ + if (tp->rcv_reass_dsack.start == tp->rcv_reass_dsack.end) { + sack_seq = htonl(tp->rcv_reass_dsack.start); + bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); + optp += sizeof(sack_seq); + sack_seq = htonl(tp->rcv_reass_dsack.end); + bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); + optp += sizeof(sack_seq); + tp->rcv_reass_dsack.start = 0; + tp->rcv_reass_dsack.end = 0; + numsacks--; + nsacks++; + } + /* - * The most recent block must appear first. RFC2018, Section 4. - * Add the other blocks in most recent created or updated order. + * The most recent block must appear first. Add the other + * blocks in most recent created or updated order. + * RFC2018: section 4 */ LIST_FOREACH(trb, &tp->rcv_reass_sack, trb_sack) { if (numsacks < 1) ==== //depot/projects/tcp_reass/netinet/tcp_var.h#20 (text+ko) ==== @@ -106,8 +106,10 @@ */ struct tcpcb { RB_HEAD(tcp_ra, tcp_reass_block) rcv_reass; /* segment reassembly queue */ + int rcv_reass_size; /* segment reassembly memory usage */ + int rcv_reass_blocks; /* blocks in reassembly queue */ LIST_HEAD(tcp_ras, tcp_reass_block) rcv_reass_sack; /* last additions to reass queue */ - int rcv_reass_size; /* segment reassembly memory usage */ + struct sackblk rcv_reass_dsack; /* DSACK block */ int t_dupacks; /* consecutive dup acks recd */