Date: Thu, 16 Jul 2009 08:49:24 GMT From: Andre Oppermann <andre@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 166159 for review Message-ID: <200907160849.n6G8nOhk042411@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=166159 Change 166159 by andre@andre_t61 on 2009/07/16 08:49:06 Move queue integrity test to its own function. Affected files ... .. //depot/projects/tcp_reass/netinet/tcp_reass.c#30 edit Differences ... ==== //depot/projects/tcp_reass/netinet/tcp_reass.c#30 (text+ko) ==== @@ -1,731 +1,740 @@ -/*- - * Copyright (c) 2007 - * Andre Oppermann, Internet Business Solutions AG. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 - * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $ - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.362 2009/07/14 22:48:30 rwatson Exp $"); - -/* - * Operational overview of TCP reassembly: - * - * It is the purpose of tcp reassembly to store segments that are received - * out of order. This happens when packets are lost along the way due to - * various reasons. The most common one is traffic overload which causes - * routers to stop accepting packets for brief moments. - * - * Upon arrival of the missing segment(s) the whole chain of stored segments - * is moved into the socket buffer. In case of multiple missing segments - * the first consequtive part is moved with the remainder being kept in - * store until the next missing segment arrives. - * - * While in reassembly mode *all* arrving segments are put into the reassembly - * queue. - * - * Instead of storing all segments on their own we build blocks of consequtive - * segments chained together. We use a tailq because a new segments has the - * highest probability to fit the tail of the chain. If not, the second - * highest probability is the beginning of the chain for being the missing - * segment. Otherwise we cycle through each consequtive block until a match - * is found. If a segment matches the end of one block and the start of the - * next block the two blocks are joined together. If no match is found a - * new block is created. - * - * This system is very efficient and can deal efficiently with long chains - * and many holes. - * - * trq_tail ----------------------------------------------\ - * trq_head --> [block] ------> [block] ------> [block] <-/ - * m_next m_next m_next - * | | | - * m_next m_next m_next - * | | | - * m_next m_next m_next - * - * - * The reassembly queues block structure is also used to track SACK - * information as a data receiver. A double-linked list is added - * that links the blocks the reverse order of their arrival or updating. - * This makes us fully compliant to RFC2018 Section 4 including all - * optional parts marked as "SHOULD". - * - * TODO: - * A further improvement is to merge the content of mbufs together if the - * preceeding one has enough space to hold the data of the new one. When - * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves - * them in place. Only when trimming from the tail it actually frees them. - * Normally we don't get mbuf chains so this isn't too much of a concern - * right now. Use m_collapse() to compact the mbuf chains within the - * blocks. - */ - -#include "opt_inet.h" - -#include <sys/param.h> -#include <sys/kernel.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/sysctl.h> -#include <sys/syslog.h> -#include <sys/systm.h> -#include <sys/vimage.h> - -#include <vm/uma.h> - -#include <netinet/in.h> -#include <netinet/in_pcb.h> -#include <netinet/in_systm.h> -#include <netinet/tcp.h> -#include <netinet/tcp_fsm.h> -#include <netinet/tcp_seq.h> -#include <netinet/tcp_timer.h> -#include <netinet/tcp_var.h> - -static VNET_DEFINE(int, tcp_reass_maxseg); -VNET_DEFINE(int, tcp_reass_qsize); -static VNET_DEFINE(int, tcp_reass_maxqlen); -static VNET_DEFINE(int, tcp_reass_overflows); - -#define V_tcp_reass_maxseg VNET_GET(tcp_reass_maxseg) -#define V_tcp_reass_maxqlen VNET_GET(tcp_reass_maxqlen) -#define V_tcp_reass_overflows VNET_GET(tcp_reass_overflows) - -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, - "TCP Segment Reassembly Queue"); - -static int tcp_reass_enabled = 1; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR, - &tcp_reass_enabled, 0, - "Enable/disable use of TCP Reassembly Queue"); - -static int tcp_reass_maxblocks = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN, - &tcp_reass_maxblocks, 0, - "Global maximum number of TCP Segment Blocks in Reassembly Queue"); - -static int tcp_reass_qsize = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD, - &tcp_reass_qsize, 0, - "Global number of TCP Segment Blocks currently in Reassembly Queue"); - -static int tcp_reass_qtimo = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW, - &tcp_reass_qtimo, 0, - "Reassembly Queue Timeout in multiples of the Retransmission Timeout"); - -static int tcp_reass_spacetime = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW, - &tcp_reass_spacetime, 0, - "Reassembly Queue strategy of space vs. time efficiency"); - -static void tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *); - -uma_zone_t tcp_reass_zone; - -static __inline void -sack_track(struct trq *tqe) { - if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) { - LIST_REMOVE((tqe), trq_s); - LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s); - } -} - -/* Trim empty mbufs from head of chain. */ -static struct mbuf * -m_trimhead(struct mbuf *m) { - struct mbuf *n; - while (m->m_len == 0) { - n = m; - m = m->m_next; - m_free(n); - } - return (m); -} - -static u_int -m_storagesize(m) { - u_int mcnt; - for (mcnt = 0, m; n; m = m->m_next) - mcnt += (m->m_flags & M_EXT) ? - m->m_ext.ext_size + MSIZE : MSIZE; - return (mcnt); -} - -/* - * Adjust TCP reassembly zone limits when the nmbclusters zone changes. - */ -static void -tcp_reass_zone_change(void *tag) -{ - - tcp_reass_maxblocks = nmbclusters / 16; - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); -} - -VNET_DEFINE(uma_zone_t, tcp_reass_zone); - -/* - * Initialize TCP reassembly zone on startup. - */ -void -tcp_reass_init(void) -{ - - /* XXX: nmbclusters may be zero. */ - tcp_reass_maxblocks = nmbclusters / 16; - TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks", - &tcp_reass_maxblocks); - tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); - EVENTHANDLER_REGISTER(nmbclusters_change, - tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); -} - -/* - * Insert segments into the reassembly queue. - * - * NB: We must always consume the mbuf. Either by appeding it to - * the queue or by freeing it. - */ -int -tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) -{ - struct trq *tqe, *tqen; - struct socket *so = tp->t_inpcb->inp_socket; - struct mbuf *n; - int i, thflags = 0, mcnt; - tcp_seq th_seq; - struct trq tqes; - - INP_WLOCK_ASSERT(tp->t_inpcb); - - /* - * Call with th==NULL after becoming established to - * force pre-ESTABLISHED data up to user socket. - * XXX: Was used for T/TCP of which code remains. - */ - if (th == NULL) { - if (!TCPS_HAVEESTABLISHED(tp->t_state) || - TAILQ_EMPTY(&tp->t_trq) || - ((tqe = TAILQ_FIRST(&tp->t_trq)) && - tqe->trq_seq != tp->rcv_nxt)) - return (0); - goto present; - } - - /* - * Store TCP header information in local variables as - * we may lose access to it after mbuf compacting. - */ - thflags = th->th_flags; - th_seq = th->th_seq; - th = NULL; /* Prevent further use. */ - - /* Check if it is really neccessary to do all the work. */ - if (!tcp_reass_enabled && TAILQ_EMPTY(&tp->t_trq)) { - *tlenp = 0; - m_freem(m); - return (0); - } - - KASSERT(SEQ_LEQ(tp->rcv_nxt, th_seq), - ("%s: sequence number below rcv_nxt", __func__)); - KASSERT(!(tp->rcv_nxt == th_seq) || !(TAILQ_EMPTY(&tp->t_trq)), - ("%s: got missing segment but queue is empty", __func__)); - -#ifdef INVARIANTS - i = 0; - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt), - ("%s: trq_seq < rcv_nxt", __func__)); - KASSERT(tqen == NULL || - SEQ_LT(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), - ("%s: overlapping blocks", __func__)); - i++; - } - LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) { - i--; - } - KASSERT(i == 0, ("%s: SEQ# ordered tailq and arrival ordered " - "SACK list are not equally long", __func__)); -#endif - - /* - * Limit the number of segments in the reassembly queue to prevent - * holding on to too many segments (and thus running out of mbufs). - * Make sure to let the missing segment through which caused this - * queue. - * - * Count the gross space used by the mbufs in the reassembly queue - * and limit it to the free space in the socket buffer. This way - * the reassembly queue can never consume more mbuf space than the - * socket buffer got allocated anyway and it reflects the actual - * amount of kernel memory used. This effectively prevents mbuf - * exhaustion due to pathological traffic (one byte segments with - * a hole each time) on a single connection. - * - * Counting the gross mbuf space effectively sets the net data - * limit lower than the socket buffer would allow. - * Don't underestimates the effective free space in the socket - * buffer vs. actual real data with 2k clusters and 1500 byte - * packets by introducing a correction factor of 11/8th. - */ - if (th_seq != tp->rcv_nxt && - tp->t_trqmcnt > (sbspace(&so->so_rcv) / 8 * 11)) { - TCPSTAT_INC(tcps_reass_overflow); - TCPSTAT_INC(tcps_rcvmemdrop); - m_freem(m); - *tlenp = 0; - return (0); - } - - /* Get rid of packet header and mtags. */ - m_demote(m, 1); - - /* Trim empty mbufs from head of chain. */ - m = m_trimhead(m); - - /* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */ - mcnt = m_storagesize(m); - - /* - * FIN handling is a bit tricky. - * We cannot trust a FIN that goes into the reassembly queue. - * It can be easily spoofed as it may be anywhere in the receive - * window (see RST attack mitigation in tcp-secure). - * For this reason (and complexity avoidance) we generally ignore - * any FIN arriving at the reassembly queue with one exception; - * When it exactly matches rcv_nxt together with any data in the - * same segment we can conclude it to be genuine and proceed with - * flushing any other data waiting in the reassembly queue. - * A FIN is part of the sequence space and will get retransmitted - * if it was genuine. - * This approach is based on a discussion on TCPM mailing list. - */ - if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) { - tcp_reass_qfree(tp); - tqe = NULL; - if (m->m_len == 0) { - tcp_timer_activate(tp, TT_REASS, 0); - return (thflags); - } - goto insert; - } else - thflags &= ~TH_FIN; - - /* Check if this is the first segment. */ - if (TAILQ_EMPTY(&tp->t_trq)) - goto insert; - - /* Starting point for the following tests. */ - tqe = TAILQ_LAST(&tp->t_trq, trq_head); - - /* Check if this segment directly attaches to the end. */ - if (tqe->trq_seq + tqe->trq_len == th_seq) { - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_ml->m_next = m; - tqe->trq_ml = m_last(m); - if (tcp_reass_spacetime) { - tqe->trq_m = m_collapse(tqe->trq_m, M_DONTWAIT, 1024); - tp->t_trqmcnt -= tqe->trq_mcnt; - tqe->trq_mcnt = m_storagesize(tqe->trq_m); - tqe->trq_mcnt += tp->t_trqmcnt; - } - sack_track(tqe); - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvoopack); - TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); - TCPSTAT_INC(tcps_reass_tail); - return (0); - } - - /* Check if beyond last block. */ - if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq)) - goto insert; - - /* Check if this is the missing segment. */ - if (tp->rcv_nxt == th_seq) { - tqe = TAILQ_FIRST(&tp->t_trq); - KASSERT(SEQ_GT(tqe->trq_seq, th_seq), - ("%s: first block starts below missing segment", __func__)); - /* Check if segment prepends first block. */ - if (SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp)) { - /* Trim tail of segment. */ - if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) { - m_adj(m, -i); - *tlenp -= i; - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); - /* Update accounting. */ - mcnt = m_storagesize(m); - } - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_seq = th_seq; - n = m_last(m); - n->m_next = tqe->trq_m; - tqe->trq_m = m; - goto present; - } - goto insert; /* No statistics, this segment is in line. */ - } - - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvoopack); - TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); - - /* See where it fits. */ - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - /* Segment is after this blocks coverage. */ - if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq)) - continue; - /* Segment is after the previous one but before this one. */ - if (SEQ_GT(tqe->trq_seq, th_seq + *tlenp)) - break; /* Insert as new block. */ - - /* Segment is already fully covered. */ - if (SEQ_LEQ(tqe->trq_seq, th_seq) && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) { - TCPSTAT_INC(tcps_rcvduppack); - TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp); - TCPSTAT_INC(tcps_reass_covered); - /* - * XXXAO: What to SACK report when duplicate? - * See RFC2883: D-SACK (Duplicate SACK) - */ - sack_track(tqe); - m_freem(m); - *tlenp = 0; - return (0); - } - - /* Segment covers and extends on both ends. */ - if (SEQ_GT(tqe->trq_seq, th_seq) && - SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) { - /* Replace block content. */ - tp->t_trqmcnt -= tqe->trq_mcnt; - m_freem(tqe->trq_m); - tqe->trq_len = *tlenp; - tqe->trq_mcnt = mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_seq = th_seq; - tqe->trq_m = m; - tqe->trq_ml = m_last(m); - /* Check if segment bridges next block to merge. */ - if (tqen != NULL && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq)) - tcp_reass_merge(tp, tqe, tqen); - sack_track(tqe); - TCPSTAT_INC(tcps_reass_replace); - return (0); - } - - /* Segment prepends to this block. */ - if (SEQ_GT(tqe->trq_seq, th_seq) && - SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp) && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) { - KASSERT(!(thflags & TH_FIN), - ("%s: new segment with FIN can't prepend", __func__)); - /* Trim tail of segment. */ - if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) { - m_adj(m, -i); - *tlenp -= i; - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); - /* Update accounting. */ - mcnt = m_storagesize(m); - } - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_seq = th_seq; - n = m_last(m); - n->m_next = tqe->trq_m; - tqe->trq_m = m; - sack_track(tqe); - TCPSTAT_INC(tcps_reass_prepend); - return (0); - } - - /* Segment appends to this block. */ - if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp) && - SEQ_LEQ(tqe->trq_seq, th_seq) && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq)) { - /* Trim head of segment. */ - if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, th_seq))) { - m_adj(m, i); - *tlenp -= i; - /* TCP Statistics. */ - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); - } - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_ml->m_next = m; - tqe->trq_ml = m_last(m); - /* Check if segment bridges two blocks to merge. */ - if (tqen != NULL && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq)) - tcp_reass_merge(tp, tqe, tqen); - sack_track(tqe); - TCPSTAT_INC(tcps_reass_append); - return (0); - } - } - -insert: - /* Prepare to insert into block queue. */ - if (tp->rcv_nxt == th_seq) { - /* - * Use temporary struct trq on the stack for missing - * segment to prevent blocking of all reassembly queues - * due to zone exhaustion. - */ - tqen = &tqes; - } else { - tqen = uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO)); - if (tqen == NULL) { - TCPSTAT_INC(tcps_rcvmemdrop); - m_freem(m); - *tlenp = 0; - return (0); - } - TCPSTAT_INC(tcps_reass_blocks); - } - tcp_reass_qsize++; - if (tcp_reass_spacetime) { - m = m_collapse(); - mcnt = m_storagesize(m); - } - tqen->trq_seq = th_seq; - tqen->trq_len = *tlenp; - tqen->trq_mcnt = mcnt; - tp->t_trqmcnt += mcnt; - tqen->trq_m = m; - tqen->trq_ml = m_last(m); - - /* Where to insert. */ - if (tqe != NULL && SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq)) - TAILQ_INSERT_AFTER(&tp->t_trq, tqe, tqen, trq_q); - else if (tqe != NULL) - TAILQ_INSERT_BEFORE(tqe, tqen, trq_q); - else { - KASSERT(TAILQ_EMPTY(&tp->t_trq), - ("%s: first element queue not empty", __func__)); - TAILQ_INSERT_HEAD(&tp->t_trq, tqen, trq_q); - /* - * Flush the reassembly queue after x times the - * current retransmit interval measured from the - * arrival time of the first segment. - */ - if (tcp_reass_qtimo) - tcp_timer_activate(tp, TT_REASS, - tp->t_rxtcur * tcp_reass_qtimo); - } - LIST_INSERT_HEAD(&tp->t_trq_sack, tqen, trq_s); - - /* Missing segment? */ - if (tp->rcv_nxt != th_seq) - return (0); -present: - /* - * Present data to user, advancing rcv_nxt through the - * completed sequence space. - */ - KASSERT(!TAILQ_EMPTY(&tp->t_trq), - ("%s: queue empty at present", __func__)); - KASSERT((TAILQ_FIRST(&tp->t_trq))->trq_seq == tp->rcv_nxt, - ("%s: first block does not match rcv_nxt", __func__)); - TCPSTAT_INC(tcps_reass_missingseg); - - SOCKBUF_LOCK(&so->so_rcv); - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt), - ("%s: trq_seq < rcv_nxt", __func__)); - KASSERT(tqen == NULL || - SEQ_LEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), - ("%s: block overlaps into next one", __func__)); - - if (tqe->trq_seq != tp->rcv_nxt) - break; - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) - m_freem(tqe->trq_m); - else - sbappendstream_locked(&so->so_rcv, tqe->trq_m); - tp->rcv_nxt += tqe->trq_len; - tp->t_trqmcnt -= tqe->trq_mcnt; - TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); - LIST_REMOVE(tqe, trq_s); - if (tqe != &tqes) - uma_zfree(tcp_reass_zone, tqe); - V_tcp_reass_qsize--; - } - /* NB: sorwakeup_locked() does a implicit socket buffer unlock. */ - sorwakeup_locked(so); - - /* - * Restart the reassembly queue flush timer after advancing - * the sequence space and if queue is not empty. Otherwise - * deactivate it. - */ - if (tcp_reass_qtimo && !TAILQ_EMPTY(&tp->t_trq)) - tcp_timer_activate(tp, TT_REASS, - tp->t_rxtcur * tcp_reass_qtimo); - else - tcp_timer_activate(tp, TT_REASS, 0); - - ND6_HINT(tp); - return (thflags); -} - -/* - * Merge one or more consecutive blocks together. - */ -static void -tcp_reass_merge(struct tcpcb *tp, struct trq *tqe, struct trq *tqen) -{ - int i; - - KASSERT(tqe != NULL && tqen != NULL, - ("%s: incomplete input", __func__)); - KASSERT(SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), - ("%s: blocks do not overlap, nothing to merge", __func__)); - - /* Appended block may reach beyond next block. */ - while (SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq + tqen->trq_len)) { - /* TCP Statistics. */ - TCPSTAT_ADD(tcps_rcvpartdupbyte, tqen->trq_len); - TCPSTAT_INC(tcps_reass_covered); - tp->t_trqmcnt -= tqe->trq_mcnt; - m_freem(tqen->trq_m); - TAILQ_REMOVE(&tp->t_trq, tqen, trq_q); - LIST_REMOVE(tqen, trq_s); - uma_zfree(tcp_reass_zone, tqen); - tcp_reass_qsize--; - /* And the one after that. */ - if ((tqen = TAILQ_NEXT(tqe, trq_q)) == NULL) - return; - } - - /* Trim head of next block. */ - if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, tqen->trq_seq))) { - m_adj(tqen->trq_m, i); - tqen->trq_len -= i; - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); /* Statistics */ - /* Dispose of empty mbufs. */ - if (tcp_reass_spacetime) { - tqen->trq_m = m_trimhead(tqen->trq_m); - tqen->trq_mcnt = m_storagesize(tqen->trq_m); - } - KASSERT(tqen->trq_m != NULL, - ("%s: no remaining mbufs in block", __func__)); - } - - /* Merge blocks together. */ - tqe->trq_len += tqen->trq_len; - tqe->trq_mcnt += tqen->trq_mcnt; - tqe->trq_ml->m_next = tqen->trq_m; - tqe->trq_ml = tqen->trq_ml; - TAILQ_REMOVE(&tp->t_trq, tqen, trq_q); - LIST_REMOVE(tqen, trq_s); - uma_zfree(tcp_reass_zone, tqen); - tcp_reass_qsize--; - TCPSTAT_INC(tcps_reass_merge); -} - -/* - * Put the sequence number of the reassembly queue blocks into - * the SACK options of an outgoing segment. - */ -int -tcp_reass_sack(struct tcpcb *tp, u_char *optp, int numsacks) -{ - struct trq *tqe; - tcp_seq sack_seq; - int nsacks = 0; - - KASSERT(numsacks > 0, - ("%s: zero sack blocks to add", __func__)); - KASSERT(!TAILQ_EMPTY(&tp->t_trq), - ("%s: reassembly queue empty", __func__)); - KASSERT(!LIST_EMPTY(&tp->t_trq_sack), - ("%s: sack list empty", __func__)); - - /* - * The most recent block must appear first. RFC2018, Section 4. - * Add the other blocks in most recent created or updated order. - */ - LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) { - if (numsacks < 1) - break; - sack_seq = htonl(tqe->trq_seq); - bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); - optp += sizeof(sack_seq); - sack_seq = htonl(tqe->trq_seq + tqe->trq_len); - bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); - optp += sizeof(sack_seq); - numsacks--; - nsacks++; - } - - return (nsacks); -} - -/* - * Free the reassembly queue on tcpcb disposal or on general memory shortage. - */ -void -tcp_reass_qfree(struct tcpcb *tp) -{ - struct trq *tqe, *tqen; - - INP_WLOCK_ASSERT(tp->t_inpcb); - - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - m_freem(tqe->trq_m); - KASSERT(tp->t_trqmcnt >= tqe->trq_mcnt, - ("%s: t_trqmcnt incorrect", __func__)); - tp->t_trqmcnt -= tqe->trq_mcnt; - TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); - LIST_REMOVE(tqe, trq_s); - uma_zfree(tcp_reass_zone, tqe); - tcp_reass_qsize--; - } - tcp_timer_activate(tp, TT_REASS, 0); -} +/*- + * Copyright (c) 2007 + * Andre Oppermann, Internet Business Solutions AG. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.362 2009/07/14 22:48:30 rwatson Exp $"); + +/* + * Operational overview of TCP reassembly: + * + * It is the purpose of tcp reassembly to store segments that are received + * out of order. This happens when packets are lost along the way due to + * various reasons. The most common one is traffic overload which causes + * routers to stop accepting packets for brief moments. + * + * Upon arrival of the missing segment(s) the whole chain of stored segments + * is moved into the socket buffer. In case of multiple missing segments + * the first consequtive part is moved with the remainder being kept in + * store until the next missing segment arrives. + * + * While in reassembly mode *all* arrving segments are put into the reassembly + * queue. + * + * Instead of storing all segments on their own we build blocks of consequtive + * segments chained together. We use a tailq because a new segments has the + * highest probability to fit the tail of the chain. If not, the second + * highest probability is the beginning of the chain for being the missing + * segment. Otherwise we cycle through each consequtive block until a match + * is found. If a segment matches the end of one block and the start of the + * next block the two blocks are joined together. If no match is found a + * new block is created. + * + * This system is very efficient and can deal efficiently with long chains + * and many holes. + * + * trq_tail ----------------------------------------------\ + * trq_head --> [block] ------> [block] ------> [block] <-/ + * m_next m_next m_next + * | | | + * m_next m_next m_next + * | | | + * m_next m_next m_next + * + * + * The reassembly queues block structure is also used to track SACK + * information as a data receiver. A double-linked list is added + * that links the blocks the reverse order of their arrival or updating. + * This makes us fully compliant to RFC2018 Section 4 including all + * optional parts marked as "SHOULD". + * + * TODO: + * A further improvement is to merge the content of mbufs together if the + * preceeding one has enough space to hold the data of the new one. When + * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves + * them in place. Only when trimming from the tail it actually frees them. + * Normally we don't get mbuf chains so this isn't too much of a concern + * right now. Use m_collapse() to compact the mbuf chains within the + * blocks. + */ + +#include "opt_inet.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/vimage.h> + +#include <vm/uma.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> + +static VNET_DEFINE(int, tcp_reass_maxseg); +VNET_DEFINE(int, tcp_reass_qsize); +static VNET_DEFINE(int, tcp_reass_maxqlen); +static VNET_DEFINE(int, tcp_reass_overflows); + +VNET_DEFINE(uma_zone_t, tcp_reass_zone); + +#define V_tcp_reass_maxseg VNET_GET(tcp_reass_maxseg) +#define V_tcp_reass_maxqlen VNET_GET(tcp_reass_maxqlen) +#define V_tcp_reass_overflows VNET_GET(tcp_reass_overflows) + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, + "TCP Segment Reassembly Queue"); + +static int tcp_reass_enabled = 1; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR, + &tcp_reass_enabled, 0, + "Enable/disable use of TCP Reassembly Queue"); + +static int tcp_reass_maxblocks = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN, + &tcp_reass_maxblocks, 0, + "Global maximum number of TCP Segment Blocks in Reassembly Queue"); + +static int tcp_reass_qsize = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD, + &tcp_reass_qsize, 0, + "Global number of TCP Segment Blocks currently in Reassembly Queue"); + +static int tcp_reass_qtimo = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW, + &tcp_reass_qtimo, 0, + "Reassembly Queue Timeout in multiples of the Retransmission Timeout"); + +static int tcp_reass_spacetime = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW, + &tcp_reass_spacetime, 0, + "Reassembly Queue strategy of space vs. time efficiency"); + +static void tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *); + +static __inline void +sack_track(struct trq *tqe) { + if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) { + LIST_REMOVE((tqe), trq_s); + LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s); + } +} + +/* Trim empty mbufs from head of chain. */ +static struct mbuf * +m_trimhead(struct mbuf *m) { + struct mbuf *n; + + while (m->m_len == 0) { + n = m; + m = m->m_next; + m_free(n); + } + return (m); +} + +static u_int +m_storagesize(m) { + u_int mcnt; + + for (mcnt = 0, m; n; m = m->m_next) + mcnt += (m->m_flags & M_EXT) ? + m->m_ext.ext_size + MSIZE : MSIZE; + return (mcnt); +} + +/* + * Adjust TCP reassembly zone limits when the nmbclusters zone changes. + */ +static void +tcp_reass_zone_change(void *tag) +{ + + tcp_reass_maxblocks = nmbclusters / 16; + uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); +} + +#ifdef INVARIANTS +static int +tcp_reass_verify(struct tcpcb *tp) +{ + struct trq *tqe, *tqen; + int i = 0; + + TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { + KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt), + ("%s: trq_seq < rcv_nxt", __func__)); + KASSERT(tqen == NULL || + SEQ_LT(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), + ("%s: overlapping blocks", __func__)); + i++; + } + LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) { + i--; + } + KASSERT(i == 0, ("%s: SEQ# ordered tailq and arrival ordered " + "SACK list are not equally long", __func__)); + return (0); +} +#endif + +/* + * Initialize TCP reassembly zone on startup. + */ +void +tcp_reass_init(void) +{ + + /* XXX: nmbclusters may be zero. */ + tcp_reass_maxblocks = nmbclusters / 16; + TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks", + &tcp_reass_maxblocks); + tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); + EVENTHANDLER_REGISTER(nmbclusters_change, + tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); +} + +/* + * Insert segments into the reassembly queue. + * + * NB: We must always consume the mbuf. Either by appeding it to + * the queue or by freeing it. + */ +int +tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) +{ + struct trq *tqe, *tqen; + struct socket *so = tp->t_inpcb->inp_socket; + struct mbuf *n; + int i, thflags = 0, mcnt; + tcp_seq th_seq; + struct trq tqes; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* >>> TRUNCATED FOR MAIL (1000 lines) <<<
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200907160849.n6G8nOhk042411>