Date: Fri, 17 Jul 2009 18:22:51 GMT From: Andre Oppermann <andre@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 166205 for review Message-ID: <200907171822.n6HIMpnc007001@repoman.freebsd.org>
index | next in thread | raw e-mail
http://perforce.freebsd.org/chv.cgi?CH=166205 Change 166205 by andre@andre_flirtbox on 2009/07/17 18:22:49 More fixes. Affected files ... .. //depot/projects/tcp_reass/netinet/tcp_input.c#12 edit .. //depot/projects/tcp_reass/netinet/tcp_output.c#14 edit .. //depot/projects/tcp_reass/netinet/tcp_reass.c#32 edit .. //depot/projects/tcp_reass/netinet/tcp_sack.c#8 edit .. //depot/projects/tcp_reass/netinet/tcp_subr.c#10 edit .. //depot/projects/tcp_reass/netinet/tcp_usrreq.c#12 edit .. //depot/projects/tcp_reass/netinet/tcp_var.h#16 edit Differences ... ==== //depot/projects/tcp_reass/netinet/tcp_input.c#12 (text+ko) ==== @@ -1245,7 +1245,7 @@ tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && - LIST_EMPTY(&tp->t_segq) && + TAILQ_EMPTY(&tp->t_trq) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { @@ -1386,10 +1386,6 @@ panic("%s: ti_locked %d on pure data " "segment", __func__, ti_locked); ti_locked = TI_UNLOCKED; - - /* Clean receiver SACK report if present */ - if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) - tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* @@ -2560,7 +2556,6 @@ */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { - tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue @@ -2575,7 +2570,7 @@ * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && - LIST_EMPTY(&tp->t_segq) && + TAILQ_EMPTY(&tp->t_trq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) tp->t_flags |= TF_DELACK; @@ -2600,11 +2595,9 @@ * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ - thflags = tcp_reass(tp, th, &tlen, m); + thflags |= tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } - if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) - tcp_update_sack_list(tp, save_start, save_start + tlen); #if 0 /* * Note the amount of data that peer has sent into ==== //depot/projects/tcp_reass/netinet/tcp_output.c#14 (text+ko) ==== @@ -49,6 +49,7 @@ #include <sys/socketvar.h> #include <sys/sysctl.h> #include <sys/vimage.h> +#include <sys/syslog.h> #include <net/if.h> #include <net/route.h> @@ -141,7 +142,7 @@ struct mbuf *m; struct ip *ip = NULL; struct ipovly *ipov = NULL; - struct tcphdr *th; + struct tcphdr *th = NULL; u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; #ifdef IPSEC @@ -152,6 +153,8 @@ struct sackhole *p; int tso = 0; struct tcpopt to; + char *s; + int ipout = 0; #if 0 int maxburst = TCP_MAXBURST; #endif @@ -476,7 +479,7 @@ if (len > tp->t_maxseg) { if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && ((tp->t_flags & TF_SIGNATURE) == 0) && - tp->rcv_numsacks == 0 && sack_rxmit == 0 && + TAILQ_EMPTY(&tp->t_trq) && sack_rxmit == 0 && tp->t_inpcb->inp_options == NULL && tp->t_inpcb->in6p_options == NULL #ifdef IPSEC @@ -683,10 +686,9 @@ to.to_flags |= TOF_SACKPERM; else if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->t_flags & TF_SACK_PERMIT) && - tp->rcv_numsacks > 0) { + !TAILQ_EMPTY(&tp->t_trq)) { to.to_flags |= TOF_SACK; - to.to_nsacks = tp->rcv_numsacks; - to.to_sacks = (u_char *)tp->sackblks; + to.to_sacks = (u_char *)tp; } } #ifdef TCP_SIGNATURE @@ -1184,6 +1186,8 @@ if (V_path_mtu_discovery) ip->ip_off |= IP_DF; + ipout = 1; + error = ip_output(m, tp->t_inpcb->inp_options, NULL, ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0, tp->t_inpcb); @@ -1217,6 +1221,13 @@ } out: SOCKBUF_UNLOCK_ASSERT(&so->so_snd); /* Check gotos. */ + + if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, NULL, NULL, NULL))) { + log(LOG_DEBUG, "%s; %s: error %i while sending (ip_output %i)\n", + s, __func__, error, ipout); + free(s, M_TCPLOG); + } + switch (error) { case EPERM: tp->t_softerror = error; @@ -1410,8 +1421,6 @@ case TOF_SACK: { int sackblks = 0; - struct sackblk *sack = (struct sackblk *)to->to_sacks; - tcp_seq sack_seq; while (!optlen || optlen % 4 != 2) { optlen += TCPOLEN_NOP; @@ -1421,19 +1430,11 @@ continue; optlen += TCPOLEN_SACKHDR; *optp++ = TCPOPT_SACK; - sackblks = min(to->to_nsacks, - (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); - *optp++ = TCPOLEN_SACKHDR + sackblks * TCPOLEN_SACK; - while (sackblks--) { - sack_seq = htonl(sack->start); - bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); - optp += sizeof(sack_seq); - sack_seq = htonl(sack->end); - bcopy((u_char *)&sack_seq, optp, sizeof(sack_seq)); - optp += sizeof(sack_seq); - optlen += TCPOLEN_SACK; - sack++; - } + sackblks = tcp_reass_sack((struct tcpcb *)to->to_sacks, + optp + 1, (TCP_MAXOLEN - optlen) / TCPOLEN_SACK); + *optp++ = TCPOLEN_SACKHDR + (sackblks * TCPOLEN_SACK); + optlen += TCPOLEN_SACK * sackblks; + optp += sizeof(tcp_seq) * 2 * sackblks; TCPSTAT_INC(tcps_sack_send_blocks); break; } ==== //depot/projects/tcp_reass/netinet/tcp_reass.c#32 (text+ko) ==== @@ -1,740 +1,742 @@ -/*- - * Copyright (c) 2007 - * Andre Oppermann, Internet Business Solutions AG. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 - * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $ - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.362 2009/07/14 22:48:30 rwatson Exp $"); - -/* - * Operational overview of TCP reassembly: - * - * It is the purpose of tcp reassembly to store segments that are received - * out of order. This happens when packets are lost along the way due to - * various reasons. The most common one is traffic overload which causes - * routers to stop accepting packets for brief moments. - * - * Upon arrival of the missing segment(s) the whole chain of stored segments - * is moved into the socket buffer. In case of multiple missing segments - * the first consequtive part is moved with the remainder being kept in - * store until the next missing segment arrives. - * - * While in reassembly mode *all* arrving segments are put into the reassembly - * queue. - * - * Instead of storing all segments on their own we build blocks of consequtive - * segments chained together. We use a tailq because a new segments has the - * highest probability to fit the tail of the chain. If not, the second - * highest probability is the beginning of the chain for being the missing - * segment. Otherwise we cycle through each consequtive block until a match - * is found. If a segment matches the end of one block and the start of the - * next block the two blocks are joined together. If no match is found a - * new block is created. - * - * This system is very efficient and can deal efficiently with long chains - * and many holes. - * - * trq_tail ----------------------------------------------\ - * trq_head --> [block] ------> [block] ------> [block] <-/ - * m_next m_next m_next - * | | | - * m_next m_next m_next - * | | | - * m_next m_next m_next - * - * - * The reassembly queues block structure is also used to track SACK - * information as a data receiver. A double-linked list is added - * that links the blocks the reverse order of their arrival or updating. - * This makes us fully compliant to RFC2018 Section 4 including all - * optional parts marked as "SHOULD". - * - * TODO: - * A further improvement is to merge the content of mbufs together if the - * preceeding one has enough space to hold the data of the new one. When - * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves - * them in place. Only when trimming from the tail it actually frees them. - * Normally we don't get mbuf chains so this isn't too much of a concern - * right now. Use m_collapse() to compact the mbuf chains within the - * blocks. - */ - -#include "opt_inet.h" - -#include <sys/param.h> -#include <sys/kernel.h> -#include <sys/malloc.h> -#include <sys/mbuf.h> -#include <sys/socket.h> -#include <sys/socketvar.h> -#include <sys/sysctl.h> -#include <sys/syslog.h> -#include <sys/systm.h> -#include <sys/vimage.h> - -#include <vm/uma.h> - -#include <netinet/in.h> -#include <netinet/in_pcb.h> -#include <netinet/in_systm.h> -#include <netinet/tcp.h> -#include <netinet/tcp_fsm.h> -#include <netinet/tcp_seq.h> -#include <netinet/tcp_timer.h> -#include <netinet/tcp_var.h> - -static VNET_DEFINE(int, tcp_reass_maxseg); -VNET_DEFINE(int, tcp_reass_qsize); -static VNET_DEFINE(int, tcp_reass_maxqlen); -static VNET_DEFINE(int, tcp_reass_overflows); - -VNET_DEFINE(uma_zone_t, tcp_reass_zone); - -#define V_tcp_reass_maxseg VNET_GET(tcp_reass_maxseg) -#define V_tcp_reass_maxqlen VNET_GET(tcp_reass_maxqlen) -#define V_tcp_reass_overflows VNET_GET(tcp_reass_overflows) - -SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, - "TCP Segment Reassembly Queue"); - -static int tcp_reass_enabled = 1; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR, - &tcp_reass_enabled, 0, - "Enable/disable use of TCP Reassembly Queue"); - -static int tcp_reass_maxblocks = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN, - &tcp_reass_maxblocks, 0, - "Global maximum number of TCP Segment Blocks in Reassembly Queue"); - -static int tcp_reass_qsize = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD, - &tcp_reass_qsize, 0, - "Global number of TCP Segment Blocks currently in Reassembly Queue"); - -static int tcp_reass_qtimo = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW, - &tcp_reass_qtimo, 0, - "Reassembly Queue Timeout in multiples of the Retransmission Timeout"); - -static int tcp_reass_spacetime = 0; -SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW, - &tcp_reass_spacetime, 0, - "Reassembly Queue strategy of space vs. time efficiency"); - -static void tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *); - -static __inline void -sack_track(struct tcpcb *tp, struct trq *tqe) { - if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) { - LIST_REMOVE((tqe), trq_s); - LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s); - } -} - -/* Trim empty mbufs from head of chain. */ -static struct mbuf * -m_trimhead(struct mbuf *m) { - struct mbuf *n; - - while (m->m_len == 0) { - n = m; - m = m->m_next; - m_free(n); - } - return (m); -} - -static u_int -m_storagesize(m) { +/*- + * Copyright (c) 2007 + * Andre Oppermann, Internet Business Solutions AG. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.362 2009/07/14 22:48:30 rwatson Exp $"); + +/* + * Operational overview of TCP reassembly: + * + * It is the purpose of tcp reassembly to store segments that are received + * out of order. This happens when packets are lost along the way due to + * various reasons. The most common one is traffic overload which causes + * routers to stop accepting packets for brief moments. + * + * Upon arrival of the missing segment(s) the whole chain of stored segments + * is moved into the socket buffer. In case of multiple missing segments + * the first consequtive part is moved with the remainder being kept in + * store until the next missing segment arrives. + * + * While in reassembly mode *all* arrving segments are put into the reassembly + * queue. + * + * Instead of storing all segments on their own we build blocks of consequtive + * segments chained together. We use a tailq because a new segments has the + * highest probability to fit the tail of the chain. If not, the second + * highest probability is the beginning of the chain for being the missing + * segment. Otherwise we cycle through each consequtive block until a match + * is found. If a segment matches the end of one block and the start of the + * next block the two blocks are joined together. If no match is found a + * new block is created. + * + * This system is very efficient and can deal efficiently with long chains + * and many holes. + * + * trq_tail ----------------------------------------------\ + * trq_head --> [block] ------> [block] ------> [block] <-/ + * m_next m_next m_next + * | | | + * m_next m_next m_next + * | | | + * m_next m_next m_next + * + * + * The reassembly queues block structure is also used to track SACK + * information as a data receiver. A double-linked list is added + * that links the blocks the reverse order of their arrival or updating. + * This makes us fully compliant to RFC2018 Section 4 including all + * optional parts marked as "SHOULD". + * + * TODO: + * A further improvement is to merge the content of mbufs together if the + * preceeding one has enough space to hold the data of the new one. When + * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves + * them in place. Only when trimming from the tail it actually frees them. + * Normally we don't get mbuf chains so this isn't too much of a concern + * right now. Use m_collapse() to compact the mbuf chains within the + * blocks. + */ + +#include "opt_inet.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/vimage.h> + +#include <vm/uma.h> + +#include <net/if.h> +#include <net/route.h> + +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/ip_options.h> +#include <netinet/tcp.h> +#include <netinet/tcp_fsm.h> +#include <netinet/tcp_seq.h> +#include <netinet/tcp_timer.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> + +VNET_DEFINE(uma_zone_t, tcp_reass_zone); + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, + "TCP Segment Reassembly Queue"); + +static int tcp_reass_enabled = 1; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, enable, CTLFLAG_WR, + &tcp_reass_enabled, 0, + "Enable/disable use of TCP Reassembly Queue"); + +static int tcp_reass_maxblocks = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxblocks, CTLFLAG_RDTUN, + &tcp_reass_maxblocks, 0, + "Global maximum number of TCP Segment Blocks in Reassembly Queue"); + +int tcp_reass_qsize = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, curblocks, CTLFLAG_RD, + &tcp_reass_qsize, 0, + "Global number of TCP Segment Blocks currently in Reassembly Queue"); + +static int tcp_reass_qtimo = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, queue_timeout, CTLFLAG_RW, + &tcp_reass_qtimo, 0, + "Reassembly Queue Timeout in multiples of the Retransmission Timeout"); + +static int tcp_reass_spacetime = 0; +SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, space_time, CTLFLAG_RW, + &tcp_reass_spacetime, 0, + "Reassembly Queue strategy of space vs. time efficiency"); + +static void tcp_reass_merge(struct tcpcb *, struct trq *, struct trq *); + +static __inline void +sack_track(struct tcpcb *tp, struct trq *tqe) +{ + + if (LIST_FIRST(&tp->t_trq_sack) != (tqe)) { + LIST_REMOVE((tqe), trq_s); + LIST_INSERT_HEAD(&tp->t_trq_sack, (tqe), trq_s); + } +} + +/* Trim empty mbufs from head of chain. */ +static struct mbuf * +m_trimhead(struct mbuf *m) +{ + struct mbuf *n; + + while (m->m_len == 0) { + n = m; + m = m->m_next; + m_free(n); + } + return (m); +} + +static u_int +m_storagesize(struct mbuf *m) +{ u_int mcnt; - - for (mcnt = 0; m != NULL; m = m->m_next) - mcnt += (m->m_flags & M_EXT) ? - m->m_ext.ext_size + MSIZE : MSIZE; - return (mcnt); -} - -/* - * Adjust TCP reassembly zone limits when the nmbclusters zone changes. - */ -static void -tcp_reass_zone_change(void *tag) -{ - - tcp_reass_maxblocks = nmbclusters / 16; - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); -} - -#ifdef INVARIANTS -static int -tcp_reass_verify(struct tcpcb *tp) -{ - struct trq *tqe, *tqen; - int i = 0; - - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt), - ("%s: trq_seq < rcv_nxt", __func__)); - KASSERT(tqen == NULL || - SEQ_LT(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), - ("%s: overlapping blocks", __func__)); - i++; - } - LIST_FOREACH(tqe, &tp->t_trq_sack, trq_s) { - i--; - } - KASSERT(i == 0, ("%s: SEQ# ordered tailq and arrival ordered " - "SACK list are not equally long", __func__)); - return (0); -} -#endif - -/* - * Initialize TCP reassembly zone on startup. - */ -void -tcp_reass_init(void) -{ - - /* XXX: nmbclusters may be zero. */ - tcp_reass_maxblocks = nmbclusters / 16; - TUNABLE_INT_FETCH("net.inet.tcp.reass.maxblocks", - &tcp_reass_maxblocks); - tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - uma_zone_set_max(tcp_reass_zone, tcp_reass_maxblocks); - EVENTHANDLER_REGISTER(nmbclusters_change, - tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); -} - -/* - * Insert segments into the reassembly queue. - * - * NB: We must always consume the mbuf. Either by appeding it to - * the queue or by freeing it. - */ -int -tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) -{ - struct trq *tqe, *tqen; - struct socket *so = tp->t_inpcb->inp_socket; - struct mbuf *n; - int i, thflags = 0, mcnt; - tcp_seq th_seq; - struct trq tqes; - - INP_WLOCK_ASSERT(tp->t_inpcb); - - /* - * Call with th==NULL after becoming established to - * force pre-ESTABLISHED data up to user socket. - * XXX: Was used for T/TCP of which code remains. - */ - if (th == NULL) { - if (!TCPS_HAVEESTABLISHED(tp->t_state) || - TAILQ_EMPTY(&tp->t_trq) || - ((tqe = TAILQ_FIRST(&tp->t_trq)) && - tqe->trq_seq != tp->rcv_nxt)) - return (0); - goto present; - } - - /* - * Store TCP header information in local variables as - * we may lose access to it after mbuf compacting. - */ - thflags = th->th_flags; - th_seq = th->th_seq; - th = NULL; /* Prevent further use. */ - - /* Check if it is really neccessary to do all the work. */ - if (!tcp_reass_enabled && TAILQ_EMPTY(&tp->t_trq)) { - *tlenp = 0; - m_freem(m); - return (0); - } - - KASSERT(SEQ_LEQ(tp->rcv_nxt, th_seq), - ("%s: sequence number below rcv_nxt", __func__)); - KASSERT(!(tp->rcv_nxt == th_seq) || !(TAILQ_EMPTY(&tp->t_trq)), - ("%s: got missing segment but queue is empty", __func__)); - KASSERT(tcp_reass_verify(tp), - ("%s: reassembly queue inconsistent", __func__)); - - /* - * Limit the number of segments in the reassembly queue to prevent - * holding on to too many segments (and thus running out of mbufs). - * Make sure to let the missing segment through which caused this - * queue. - * - * Count the gross space used by the mbufs in the reassembly queue - * and limit it to the free space in the socket buffer. This way - * the reassembly queue can never consume more mbuf space than the - * socket buffer got allocated anyway and it reflects the actual - * amount of kernel memory used. This effectively prevents mbuf - * exhaustion due to pathological traffic (one byte segments with - * a hole each time) on a single connection. - * - * Counting the gross mbuf space effectively sets the net data - * limit lower than the socket buffer would allow. - * Don't underestimates the effective free space in the socket - * buffer vs. actual real data with 2k clusters and 1500 byte - * packets by introducing a correction factor of 11/8th. - */ - if (th_seq != tp->rcv_nxt && - tp->t_trqmcnt > (sbspace(&so->so_rcv) / 8 * 11)) { - TCPSTAT_INC(tcps_reass_overflow); - TCPSTAT_INC(tcps_rcvmemdrop); - m_freem(m); - *tlenp = 0; - return (0); - } - - /* Get rid of packet header and mtags. */ - m_demote(m, 1); - - /* Trim empty mbufs from head of chain. */ - m = m_trimhead(m); - - /* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */ - mcnt = m_storagesize(m); - - /* - * FIN handling is a bit tricky. - * We cannot trust a FIN that goes into the reassembly queue. - * It can be easily spoofed as it may be anywhere in the receive - * window (see RST attack mitigation in tcp-secure). - * For this reason (and complexity avoidance) we generally ignore - * any FIN arriving at the reassembly queue with one exception; - * When it exactly matches rcv_nxt together with any data in the - * same segment we can conclude it to be genuine and proceed with - * flushing any other data waiting in the reassembly queue. - * A FIN is part of the sequence space and will get retransmitted - * if it was genuine. - * This approach is based on a discussion on TCPM mailing list. - */ - if ((thflags & TH_FIN) && tp->rcv_nxt == th_seq) { - tcp_reass_qfree(tp); - tqe = NULL; - if (m->m_len == 0) { - tcp_timer_activate(tp, TT_REASS, 0); - return (thflags); - } - goto insert; - } else - thflags &= ~TH_FIN; - - /* Check if this is the first segment. */ - if (TAILQ_EMPTY(&tp->t_trq)) - goto insert; - - /* Starting point for the following tests. */ - tqe = TAILQ_LAST(&tp->t_trq, trq_head); - - /* Check if this segment directly attaches to the end. */ - if (tqe->trq_seq + tqe->trq_len == th_seq) { - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_ml->m_next = m; - tqe->trq_ml = m_last(m); - if (tcp_reass_spacetime) { - tqe->trq_m = m_collapse(tqe->trq_m, M_DONTWAIT, 1024); - tp->t_trqmcnt -= tqe->trq_mcnt; - tqe->trq_mcnt = m_storagesize(tqe->trq_m); - tqe->trq_mcnt += tp->t_trqmcnt; - } - sack_track(tqe); - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvoopack); - TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); - TCPSTAT_INC(tcps_reass_tail); - return (0); - } - - /* Check if beyond last block. */ - if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq)) - goto insert; - - /* Check if this is the missing segment. */ - if (tp->rcv_nxt == th_seq) { - tqe = TAILQ_FIRST(&tp->t_trq); - KASSERT(SEQ_GT(tqe->trq_seq, th_seq), - ("%s: first block starts below missing segment", __func__)); - /* Check if segment prepends first block. */ - if (SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp)) { - /* Trim tail of segment. */ - if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) { - m_adj(m, -i); - *tlenp -= i; - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); - /* Update accounting. */ - mcnt = m_storagesize(m); - } - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_seq = th_seq; - n = m_last(m); - n->m_next = tqe->trq_m; - tqe->trq_m = m; - goto present; - } - goto insert; /* No statistics, this segment is in line. */ - } - - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvoopack); - TCPSTAT_ADD(tcps_rcvoobyte, *tlenp); - - /* See where it fits. */ - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - /* Segment is after this blocks coverage. */ - if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq)) - continue; - /* Segment is after the previous one but before this one. */ - if (SEQ_GT(tqe->trq_seq, th_seq + *tlenp)) - break; /* Insert as new block. */ - - /* Segment is already fully covered. */ - if (SEQ_LEQ(tqe->trq_seq, th_seq) && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) { - TCPSTAT_INC(tcps_rcvduppack); - TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp); - TCPSTAT_INC(tcps_reass_covered); - /* - * XXXAO: What to SACK report when duplicate? - * See RFC2883: D-SACK (Duplicate SACK) - */ - sack_track(tqe); - m_freem(m); - *tlenp = 0; - return (0); - } - - /* Segment covers and extends on both ends. */ - if (SEQ_GT(tqe->trq_seq, th_seq) && - SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) { - /* Replace block content. */ - tp->t_trqmcnt -= tqe->trq_mcnt; - m_freem(tqe->trq_m); - tqe->trq_len = *tlenp; - tqe->trq_mcnt = mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_seq = th_seq; - tqe->trq_m = m; - tqe->trq_ml = m_last(m); - /* Check if segment bridges next block to merge. */ - if (tqen != NULL && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq)) - tcp_reass_merge(tp, tqe, tqen); - sack_track(tqe); - TCPSTAT_INC(tcps_reass_replace); - return (0); - } - - /* Segment prepends to this block. */ - if (SEQ_GT(tqe->trq_seq, th_seq) && - SEQ_LEQ(tqe->trq_seq, th_seq + *tlenp) && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp)) { - KASSERT(!(thflags & TH_FIN), - ("%s: new segment with FIN can't prepend", __func__)); - /* Trim tail of segment. */ - if ((i = SEQ_DELTA(tqe->trq_seq, th_seq + *tlenp))) { - m_adj(m, -i); - *tlenp -= i; - /* TCP statistics. */ - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); - /* Update accounting. */ - mcnt = m_storagesize(m); - } - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_seq = th_seq; - n = m_last(m); - n->m_next = tqe->trq_m; - tqe->trq_m = m; - sack_track(tqe); - TCPSTAT_INC(tcps_reass_prepend); - return (0); - } - - /* Segment appends to this block. */ - if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq + *tlenp) && - SEQ_LEQ(tqe->trq_seq, th_seq) && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th_seq)) { - /* Trim head of segment. */ - if ((i = SEQ_DELTA(tqe->trq_seq + tqe->trq_len, th_seq))) { - m_adj(m, i); - *tlenp -= i; - /* TCP Statistics. */ - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, i); - } - tqe->trq_len += *tlenp; - tqe->trq_mcnt += mcnt; - tp->t_trqmcnt += mcnt; - tqe->trq_ml->m_next = m; - tqe->trq_ml = m_last(m); - /* Check if segment bridges two blocks to merge. */ - if (tqen != NULL && - SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq)) - tcp_reass_merge(tp, tqe, tqen); - sack_track(tqe); - TCPSTAT_INC(tcps_reass_append); - return (0); - } - } - -insert: - /* Prepare to insert into block queue. */ - if (tp->rcv_nxt == th_seq) { - /* - * Use temporary struct trq on the stack for missing - * segment to prevent blocking of all reassembly queues - * due to zone exhaustion. - */ - tqen = &tqes; - } else { - tqen = uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO)); - if (tqen == NULL) { - TCPSTAT_INC(tcps_rcvmemdrop); - m_freem(m); - *tlenp = 0; - return (0); - } - TCPSTAT_INC(tcps_reass_blocks); - } - tcp_reass_qsize++; - if (tcp_reass_spacetime) { - m = m_collapse(); - mcnt = m_storagesize(m); - } - tqen->trq_seq = th_seq; - tqen->trq_len = *tlenp; - tqen->trq_mcnt = mcnt; - tp->t_trqmcnt += mcnt; - tqen->trq_m = m; - tqen->trq_ml = m_last(m); - - /* Where to insert. */ - if (tqe != NULL && SEQ_LT(tqe->trq_seq + tqe->trq_len, th_seq)) - TAILQ_INSERT_AFTER(&tp->t_trq, tqe, tqen, trq_q); - else if (tqe != NULL) - TAILQ_INSERT_BEFORE(tqe, tqen, trq_q); - else { - KASSERT(TAILQ_EMPTY(&tp->t_trq), - ("%s: first element queue not empty", __func__)); - TAILQ_INSERT_HEAD(&tp->t_trq, tqen, trq_q); - /* - * Flush the reassembly queue after x times the - * current retransmit interval measured from the - * arrival time of the first segment. - */ - if (tcp_reass_qtimo) - tcp_timer_activate(tp, TT_REASS, - tp->t_rxtcur * tcp_reass_qtimo); - } - LIST_INSERT_HEAD(&tp->t_trq_sack, tqen, trq_s); - - /* Missing segment? */ - if (tp->rcv_nxt != th_seq) - return (0); -present: - /* - * Present data to user, advancing rcv_nxt through the - * completed sequence space. - */ - KASSERT(!TAILQ_EMPTY(&tp->t_trq), - ("%s: queue empty at present", __func__)); - KASSERT((TAILQ_FIRST(&tp->t_trq))->trq_seq == tp->rcv_nxt, - ("%s: first block does not match rcv_nxt", __func__)); - TCPSTAT_INC(tcps_reass_missingseg); - - SOCKBUF_LOCK(&so->so_rcv); - TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { - KASSERT(SEQ_GEQ(tqe->trq_seq, tp->rcv_nxt), - ("%s: trq_seq < rcv_nxt", __func__)); - KASSERT(tqen == NULL || - SEQ_LEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), - ("%s: block overlaps into next one", __func__)); - - if (tqe->trq_seq != tp->rcv_nxt) - break; - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) - m_freem(tqe->trq_m); - else - sbappendstream_locked(&so->so_rcv, tqe->trq_m); - tp->rcv_nxt += tqe->trq_len; - tp->t_trqmcnt -= tqe->trq_mcnt; - TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); - LIST_REMOVE(tqe, trq_s); - if (tqe != &tqes) - uma_zfree(tcp_reass_zone, tqe); - V_tcp_reass_qsize--; - } - /* NB: sorwakeup_locked() does a implicit socket buffer unlock. */ - sorwakeup_locked(so); - - /* - * Restart the reassembly queue flush timer after advancing - * the sequence space and if queue is not empty. Otherwise - * deactivate it. - */ - if (tcp_reass_qtimo && !TAILQ_EMPTY(&tp->t_trq)) - tcp_timer_activate(tp, TT_REASS, - tp->t_rxtcur * tcp_reass_qtimo); - else - tcp_timer_activate(tp, TT_REASS, 0); - - ND6_HINT(tp); - return (thflags); -} - -/* - * Merge one or more consecutive blocks together. - */ -static void -tcp_reass_merge(struct tcpcb *tp, struct trq *tqe, struct trq *tqen) -{ - int i; - - KASSERT(tqe != NULL && tqen != NULL, - ("%s: incomplete input", __func__)); - KASSERT(SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq), - ("%s: blocks do not overlap, nothing to merge", __func__)); - - /* Appended block may reach beyond next block. */ - while (SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq + tqen->trq_len)) { - /* TCP Statistics. */ - TCPSTAT_ADD(tcps_rcvpartdupbyte, tqen->trq_len); >>> TRUNCATED FOR MAIL (1000 lines) <<<help
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200907171822.n6HIMpnc007001>
