Date: Fri, 11 Jan 2008 21:20:41 GMT From: Andre Oppermann <andre@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 133069 for review Message-ID: <200801112120.m0BLKf75004734@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=133069 Change 133069 by andre@andre_flirtbox on 2008/01/11 21:20:25 Initial branch and import of new TCP reassembly queue. It's primarily a code dump. It compiles, doesn't crash and almost works as advertized. Description is at the top of the file. Lots of comments inline. Affected files ... .. //depot/projects/tcp_reass/netinet/tcp_input.c#2 edit .. //depot/projects/tcp_reass/netinet/tcp_reass.c#2 edit .. //depot/projects/tcp_reass/netinet/tcp_subr.c#2 edit .. //depot/projects/tcp_reass/netinet/tcp_usrreq.c#2 edit .. //depot/projects/tcp_reass/netinet/tcp_var.h#2 edit Differences ... ==== //depot/projects/tcp_reass/netinet/tcp_input.c#2 (text+ko) ==== @@ -980,7 +980,7 @@ tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && - LIST_EMPTY(&tp->t_segq) && + TAILQ_EMPTY(&tp->t_trq) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { @@ -1705,8 +1705,7 @@ * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) - (void) tcp_reass(tp, (struct tcphdr *)0, 0, - (struct mbuf *)0); + (void) tcp_reass(tp, NULL, NULL, NULL); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ @@ -2234,7 +2233,7 @@ * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && - LIST_EMPTY(&tp->t_segq) && + TAILQ_EMPTY(&tp->t_trq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp)) tp->t_flags |= TF_DELACK; ==== //depot/projects/tcp_reass/netinet/tcp_reass.c#2 (text+ko) ==== @@ -1,6 +1,6 @@ /*- - * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 - * The Regents of the University of California. All rights reserved. + * Copyright (c) 2007 + * Andre Oppermann, Internet Business Solutions AG. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -27,14 +27,55 @@ * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD: src/sys/netinet/tcp_reass.c,v 1.352 2007/05/13 22:16:13 andre Exp $ */ -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/sys/netinet/tcp_reass.c,v 1.353 2007/10/07 20:44:24 silby Exp $"); +/* + * Operational overview of TCP reassembly: + * + * It is the purpose of tcp reassembly to store segments that are received + * out of order. This happens when packets are lost along the way due to + * various reasons. The most common one is traffic overload which causes + * routers to stop accepting packets for brief moments. + * + * Upon arrival of the missing segment(s) the whole chain of stored segments + * is moved into the socket buffer. In case of multiple missing segments + * the first consequtive part is moved with the remainder being kept in + * store until the next missing segment arrives. + * + * While in reassembly mode *all* arrving segments are put into the reassembly + * queue. + * + * Instead of storing all segments on their own we build blocks of consequtive + * segments chained together. We use a tailq because a new segments has the + * highest probability to fit the tail of the chain. If not, the second + * highest probability is the beginning of the chain for being the missing + * segment. Otherwise we cycle through each consequtive block until a match + * is found. If a segment matches the end of one block and the start of the + * next block the two blocks are joined together. If no match is found a + * new block is created. + * + * This system is very efficient and can deal efficiently with long chains + * and many holes. + * + * trq_tail ----------------------------------------------\ + * trq_head --> [block] ------> [block] ------> [block] <-/ + * m_next m_next m_next + * | | | + * m_next m_next m_next + * | | | + * m_next m_next m_next + * + * + * A further improvement is to merge the content of mbufs together if the + * preceeding one has enough space to hold the data of the new one. When + * trimming the head of an mbuf chain m_adj() empties the mbufs but leaves + * them in place. Only when trimming from the tail it actually frees them. + * Normally we don't get mbuf chains so this isn't too much of a concern + * right now. TODO. + */ #include "opt_inet.h" -#include "opt_inet6.h" -#include "opt_tcpdebug.h" #include <sys/param.h> #include <sys/kernel.h> @@ -48,30 +89,13 @@ #include <vm/uma.h> -#include <net/if.h> -#include <net/route.h> - #include <netinet/in.h> #include <netinet/in_pcb.h> #include <netinet/in_systm.h> -#include <netinet/in_var.h> -#include <netinet/ip.h> -#include <netinet/ip_var.h> -#include <netinet/ip_options.h> -#include <netinet/ip6.h> -#include <netinet6/in6_pcb.h> -#include <netinet6/ip6_var.h> -#include <netinet6/nd6.h> #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> #include <netinet/tcp_seq.h> -#include <netinet/tcp_timer.h> #include <netinet/tcp_var.h> -#include <netinet6/tcp6_var.h> -#include <netinet/tcpip.h> -#ifdef TCPDEBUG -#include <netinet/tcp_debug.h> -#endif /* TCPDEBUG */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); @@ -114,7 +138,7 @@ tcp_reass_maxseg = nmbclusters / 16; TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", &tcp_reass_maxseg); - tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), + tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct trq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcp_reass_zone, tcp_reass_maxseg); EVENTHANDLER_REGISTER(nmbclusters_change, @@ -124,23 +148,15 @@ int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { - struct tseg_qent *q; - struct tseg_qent *p = NULL; - struct tseg_qent *nq; - struct tseg_qent *te = NULL; + struct trq *tqe, *tqen; struct socket *so = tp->t_inpcb->inp_socket; - int flags; + struct mbuf *n; + int i, flags = 0, segs = 0; INP_LOCK_ASSERT(tp->t_inpcb); /* - * XXX: tcp_reass() is rather inefficient with its data structures - * and should be rewritten (see NetBSD for optimizations). While - * doing that it should move to its own file tcp_reass.c. - */ - - /* - * Call with th==NULL after become established to + * Call with th==NULL after becoming established to * force pre-ESTABLISHED data up to user socket. */ if (th == NULL) @@ -155,7 +171,7 @@ */ if (th->th_seq != tp->rcv_nxt && (tcp_reass_qsize + 1 >= tcp_reass_maxseg || - tp->t_segqlen >= tcp_reass_maxqlen)) { + tp->t_trqlen >= tcp_reass_maxqlen)) { tcp_reass_overflows++; tcpstat.tcps_rcvmemdrop++; m_freem(m); @@ -163,97 +179,145 @@ return (0); } - /* - * Allocate a new queue entry. If we can't, or hit the zone limit - * just drop the pkt. - */ - te = uma_zalloc(tcp_reass_zone, M_NOWAIT); - if (te == NULL) { - tcpstat.tcps_rcvmemdrop++; - m_freem(m); - *tlenp = 0; + /* Accounting. */ + tcpstat.tcps_rcvoopack++; + tcpstat.tcps_rcvoobyte += *tlenp; + /* NB: m_adj(m, -i) may free mbufs at the tail of a chain. */ + for (n = m; n; n = n->m_next) + segs++; + tp->t_trqlen += segs; + tcp_reass_qsize += segs; + + /* Get rid of packet header and mtags. */ + m_demote(m, 1); + + /* Check if this segment attaches to the end. */ + tqe = TAILQ_LAST(&tp->t_trq, trq_head); + if (tqe && tqe->trq_seq + tqe->trq_len == th->th_seq) { + tqe->trq_len += *tlenp; + tqe->trq_segs += segs; + tqe->trq_ml->m_next = m; + tqe->trq_ml = m_last(m); return (0); } - tp->t_segqlen++; - tcp_reass_qsize++; - /* - * Find a segment which begins after this one does. - */ - LIST_FOREACH(q, &tp->t_segq, tqe_q) { - if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) - break; - p = q; - } - - /* - * If there is a preceding segment, it may provide some of - * our data already. If so, drop the data from the incoming - * segment. If it provides all of our data, drop us. - */ - if (p != NULL) { - int i; - /* conversion to int (in i) handles seq wraparound */ - i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; - if (i > 0) { - if (i >= *tlenp) { - tcpstat.tcps_rcvduppack++; - tcpstat.tcps_rcvdupbyte += *tlenp; - m_freem(m); - uma_zfree(tcp_reass_zone, te); - tp->t_segqlen--; - tcp_reass_qsize--; - /* - * Try to present any queued data - * at the left window edge to the user. - * This is needed after the 3-WHS - * completes. - */ - goto present; /* ??? */ + /* Check if this is the missing segment. */ + if (tp->rcv_nxt == th->th_seq) { + tqe = TAILQ_FIRST(&tp->t_trq); + KASSERT(tqe != NULL, + ("%s: missing segment but nothing in queue", __func__)); + if (SEQ_LT(tqe->trq_seq, th->th_seq + *tlenp)) { + /* Trim tail. */ + if ((i = tqe->trq_seq - (th->th_seq + *tlenp))) { + m_adj(m, i); + *tlenp -= i; + /* Update accounting. */ + if (segs > 1) { + for (n = m; n; n = n->m_next) + segs--; + tp->t_trqlen -= segs; + tcp_reass_qsize -= segs; + } } - m_adj(m, i); - *tlenp -= i; - th->th_seq += i; } + goto insert; } - tcpstat.tcps_rcvoopack++; - tcpstat.tcps_rcvoobyte += *tlenp; - /* - * While we overlap succeeding segments trim them or, - * if they are completely covered, dequeue them. - */ - while (q) { - int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; - if (i <= 0) + /* See where it fits. */ + TAILQ_FOREACH(tqe, &tp->t_trq, trq_q) { + /* Segment is after our coverage. */ + if (SEQ_LT(tqe->trq_seq + tqe->trq_len, th->th_seq)) + continue; + /* Segment is after the previous one but before us. */ + if (SEQ_GT(tqe->trq_seq, th->th_seq + *tlenp)) break; - if (i < q->tqe_len) { - q->tqe_th->th_seq += i; - q->tqe_len -= i; - m_adj(q->tqe_m, i); - break; + /* Segment is already fully covered. */ + if (SEQ_LEQ(tqe->trq_seq, th->th_seq) && + SEQ_GEQ(tqe->trq_seq + tqe->trq_len, th->th_seq + *tlenp)) { + tcpstat.tcps_rcvduppack++; + tcpstat.tcps_rcvdupbyte += *tlenp; + m_freem(m); + tp->t_trqlen -= segs; + tcp_reass_qsize -= segs; + *tlenp = 0; + return (0); + } + /* Segment appends. */ + if (SEQ_LEQ(tqe->trq_seq + tqe->trq_len, th->th_seq)) { + /* Trim head. */ + if ((i = tqe->trq_seq + tqe->trq_len - th->th_seq)) { + m_adj(m, i); + *tlenp -= i; + } + tqe->trq_len += *tlenp; + tqe->trq_segs += segs; + tqe->trq_ml->m_next = m; + tqe->trq_ml = m_last(m); + /* Check for next block to merge. */ + if ((tqen = TAILQ_NEXT(tqe, trq_q)) && + SEQ_GEQ(tqe->trq_seq + tqe->trq_len, tqen->trq_seq)) { + /* Trim head. */ + if ((i = tqe->trq_seq + tqe->trq_len - + tqen->trq_seq)) { + m_adj(tqen->trq_m, i); + tqen->trq_len -= i; + } + tqe->trq_len += tqen->trq_len; + tqe->trq_segs += tqen->trq_segs; + tqe->trq_ml->m_next = tqen->trq_m; + tqe->trq_ml = tqen->trq_ml; + TAILQ_REMOVE(&tp->t_trq, tqen, trq_q); + uma_zfree(tcp_reass_zone, tqen); + } + return (0); + } + /* Segment prepends. */ + if (SEQ_GT(tqe->trq_seq, th->th_seq)) { + /* Trim tail. */ + if ((i = tqe->trq_seq - (th->th_seq + *tlenp))) { + m_adj(m, i); + *tlenp -= i; + /* Update accounting. */ + if (segs > 1) { + for (n = m; n; n = n->m_next) + segs--; + tp->t_trqlen -= segs; + tcp_reass_qsize -= segs; + } + } + tqe->trq_len += *tlenp; + tqe->trq_segs += segs; + tqe->trq_m = m; + n = m_last(m); + n->m_next = tqe->trq_m; + return (0); } + } - nq = LIST_NEXT(q, tqe_q); - LIST_REMOVE(q, tqe_q); - m_freem(q->tqe_m); - uma_zfree(tcp_reass_zone, q); - tp->t_segqlen--; - tcp_reass_qsize--; - q = nq; +insert: + /* Prepare to insert into block queue. */ + tqen = uma_zalloc(tcp_reass_zone, (M_NOWAIT|M_ZERO)); + if (tqen == NULL) { + tcpstat.tcps_rcvmemdrop++; + m_freem(m); + *tlenp = 0; + return (0); } + tqen->trq_seq = th->th_seq; + tqen->trq_len = *tlenp; + tqen->trq_segs = segs; + tqen->trq_m = m; + tqen->trq_ml = m_last(m); - /* Insert the new segment queue entry into place. */ - te->tqe_m = m; - te->tqe_th = th; - te->tqe_len = *tlenp; + /* Where to insert. */ + if (tqe) + TAILQ_INSERT_BEFORE(tqe, tqen, trq_q); + else + TAILQ_INSERT_HEAD(&tp->t_trq, tqen, trq_q); - if (p == NULL) { - LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); - } else { - LIST_INSERT_AFTER(p, te, tqe_q); - } - + /* Missing segment? */ + if (tp->rcv_nxt != th->th_seq) + return (0); present: /* * Present data to user, advancing rcv_nxt through @@ -261,25 +325,55 @@ */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); - q = LIST_FIRST(&tp->t_segq); - if (!q || q->tqe_th->th_seq != tp->rcv_nxt) + tqe = TAILQ_FIRST(&tp->t_trq); + if (tqe == NULL || tqe->trq_seq != tp->rcv_nxt) return (0); SOCKBUF_LOCK(&so->so_rcv); - do { - tp->rcv_nxt += q->tqe_len; - flags = q->tqe_th->th_flags & TH_FIN; - nq = LIST_NEXT(q, tqe_q); - LIST_REMOVE(q, tqe_q); + TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { + if (tqe->trq_seq != tp->rcv_nxt) + break; +#if 1 + /* XXX: This is bogus if we had a FIN. */ + flags = tqe->trq_flags & TH_FIN; +#endif + tp->rcv_nxt += tqe->trq_len; if (so->so_rcv.sb_state & SBS_CANTRCVMORE) - m_freem(q->tqe_m); + m_freem(tqe->trq_m); else - sbappendstream_locked(&so->so_rcv, q->tqe_m); - uma_zfree(tcp_reass_zone, q); - tp->t_segqlen--; - tcp_reass_qsize--; - q = nq; - } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + sbappendstream_locked(&so->so_rcv, tqe->trq_m); + KASSERT(tp->t_trqlen >= tqe->trq_segs, + ("%s: t_trqlen incorrect", __func__)); + tp->t_trqlen -= tqe->trq_segs; + tcp_reass_qsize -= tqe->trq_segs; + TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); + uma_zfree(tcp_reass_zone, tqe); + } + /* NB: sorwakeup_locked() does an implicit socket buffer unlock. */ + sorwakeup_locked(so); ND6_HINT(tp); - sorwakeup_locked(so); +#if 1 return (flags); +#else + return (0); +#endif +} + +/* + * Free the reassembly queue on tcpcb free and on general memory shortage. + */ +void +tcp_reass_qfree(struct tcpcb *tp) { + struct trq *tqe, *tqen; + + INP_LOCK_ASSERT(tp->t_inpcb); + + TAILQ_FOREACH_SAFE(tqe, &tp->t_trq, trq_q, tqen) { + m_freem(tqe->trq_m); + KASSERT(tp->t_trqlen >= tqe->trq_segs, + ("%s: t_trqlen incorrect", __func__)); + tp->t_trqlen -= tqe->trq_segs; + tcp_reass_qsize -= tqe->trq_segs; + TAILQ_REMOVE(&tp->t_trq, tqe, trq_q); + uma_zfree(tcp_reass_zone, tqe); + } } ==== //depot/projects/tcp_reass/netinet/tcp_subr.c#2 (text+ko) ==== @@ -593,7 +593,6 @@ return (NULL); tp = &tm->tcb; tp->t_timers = &tm->tt; - /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? tcp_v6mssdflt : @@ -611,7 +610,8 @@ tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; - TAILQ_INIT(&tp->snd_holes); + TAILQ_INIT(&tp->snd_holes); /* Covered by M_ZERO. */ + TAILQ_INIT(&tp->t_trq); /* Covered by M_ZERO. */ tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no @@ -665,7 +665,6 @@ void tcp_discardcb(struct tcpcb *tp) { - struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 @@ -742,18 +741,13 @@ tcp_hc_update(&inp->inp_inc, &metrics); } - /* free the reassembly queue, if any */ - while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { - LIST_REMOVE(q, tqe_q); - m_freem(q->tqe_m); - uma_zfree(tcp_reass_zone, q); - tp->t_segqlen--; - tcp_reass_qsize--; - } + /* Free the reassembly queue and other data structures. */ + tcp_reass_qfree(tp); + tcp_free_sackholes(tp); + /* Disconnect offload device, if any. */ tcp_offload_detach(tp); - tcp_free_sackholes(tp); inp->inp_ppcb = NULL; tp->t_inpcb = NULL; uma_zfree(tcpcb_zone, tp); @@ -794,37 +788,27 @@ return (tp); } +/* + * Walk the tcpbs, if existing, and flush the reassembly queue, + * if there is one... + * According to RFC xxx we may flush the reassembly queue even + * if we have indicated receipt of segments through SACK. + */ void tcp_drain(void) { + struct inpcb *inpb; + struct tcpcb *tcpb; if (do_tcpdrain) { - struct inpcb *inpb; - struct tcpcb *tcpb; - struct tseg_qent *te; - /* - * Walk the tcpbs, if existing, and flush the reassembly queue, - * if there is one... - * XXX: The "Net/3" implementation doesn't imply that the TCP - * reassembly queue should be flushed, but in a situation - * where we're really low on mbufs, this is potentially - * usefull. - */ INP_INFO_RLOCK(&tcbinfo); LIST_FOREACH(inpb, tcbinfo.ipi_listhead, inp_list) { if (inpb->inp_vflag & INP_TIMEWAIT) continue; INP_LOCK(inpb); if ((tcpb = intotcpcb(inpb)) != NULL) { - while ((te = LIST_FIRST(&tcpb->t_segq)) - != NULL) { - LIST_REMOVE(te, tqe_q); - m_freem(te->tqe_m); - uma_zfree(tcp_reass_zone, te); - tcpb->t_segqlen--; - tcp_reass_qsize--; - } + tcp_reass_qfree(tcpb); tcp_clean_sackreport(tcpb); } INP_UNLOCK(inpb); ==== //depot/projects/tcp_reass/netinet/tcp_usrreq.c#2 (text+ko) ==== @@ -1745,7 +1745,7 @@ db_print_indent(indent); db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", - LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); + TAILQ_FIRST(&tp->t_trq), tp->t_trqlen, tp->t_dupacks); db_print_indent(indent); db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", ==== //depot/projects/tcp_reass/netinet/tcp_var.h#2 (text+ko) ==== @@ -40,16 +40,20 @@ */ extern int tcp_do_rfc1323; -/* TCP segment queue entry */ -struct tseg_qent { - LIST_ENTRY(tseg_qent) tqe_q; - int tqe_len; /* TCP segment data length */ - struct tcphdr *tqe_th; /* a pointer to tcp header */ - struct mbuf *tqe_m; /* mbuf contains packet */ +/* TCP reassembly queue segment entry. */ +struct trq { + TAILQ_ENTRY(trq) trq_q; + tcp_seq trq_seq; /* start of segment */ + int trq_len; /* length of segment */ + int trq_segs; /* number of mbufs */ + int trq_flags; /* flags for segment chain */ +#define TRQ_FIN 0x01 /* FIN was on last segment */ + struct mbuf *trq_m; /* mbuf chain of data */ + struct mbuf *trq_ml; /* last mbuf in chain of data */ }; -LIST_HEAD(tsegqe_head, tseg_qent); -extern int tcp_reass_qsize; -extern struct uma_zone *tcp_reass_zone; +TAILQ_HEAD(trq_head, trq); +extern int tcp_reass_qsize; +extern struct uma_zone *tcp_reass_zone; struct sackblk { tcp_seq start; /* start seq no. of sack block */ @@ -92,8 +96,8 @@ * Organized for 16 byte cacheline efficiency. */ struct tcpcb { - struct tsegqe_head t_segq; /* segment reassembly queue */ - int t_segqlen; /* segment reassembly queue length */ + struct trq_head t_trq; /* segment reassembly queue */ + int t_trqlen; /* segment reassembly queue length */ int t_dupacks; /* consecutive dup acks recd */ struct tcp_timer *t_timers; /* All the TCP timers in one struct */ @@ -531,6 +535,7 @@ const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); void tcp_reass_init(void); +void tcp_reass_qfree(struct tcpcb *); void tcp_input(struct mbuf *, int); u_long tcp_maxmtu(struct in_conninfo *, int *); u_long tcp_maxmtu6(struct in_conninfo *, int *);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200801112120.m0BLKf75004734>