Date: Sun, 4 May 2014 23:25:33 +0000 (UTC) From: Gleb Smirnoff <glebius@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r265338 - in head/sys: netinet sys Message-ID: <201405042325.s44NPXwD089621@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: glebius Date: Sun May 4 23:25:32 2014 New Revision: 265338 URL: http://svnweb.freebsd.org/changeset/base/265338 Log: The FreeBSD-SA-14:08.tcp was a lesson on not doing acrobatics with mixing on stack memory and UMA memory in one linked list. Thus, rewrite TCP reassembly code in terms of memory usage. The algorithm remains unchanged. We actually do not need extra memory to build a reassembly queue. Arriving mbufs are always packet header mbufs. So we got the length of data as pkthdr.len. We got m_nextpkt for linkage. And we need only one pointer to point at the tcphdr, use PH_loc for that. In tcpcb the t_segq fields becomes mbuf pointer. The t_segqlen field now counts not packets, but bytes in the queue. This gives us more precision when comparing to socket buffer limits. Sponsored by: Netflix Sponsored by: Nginx, Inc. Modified: head/sys/netinet/tcp_input.c head/sys/netinet/tcp_reass.c head/sys/netinet/tcp_subr.c head/sys/netinet/tcp_usrreq.c head/sys/netinet/tcp_var.h head/sys/sys/mbuf.h Modified: head/sys/netinet/tcp_input.c ============================================================================== --- head/sys/netinet/tcp_input.c Sun May 4 20:21:41 2014 (r265337) +++ head/sys/netinet/tcp_input.c Sun May 4 23:25:32 2014 (r265338) @@ -1639,8 +1639,7 @@ tcp_do_segment(struct mbuf *m, struct tc tp->snd_nxt == tp->snd_max && tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && - LIST_EMPTY(&tp->t_segq) && - ((to.to_flags & TOF_TS) == 0 || + tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) { /* @@ -2908,8 +2907,7 @@ dodata: /* XXX */ * immediately when segments are out of order (so * fast retransmit can work). */ - if (th->th_seq == tp->rcv_nxt && - LIST_EMPTY(&tp->t_segq) && + if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp, tlen)) tp->t_flags |= TF_DELACK; Modified: head/sys/netinet/tcp_reass.c ============================================================================== --- head/sys/netinet/tcp_reass.c Sun May 4 20:21:41 2014 (r265337) +++ head/sys/netinet/tcp_reass.c Sun May 4 23:25:32 2014 (r265338) @@ -71,19 +71,10 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_var.h> #include <netinet6/tcp6_var.h> #include <netinet/tcpip.h> -#ifdef TCPDEBUG -#include <netinet/tcp_debug.h> -#endif /* TCPDEBUG */ static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0, "TCP Segment Reassembly Queue"); -static VNET_DEFINE(int, tcp_reass_maxseg) = 0; -#define V_tcp_reass_maxseg VNET(tcp_reass_maxseg) -SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN, - &VNET_NAME(tcp_reass_maxseg), 0, - "Global maximum number of TCP Segments in Reassembly Queue"); - static VNET_DEFINE(int, tcp_reass_overflows) = 0; #define V_tcp_reass_overflows VNET(tcp_reass_overflows) SYSCTL_VNET_INT(_net_inet_tcp_reass, OID_AUTO, overflows, @@ -91,78 +82,32 @@ SYSCTL_VNET_INT(_net_inet_tcp_reass, OID &VNET_NAME(tcp_reass_overflows), 0, "Global number of TCP Segment Reassembly Queue Overflows"); -static VNET_DEFINE(uma_zone_t, tcp_reass_zone); -#define V_tcp_reass_zone VNET(tcp_reass_zone) -SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_VNET, - &VNET_NAME(tcp_reass_zone), - "Global number of TCP Segments currently in Reassembly Queue"); - -/* Initialize TCP reassembly queue */ -static void -tcp_reass_zone_change(void *tag) -{ - - /* Set the zone limit and read back the effective value. */ - V_tcp_reass_maxseg = nmbclusters / 16; - V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone, - V_tcp_reass_maxseg); -} - -void -tcp_reass_init(void) -{ - - V_tcp_reass_maxseg = nmbclusters / 16; - TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments", - &V_tcp_reass_maxseg); - V_tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); - /* Set the zone limit and read back the effective value. */ - V_tcp_reass_maxseg = uma_zone_set_max(V_tcp_reass_zone, - V_tcp_reass_maxseg); - EVENTHANDLER_REGISTER(nmbclusters_change, - tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY); -} - -#ifdef VIMAGE -void -tcp_reass_destroy(void) -{ - - uma_zdestroy(V_tcp_reass_zone); -} -#endif - void tcp_reass_flush(struct tcpcb *tp) { - struct tseg_qent *qe; + struct mbuf *m; INP_WLOCK_ASSERT(tp->t_inpcb); - while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) { - LIST_REMOVE(qe, tqe_q); - m_freem(qe->tqe_m); - uma_zfree(V_tcp_reass_zone, qe); - tp->t_segqlen--; + while ((m = tp->t_segq) != NULL) { + tp->t_segq = m->m_nextpkt; + tp->t_segqlen -= m->m_pkthdr.len; + m_freem(m); } KASSERT((tp->t_segqlen == 0), - ("TCP reass queue %p segment count is %d instead of 0 after flush.", + ("TCP reass queue %p length is %d instead of 0 after flush.", tp, tp->t_segqlen)); } +#define M_TCPHDR(m) ((struct tcphdr *)((m)->m_pkthdr.pkt_tcphdr)) + int tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m) { - struct tseg_qent *q; - struct tseg_qent *p = NULL; - struct tseg_qent *nq; - struct tseg_qent *te = NULL; struct socket *so = tp->t_inpcb->inp_socket; - char *s = NULL; - int flags; - struct tseg_qent tqs; + struct mbuf *mq, *mp; + int flags, wakeup; INP_WLOCK_ASSERT(tp->t_inpcb); @@ -178,6 +123,10 @@ tcp_reass(struct tcpcb *tp, struct tcphd if (th == NULL) goto present; + M_ASSERTPKTHDR(m); + KASSERT(*tlenp == m->m_pkthdr.len, ("%s: tlenp %u len %u", __func__, + *tlenp, m->m_pkthdr.len)); + /* * Limit the number of segments that can be queued to reduce the * potential for mbuf exhaustion. For best performance, we want to be @@ -188,19 +137,17 @@ tcp_reass(struct tcpcb *tp, struct tcphd * Always let the missing segment through which caused this queue. * NB: Access to the socket buffer is left intentionally unlocked as we * can tolerate stale information here. - * - * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat - * should work but causes packets to be dropped when they shouldn't. - * Investigate why and re-evaluate the below limit after the behaviour - * is understood. */ if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) && - tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) { + tp->t_segqlen + m->m_pkthdr.len >= sbspace(&so->so_rcv)) { + char *s; + V_tcp_reass_overflows++; TCPSTAT_INC(tcps_rcvmemdrop); m_freem(m); *tlenp = 0; - if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) { + if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, + NULL))) { log(LOG_DEBUG, "%s; %s: queue limit reached, " "segment dropped\n", s, __func__); free(s, M_TCPLOG); @@ -209,46 +156,13 @@ tcp_reass(struct tcpcb *tp, struct tcphd } /* - * Allocate a new queue entry. If we can't, or hit the zone limit - * just drop the pkt. - * - * Use a temporary structure on the stack for the missing segment - * when the zone is exhausted. Otherwise we may get stuck. - */ - te = uma_zalloc(V_tcp_reass_zone, M_NOWAIT); - if (te == NULL) { - if (th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) { - TCPSTAT_INC(tcps_rcvmemdrop); - m_freem(m); - *tlenp = 0; - if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, - NULL))) { - log(LOG_DEBUG, "%s; %s: global zone limit " - "reached, segment dropped\n", s, __func__); - free(s, M_TCPLOG); - } - return (0); - } else { - bzero(&tqs, sizeof(struct tseg_qent)); - te = &tqs; - if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, - NULL))) { - log(LOG_DEBUG, - "%s; %s: global zone limit reached, using " - "stack for missing segment\n", s, __func__); - free(s, M_TCPLOG); - } - } - } - tp->t_segqlen++; - - /* * Find a segment which begins after this one does. */ - LIST_FOREACH(q, &tp->t_segq, tqe_q) { - if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) + mp = NULL; + for (mq = tp->t_segq; mq != NULL; mq = mq->m_nextpkt) { + if (SEQ_GT(M_TCPHDR(mq)->th_seq, th->th_seq)) break; - p = q; + mp = mq; } /* @@ -256,18 +170,16 @@ tcp_reass(struct tcpcb *tp, struct tcphd * our data already. If so, drop the data from the incoming * segment. If it provides all of our data, drop us. */ - if (p != NULL) { + if (mp != NULL) { int i; + /* conversion to int (in i) handles seq wraparound */ - i = p->tqe_th->th_seq + p->tqe_len - th->th_seq; + i = M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len - th->th_seq; if (i > 0) { if (i >= *tlenp) { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp); m_freem(m); - if (te != &tqs) - uma_zfree(V_tcp_reass_zone, te); - tp->t_segqlen--; /* * Try to present any queued data * at the left window edge to the user. @@ -289,37 +201,40 @@ tcp_reass(struct tcpcb *tp, struct tcphd * While we overlap succeeding segments trim them or, * if they are completely covered, dequeue them. */ - while (q) { - int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq; + while (mq) { + struct mbuf *nq; + int i; + + i = (th->th_seq + *tlenp) - M_TCPHDR(mq)->th_seq; if (i <= 0) break; - if (i < q->tqe_len) { - q->tqe_th->th_seq += i; - q->tqe_len -= i; - m_adj(q->tqe_m, i); + if (i < mq->m_pkthdr.len) { + M_TCPHDR(mq)->th_seq += i; + m_adj(mq, i); + tp->t_segqlen -= i; break; } - nq = LIST_NEXT(q, tqe_q); - LIST_REMOVE(q, tqe_q); - m_freem(q->tqe_m); - uma_zfree(V_tcp_reass_zone, q); - tp->t_segqlen--; - q = nq; + nq = mq->m_nextpkt; + tp->t_segqlen -= mq->m_pkthdr.len; + m_freem(mq); + if (mp) + mp->m_nextpkt = nq; + else + tp->t_segq = nq; + mq = nq; } /* Insert the new segment queue entry into place. */ - te->tqe_m = m; - te->tqe_th = th; - te->tqe_len = *tlenp; - - if (p == NULL) { - LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q); + if (mp) { + m->m_nextpkt = mp->m_nextpkt; + mp->m_nextpkt = m; } else { - KASSERT(te != &tqs, ("%s: temporary stack based entry not " - "first element in queue", __func__)); - LIST_INSERT_AFTER(p, te, tqe_q); + m->m_nextpkt = tp->t_segq; + tp->t_segq = m ; } + m->m_pkthdr.pkt_tcphdr = th; + tp->t_segqlen += m->m_pkthdr.len; present: /* @@ -328,25 +243,30 @@ present: */ if (!TCPS_HAVEESTABLISHED(tp->t_state)) return (0); - q = LIST_FIRST(&tp->t_segq); - if (!q || q->tqe_th->th_seq != tp->rcv_nxt) - return (0); + + flags = 0; + wakeup = 0; SOCKBUF_LOCK(&so->so_rcv); - do { - tp->rcv_nxt += q->tqe_len; - flags = q->tqe_th->th_flags & TH_FIN; - nq = LIST_NEXT(q, tqe_q); - LIST_REMOVE(q, tqe_q); + while ((mq = tp->t_segq) != NULL && + M_TCPHDR(mq)->th_seq == tp->rcv_nxt) { + tp->t_segq = mq->m_nextpkt; + + tp->rcv_nxt += mq->m_pkthdr.len; + tp->t_segqlen -= mq->m_pkthdr.len; + flags = M_TCPHDR(mq)->th_flags & TH_FIN; + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) - m_freem(q->tqe_m); - else - sbappendstream_locked(&so->so_rcv, q->tqe_m); - if (q != &tqs) - uma_zfree(V_tcp_reass_zone, q); - tp->t_segqlen--; - q = nq; - } while (q && q->tqe_th->th_seq == tp->rcv_nxt); + m_freem(mq); + else { + mq->m_nextpkt = NULL; + sbappendstream_locked(&so->so_rcv, mq); + wakeup = 1; + } + } ND6_HINT(tp); - sorwakeup_locked(so); + if (wakeup) + sorwakeup_locked(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); return (flags); } Modified: head/sys/netinet/tcp_subr.c ============================================================================== --- head/sys/netinet/tcp_subr.c Sun May 4 20:21:41 2014 (r265337) +++ head/sys/netinet/tcp_subr.c Sun May 4 23:25:32 2014 (r265338) @@ -375,7 +375,6 @@ tcp_init(void) tcp_tw_init(); syncache_init(); tcp_hc_init(); - tcp_reass_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), @@ -433,7 +432,6 @@ tcp_destroy(void) { int error; - tcp_reass_destroy(); tcp_hc_destroy(); syncache_destroy(); tcp_tw_destroy(); Modified: head/sys/netinet/tcp_usrreq.c ============================================================================== --- head/sys/netinet/tcp_usrreq.c Sun May 4 20:21:41 2014 (r265337) +++ head/sys/netinet/tcp_usrreq.c Sun May 4 23:25:32 2014 (r265338) @@ -1949,7 +1949,7 @@ db_print_tcpcb(struct tcpcb *tp, const c db_print_indent(indent); db_printf("t_segq first: %p t_segqlen: %d t_dupacks: %d\n", - LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks); + tp->t_segq, tp->t_segqlen, tp->t_dupacks); db_print_indent(indent); db_printf("tt_rexmt: %p tt_persist: %p tt_keep: %p\n", Modified: head/sys/netinet/tcp_var.h ============================================================================== --- head/sys/netinet/tcp_var.h Sun May 4 20:21:41 2014 (r265337) +++ head/sys/netinet/tcp_var.h Sun May 4 23:25:32 2014 (r265338) @@ -46,15 +46,6 @@ VNET_DECLARE(int, tcp_do_rfc1323); #endif /* _KERNEL */ -/* TCP segment queue entry */ -struct tseg_qent { - LIST_ENTRY(tseg_qent) tqe_q; - int tqe_len; /* TCP segment data length */ - struct tcphdr *tqe_th; /* a pointer to tcp header */ - struct mbuf *tqe_m; /* mbuf contains packet */ -}; -LIST_HEAD(tsegqe_head, tseg_qent); - struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ @@ -100,7 +91,7 @@ do { \ * Organized for 16 byte cacheline efficiency. */ struct tcpcb { - struct tsegqe_head t_segq; /* segment reassembly queue */ + struct mbuf *t_segq; /* segment reassembly queue */ void *t_pspare[2]; /* new reassembly queue */ int t_segqlen; /* segment reassembly queue length */ int t_dupacks; /* consecutive dup acks recd */ @@ -663,11 +654,7 @@ char *tcp_log_addrs(struct in_conninfo * char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *); -void tcp_reass_init(void); void tcp_reass_flush(struct tcpcb *); -#ifdef VIMAGE -void tcp_reass_destroy(void); -#endif void tcp_input(struct mbuf *, int); u_long tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); u_long tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); Modified: head/sys/sys/mbuf.h ============================================================================== --- head/sys/sys/mbuf.h Sun May 4 20:21:41 2014 (r265337) +++ head/sys/sys/mbuf.h Sun May 4 23:25:32 2014 (r265338) @@ -159,6 +159,7 @@ struct pkthdr { #define tso_segsz PH_per.sixteen[1] #define csum_phsum PH_per.sixteen[2] #define csum_data PH_per.thirtytwo[1] +#define pkt_tcphdr PH_loc.ptr /* * Description of external storage mapped into mbuf; valid only if M_EXT is
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201405042325.s44NPXwD089621>