Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 29 Jul 2015 17:59:14 +0000 (UTC)
From:      Patrick Kelsey <pkelsey@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r286027 - in head/sys: netinet sys
Message-ID:  <201507291759.t6THxEZ2061562@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: pkelsey
Date: Wed Jul 29 17:59:13 2015
New Revision: 286027
URL: https://svnweb.freebsd.org/changeset/base/286027

Log:
  Revert r265338, r271089 and r271123 as those changes do not handle
  non-inline urgent data and introduce an mbuf exhaustion attack vector
  similar to FreeBSD-SA-15:15.tcp, but not requiring VNETs.
  
  Address the issue described in FreeBSD-SA-15:15.tcp.
  
  Reviewed by:	glebius
  Approved by:	so
  Approved by:	jmallett (mentor)
  Security:	FreeBSD-SA-15:15.tcp
  Sponsored by:	Norse Corp, Inc.

Modified:
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_reass.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h
  head/sys/sys/mbuf.h

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c	Wed Jul 29 17:50:14 2015	(r286026)
+++ head/sys/netinet/tcp_input.c	Wed Jul 29 17:59:13 2015	(r286027)
@@ -1665,7 +1665,8 @@ tcp_do_segment(struct mbuf *m, struct tc
 	    tp->snd_nxt == tp->snd_max &&
 	    tiwin && tiwin == tp->snd_wnd && 
 	    ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
-	    tp->t_segq == NULL && ((to.to_flags & TOF_TS) == 0 ||
+	    LIST_EMPTY(&tp->t_segq) &&
+	    ((to.to_flags & TOF_TS) == 0 ||
 	     TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 
 		/*
@@ -2903,7 +2904,8 @@ dodata:							/* XXX */
 		 * immediately when segments are out of order (so
 		 * fast retransmit can work).
 		 */
-		if (th->th_seq == tp->rcv_nxt && tp->t_segq == NULL &&
+		if (th->th_seq == tp->rcv_nxt &&
+		    LIST_EMPTY(&tp->t_segq) &&
 		    TCPS_HAVEESTABLISHED(tp->t_state)) {
 			if (DELAY_ACK(tp, tlen))
 				tp->t_flags |= TF_DELACK;

Modified: head/sys/netinet/tcp_reass.c
==============================================================================
--- head/sys/netinet/tcp_reass.c	Wed Jul 29 17:50:14 2015	(r286026)
+++ head/sys/netinet/tcp_reass.c	Wed Jul 29 17:59:13 2015	(r286027)
@@ -71,33 +71,80 @@ __FBSDID("$FreeBSD$");
 #include <netinet/tcp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+
+static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW, 0,
+    "TCP Segment Reassembly Queue");
+
+static int tcp_reass_maxseg = 0;
+SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, maxsegments, CTLFLAG_RDTUN,
+    &tcp_reass_maxseg, 0,
+    "Global maximum number of TCP Segments in Reassembly Queue");
+
+static uma_zone_t tcp_reass_zone;
+SYSCTL_UMA_CUR(_net_inet_tcp_reass, OID_AUTO, cursegments, CTLFLAG_VNET,
+    &tcp_reass_zone,
+    "Global number of TCP Segments currently in Reassembly Queue");
+
+/* Initialize TCP reassembly queue */
+static void
+tcp_reass_zone_change(void *tag)
+{
+
+	/* Set the zone limit and read back the effective value. */
+	tcp_reass_maxseg = nmbclusters / 16;
+	tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone,
+	    tcp_reass_maxseg);
+}
+
+void
+tcp_reass_global_init(void)
+{
+
+	tcp_reass_maxseg = nmbclusters / 16;
+	TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
+	    &tcp_reass_maxseg);
+	tcp_reass_zone = uma_zcreate("tcpreass", sizeof (struct tseg_qent),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	/* Set the zone limit and read back the effective value. */
+	tcp_reass_maxseg = uma_zone_set_max(tcp_reass_zone,
+	    tcp_reass_maxseg);
+	EVENTHANDLER_REGISTER(nmbclusters_change,
+	    tcp_reass_zone_change, NULL, EVENTHANDLER_PRI_ANY);
+}
 
 void
 tcp_reass_flush(struct tcpcb *tp)
 {
-	struct mbuf *m;
+	struct tseg_qent *qe;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
-	while ((m = tp->t_segq) != NULL) {
-		tp->t_segq = m->m_nextpkt;
-		tp->t_segqlen -= m->m_pkthdr.len;
-		m_freem(m);
+	while ((qe = LIST_FIRST(&tp->t_segq)) != NULL) {
+		LIST_REMOVE(qe, tqe_q);
+		m_freem(qe->tqe_m);
+		uma_zfree(tcp_reass_zone, qe);
+		tp->t_segqlen--;
 	}
 
 	KASSERT((tp->t_segqlen == 0),
-	    ("TCP reass queue %p length is %d instead of 0 after flush.",
+	    ("TCP reass queue %p segment count is %d instead of 0 after flush.",
 	    tp, tp->t_segqlen));
 }
 
-#define	M_TCPHDR(m)	((struct tcphdr *)((m)->m_pkthdr.pkt_tcphdr))
-
 int
 tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m)
 {
+	struct tseg_qent *q;
+	struct tseg_qent *p = NULL;
+	struct tseg_qent *nq;
+	struct tseg_qent *te = NULL;
 	struct socket *so = tp->t_inpcb->inp_socket;
-	struct mbuf *mq, *mp;
-	int flags, wakeup;
+	char *s = NULL;
+	int flags;
+	struct tseg_qent tqs;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
@@ -113,10 +160,6 @@ tcp_reass(struct tcpcb *tp, struct tcphd
 	if (th == NULL)
 		goto present;
 
-	M_ASSERTPKTHDR(m);
-	KASSERT(*tlenp == m->m_pkthdr.len, ("%s: tlenp %u len %u", __func__,
-	    *tlenp, m->m_pkthdr.len));
-
 	/*
 	 * Limit the number of segments that can be queued to reduce the
 	 * potential for mbuf exhaustion. For best performance, we want to be
@@ -127,15 +170,17 @@ tcp_reass(struct tcpcb *tp, struct tcphd
 	 * Always let the missing segment through which caused this queue.
 	 * NB: Access to the socket buffer is left intentionally unlocked as we
 	 * can tolerate stale information here.
+	 *
+	 * XXXLAS: Using sbspace(so->so_rcv) instead of so->so_rcv.sb_hiwat
+	 * should work but causes packets to be dropped when they shouldn't.
+	 * Investigate why and re-evaluate the below limit after the behaviour
+	 * is understood.
 	 */
 	if ((th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) &&
-	    tp->t_segqlen + m->m_pkthdr.len >= sbspace(&so->so_rcv)) {
-		char *s;
-
+	    tp->t_segqlen >= (so->so_rcv.sb_hiwat / tp->t_maxseg) + 1) {
 		TCPSTAT_INC(tcps_rcvreassfull);
 		*tlenp = 0;
-		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
-		    NULL))) {
+		if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: queue limit reached, "
 			    "segment dropped\n", s, __func__);
 			free(s, M_TCPLOG);
@@ -145,13 +190,46 @@ tcp_reass(struct tcpcb *tp, struct tcphd
 	}
 
 	/*
+	 * Allocate a new queue entry. If we can't, or hit the zone limit
+	 * just drop the pkt.
+	 *
+	 * Use a temporary structure on the stack for the missing segment
+	 * when the zone is exhausted. Otherwise we may get stuck.
+	 */
+	te = uma_zalloc(tcp_reass_zone, M_NOWAIT);
+	if (te == NULL) {
+		if (th->th_seq != tp->rcv_nxt || !TCPS_HAVEESTABLISHED(tp->t_state)) {
+			TCPSTAT_INC(tcps_rcvmemdrop);
+			m_freem(m);
+			*tlenp = 0;
+			if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
+			    NULL))) {
+				log(LOG_DEBUG, "%s; %s: global zone limit "
+				    "reached, segment dropped\n", s, __func__);
+				free(s, M_TCPLOG);
+			}
+			return (0);
+		} else {
+			bzero(&tqs, sizeof(struct tseg_qent));
+			te = &tqs;
+			if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL,
+			    NULL))) {
+				log(LOG_DEBUG,
+				    "%s; %s: global zone limit reached, using "
+				    "stack for missing segment\n", s, __func__);
+				free(s, M_TCPLOG);
+			}
+		}
+	}
+	tp->t_segqlen++;
+
+	/*
 	 * Find a segment which begins after this one does.
 	 */
-	mp = NULL;
-	for (mq = tp->t_segq; mq != NULL; mq = mq->m_nextpkt) {
-		if (SEQ_GT(M_TCPHDR(mq)->th_seq, th->th_seq))
+	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
+		if (SEQ_GT(q->tqe_th->th_seq, th->th_seq))
 			break;
-		mp = mq;
+		p = q;
 	}
 
 	/*
@@ -159,16 +237,18 @@ tcp_reass(struct tcpcb *tp, struct tcphd
 	 * our data already.  If so, drop the data from the incoming
 	 * segment.  If it provides all of our data, drop us.
 	 */
-	if (mp != NULL) {
+	if (p != NULL) {
 		int i;
-
 		/* conversion to int (in i) handles seq wraparound */
-		i = M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len - th->th_seq;
+		i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
 		if (i > 0) {
 			if (i >= *tlenp) {
 				TCPSTAT_INC(tcps_rcvduppack);
 				TCPSTAT_ADD(tcps_rcvdupbyte, *tlenp);
 				m_freem(m);
+				if (te != &tqs)
+					uma_zfree(tcp_reass_zone, te);
+				tp->t_segqlen--;
 				/*
 				 * Try to present any queued data
 				 * at the left window edge to the user.
@@ -190,54 +270,37 @@ tcp_reass(struct tcpcb *tp, struct tcphd
 	 * While we overlap succeeding segments trim them or,
 	 * if they are completely covered, dequeue them.
 	 */
-	while (mq) {
-		struct mbuf *nq;
-		int i;
-
-		i = (th->th_seq + *tlenp) - M_TCPHDR(mq)->th_seq;
+	while (q) {
+		int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
 		if (i <= 0)
 			break;
-		if (i < mq->m_pkthdr.len) {
-			M_TCPHDR(mq)->th_seq += i;
-			m_adj(mq, i);
-			tp->t_segqlen -= i;
+		if (i < q->tqe_len) {
+			q->tqe_th->th_seq += i;
+			q->tqe_len -= i;
+			m_adj(q->tqe_m, i);
 			break;
 		}
 
-		nq = mq->m_nextpkt;
-		tp->t_segqlen -= mq->m_pkthdr.len;
-		m_freem(mq);
-		if (mp)
-			mp->m_nextpkt = nq;
-		else
-			tp->t_segq = nq;
-		mq = nq;
+		nq = LIST_NEXT(q, tqe_q);
+		LIST_REMOVE(q, tqe_q);
+		m_freem(q->tqe_m);
+		uma_zfree(tcp_reass_zone, q);
+		tp->t_segqlen--;
+		q = nq;
 	}
 
-	/*
-	 * Insert the new segment queue entry into place.  Try to collapse
-	 * mbuf chains if segments are adjacent.
-	 */
-	if (mp) {
-		if (M_TCPHDR(mp)->th_seq + mp->m_pkthdr.len == th->th_seq)
-			m_catpkt(mp, m);
-		else {
-			m->m_nextpkt = mp->m_nextpkt;
-			mp->m_nextpkt = m;
-			m->m_pkthdr.pkt_tcphdr = th;
-		}
+	/* Insert the new segment queue entry into place. */
+	te->tqe_m = m;
+	te->tqe_th = th;
+	te->tqe_len = *tlenp;
+
+	if (p == NULL) {
+		LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
 	} else {
-		mq = tp->t_segq;
-		tp->t_segq = m;
-		if (mq && th->th_seq + *tlenp == M_TCPHDR(mq)->th_seq) {
-			m->m_nextpkt = mq->m_nextpkt;
-			mq->m_nextpkt = NULL;
-			m_catpkt(m, mq);
-		} else
-			m->m_nextpkt = mq;
-		m->m_pkthdr.pkt_tcphdr = th;
+		KASSERT(te != &tqs, ("%s: temporary stack based entry not "
+		    "first element in queue", __func__));
+		LIST_INSERT_AFTER(p, te, tqe_q);
 	}
-	tp->t_segqlen += *tlenp;
 
 present:
 	/*
@@ -246,30 +309,25 @@ present:
 	 */
 	if (!TCPS_HAVEESTABLISHED(tp->t_state))
 		return (0);
-
-	flags = 0;
-	wakeup = 0;
+	q = LIST_FIRST(&tp->t_segq);
+	if (!q || q->tqe_th->th_seq != tp->rcv_nxt)
+		return (0);
 	SOCKBUF_LOCK(&so->so_rcv);
-	while ((mq = tp->t_segq) != NULL &&
-	    M_TCPHDR(mq)->th_seq == tp->rcv_nxt) {
-		tp->t_segq = mq->m_nextpkt;
-
-		tp->rcv_nxt += mq->m_pkthdr.len;
-		tp->t_segqlen -= mq->m_pkthdr.len;
-		flags = M_TCPHDR(mq)->th_flags & TH_FIN;
-
+	do {
+		tp->rcv_nxt += q->tqe_len;
+		flags = q->tqe_th->th_flags & TH_FIN;
+		nq = LIST_NEXT(q, tqe_q);
+		LIST_REMOVE(q, tqe_q);
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
-			m_freem(mq);
-		else {
-			mq->m_nextpkt = NULL;
-			sbappendstream_locked(&so->so_rcv, mq, 0);
-			wakeup = 1;
-		}
-	}
+			m_freem(q->tqe_m);
+		else
+			sbappendstream_locked(&so->so_rcv, q->tqe_m, 0);
+		if (q != &tqs)
+			uma_zfree(tcp_reass_zone, q);
+		tp->t_segqlen--;
+		q = nq;
+	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
 	ND6_HINT(tp);
-	if (wakeup)
-		sorwakeup_locked(so);
-	else
-		SOCKBUF_UNLOCK(&so->so_rcv);
+	sorwakeup_locked(so);
 	return (flags);
 }

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c	Wed Jul 29 17:50:14 2015	(r286026)
+++ head/sys/netinet/tcp_subr.c	Wed Jul 29 17:59:13 2015	(r286027)
@@ -385,6 +385,8 @@ tcp_init(void)
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
+	tcp_reass_global_init();
+
 	/* XXX virtualize those bellow? */
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;

Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c	Wed Jul 29 17:50:14 2015	(r286026)
+++ head/sys/netinet/tcp_usrreq.c	Wed Jul 29 17:59:13 2015	(r286027)
@@ -1977,7 +1977,7 @@ db_print_tcpcb(struct tcpcb *tp, const c
 
 	db_print_indent(indent);
 	db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
-	   tp->t_segq, tp->t_segqlen, tp->t_dupacks);
+	   LIST_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
 
 	db_print_indent(indent);
 	db_printf("tt_rexmt: %p   tt_persist: %p   tt_keep: %p\n",

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h	Wed Jul 29 17:50:14 2015	(r286026)
+++ head/sys/netinet/tcp_var.h	Wed Jul 29 17:59:13 2015	(r286027)
@@ -46,6 +46,15 @@ VNET_DECLARE(int, tcp_do_rfc1323);
 
 #endif /* _KERNEL */
 
+/* TCP segment queue entry */
+struct tseg_qent {
+	LIST_ENTRY(tseg_qent) tqe_q;
+	int	tqe_len;		/* TCP segment data length */
+	struct	tcphdr *tqe_th;		/* a pointer to tcp header */
+	struct	mbuf	*tqe_m;		/* mbuf contains packet */
+};
+LIST_HEAD(tsegqe_head, tseg_qent);
+
 struct sackblk {
 	tcp_seq start;		/* start seq no. of sack block */
 	tcp_seq end;		/* end seq no. */
@@ -91,7 +100,7 @@ do {								\
  * Organized for 16 byte cacheline efficiency.
  */
 struct tcpcb {
-	struct	mbuf *t_segq;		/* segment reassembly queue */
+	struct	tsegqe_head t_segq;	/* segment reassembly queue */
 	void	*t_pspare[2];		/* new reassembly queue */
 	int	t_segqlen;		/* segment reassembly queue length */
 	int	t_dupacks;		/* consecutive dup acks recd */
@@ -667,6 +676,7 @@ char	*tcp_log_addrs(struct in_conninfo *
 char	*tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *,
 	    const void *);
 int	 tcp_reass(struct tcpcb *, struct tcphdr *, int *, struct mbuf *);
+void	 tcp_reass_global_init(void);
 void	 tcp_reass_flush(struct tcpcb *);
 int	 tcp_input(struct mbuf **, int *, int);
 u_long	 tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);

Modified: head/sys/sys/mbuf.h
==============================================================================
--- head/sys/sys/mbuf.h	Wed Jul 29 17:50:14 2015	(r286026)
+++ head/sys/sys/mbuf.h	Wed Jul 29 17:59:13 2015	(r286027)
@@ -150,7 +150,6 @@ struct pkthdr {
 #define	tso_segsz	PH_per.sixteen[1]
 #define	csum_phsum	PH_per.sixteen[2]
 #define	csum_data	PH_per.thirtytwo[1]
-#define	pkt_tcphdr	PH_loc.ptr
 
 /*
  * Description of external storage mapped into mbuf; valid only if M_EXT is



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201507291759.t6THxEZ2061562>