Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 14 Aug 2018 17:17:38 +0000 (UTC)
From:      "Jonathan T. Looney" <jtl@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r337776 - head/sys/netinet6
Message-ID:  <201808141717.w7EHHcPf010971@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jtl
Date: Tue Aug 14 17:17:37 2018
New Revision: 337776
URL: https://svnweb.freebsd.org/changeset/base/337776

Log:
  Improve IPv6 reassembly performance by hashing fragments into buckets.
  
  Currently, all IPv6 fragment reassembly queues are kept in a flat
  linked list. This has a number of implications. Two significant
  implications are: all reassembly operations share a common lock,
  and it is possible for the linked list to grow quite large.
  
  Improve IPv6 reassembly performance by hashing fragments into buckets,
  each of which has its own lock. Calculate the hash key using a Jenkins
  hash with a random seed.
  
  Reviewed by:	jhb
  Security:	FreeBSD-SA-18:10.ip
  Security:	CVE-2018-6923

Modified:
  head/sys/netinet6/frag6.c

Modified: head/sys/netinet6/frag6.c
==============================================================================
--- head/sys/netinet6/frag6.c	Tue Aug 14 17:15:47 2018	(r337775)
+++ head/sys/netinet6/frag6.c	Tue Aug 14 17:17:37 2018	(r337776)
@@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/hash.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
@@ -49,6 +50,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/kernel.h>
 #include <sys/syslog.h>
 
+#include <machine/atomic.h>
+
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
@@ -65,29 +68,41 @@ __FBSDID("$FreeBSD$");
 
 #include <security/mac/mac_framework.h>
 
-static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *);
-static void frag6_deq(struct ip6asfrag *);
-static void frag6_insque(struct ip6q *, struct ip6q *);
-static void frag6_remque(struct ip6q *);
-static void frag6_freef(struct ip6q *);
-
-static struct mtx ip6qlock;
 /*
- * These fields all protected by ip6qlock.
+ * Reassembly headers are stored in hash buckets.
  */
-VNET_DEFINE_STATIC(u_int, frag6_nfragpackets);
-VNET_DEFINE_STATIC(u_int, frag6_nfrags);
-VNET_DEFINE_STATIC(struct ip6q, ip6q);	/* ip6 reassemble queue */
+#define	IP6REASS_NHASH_LOG2	6
+#define	IP6REASS_NHASH		(1 << IP6REASS_NHASH_LOG2)
+#define	IP6REASS_HMASK		(IP6REASS_NHASH - 1)
 
+static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *,
+    uint32_t bucket __unused);
+static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused);
+static void frag6_insque_head(struct ip6q *, struct ip6q *,
+    uint32_t bucket __unused);
+static void frag6_remque(struct ip6q *, uint32_t bucket __unused);
+static void frag6_freef(struct ip6q *, uint32_t bucket);
+
+struct ip6qbucket {
+	struct ip6q	ip6q;
+	struct mtx	lock;
+};
+
+VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets);
+VNET_DEFINE_STATIC(volatile u_int, frag6_nfrags);
+VNET_DEFINE_STATIC(struct ip6qbucket, ip6q[IP6REASS_NHASH]);
+VNET_DEFINE_STATIC(uint32_t, ip6q_hashseed);
+
 #define	V_frag6_nfragpackets		VNET(frag6_nfragpackets)
 #define	V_frag6_nfrags			VNET(frag6_nfrags)
 #define	V_ip6q				VNET(ip6q)
+#define	V_ip6q_hashseed			VNET(ip6q_hashseed)
 
-#define	IP6Q_LOCK_INIT()	mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF);
-#define	IP6Q_LOCK()		mtx_lock(&ip6qlock)
-#define	IP6Q_TRYLOCK()		mtx_trylock(&ip6qlock)
-#define	IP6Q_LOCK_ASSERT()	mtx_assert(&ip6qlock, MA_OWNED)
-#define	IP6Q_UNLOCK()		mtx_unlock(&ip6qlock)
+#define	IP6Q_LOCK(i)		mtx_lock(&V_ip6q[(i)].lock)
+#define	IP6Q_TRYLOCK(i)		mtx_trylock(&V_ip6q[(i)].lock)
+#define	IP6Q_LOCK_ASSERT(i)	mtx_assert(&V_ip6q[(i)].lock, MA_OWNED)
+#define	IP6Q_UNLOCK(i)		mtx_unlock(&V_ip6q[(i)].lock)
+#define	IP6Q_HEAD(i)		(&V_ip6q[(i)].ip6q)
 
 static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header");
 
@@ -105,18 +120,22 @@ frag6_change(void *tag)
 void
 frag6_init(void)
 {
+	struct ip6q *q6;
+	int i;
 
 	V_ip6_maxfragpackets = nmbclusters / 4;
 	V_ip6_maxfrags = nmbclusters / 4;
-	V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q;
-
+	for (i = 0; i < IP6REASS_NHASH; i++) {
+		q6 = IP6Q_HEAD(i);
+		q6->ip6q_next = q6->ip6q_prev = q6;
+		mtx_init(&V_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF);
+	}
+	V_ip6q_hashseed = arc4random();
 	if (!IS_DEFAULT_VNET(curvnet))
 		return;
 
 	EVENTHANDLER_REGISTER(nmbclusters_change,
 	    frag6_change, NULL, EVENTHANDLER_PRI_ANY);
-
-	IP6Q_LOCK_INIT();
 }
 
 /*
@@ -157,12 +176,13 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 	struct mbuf *m = *mp, *t;
 	struct ip6_hdr *ip6;
 	struct ip6_frag *ip6f;
-	struct ip6q *q6;
+	struct ip6q *head, *q6;
 	struct ip6asfrag *af6, *ip6af, *af6dwn;
 	struct in6_ifaddr *ia;
 	int offset = *offp, nxt, i, next;
 	int first_frag = 0;
 	int fragoff, frgpartlen;	/* must be larger than u_int16_t */
+	uint32_t hash, hashkey[sizeof(struct in6_addr) * 2 + 1], *hashkeyp;
 	struct ifnet *dstifp;
 	u_int8_t ecn, ecn0;
 #ifdef RSS
@@ -231,7 +251,16 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 		return (ip6f->ip6f_nxt);
 	}
 
-	IP6Q_LOCK();
+	hashkeyp = hashkey;
+	memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr));
+	hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
+	memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr));
+	hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
+	*hashkeyp = ip6f->ip6f_ident;
+	hash = jenkins_hash32(hashkey, nitems(hashkey), V_ip6q_hashseed);
+	hash &= IP6REASS_HMASK;
+	head = IP6Q_HEAD(hash);
+	IP6Q_LOCK(hash);
 
 	/*
 	 * Enforce upper bound on number of fragments.
@@ -240,10 +269,10 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 	 */
 	if (V_ip6_maxfrags < 0)
 		;
-	else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags)
+	else if (atomic_load_int(&V_frag6_nfrags) >= (u_int)V_ip6_maxfrags)
 		goto dropfrag;
 
-	for (q6 = V_ip6q.ip6q_next; q6 != &V_ip6q; q6 = q6->ip6q_next)
+	for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next)
 		if (ip6f->ip6f_ident == q6->ip6q_ident &&
 		    IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) &&
 		    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst)
@@ -253,7 +282,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 		    )
 			break;
 
-	if (q6 == &V_ip6q) {
+	if (q6 == head) {
 		/*
 		 * the first fragment to arrive, create a reassembly queue.
 		 */
@@ -268,9 +297,10 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 		 */
 		if (V_ip6_maxfragpackets < 0)
 			;
-		else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets)
+		else if (atomic_load_int(&V_frag6_nfragpackets) >=
+		    (u_int)V_ip6_maxfragpackets)
 			goto dropfrag;
-		V_frag6_nfragpackets++;
+		atomic_add_int(&V_frag6_nfragpackets, 1);
 		q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE,
 		    M_NOWAIT);
 		if (q6 == NULL)
@@ -283,7 +313,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 		}
 		mac_ip6q_create(m, q6);
 #endif
-		frag6_insque(q6, &V_ip6q);
+		frag6_insque_head(q6, head, hash);
 
 		/* ip6q_nxt will be filled afterwards, from 1st fragment */
 		q6->ip6q_down	= q6->ip6q_up = (struct ip6asfrag *)q6;
@@ -324,14 +354,14 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 			icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 			    offset - sizeof(struct ip6_frag) +
 			    offsetof(struct ip6_frag, ip6f_offlg));
-			IP6Q_UNLOCK();
+			IP6Q_UNLOCK(hash);
 			return (IPPROTO_DONE);
 		}
 	} else if (fragoff + frgpartlen > IPV6_MAXPACKET) {
 		icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
 		    offset - sizeof(struct ip6_frag) +
 		    offsetof(struct ip6_frag, ip6f_offlg));
-		IP6Q_UNLOCK();
+		IP6Q_UNLOCK(hash);
 		return (IPPROTO_DONE);
 	}
 	/*
@@ -350,7 +380,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 				int erroff = af6->ip6af_offset;
 
 				/* dequeue the fragment. */
-				frag6_deq(af6);
+				frag6_deq(af6, hash);
 				free(af6, M_FTABLE);
 
 				/* adjust pointer. */
@@ -448,7 +478,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
 		}
 		af6 = af6->ip6af_down;
 		m_freem(IP6_REASS_MBUF(af6->ip6af_up));
-		frag6_deq(af6->ip6af_up);
+		frag6_deq(af6->ip6af_up, hash);
 	}
 #else
 	/*
@@ -500,26 +530,26 @@ insert:
 	 * Move to front of packet queue, as we are
 	 * the most recently active fragmented packet.
 	 */
-	frag6_enq(ip6af, af6->ip6af_up);
-	V_frag6_nfrags++;
+	frag6_enq(ip6af, af6->ip6af_up, hash);
+	atomic_add_int(&V_frag6_nfrags, 1);
 	q6->ip6q_nfrag++;
 #if 0 /* xxx */
-	if (q6 != V_ip6q.ip6q_next) {
-		frag6_remque(q6);
-		frag6_insque(q6, &V_ip6q);
+	if (q6 != head->ip6q_next) {
+		frag6_remque(q6, hash);
+		frag6_insque_head(q6, head, hash);
 	}
 #endif
 	next = 0;
 	for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
 	     af6 = af6->ip6af_down) {
 		if (af6->ip6af_off != next) {
-			IP6Q_UNLOCK();
+			IP6Q_UNLOCK(hash);
 			return IPPROTO_DONE;
 		}
 		next += af6->ip6af_frglen;
 	}
 	if (af6->ip6af_up->ip6af_mff) {
-		IP6Q_UNLOCK();
+		IP6Q_UNLOCK(hash);
 		return IPPROTO_DONE;
 	}
 
@@ -529,7 +559,7 @@ insert:
 	ip6af = q6->ip6q_down;
 	t = m = IP6_REASS_MBUF(ip6af);
 	af6 = ip6af->ip6af_down;
-	frag6_deq(ip6af);
+	frag6_deq(ip6af, hash);
 	while (af6 != (struct ip6asfrag *)q6) {
 		m->m_pkthdr.csum_flags &=
 		    IP6_REASS_MBUF(af6)->m_pkthdr.csum_flags;
@@ -537,7 +567,7 @@ insert:
 		    IP6_REASS_MBUF(af6)->m_pkthdr.csum_data;
 
 		af6dwn = af6->ip6af_down;
-		frag6_deq(af6);
+		frag6_deq(af6, hash);
 		while (t->m_next)
 			t = t->m_next;
 		m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset);
@@ -564,13 +594,13 @@ insert:
 #endif
 
 	if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) {
-		frag6_remque(q6);
-		V_frag6_nfrags -= q6->ip6q_nfrag;
+		frag6_remque(q6, hash);
+		atomic_subtract_int(&V_frag6_nfrags, q6->ip6q_nfrag);
 #ifdef MAC
 		mac_ip6q_destroy(q6);
 #endif
 		free(q6, M_FTABLE);
-		V_frag6_nfragpackets--;
+		atomic_subtract_int(&V_frag6_nfragpackets, 1);
 
 		goto dropfrag;
 	}
@@ -581,14 +611,14 @@ insert:
 	m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t),
 	    (caddr_t)&nxt);
 
-	frag6_remque(q6);
-	V_frag6_nfrags -= q6->ip6q_nfrag;
+	frag6_remque(q6, hash);
+	atomic_subtract_int(&V_frag6_nfrags, q6->ip6q_nfrag);
 #ifdef MAC
 	mac_ip6q_reassemble(q6, m);
 	mac_ip6q_destroy(q6);
 #endif
 	free(q6, M_FTABLE);
-	V_frag6_nfragpackets--;
+	atomic_subtract_int(&V_frag6_nfragpackets, 1);
 
 	if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */
 		int plen = 0;
@@ -610,7 +640,7 @@ insert:
 	m_tag_prepend(m, mtag);
 #endif
 
-	IP6Q_UNLOCK();
+	IP6Q_UNLOCK(hash);
 	IP6STAT_INC(ip6s_reassembled);
 	in6_ifstat_inc(dstifp, ifs6_reass_ok);
 
@@ -632,7 +662,7 @@ insert:
 	return nxt;
 
  dropfrag:
-	IP6Q_UNLOCK();
+	IP6Q_UNLOCK(hash);
 	in6_ifstat_inc(dstifp, ifs6_reass_fail);
 	IP6STAT_INC(ip6s_fragdropped);
 	m_freem(m);
@@ -643,19 +673,19 @@ insert:
  * Free a fragment reassembly header and all
  * associated datagrams.
  */
-void
-frag6_freef(struct ip6q *q6)
+static void
+frag6_freef(struct ip6q *q6, uint32_t bucket)
 {
 	struct ip6asfrag *af6, *down6;
 
-	IP6Q_LOCK_ASSERT();
+	IP6Q_LOCK_ASSERT(bucket);
 
 	for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
 	     af6 = down6) {
 		struct mbuf *m = IP6_REASS_MBUF(af6);
 
 		down6 = af6->ip6af_down;
-		frag6_deq(af6);
+		frag6_deq(af6, bucket);
 
 		/*
 		 * Return ICMP time exceeded error for the 1st fragment.
@@ -677,24 +707,25 @@ frag6_freef(struct ip6q *q6)
 			m_freem(m);
 		free(af6, M_FTABLE);
 	}
-	frag6_remque(q6);
-	V_frag6_nfrags -= q6->ip6q_nfrag;
+	frag6_remque(q6, bucket);
+	atomic_subtract_int(&V_frag6_nfrags, q6->ip6q_nfrag);
 #ifdef MAC
 	mac_ip6q_destroy(q6);
 #endif
 	free(q6, M_FTABLE);
-	V_frag6_nfragpackets--;
+	atomic_subtract_int(&V_frag6_nfragpackets, 1);
 }
 
 /*
  * Put an ip fragment on a reassembly chain.
  * Like insque, but pointers in middle of structure.
  */
-void
-frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6)
+static void
+frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6,
+    uint32_t bucket __unused)
 {
 
-	IP6Q_LOCK_ASSERT();
+	IP6Q_LOCK_ASSERT(bucket);
 
 	af6->ip6af_up = up6;
 	af6->ip6af_down = up6->ip6af_down;
@@ -705,21 +736,24 @@ frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6
 /*
  * To frag6_enq as remque is to insque.
  */
-void
-frag6_deq(struct ip6asfrag *af6)
+static void
+frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused)
 {
 
-	IP6Q_LOCK_ASSERT();
+	IP6Q_LOCK_ASSERT(bucket);
 
 	af6->ip6af_up->ip6af_down = af6->ip6af_down;
 	af6->ip6af_down->ip6af_up = af6->ip6af_up;
 }
 
-void
-frag6_insque(struct ip6q *new, struct ip6q *old)
+static void
+frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket __unused)
 {
 
-	IP6Q_LOCK_ASSERT();
+	IP6Q_LOCK_ASSERT(bucket);
+	KASSERT(IP6Q_HEAD(bucket) == old,
+	    ("%s: attempt to insert at head of wrong bucket"
+	    " (bucket=%u, old=%p)", __func__, bucket, old));
 
 	new->ip6q_prev = old;
 	new->ip6q_next = old->ip6q_next;
@@ -727,11 +761,11 @@ frag6_insque(struct ip6q *new, struct ip6q *old)
 	old->ip6q_next = new;
 }
 
-void
-frag6_remque(struct ip6q *p6)
+static void
+frag6_remque(struct ip6q *p6, uint32_t bucket __unused)
 {
 
-	IP6Q_LOCK_ASSERT();
+	IP6Q_LOCK_ASSERT(bucket);
 
 	p6->ip6q_prev->ip6q_next = p6->ip6q_next;
 	p6->ip6q_next->ip6q_prev = p6->ip6q_prev;
@@ -746,37 +780,42 @@ void
 frag6_slowtimo(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
-	struct ip6q *q6;
+	struct ip6q *head, *q6;
+	int i;
 
 	VNET_LIST_RLOCK_NOSLEEP();
-	IP6Q_LOCK();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
-		q6 = V_ip6q.ip6q_next;
-		if (q6)
-			while (q6 != &V_ip6q) {
-				--q6->ip6q_ttl;
-				q6 = q6->ip6q_next;
-				if (q6->ip6q_prev->ip6q_ttl == 0) {
-					IP6STAT_INC(ip6s_fragtimeout);
-					/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
-					frag6_freef(q6->ip6q_prev);
+		for (i = 0; i < IP6REASS_NHASH; i++) {
+			IP6Q_LOCK(i);
+			head = IP6Q_HEAD(i);
+			q6 = head->ip6q_next;
+			if (q6)
+				while (q6 != head) {
+					--q6->ip6q_ttl;
+					q6 = q6->ip6q_next;
+					if (q6->ip6q_prev->ip6q_ttl == 0) {
+						IP6STAT_INC(ip6s_fragtimeout);
+						/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
+						frag6_freef(q6->ip6q_prev, i);
+					}
 				}
+			/*
+			 * If we are over the maximum number of fragments
+			 * (due to the limit being lowered), drain off
+			 * enough to get down to the new limit.
+			 */
+			while (atomic_load_int(&V_frag6_nfragpackets) >
+			    (u_int)V_ip6_maxfragpackets &&
+			    head->ip6q_prev != head) {
+				IP6STAT_INC(ip6s_fragoverflow);
+				/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
+				frag6_freef(head->ip6q_prev, i);
 			}
-		/*
-		 * If we are over the maximum number of fragments
-		 * (due to the limit being lowered), drain off
-		 * enough to get down to the new limit.
-		 */
-		while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets &&
-		    V_ip6q.ip6q_prev) {
-			IP6STAT_INC(ip6s_fragoverflow);
-			/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
-			frag6_freef(V_ip6q.ip6q_prev);
+			IP6Q_UNLOCK(i);
 		}
 		CURVNET_RESTORE();
 	}
-	IP6Q_UNLOCK();
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 
@@ -787,22 +826,25 @@ void
 frag6_drain(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
+	struct ip6q *head;
+	int i;
 
 	VNET_LIST_RLOCK_NOSLEEP();
-	if (IP6Q_TRYLOCK() == 0) {
-		VNET_LIST_RUNLOCK_NOSLEEP();
-		return;
-	}
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
-		while (V_ip6q.ip6q_next != &V_ip6q) {
-			IP6STAT_INC(ip6s_fragdropped);
-			/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
-			frag6_freef(V_ip6q.ip6q_next);
+		for (i = 0; i < IP6REASS_NHASH; i++) {
+			if (IP6Q_TRYLOCK(i) == 0)
+				continue;
+			head = IP6Q_HEAD(i);
+			while (head->ip6q_next != head) {
+				IP6STAT_INC(ip6s_fragdropped);
+				/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
+				frag6_freef(head->ip6q_next, i);
+			}
+			IP6Q_UNLOCK(i);
 		}
 		CURVNET_RESTORE();
 	}
-	IP6Q_UNLOCK();
 	VNET_LIST_RUNLOCK_NOSLEEP();
 }
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808141717.w7EHHcPf010971>