Date: Tue, 14 Aug 2018 17:17:38 +0000 (UTC) From: "Jonathan T. Looney" <jtl@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r337776 - head/sys/netinet6 Message-ID: <201808141717.w7EHHcPf010971@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: jtl Date: Tue Aug 14 17:17:37 2018 New Revision: 337776 URL: https://svnweb.freebsd.org/changeset/base/337776 Log: Improve IPv6 reassembly performance by hashing fragments into buckets. Currently, all IPv6 fragment reassembly queues are kept in a flat linked list. This has a number of implications. Two significant implications are: all reassembly operations share a common lock, and it is possible for the linked list to grow quite large. Improve IPv6 reassembly performance by hashing fragments into buckets, each of which has its own lock. Calculate the hash key using a Jenkins hash with a random seed. Reviewed by: jhb Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: head/sys/netinet6/frag6.c Modified: head/sys/netinet6/frag6.c ============================================================================== --- head/sys/netinet6/frag6.c Tue Aug 14 17:15:47 2018 (r337775) +++ head/sys/netinet6/frag6.c Tue Aug 14 17:17:37 2018 (r337776) @@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> +#include <sys/hash.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/domain.h> @@ -49,6 +50,8 @@ __FBSDID("$FreeBSD$"); #include <sys/kernel.h> #include <sys/syslog.h> +#include <machine/atomic.h> + #include <net/if.h> #include <net/if_var.h> #include <net/netisr.h> @@ -65,29 +68,41 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> -static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *); -static void frag6_deq(struct ip6asfrag *); -static void frag6_insque(struct ip6q *, struct ip6q *); -static void frag6_remque(struct ip6q *); -static void frag6_freef(struct ip6q *); - -static struct mtx ip6qlock; /* - * These fields all protected by ip6qlock. + * Reassembly headers are stored in hash buckets. */ -VNET_DEFINE_STATIC(u_int, frag6_nfragpackets); -VNET_DEFINE_STATIC(u_int, frag6_nfrags); -VNET_DEFINE_STATIC(struct ip6q, ip6q); /* ip6 reassemble queue */ +#define IP6REASS_NHASH_LOG2 6 +#define IP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) +#define IP6REASS_HMASK (IP6REASS_NHASH - 1) +static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *, + uint32_t bucket __unused); +static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused); +static void frag6_insque_head(struct ip6q *, struct ip6q *, + uint32_t bucket __unused); +static void frag6_remque(struct ip6q *, uint32_t bucket __unused); +static void frag6_freef(struct ip6q *, uint32_t bucket); + +struct ip6qbucket { + struct ip6q ip6q; + struct mtx lock; +}; + +VNET_DEFINE_STATIC(volatile u_int, frag6_nfragpackets); +VNET_DEFINE_STATIC(volatile u_int, frag6_nfrags); +VNET_DEFINE_STATIC(struct ip6qbucket, ip6q[IP6REASS_NHASH]); +VNET_DEFINE_STATIC(uint32_t, ip6q_hashseed); + #define V_frag6_nfragpackets VNET(frag6_nfragpackets) #define V_frag6_nfrags VNET(frag6_nfrags) #define V_ip6q VNET(ip6q) +#define V_ip6q_hashseed VNET(ip6q_hashseed) -#define IP6Q_LOCK_INIT() mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF); -#define IP6Q_LOCK() mtx_lock(&ip6qlock) -#define IP6Q_TRYLOCK() mtx_trylock(&ip6qlock) -#define IP6Q_LOCK_ASSERT() mtx_assert(&ip6qlock, MA_OWNED) -#define IP6Q_UNLOCK() mtx_unlock(&ip6qlock) +#define IP6Q_LOCK(i) mtx_lock(&V_ip6q[(i)].lock) +#define IP6Q_TRYLOCK(i) mtx_trylock(&V_ip6q[(i)].lock) +#define IP6Q_LOCK_ASSERT(i) mtx_assert(&V_ip6q[(i)].lock, MA_OWNED) +#define IP6Q_UNLOCK(i) mtx_unlock(&V_ip6q[(i)].lock) +#define IP6Q_HEAD(i) (&V_ip6q[(i)].ip6q) static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); @@ -105,18 +120,22 @@ frag6_change(void *tag) void frag6_init(void) { + struct ip6q *q6; + int i; V_ip6_maxfragpackets = nmbclusters / 4; V_ip6_maxfrags = nmbclusters / 4; - V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q; - + for (i = 0; i < IP6REASS_NHASH; i++) { + q6 = IP6Q_HEAD(i); + q6->ip6q_next = q6->ip6q_prev = q6; + mtx_init(&V_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF); + } + V_ip6q_hashseed = arc4random(); if (!IS_DEFAULT_VNET(curvnet)) return; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); - - IP6Q_LOCK_INIT(); } /* @@ -157,12 +176,13 @@ frag6_input(struct mbuf **mp, int *offp, int proto) struct mbuf *m = *mp, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; - struct ip6q *q6; + struct ip6q *head, *q6; struct ip6asfrag *af6, *ip6af, *af6dwn; struct in6_ifaddr *ia; int offset = *offp, nxt, i, next; int first_frag = 0; int fragoff, frgpartlen; /* must be larger than u_int16_t */ + uint32_t hash, hashkey[sizeof(struct in6_addr) * 2 + 1], *hashkeyp; struct ifnet *dstifp; u_int8_t ecn, ecn0; #ifdef RSS @@ -231,7 +251,16 @@ frag6_input(struct mbuf **mp, int *offp, int proto) return (ip6f->ip6f_nxt); } - IP6Q_LOCK(); + hashkeyp = hashkey; + memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr)); + hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); + memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr)); + hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); + *hashkeyp = ip6f->ip6f_ident; + hash = jenkins_hash32(hashkey, nitems(hashkey), V_ip6q_hashseed); + hash &= IP6REASS_HMASK; + head = IP6Q_HEAD(hash); + IP6Q_LOCK(hash); /* * Enforce upper bound on number of fragments. @@ -240,10 +269,10 @@ frag6_input(struct mbuf **mp, int *offp, int proto) */ if (V_ip6_maxfrags < 0) ; - else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags) + else if (atomic_load_int(&V_frag6_nfrags) >= (u_int)V_ip6_maxfrags) goto dropfrag; - for (q6 = V_ip6q.ip6q_next; q6 != &V_ip6q; q6 = q6->ip6q_next) + for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next) if (ip6f->ip6f_ident == q6->ip6q_ident && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst) @@ -253,7 +282,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) ) break; - if (q6 == &V_ip6q) { + if (q6 == head) { /* * the first fragment to arrive, create a reassembly queue. */ @@ -268,9 +297,10 @@ frag6_input(struct mbuf **mp, int *offp, int proto) */ if (V_ip6_maxfragpackets < 0) ; - else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets) + else if (atomic_load_int(&V_frag6_nfragpackets) >= + (u_int)V_ip6_maxfragpackets) goto dropfrag; - V_frag6_nfragpackets++; + atomic_add_int(&V_frag6_nfragpackets, 1); q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE, M_NOWAIT); if (q6 == NULL) @@ -283,7 +313,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) } mac_ip6q_create(m, q6); #endif - frag6_insque(q6, &V_ip6q); + frag6_insque_head(q6, head, hash); /* ip6q_nxt will be filled afterwards, from 1st fragment */ q6->ip6q_down = q6->ip6q_up = (struct ip6asfrag *)q6; @@ -324,14 +354,14 @@ frag6_input(struct mbuf **mp, int *offp, int proto) icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); return (IPPROTO_DONE); } } else if (fragoff + frgpartlen > IPV6_MAXPACKET) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); return (IPPROTO_DONE); } /* @@ -350,7 +380,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) int erroff = af6->ip6af_offset; /* dequeue the fragment. */ - frag6_deq(af6); + frag6_deq(af6, hash); free(af6, M_FTABLE); /* adjust pointer. */ @@ -448,7 +478,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) } af6 = af6->ip6af_down; m_freem(IP6_REASS_MBUF(af6->ip6af_up)); - frag6_deq(af6->ip6af_up); + frag6_deq(af6->ip6af_up, hash); } #else /* @@ -500,26 +530,26 @@ insert: * Move to front of packet queue, as we are * the most recently active fragmented packet. */ - frag6_enq(ip6af, af6->ip6af_up); - V_frag6_nfrags++; + frag6_enq(ip6af, af6->ip6af_up, hash); + atomic_add_int(&V_frag6_nfrags, 1); q6->ip6q_nfrag++; #if 0 /* xxx */ - if (q6 != V_ip6q.ip6q_next) { - frag6_remque(q6); - frag6_insque(q6, &V_ip6q); + if (q6 != head->ip6q_next) { + frag6_remque(q6, hash); + frag6_insque_head(q6, head, hash); } #endif next = 0; for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = af6->ip6af_down) { if (af6->ip6af_off != next) { - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); return IPPROTO_DONE; } next += af6->ip6af_frglen; } if (af6->ip6af_up->ip6af_mff) { - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); return IPPROTO_DONE; } @@ -529,7 +559,7 @@ insert: ip6af = q6->ip6q_down; t = m = IP6_REASS_MBUF(ip6af); af6 = ip6af->ip6af_down; - frag6_deq(ip6af); + frag6_deq(ip6af, hash); while (af6 != (struct ip6asfrag *)q6) { m->m_pkthdr.csum_flags &= IP6_REASS_MBUF(af6)->m_pkthdr.csum_flags; @@ -537,7 +567,7 @@ insert: IP6_REASS_MBUF(af6)->m_pkthdr.csum_data; af6dwn = af6->ip6af_down; - frag6_deq(af6); + frag6_deq(af6, hash); while (t->m_next) t = t->m_next; m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset); @@ -564,13 +594,13 @@ insert: #endif if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) { - frag6_remque(q6); - V_frag6_nfrags -= q6->ip6q_nfrag; + frag6_remque(q6, hash); + atomic_subtract_int(&V_frag6_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_destroy(q6); #endif free(q6, M_FTABLE); - V_frag6_nfragpackets--; + atomic_subtract_int(&V_frag6_nfragpackets, 1); goto dropfrag; } @@ -581,14 +611,14 @@ insert: m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), (caddr_t)&nxt); - frag6_remque(q6); - V_frag6_nfrags -= q6->ip6q_nfrag; + frag6_remque(q6, hash); + atomic_subtract_int(&V_frag6_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_reassemble(q6, m); mac_ip6q_destroy(q6); #endif free(q6, M_FTABLE); - V_frag6_nfragpackets--; + atomic_subtract_int(&V_frag6_nfragpackets, 1); if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ int plen = 0; @@ -610,7 +640,7 @@ insert: m_tag_prepend(m, mtag); #endif - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); IP6STAT_INC(ip6s_reassembled); in6_ifstat_inc(dstifp, ifs6_reass_ok); @@ -632,7 +662,7 @@ insert: return nxt; dropfrag: - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); in6_ifstat_inc(dstifp, ifs6_reass_fail); IP6STAT_INC(ip6s_fragdropped); m_freem(m); @@ -643,19 +673,19 @@ insert: * Free a fragment reassembly header and all * associated datagrams. */ -void -frag6_freef(struct ip6q *q6) +static void +frag6_freef(struct ip6q *q6, uint32_t bucket) { struct ip6asfrag *af6, *down6; - IP6Q_LOCK_ASSERT(); + IP6Q_LOCK_ASSERT(bucket); for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = down6) { struct mbuf *m = IP6_REASS_MBUF(af6); down6 = af6->ip6af_down; - frag6_deq(af6); + frag6_deq(af6, bucket); /* * Return ICMP time exceeded error for the 1st fragment. @@ -677,24 +707,25 @@ frag6_freef(struct ip6q *q6) m_freem(m); free(af6, M_FTABLE); } - frag6_remque(q6); - V_frag6_nfrags -= q6->ip6q_nfrag; + frag6_remque(q6, bucket); + atomic_subtract_int(&V_frag6_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_destroy(q6); #endif free(q6, M_FTABLE); - V_frag6_nfragpackets--; + atomic_subtract_int(&V_frag6_nfragpackets, 1); } /* * Put an ip fragment on a reassembly chain. * Like insque, but pointers in middle of structure. */ -void -frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6) +static void +frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6, + uint32_t bucket __unused) { - IP6Q_LOCK_ASSERT(); + IP6Q_LOCK_ASSERT(bucket); af6->ip6af_up = up6; af6->ip6af_down = up6->ip6af_down; @@ -705,21 +736,24 @@ frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6 /* * To frag6_enq as remque is to insque. */ -void -frag6_deq(struct ip6asfrag *af6) +static void +frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused) { - IP6Q_LOCK_ASSERT(); + IP6Q_LOCK_ASSERT(bucket); af6->ip6af_up->ip6af_down = af6->ip6af_down; af6->ip6af_down->ip6af_up = af6->ip6af_up; } -void -frag6_insque(struct ip6q *new, struct ip6q *old) +static void +frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket __unused) { - IP6Q_LOCK_ASSERT(); + IP6Q_LOCK_ASSERT(bucket); + KASSERT(IP6Q_HEAD(bucket) == old, + ("%s: attempt to insert at head of wrong bucket" + " (bucket=%u, old=%p)", __func__, bucket, old)); new->ip6q_prev = old; new->ip6q_next = old->ip6q_next; @@ -727,11 +761,11 @@ frag6_insque(struct ip6q *new, struct ip6q *old) old->ip6q_next = new; } -void -frag6_remque(struct ip6q *p6) +static void +frag6_remque(struct ip6q *p6, uint32_t bucket __unused) { - IP6Q_LOCK_ASSERT(); + IP6Q_LOCK_ASSERT(bucket); p6->ip6q_prev->ip6q_next = p6->ip6q_next; p6->ip6q_next->ip6q_prev = p6->ip6q_prev; @@ -746,37 +780,42 @@ void frag6_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); - struct ip6q *q6; + struct ip6q *head, *q6; + int i; VNET_LIST_RLOCK_NOSLEEP(); - IP6Q_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - q6 = V_ip6q.ip6q_next; - if (q6) - while (q6 != &V_ip6q) { - --q6->ip6q_ttl; - q6 = q6->ip6q_next; - if (q6->ip6q_prev->ip6q_ttl == 0) { - IP6STAT_INC(ip6s_fragtimeout); - /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(q6->ip6q_prev); + for (i = 0; i < IP6REASS_NHASH; i++) { + IP6Q_LOCK(i); + head = IP6Q_HEAD(i); + q6 = head->ip6q_next; + if (q6) + while (q6 != head) { + --q6->ip6q_ttl; + q6 = q6->ip6q_next; + if (q6->ip6q_prev->ip6q_ttl == 0) { + IP6STAT_INC(ip6s_fragtimeout); + /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ + frag6_freef(q6->ip6q_prev, i); + } } + /* + * If we are over the maximum number of fragments + * (due to the limit being lowered), drain off + * enough to get down to the new limit. + */ + while (atomic_load_int(&V_frag6_nfragpackets) > + (u_int)V_ip6_maxfragpackets && + head->ip6q_prev != head) { + IP6STAT_INC(ip6s_fragoverflow); + /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ + frag6_freef(head->ip6q_prev, i); } - /* - * If we are over the maximum number of fragments - * (due to the limit being lowered), drain off - * enough to get down to the new limit. - */ - while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets && - V_ip6q.ip6q_prev) { - IP6STAT_INC(ip6s_fragoverflow); - /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(V_ip6q.ip6q_prev); + IP6Q_UNLOCK(i); } CURVNET_RESTORE(); } - IP6Q_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); } @@ -787,22 +826,25 @@ void frag6_drain(void) { VNET_ITERATOR_DECL(vnet_iter); + struct ip6q *head; + int i; VNET_LIST_RLOCK_NOSLEEP(); - if (IP6Q_TRYLOCK() == 0) { - VNET_LIST_RUNLOCK_NOSLEEP(); - return; - } VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - while (V_ip6q.ip6q_next != &V_ip6q) { - IP6STAT_INC(ip6s_fragdropped); - /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(V_ip6q.ip6q_next); + for (i = 0; i < IP6REASS_NHASH; i++) { + if (IP6Q_TRYLOCK(i) == 0) + continue; + head = IP6Q_HEAD(i); + while (head->ip6q_next != head) { + IP6STAT_INC(ip6s_fragdropped); + /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ + frag6_freef(head->ip6q_next, i); + } + IP6Q_UNLOCK(i); } CURVNET_RESTORE(); } - IP6Q_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); }
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808141717.w7EHHcPf010971>