From owner-svn-src-all@freebsd.org Tue Aug 14 17:46:55 2018 Return-Path: Delivered-To: svn-src-all@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 38E94107FACA; Tue, 14 Aug 2018 17:46:55 +0000 (UTC) (envelope-from jtl@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id E09177DFAA; Tue, 14 Aug 2018 17:46:54 +0000 (UTC) (envelope-from jtl@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id A49371ED4F; Tue, 14 Aug 2018 17:46:54 +0000 (UTC) (envelope-from jtl@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id w7EHksGZ027122; Tue, 14 Aug 2018 17:46:54 GMT (envelope-from jtl@FreeBSD.org) Received: (from jtl@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id w7EHkskL027121; Tue, 14 Aug 2018 17:46:54 GMT (envelope-from jtl@FreeBSD.org) Message-Id: <201808141746.w7EHkskL027121@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: jtl set sender to jtl@FreeBSD.org using -f From: "Jonathan T. Looney" Date: Tue, 14 Aug 2018 17:46:54 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org Subject: svn commit: r337790 - stable/11/sys/netinet6 X-SVN-Group: stable-11 X-SVN-Commit-Author: jtl X-SVN-Commit-Paths: stable/11/sys/netinet6 X-SVN-Commit-Revision: 337790 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.27 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 14 Aug 2018 17:46:55 -0000 Author: jtl Date: Tue Aug 14 17:46:54 2018 New Revision: 337790 URL: https://svnweb.freebsd.org/changeset/base/337790 Log: MFC r337776: Improve IPv6 reassembly performance by hashing fragments into buckets. Currently, all IPv6 fragment reassembly queues are kept in a flat linked list. This has a number of implications. Two significant implications are: all reassembly operations share a common lock, and it is possible for the linked list to grow quite large. Improve IPv6 reassembly performance by hashing fragments into buckets, each of which has its own lock. Calculate the hash key using a Jenkins hash with a random seed. Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet6/frag6.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet6/frag6.c ============================================================================== --- stable/11/sys/netinet6/frag6.c Tue Aug 14 17:43:11 2018 (r337789) +++ stable/11/sys/netinet6/frag6.c Tue Aug 14 17:46:54 2018 (r337790) @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -47,6 +48,8 @@ __FBSDID("$FreeBSD$"); #include #include +#include + #include #include #include @@ -63,29 +66,41 @@ __FBSDID("$FreeBSD$"); #include -static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *); -static void frag6_deq(struct ip6asfrag *); -static void frag6_insque(struct ip6q *, struct ip6q *); -static void frag6_remque(struct ip6q *); -static void frag6_freef(struct ip6q *); - -static struct mtx ip6qlock; /* - * These fields all protected by ip6qlock. + * Reassembly headers are stored in hash buckets. */ -static VNET_DEFINE(u_int, frag6_nfragpackets); -static VNET_DEFINE(u_int, frag6_nfrags); -static VNET_DEFINE(struct ip6q, ip6q); /* ip6 reassemble queue */ +#define IP6REASS_NHASH_LOG2 6 +#define IP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2) +#define IP6REASS_HMASK (IP6REASS_NHASH - 1) +static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *, + uint32_t bucket __unused); +static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused); +static void frag6_insque_head(struct ip6q *, struct ip6q *, + uint32_t bucket __unused); +static void frag6_remque(struct ip6q *, uint32_t bucket __unused); +static void frag6_freef(struct ip6q *, uint32_t bucket); + +struct ip6qbucket { + struct ip6q ip6q; + struct mtx lock; +}; + +static VNET_DEFINE(volatile u_int, frag6_nfragpackets); +static VNET_DEFINE(volatile u_int, frag6_nfrags); +static VNET_DEFINE(struct ip6qbucket, ip6q[IP6REASS_NHASH]); +static VNET_DEFINE(uint32_t, ip6q_hashseed); + #define V_frag6_nfragpackets VNET(frag6_nfragpackets) #define V_frag6_nfrags VNET(frag6_nfrags) #define V_ip6q VNET(ip6q) +#define V_ip6q_hashseed VNET(ip6q_hashseed) -#define IP6Q_LOCK_INIT() mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF); -#define IP6Q_LOCK() mtx_lock(&ip6qlock) -#define IP6Q_TRYLOCK() mtx_trylock(&ip6qlock) -#define IP6Q_LOCK_ASSERT() mtx_assert(&ip6qlock, MA_OWNED) -#define IP6Q_UNLOCK() mtx_unlock(&ip6qlock) +#define IP6Q_LOCK(i) mtx_lock(&V_ip6q[(i)].lock) +#define IP6Q_TRYLOCK(i) mtx_trylock(&V_ip6q[(i)].lock) +#define IP6Q_LOCK_ASSERT(i) mtx_assert(&V_ip6q[(i)].lock, MA_OWNED) +#define IP6Q_UNLOCK(i) mtx_unlock(&V_ip6q[(i)].lock) +#define IP6Q_HEAD(i) (&V_ip6q[(i)].ip6q) static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); @@ -103,18 +118,22 @@ frag6_change(void *tag) void frag6_init(void) { + struct ip6q *q6; + int i; V_ip6_maxfragpackets = nmbclusters / 4; V_ip6_maxfrags = nmbclusters / 4; - V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q; - + for (i = 0; i < IP6REASS_NHASH; i++) { + q6 = IP6Q_HEAD(i); + q6->ip6q_next = q6->ip6q_prev = q6; + mtx_init(&V_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF); + } + V_ip6q_hashseed = arc4random(); if (!IS_DEFAULT_VNET(curvnet)) return; EVENTHANDLER_REGISTER(nmbclusters_change, frag6_change, NULL, EVENTHANDLER_PRI_ANY); - - IP6Q_LOCK_INIT(); } /* @@ -155,12 +174,13 @@ frag6_input(struct mbuf **mp, int *offp, int proto) struct mbuf *m = *mp, *t; struct ip6_hdr *ip6; struct ip6_frag *ip6f; - struct ip6q *q6; + struct ip6q *head, *q6; struct ip6asfrag *af6, *ip6af, *af6dwn; struct in6_ifaddr *ia; int offset = *offp, nxt, i, next; int first_frag = 0; int fragoff, frgpartlen; /* must be larger than u_int16_t */ + uint32_t hash, hashkey[sizeof(struct in6_addr) * 2 + 1], *hashkeyp; struct ifnet *dstifp; u_int8_t ecn, ecn0; #ifdef RSS @@ -229,7 +249,16 @@ frag6_input(struct mbuf **mp, int *offp, int proto) return (ip6f->ip6f_nxt); } - IP6Q_LOCK(); + hashkeyp = hashkey; + memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr)); + hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); + memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr)); + hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp); + *hashkeyp = ip6f->ip6f_ident; + hash = jenkins_hash32(hashkey, nitems(hashkey), V_ip6q_hashseed); + hash &= IP6REASS_HMASK; + head = IP6Q_HEAD(hash); + IP6Q_LOCK(hash); /* * Enforce upper bound on number of fragments. @@ -241,7 +270,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags) goto dropfrag; - for (q6 = V_ip6q.ip6q_next; q6 != &V_ip6q; q6 = q6->ip6q_next) + for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next) if (ip6f->ip6f_ident == q6->ip6q_ident && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst) @@ -251,7 +280,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) ) break; - if (q6 == &V_ip6q) { + if (q6 == head) { /* * the first fragment to arrive, create a reassembly queue. */ @@ -268,7 +297,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) ; else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets) goto dropfrag; - V_frag6_nfragpackets++; + atomic_add_int(&V_frag6_nfragpackets, 1); q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE, M_NOWAIT); if (q6 == NULL) @@ -281,7 +310,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) } mac_ip6q_create(m, q6); #endif - frag6_insque(q6, &V_ip6q); + frag6_insque_head(q6, head, hash); /* ip6q_nxt will be filled afterwards, from 1st fragment */ q6->ip6q_down = q6->ip6q_up = (struct ip6asfrag *)q6; @@ -322,14 +351,14 @@ frag6_input(struct mbuf **mp, int *offp, int proto) icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); return (IPPROTO_DONE); } } else if (fragoff + frgpartlen > IPV6_MAXPACKET) { icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, offset - sizeof(struct ip6_frag) + offsetof(struct ip6_frag, ip6f_offlg)); - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); return (IPPROTO_DONE); } /* @@ -348,7 +377,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) int erroff = af6->ip6af_offset; /* dequeue the fragment. */ - frag6_deq(af6); + frag6_deq(af6, hash); free(af6, M_FTABLE); /* adjust pointer. */ @@ -446,7 +475,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto) } af6 = af6->ip6af_down; m_freem(IP6_REASS_MBUF(af6->ip6af_up)); - frag6_deq(af6->ip6af_up); + frag6_deq(af6->ip6af_up, hash); } #else /* @@ -498,26 +527,26 @@ insert: * Move to front of packet queue, as we are * the most recently active fragmented packet. */ - frag6_enq(ip6af, af6->ip6af_up); - V_frag6_nfrags++; + frag6_enq(ip6af, af6->ip6af_up, hash); + atomic_add_int(&V_frag6_nfrags, 1); q6->ip6q_nfrag++; #if 0 /* xxx */ - if (q6 != V_ip6q.ip6q_next) { - frag6_remque(q6); - frag6_insque(q6, &V_ip6q); + if (q6 != head->ip6q_next) { + frag6_remque(q6, hash); + frag6_insque_head(q6, head, hash); } #endif next = 0; for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = af6->ip6af_down) { if (af6->ip6af_off != next) { - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); return IPPROTO_DONE; } next += af6->ip6af_frglen; } if (af6->ip6af_up->ip6af_mff) { - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); return IPPROTO_DONE; } @@ -527,7 +556,7 @@ insert: ip6af = q6->ip6q_down; t = m = IP6_REASS_MBUF(ip6af); af6 = ip6af->ip6af_down; - frag6_deq(ip6af); + frag6_deq(ip6af, hash); while (af6 != (struct ip6asfrag *)q6) { m->m_pkthdr.csum_flags &= IP6_REASS_MBUF(af6)->m_pkthdr.csum_flags; @@ -535,7 +564,7 @@ insert: IP6_REASS_MBUF(af6)->m_pkthdr.csum_data; af6dwn = af6->ip6af_down; - frag6_deq(af6); + frag6_deq(af6, hash); while (t->m_next) t = t->m_next; m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset); @@ -562,13 +591,13 @@ insert: #endif if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) { - frag6_remque(q6); - V_frag6_nfrags -= q6->ip6q_nfrag; + frag6_remque(q6, hash); + atomic_subtract_int(&V_frag6_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_destroy(q6); #endif free(q6, M_FTABLE); - V_frag6_nfragpackets--; + atomic_subtract_int(&V_frag6_nfragpackets, 1); goto dropfrag; } @@ -579,14 +608,14 @@ insert: m_copyback(m, ip6_get_prevhdr(m, offset), sizeof(uint8_t), (caddr_t)&nxt); - frag6_remque(q6); - V_frag6_nfrags -= q6->ip6q_nfrag; + frag6_remque(q6, hash); + atomic_subtract_int(&V_frag6_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_reassemble(q6, m); mac_ip6q_destroy(q6); #endif free(q6, M_FTABLE); - V_frag6_nfragpackets--; + atomic_subtract_int(&V_frag6_nfragpackets, 1); if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ int plen = 0; @@ -608,7 +637,7 @@ insert: m_tag_prepend(m, mtag); #endif - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); IP6STAT_INC(ip6s_reassembled); in6_ifstat_inc(dstifp, ifs6_reass_ok); @@ -630,7 +659,7 @@ insert: return nxt; dropfrag: - IP6Q_UNLOCK(); + IP6Q_UNLOCK(hash); in6_ifstat_inc(dstifp, ifs6_reass_fail); IP6STAT_INC(ip6s_fragdropped); m_freem(m); @@ -641,19 +670,19 @@ insert: * Free a fragment reassembly header and all * associated datagrams. */ -void -frag6_freef(struct ip6q *q6) +static void +frag6_freef(struct ip6q *q6, uint32_t bucket) { struct ip6asfrag *af6, *down6; - IP6Q_LOCK_ASSERT(); + IP6Q_LOCK_ASSERT(bucket); for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; af6 = down6) { struct mbuf *m = IP6_REASS_MBUF(af6); down6 = af6->ip6af_down; - frag6_deq(af6); + frag6_deq(af6, bucket); /* * Return ICMP time exceeded error for the 1st fragment. @@ -675,24 +704,25 @@ frag6_freef(struct ip6q *q6) m_freem(m); free(af6, M_FTABLE); } - frag6_remque(q6); - V_frag6_nfrags -= q6->ip6q_nfrag; + frag6_remque(q6, bucket); + atomic_subtract_int(&V_frag6_nfrags, q6->ip6q_nfrag); #ifdef MAC mac_ip6q_destroy(q6); #endif free(q6, M_FTABLE); - V_frag6_nfragpackets--; + atomic_subtract_int(&V_frag6_nfragpackets, 1); } /* * Put an ip fragment on a reassembly chain. * Like insque, but pointers in middle of structure. */ -void -frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6) +static void +frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6, + uint32_t bucket __unused) { - IP6Q_LOCK_ASSERT(); + IP6Q_LOCK_ASSERT(bucket); af6->ip6af_up = up6; af6->ip6af_down = up6->ip6af_down; @@ -703,21 +733,24 @@ frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6 /* * To frag6_enq as remque is to insque. */ -void -frag6_deq(struct ip6asfrag *af6) +static void +frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused) { - IP6Q_LOCK_ASSERT(); + IP6Q_LOCK_ASSERT(bucket); af6->ip6af_up->ip6af_down = af6->ip6af_down; af6->ip6af_down->ip6af_up = af6->ip6af_up; } -void -frag6_insque(struct ip6q *new, struct ip6q *old) +static void +frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket __unused) { - IP6Q_LOCK_ASSERT(); + IP6Q_LOCK_ASSERT(bucket); + KASSERT(IP6Q_HEAD(bucket) == old, + ("%s: attempt to insert at head of wrong bucket" + " (bucket=%u, old=%p)", __func__, bucket, old)); new->ip6q_prev = old; new->ip6q_next = old->ip6q_next; @@ -725,11 +758,11 @@ frag6_insque(struct ip6q *new, struct ip6q *old) old->ip6q_next = new; } -void -frag6_remque(struct ip6q *p6) +static void +frag6_remque(struct ip6q *p6, uint32_t bucket __unused) { - IP6Q_LOCK_ASSERT(); + IP6Q_LOCK_ASSERT(bucket); p6->ip6q_prev->ip6q_next = p6->ip6q_next; p6->ip6q_next->ip6q_prev = p6->ip6q_prev; @@ -744,37 +777,42 @@ void frag6_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); - struct ip6q *q6; + struct ip6q *head, *q6; + int i; VNET_LIST_RLOCK_NOSLEEP(); - IP6Q_LOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - q6 = V_ip6q.ip6q_next; - if (q6) - while (q6 != &V_ip6q) { - --q6->ip6q_ttl; - q6 = q6->ip6q_next; - if (q6->ip6q_prev->ip6q_ttl == 0) { - IP6STAT_INC(ip6s_fragtimeout); - /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(q6->ip6q_prev); + for (i = 0; i < IP6REASS_NHASH; i++) { + IP6Q_LOCK(i); + head = IP6Q_HEAD(i); + q6 = head->ip6q_next; + if (q6) + while (q6 != head) { + --q6->ip6q_ttl; + q6 = q6->ip6q_next; + if (q6->ip6q_prev->ip6q_ttl == 0) { + IP6STAT_INC(ip6s_fragtimeout); + /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ + frag6_freef(q6->ip6q_prev, i); + } } + /* + * If we are over the maximum number of fragments + * (due to the limit being lowered), drain off + * enough to get down to the new limit. + */ + while (V_frag6_nfragpackets > + (u_int)V_ip6_maxfragpackets && + head->ip6q_prev != head) { + IP6STAT_INC(ip6s_fragoverflow); + /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ + frag6_freef(head->ip6q_prev, i); } - /* - * If we are over the maximum number of fragments - * (due to the limit being lowered), drain off - * enough to get down to the new limit. - */ - while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets && - V_ip6q.ip6q_prev) { - IP6STAT_INC(ip6s_fragoverflow); - /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(V_ip6q.ip6q_prev); + IP6Q_UNLOCK(i); } CURVNET_RESTORE(); } - IP6Q_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); } @@ -785,22 +823,25 @@ void frag6_drain(void) { VNET_ITERATOR_DECL(vnet_iter); + struct ip6q *head; + int i; VNET_LIST_RLOCK_NOSLEEP(); - if (IP6Q_TRYLOCK() == 0) { - VNET_LIST_RUNLOCK_NOSLEEP(); - return; - } VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - while (V_ip6q.ip6q_next != &V_ip6q) { - IP6STAT_INC(ip6s_fragdropped); - /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ - frag6_freef(V_ip6q.ip6q_next); + for (i = 0; i < IP6REASS_NHASH; i++) { + if (IP6Q_TRYLOCK(i) == 0) + continue; + head = IP6Q_HEAD(i); + while (head->ip6q_next != head) { + IP6STAT_INC(ip6s_fragdropped); + /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ + frag6_freef(head->ip6q_next, i); + } + IP6Q_UNLOCK(i); } CURVNET_RESTORE(); } - IP6Q_UNLOCK(); VNET_LIST_RUNLOCK_NOSLEEP(); }