From owner-svn-src-stable@freebsd.org Tue Aug 14 17:54:40 2018 Return-Path: Delivered-To: svn-src-stable@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id CEE5C107FF61; Tue, 14 Aug 2018 17:54:40 +0000 (UTC) (envelope-from jtl@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 80C8A7F0EC; Tue, 14 Aug 2018 17:54:40 +0000 (UTC) (envelope-from jtl@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 5E10E1EEF0; Tue, 14 Aug 2018 17:54:40 +0000 (UTC) (envelope-from jtl@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id w7EHsenw032466; Tue, 14 Aug 2018 17:54:40 GMT (envelope-from jtl@FreeBSD.org) Received: (from jtl@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id w7EHsejC032465; Tue, 14 Aug 2018 17:54:40 GMT (envelope-from jtl@FreeBSD.org) Message-Id: <201808141754.w7EHsejC032465@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: jtl set sender to jtl@FreeBSD.org using -f From: "Jonathan T. Looney" Date: Tue, 14 Aug 2018 17:54:40 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org Subject: svn commit: r337796 - stable/11/sys/netinet X-SVN-Group: stable-11 X-SVN-Commit-Author: jtl X-SVN-Commit-Paths: stable/11/sys/netinet X-SVN-Commit-Revision: 337796 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-stable@freebsd.org X-Mailman-Version: 2.1.27 Precedence: list List-Id: SVN commit messages for all the -stable branches of the src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 14 Aug 2018 17:54:41 -0000 Author: jtl Date: Tue Aug 14 17:54:39 2018 New Revision: 337796 URL: https://svnweb.freebsd.org/changeset/base/337796 Log: MFC r337780: Implement a limit on on the number of IPv4 reassembly queues per bucket. There is a hashing algorithm which should distribute IPv4 reassembly queues across the available buckets in a relatively even way. However, if there is a flaw in the hashing algorithm which allows a large number of IPv4 fragment reassembly queues to end up in a single bucket, a per- bucket limit could help mitigate the performance impact of this flaw. Implement such a limit, with a default of twice the maximum number of reassembly queues divided by the number of buckets. Recalculate the limit any time the maximum number of reassembly queues changes. However, allow the user to override the value using a sysctl (net.inet.ip.maxfragbucketsize). Approved by: so Security: FreeBSD-SA-18:10.ip Security: CVE-2018-6923 Modified: stable/11/sys/netinet/ip_reass.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/netinet/ip_reass.c ============================================================================== --- stable/11/sys/netinet/ip_reass.c Tue Aug 14 17:52:06 2018 (r337795) +++ stable/11/sys/netinet/ip_reass.c Tue Aug 14 17:54:39 2018 (r337796) @@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -70,6 +71,7 @@ SYSCTL_DECL(_net_inet_ip); struct ipqbucket { TAILQ_HEAD(ipqhead, ipq) head; struct mtx lock; + int count; }; static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]); @@ -82,6 +84,9 @@ static VNET_DEFINE(uint32_t, ipq_hashseed); #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) +static VNET_DEFINE(int, ipreass_maxbucketsize); +#define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize) + void ipreass_init(void); void ipreass_drain(void); void ipreass_slowtimo(void); @@ -89,25 +94,26 @@ void ipreass_slowtimo(void); void ipreass_destroy(void); #endif static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); +static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS); static void ipreass_zone_change(void *); static void ipreass_drain_tomax(void); -static void ipq_free(struct ipqhead *, struct ipq *); +static void ipq_free(struct ipqbucket *, struct ipq *); static struct ipq * ipq_reuse(int); static inline void -ipq_timeout(struct ipqhead *head, struct ipq *fp) +ipq_timeout(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); - ipq_free(head, fp); + ipq_free(bucket, fp); } static inline void -ipq_drop(struct ipqhead *head, struct ipq *fp) +ipq_drop(struct ipqbucket *bucket, struct ipq *fp) { IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); - ipq_free(head, fp); + ipq_free(bucket, fp); } static int maxfrags; @@ -136,6 +142,10 @@ static VNET_DEFINE(int, maxfragsperpacket); SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(maxfragsperpacket), 0, "Maximum number of IPv4 fragments allowed per packet"); +SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize, + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, + sysctl_maxfragbucketsize, "I", + "Maximum number of IPv4 fragment reassembly queue entries per bucket"); /* * Take incoming datagram fragment and try to reassemble it into @@ -241,9 +251,12 @@ ip_reass(struct mbuf *m) * If first fragment to arrive, create a reassembly queue. */ if (fp == NULL) { - fp = uma_zalloc(V_ipq_zone, M_NOWAIT); + if (V_ipq[hash].count < V_ipreass_maxbucketsize) + fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (fp == NULL) fp = ipq_reuse(hash); + if (fp == NULL) + goto dropfrag; #ifdef MAC if (mac_ipq_init(fp, M_NOWAIT) != 0) { uma_zfree(V_ipq_zone, fp); @@ -253,6 +266,7 @@ ip_reass(struct mbuf *m) mac_ipq_create(m, fp); #endif TAILQ_INSERT_HEAD(head, fp, ipq_list); + V_ipq[hash].count++; fp->ipq_nfrags = 1; atomic_add_int(&nfrags, 1); fp->ipq_ttl = IPFRAGTTL; @@ -360,7 +374,7 @@ ip_reass(struct mbuf *m) for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { if (ntohs(GETIP(q)->ip_off) != next) { if (fp->ipq_nfrags > V_maxfragsperpacket) - ipq_drop(head, fp); + ipq_drop(&V_ipq[hash], fp); goto done; } next += ntohs(GETIP(q)->ip_len); @@ -368,7 +382,7 @@ ip_reass(struct mbuf *m) /* Make sure the last packet didn't have the IP_MF flag */ if (p->m_flags & M_IP_FRAG) { if (fp->ipq_nfrags > V_maxfragsperpacket) - ipq_drop(head, fp); + ipq_drop(&V_ipq[hash], fp); goto done; } @@ -379,7 +393,7 @@ ip_reass(struct mbuf *m) ip = GETIP(q); if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { IPSTAT_INC(ips_toolong); - ipq_drop(head, fp); + ipq_drop(&V_ipq[hash], fp); goto done; } @@ -423,6 +437,7 @@ ip_reass(struct mbuf *m) ip->ip_src = fp->ipq_src; ip->ip_dst = fp->ipq_dst; TAILQ_REMOVE(head, fp, ipq_list); + V_ipq[hash].count--; uma_zfree(V_ipq_zone, fp); m->m_len += (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2); @@ -486,17 +501,21 @@ done: void ipreass_init(void) { + int max; for (int i = 0; i < IPREASS_NHASH; i++) { TAILQ_INIT(&V_ipq[i].head); mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, MTX_DEF | MTX_DUPOK); + V_ipq[i].count = 0; } V_ipq_hashseed = arc4random(); V_maxfragsperpacket = 16; V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - uma_zone_set_max(V_ipq_zone, nmbclusters / 32); + max = nmbclusters / 32; + max = uma_zone_set_max(V_ipq_zone, max); + V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); if (IS_DEFAULT_VNET(curvnet)) { maxfrags = nmbclusters / 32; @@ -517,7 +536,7 @@ ipreass_slowtimo(void) IPQ_LOCK(i); TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp) if (--fp->ipq_ttl == 0) - ipq_timeout(&V_ipq[i].head, fp); + ipq_timeout(&V_ipq[i], fp); IPQ_UNLOCK(i); } } @@ -532,7 +551,10 @@ ipreass_drain(void) for (int i = 0; i < IPREASS_NHASH; i++) { IPQ_LOCK(i); while(!TAILQ_EMPTY(&V_ipq[i].head)) - ipq_drop(&V_ipq[i].head, TAILQ_FIRST(&V_ipq[i].head)); + ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head)); + KASSERT(V_ipq[i].count == 0, + ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i, + V_ipq[i].count, V_ipq)); IPQ_UNLOCK(i); } } @@ -560,9 +582,23 @@ ipreass_destroy(void) static void ipreass_drain_tomax(void) { + struct ipq *fp; int target; /* + * Make sure each bucket is under the new limit. If + * necessary, drop enough of the oldest elements from + * each bucket to get under the new limit. + */ + for (int i = 0; i < IPREASS_NHASH; i++) { + IPQ_LOCK(i); + while (V_ipq[i].count > V_ipreass_maxbucketsize && + (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL) + ipq_timeout(&V_ipq[i], fp); + IPQ_UNLOCK(i); + } + + /* * If we are over the maximum number of fragments, * drain off enough to get down to the new limit, * stripping off last elements on queues. Every @@ -570,13 +606,11 @@ ipreass_drain_tomax(void) */ target = uma_zone_get_max(V_ipq_zone); while (uma_zone_get_cur(V_ipq_zone) > target) { - struct ipq *fp; - for (int i = 0; i < IPREASS_NHASH; i++) { IPQ_LOCK(i); fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); if (fp != NULL) - ipq_timeout(&V_ipq[i].head, fp); + ipq_timeout(&V_ipq[i], fp); IPQ_UNLOCK(i); } } @@ -593,7 +627,8 @@ ipreass_zone_change(void *tag) VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - uma_zone_set_max(V_ipq_zone, max); + max = uma_zone_set_max(V_ipq_zone, max); + V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); ipreass_drain_tomax(); CURVNET_RESTORE(); } @@ -625,6 +660,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) * and place an extreme upper bound. */ max = uma_zone_set_max(V_ipq_zone, max); + V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1); ipreass_drain_tomax(); V_noreass = 0; } else if (max == 0) { @@ -633,6 +669,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS) } else if (max == -1) { V_noreass = 0; uma_zone_set_max(V_ipq_zone, 0); + V_ipreass_maxbucketsize = INT_MAX; } else return (EINVAL); return (0); @@ -646,16 +683,15 @@ static struct ipq * ipq_reuse(int start) { struct ipq *fp; - int i; + int bucket, i; IPQ_LOCK_ASSERT(start); - for (i = start;; i++) { - if (i == IPREASS_NHASH) - i = 0; - if (i != start && IPQ_TRYLOCK(i) == 0) + for (i = 0; i < IPREASS_NHASH; i++) { + bucket = (start + i) % IPREASS_NHASH; + if (bucket != start && IPQ_TRYLOCK(bucket) == 0) continue; - fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); + fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead); if (fp) { struct mbuf *m; @@ -666,22 +702,24 @@ ipq_reuse(int start) fp->ipq_frags = m->m_nextpkt; m_freem(m); } - TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list); - if (i != start) - IPQ_UNLOCK(i); - IPQ_LOCK_ASSERT(start); - return (fp); + TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list); + V_ipq[bucket].count--; + if (bucket != start) + IPQ_UNLOCK(bucket); + break; } - if (i != start) - IPQ_UNLOCK(i); + if (bucket != start) + IPQ_UNLOCK(bucket); } + IPQ_LOCK_ASSERT(start); + return (fp); } /* * Free a fragment reassembly header and all associated datagrams. */ static void -ipq_free(struct ipqhead *fhp, struct ipq *fp) +ipq_free(struct ipqbucket *bucket, struct ipq *fp) { struct mbuf *q; @@ -691,6 +729,26 @@ ipq_free(struct ipqhead *fhp, struct ipq *fp) fp->ipq_frags = q->m_nextpkt; m_freem(q); } - TAILQ_REMOVE(fhp, fp, ipq_list); + TAILQ_REMOVE(&bucket->head, fp, ipq_list); + bucket->count--; uma_zfree(V_ipq_zone, fp); +} + +/* + * Get or set the maximum number of reassembly queues per bucket. + */ +static int +sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS) +{ + int error, max; + + max = V_ipreass_maxbucketsize; + error = sysctl_handle_int(oidp, &max, 0, req); + if (error || !req->newptr) + return (error); + if (max <= 0) + return (EINVAL); + V_ipreass_maxbucketsize = max; + ipreass_drain_tomax(); + return (0); }