Date: Thu, 1 Aug 2019 14:17:32 +0000 (UTC) From: Randall Stewart <rrs@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r350501 - in head/sys: conf dev/cxgbe dev/mlx5/mlx5_en net netinet Message-ID: <201908011417.x71EHW0j029518@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: rrs Date: Thu Aug 1 14:17:31 2019 New Revision: 350501 URL: https://svnweb.freebsd.org/changeset/base/350501 Log: This adds the third step in getting BBR into the tree. BBR and an updated rack depend on having access to the new ratelimit api in this commit. Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D20953 Added: head/sys/netinet/tcp_ratelimit.c (contents, props changed) head/sys/netinet/tcp_ratelimit.h (contents, props changed) Modified: head/sys/conf/files head/sys/dev/cxgbe/adapter.h head/sys/dev/cxgbe/t4_main.c head/sys/dev/cxgbe/t4_sched.c head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c head/sys/net/if_dead.c head/sys/net/if_lagg.c head/sys/net/if_var.h head/sys/netinet/in_pcb.c head/sys/netinet/in_pcb.h Modified: head/sys/conf/files ============================================================================== --- head/sys/conf/files Thu Aug 1 14:13:04 2019 (r350500) +++ head/sys/conf/files Thu Aug 1 14:17:31 2019 (r350501) @@ -4276,6 +4276,7 @@ netinet/tcp_lro.c optional inet | inet6 netinet/tcp_output.c optional inet | inet6 netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6 netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6 +netinet/tcp_ratelimit.c optional ratelimit inet | ratelimit inet6 netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \ compile-with "${NORMAL_C} ${NO_WNONNULL}" netinet/tcp_reass.c optional inet | inet6 Modified: head/sys/dev/cxgbe/adapter.h ============================================================================== --- head/sys/dev/cxgbe/adapter.h Thu Aug 1 14:13:04 2019 (r350500) +++ head/sys/dev/cxgbe/adapter.h Thu Aug 1 14:17:31 2019 (r350501) @@ -1247,6 +1247,7 @@ int cxgbe_snd_tag_modify(struct m_snd_tag *, union if_ int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); void cxgbe_snd_tag_free(struct m_snd_tag *); void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *); +void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *); #endif /* t4_filter.c */ Modified: head/sys/dev/cxgbe/t4_main.c ============================================================================== --- head/sys/dev/cxgbe/t4_main.c Thu Aug 1 14:13:04 2019 (r350500) +++ head/sys/dev/cxgbe/t4_main.c Thu Aug 1 14:17:31 2019 (r350501) @@ -1658,6 +1658,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi) ifp->if_snd_tag_modify = cxgbe_snd_tag_modify; ifp->if_snd_tag_query = cxgbe_snd_tag_query; ifp->if_snd_tag_free = cxgbe_snd_tag_free; + ifp->if_ratelimit_query = cxgbe_ratelimit_query; #endif ifp->if_capabilities = T4_CAP; Modified: head/sys/dev/cxgbe/t4_sched.c ============================================================================== --- head/sys/dev/cxgbe/t4_sched.c Thu Aug 1 14:13:04 2019 (r350500) +++ head/sys/dev/cxgbe/t4_sched.c Thu Aug 1 14:17:31 2019 (r350501) @@ -903,4 +903,35 @@ cxgbe_snd_tag_free(struct m_snd_tag *mst) } mtx_unlock(&cst->lock); } + +#define CXGBE_MAX_FLOWS 4000 /* Testing show so far thats all this adapter can do */ +#define CXGBE_UNIQUE_RATE_COUNT 16 /* Number of unique rates that can be setup */ + +void +cxgbe_ratelimit_query(struct ifnet *ifp __unused, + struct if_ratelimit_query_results *q) +{ + /* + * This is a skeleton and needs future work + * by the driver supporters. It should be + * enhanced to look at the specific type of + * interface and select approprate values + * for these settings. This example goes + * with an earlier card (t5), it has a maximum + * number of 16 rates that the first guys in + * select (thus the flags value RT_IS_SELECTABLE). + * If it was a fixed table then we would setup a + * const array (example mlx5). Note the card tested + * can only support reasonably 4000 flows before + * the adapter has issues with sending so here + * we limit the number of flows using hardware + * pacing to that number, other cards may + * be able to raise or eliminate this limit. + */ + q->rate_table = NULL; + q->flags = RT_IS_SELECTABLE; + q->max_flows = CXGBE_MAX_FLOWS; + q->number_of_rates = CXGBE_UNIQUE_RATE_COUNT; + q->min_segment_burst = 4; /* Driver emits 4 in a burst */ +} #endif Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c ============================================================================== --- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c Thu Aug 1 14:13:04 2019 (r350500) +++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c Thu Aug 1 14:17:31 2019 (r350501) @@ -4070,7 +4070,49 @@ mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_sn } } +#define NUM_HDWR_RATES_MLX 13 +static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = { + 135375, /* 1,083,000 */ + 180500, /* 1,444,000 */ + 270750, /* 2,166,000 */ + 361000, /* 2,888,000 */ + 541500, /* 4,332,000 */ + 721875, /* 5,775,000 */ + 1082875, /* 8,663,000 */ + 1443875, /* 11,551,000 */ + 2165750, /* 17,326,000 */ + 2887750, /* 23,102,000 */ + 4331625, /* 34,653,000 */ + 5775500, /* 46,204,000 */ + 8663125 /* 69,305,000 */ +}; + static void +mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) +{ + /* + * This function needs updating by the driver maintainer! + * For the MLX card there are currently (ConectX-4?) 13 + * pre-set rates and others i.e. ConnectX-5, 6, 7?? + * + * This will change based on later adapters + * and this code should be updated to look at ifp + * and figure out the specific adapter type + * settings i.e. how many rates as well + * as if they are fixed (as is shown here) or + * if they are dynamic (example chelsio t4). Also if there + * is a maximum number of flows that the adapter + * can handle that too needs to be updated in + * the max_flows field. + */ + q->rate_table = adapter_rates_mlx; + q->flags = RT_IS_FIXED_TABLE; + q->max_flows = 0; /* mlx has no limit */ + q->number_of_rates = NUM_HDWR_RATES_MLX; + q->min_segment_burst = 1; +} + +static void mlx5e_snd_tag_free(struct m_snd_tag *pmt) { struct mlx5e_snd_tag *tag = @@ -4155,7 +4197,9 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev) ifp->if_snd_tag_free = mlx5e_snd_tag_free; ifp->if_snd_tag_modify = mlx5e_snd_tag_modify; ifp->if_snd_tag_query = mlx5e_snd_tag_query; - +#ifdef RATELIMIT + ifp->if_ratelimit_query = mlx5e_ratelimit_query; +#endif /* set TSO limits so that we don't have to drop TX packets */ ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */; Modified: head/sys/net/if_dead.c ============================================================================== --- head/sys/net/if_dead.c Thu Aug 1 14:13:04 2019 (r350500) +++ head/sys/net/if_dead.c Thu Aug 1 14:17:31 2019 (r350501) @@ -126,6 +126,23 @@ ifdead_snd_tag_free(struct m_snd_tag *pmt) { } +static void +ifdead_ratelimit_query(struct ifnet *ifp __unused, + struct if_ratelimit_query_results *q) +{ + /* + * This guy does not support + * this interface. Not sure + * why we would specify a + * flag on the interface + * that says we do. + */ + q->rate_table = NULL; + q->flags = RT_NOSUPPORT; + q->max_flows = 0; + q->number_of_rates = 0; +} + void if_dead(struct ifnet *ifp) { @@ -142,4 +159,5 @@ if_dead(struct ifnet *ifp) ifp->if_snd_tag_modify = ifdead_snd_tag_modify; ifp->if_snd_tag_query = ifdead_snd_tag_query; ifp->if_snd_tag_free = ifdead_snd_tag_free; + ifp->if_ratelimit_query = ifdead_ratelimit_query; } Modified: head/sys/net/if_lagg.c ============================================================================== --- head/sys/net/if_lagg.c Thu Aug 1 14:13:04 2019 (r350500) +++ head/sys/net/if_lagg.c Thu Aug 1 14:17:31 2019 (r350501) @@ -144,6 +144,8 @@ static int lagg_snd_tag_modify(struct m_snd_tag *, static int lagg_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *); static void lagg_snd_tag_free(struct m_snd_tag *); +static void lagg_ratelimit_query(struct ifnet *, + struct if_ratelimit_query_results *); #endif static int lagg_setmulti(struct lagg_port *); static int lagg_clrmulti(struct lagg_port *); @@ -537,6 +539,7 @@ lagg_clone_create(struct if_clone *ifc, int unit, cadd ifp->if_snd_tag_modify = lagg_snd_tag_modify; ifp->if_snd_tag_query = lagg_snd_tag_query; ifp->if_snd_tag_free = lagg_snd_tag_free; + ifp->if_ratelimit_query = lagg_ratelimit_query; #endif ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS; @@ -1670,6 +1673,20 @@ lagg_snd_tag_free(struct m_snd_tag *mst) free(lst, M_LAGG); } +static void +lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q) +{ + /* + * For lagg, we have an indirect + * interface. The caller needs to + * get a ratelimit tag on the actual + * interface the flow will go on. + */ + q->rate_table = NULL; + q->flags = RT_IS_INDIRECT; + q->max_flows = 0; + q->number_of_rates = 0; +} #endif static int Modified: head/sys/net/if_var.h ============================================================================== --- head/sys/net/if_var.h Thu Aug 1 14:13:04 2019 (r350500) +++ head/sys/net/if_var.h Thu Aug 1 14:17:31 2019 (r350501) @@ -203,6 +203,8 @@ struct if_snd_tag_alloc_header { struct if_snd_tag_alloc_rate_limit { struct if_snd_tag_alloc_header hdr; uint64_t max_rate; /* in bytes/s */ + uint32_t flags; /* M_NOWAIT or M_WAITOK */ + uint32_t reserved; /* alignment */ }; struct if_snd_tag_rate_limit_params { @@ -210,7 +212,7 @@ struct if_snd_tag_rate_limit_params { uint32_t queue_level; /* 0 (empty) .. 65535 (full) */ #define IF_SND_QUEUE_LEVEL_MIN 0 #define IF_SND_QUEUE_LEVEL_MAX 65535 - uint32_t reserved; /* padding */ + uint32_t flags; /* M_NOWAIT or M_WAITOK */ }; union if_snd_tag_alloc_params { @@ -229,12 +231,38 @@ union if_snd_tag_query_params { struct if_snd_tag_rate_limit_params unlimited; }; +/* Query return flags */ +#define RT_NOSUPPORT 0x00000000 /* Not supported */ +#define RT_IS_INDIRECT 0x00000001 /* + * Interface like a lagg, select + * the actual interface for + * capabilities. + */ +#define RT_IS_SELECTABLE 0x00000002 /* + * No rate table, you select + * rates and the first + * number_of_rates are created. + */ +#define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */ +#define RT_IS_UNUSABLE 0x00000008 /* It is not usable for this */ + +struct if_ratelimit_query_results { + const uint64_t *rate_table; /* Pointer to table if present */ + uint32_t flags; /* Flags indicating results */ + uint32_t max_flows; /* Max flows using, 0=unlimited */ + uint32_t number_of_rates; /* How many unique rates can be created */ + uint32_t min_segment_burst; /* The amount the adapter bursts at each send */ +}; + typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *, struct m_snd_tag **); typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *); typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *); typedef void (if_snd_tag_free_t)(struct m_snd_tag *); +typedef void (if_ratelimit_query_t)(struct ifnet *, + struct if_ratelimit_query_results *); + /* * Structure defining a network interface. */ @@ -374,6 +402,7 @@ struct ifnet { if_snd_tag_modify_t *if_snd_tag_modify; if_snd_tag_query_t *if_snd_tag_query; if_snd_tag_free_t *if_snd_tag_free; + if_ratelimit_query_t *if_ratelimit_query; /* Ethernet PCP */ uint8_t if_pcp; Modified: head/sys/netinet/in_pcb.c ============================================================================== --- head/sys/netinet/in_pcb.c Thu Aug 1 14:13:04 2019 (r350500) +++ head/sys/netinet/in_pcb.c Thu Aug 1 14:17:31 2019 (r350501) @@ -210,6 +210,22 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtim &VNET_NAME(ipport_randomtime), 0, "Minimum time to keep sequental port " "allocation before switching to a random one"); + +#ifdef RATELIMIT +counter_u64_t rate_limit_active; +counter_u64_t rate_limit_alloc_fail; +counter_u64_t rate_limit_set_ok; + +static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0, + "IP Rate Limiting"); +SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, + &rate_limit_active, "Active rate limited connections"); +SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, + &rate_limit_alloc_fail, "Rate limited connection failures"); +SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, + &rate_limit_set_ok, "Rate limited setting succeeded"); +#endif /* RATELIMIT */ + #endif /* INET */ /* @@ -3170,6 +3186,7 @@ in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_p { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, + .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; struct ifnet *ifp; @@ -3256,7 +3273,8 @@ in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_tx */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, - uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate) + uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) + { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? @@ -3264,22 +3282,47 @@ in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet * .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.max_rate = max_pacing_rate, + .rate_limit.flags = M_NOWAIT, }; int error; INP_WLOCK_ASSERT(inp); - if (inp->inp_snd_tag != NULL) + if (*st != NULL) return (EINVAL); if (ifp->if_snd_tag_alloc == NULL) { error = EOPNOTSUPP; } else { error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); + + if (error == 0) { + counter_u64_add(rate_limit_set_ok, 1); + counter_u64_add(rate_limit_active, 1); + } else + counter_u64_add(rate_limit_alloc_fail, 1); } return (error); } +void +in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst) +{ + if (ifp == NULL) + return; + + /* + * If the device was detached while we still had reference(s) + * on the ifp, we assume if_snd_tag_free() was replaced with + * stubs. + */ + ifp->if_snd_tag_free(mst); + + /* release reference count on network interface */ + if_rele(ifp); + counter_u64_add(rate_limit_active, -1); +} + /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: @@ -3300,49 +3343,12 @@ in_pcbdetach_txrtlmt(struct inpcb *inp) m_snd_tag_rele(mst); } -/* - * This function should be called when the INP_RATE_LIMIT_CHANGED flag - * is set in the fast path and will attach/detach/modify the TX rate - * limit send tag based on the socket's so_max_pacing_rate value. - */ -void -in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) +int +in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) { - struct socket *socket; - uint32_t max_pacing_rate; - bool did_upgrade; int error; - if (inp == NULL) - return; - - socket = inp->inp_socket; - if (socket == NULL) - return; - - if (!INP_WLOCKED(inp)) { - /* - * NOTE: If the write locking fails, we need to bail - * out and use the non-ratelimited ring for the - * transmit until there is a new chance to get the - * write lock. - */ - if (!INP_TRY_UPGRADE(inp)) - return; - did_upgrade = 1; - } else { - did_upgrade = 0; - } - /* - * NOTE: The so_max_pacing_rate value is read unlocked, - * because atomic updates are not required since the variable - * is checked at every mbuf we send. It is assumed that the - * variable read itself will be atomic. - */ - max_pacing_rate = socket->so_max_pacing_rate; - - /* * If the existing send tag is for the wrong interface due to * a route change, first drop the existing tag. Set the * CHANGED flag so that we will keep trying to allocate a new @@ -3376,13 +3382,61 @@ in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet * error = EAGAIN; } else { error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), - mb->m_pkthdr.flowid, max_pacing_rate); + mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); } } else { error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); } if (error == 0 || error == EOPNOTSUPP) inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; + + return (error); +} + +/* + * This function should be called when the INP_RATE_LIMIT_CHANGED flag + * is set in the fast path and will attach/detach/modify the TX rate + * limit send tag based on the socket's so_max_pacing_rate value. + */ +void +in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) +{ + struct socket *socket; + uint32_t max_pacing_rate; + bool did_upgrade; + int error; + + if (inp == NULL) + return; + + socket = inp->inp_socket; + if (socket == NULL) + return; + + if (!INP_WLOCKED(inp)) { + /* + * NOTE: If the write locking fails, we need to bail + * out and use the non-ratelimited ring for the + * transmit until there is a new chance to get the + * write lock. + */ + if (!INP_TRY_UPGRADE(inp)) + return; + did_upgrade = 1; + } else { + did_upgrade = 0; + } + + /* + * NOTE: The so_max_pacing_rate value is read unlocked, + * because atomic updates are not required since the variable + * is checked at every mbuf we send. It is assumed that the + * variable read itself will be atomic. + */ + max_pacing_rate = socket->so_max_pacing_rate; + + error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); + if (did_upgrade) INP_DOWNGRADE(inp); } @@ -3424,4 +3478,14 @@ in_pcboutput_eagain(struct inpcb *inp) if (did_upgrade) INP_DOWNGRADE(inp); } + +static void +rl_init(void *st) +{ + rate_limit_active = counter_u64_alloc(M_WAITOK); + rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); + rate_limit_set_ok = counter_u64_alloc(M_WAITOK); +} + +SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); #endif /* RATELIMIT */ Modified: head/sys/netinet/in_pcb.h ============================================================================== --- head/sys/netinet/in_pcb.h Thu Aug 1 14:13:04 2019 (r350500) +++ head/sys/netinet/in_pcb.h Thu Aug 1 14:17:31 2019 (r350501) @@ -883,8 +883,13 @@ struct sockaddr * in_sockaddr(in_port_t port, struct in_addr *addr); void in_pcbsosetlabel(struct socket *so); #ifdef RATELIMIT -int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t); +int +in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *, + struct mbuf *, uint32_t); +int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, + uint32_t, struct m_snd_tag **); void in_pcbdetach_txrtlmt(struct inpcb *); +void in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst); int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t); int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *); int in_pcbquery_txrlevel(struct inpcb *, uint32_t *); Added: head/sys/netinet/tcp_ratelimit.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ head/sys/netinet/tcp_ratelimit.c Thu Aug 1 14:17:31 2019 (r350501) @@ -0,0 +1,1234 @@ +/*- + * + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2018-2019 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/** + * Author: Randall Stewart <rrs@netflix.com> + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_ratelimit.h" +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#ifdef KERN_TLS +#include <sys/sockbuf_tls.h> +#endif +#include <sys/sysctl.h> +#include <sys/eventhandler.h> +#include <sys/mutex.h> +#include <sys/ck.h> +#define TCPSTATES /* for logging */ +#include <netinet/in.h> +#include <netinet/in_pcb.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif +#include <netinet/tcp_ratelimit.h> +#ifndef USECS_IN_SECOND +#define USECS_IN_SECOND 1000000 +#endif +/* + * For the purposes of each send, what is the size + * of an ethernet frame. + */ +#ifndef ETHERNET_SEGMENT_SIZE +#define ETHERNET_SEGMENT_SIZE 1500 +#endif +MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory"); +#ifdef RATELIMIT + +#define COMMON_RATE 180500 +uint64_t desired_rates[] = { + 62500, /* 500Kbps */ + 180500, /* 1.44Mpbs */ + 375000, /* 3Mbps */ + 500000, /* 4Mbps */ + 625000, /* 5Mbps */ + 750000, /* 6Mbps */ + 1000000, /* 8Mbps */ + 1250000, /* 10Mbps */ + 2500000, /* 20Mbps */ + 3750000, /* 30Mbps */ + 5000000, /* 40Meg */ + 6250000, /* 50Mbps */ + 12500000, /* 100Mbps */ + 25000000, /* 200Mbps */ + 50000000, /* 400Mbps */ + 100000000, /* 800Mbps */ + 12500, /* 100kbps */ + 25000, /* 200kbps */ + 875000, /* 7Mbps */ + 1125000, /* 9Mbps */ + 1875000, /* 15Mbps */ + 3125000, /* 25Mbps */ + 8125000, /* 65Mbps */ + 10000000, /* 80Mbps */ + 18750000, /* 150Mbps */ + 20000000, /* 250Mbps */ + 37500000, /* 350Mbps */ + 62500000, /* 500Mbps */ + 78125000, /* 625Mbps */ + 125000000, /* 1Gbps */ +}; +#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t)) +#define RS_ORDERED_COUNT 16 /* + * Number that are in order + * at the beginning of the table, + * over this a sort is required. + */ +#define RS_NEXT_ORDER_GROUP 16 /* + * The point in our table where + * we come fill in a second ordered + * group (index wise means -1). + */ +#define ALL_HARDWARE_RATES 1004 /* + * 1Meg - 1Gig in 1 Meg steps + * plus 100, 200k and 500k and + * 10Gig + */ + +#define RS_ONE_MEGABIT_PERSEC 1000000 +#define RS_ONE_GIGABIT_PERSEC 1000000000 +#define RS_TEN_GIGABIT_PERSEC 10000000000 + +static struct head_tcp_rate_set int_rs; +static struct mtx rs_mtx; +uint32_t rs_number_alive; +uint32_t rs_number_dead; + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0, + "TCP Ratelimit stats"); +SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW, + &rs_number_alive, 0, + "Number of interfaces initialized for ratelimiting"); +SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW, + &rs_number_dead, 0, + "Number of interfaces departing from ratelimiting"); + +static void +rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs) +{ + /* + * Add sysctl entries for thus interface. + */ + if (rs->rs_flags & RS_INTF_NO_SUP) { + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "disable", CTLFLAG_RD, + &rs->rs_disable, 0, + "Disable this interface from new hdwr limiting?"); + } else { + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "disable", CTLFLAG_RW, + &rs->rs_disable, 0, + "Disable this interface from new hdwr limiting?"); + } + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "minseg", CTLFLAG_RW, + &rs->rs_min_seg, 0, + "What is the minimum we need to send on this interface?"); + SYSCTL_ADD_U64(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "flow_limit", CTLFLAG_RW, + &rs->rs_flow_limit, 0, + "What is the limit for number of flows (0=unlimited)?"); + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "highest", CTLFLAG_RD, + &rs->rs_highest_valid, 0, + "Highest valid rate"); + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "lowest", CTLFLAG_RD, + &rs->rs_lowest_valid, 0, + "Lowest valid rate"); + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "flags", CTLFLAG_RD, + &rs->rs_flags, 0, + "What lags are on the entry?"); + SYSCTL_ADD_S32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "numrates", CTLFLAG_RD, + &rs->rs_rate_cnt, 0, + "How many rates re there?"); + SYSCTL_ADD_U64(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, "flows_using", CTLFLAG_RD, + &rs->rs_flows_using, 0, + "How many flows are using this interface now?"); +#ifdef DETAILED_RATELIMIT_SYSCTL + if (rs->rs_rlt && rs->rs_rate_cnt > 0) { + /* Lets display the rates */ + int i; + struct sysctl_oid *rl_rates; + struct sysctl_oid *rl_rate_num; + char rate_num[16]; + rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_sysctl_root), + OID_AUTO, + "rate", + CTLFLAG_RW, 0, + "Ratelist"); + for( i = 0; i < rs->rs_rate_cnt; i++) { + sprintf(rate_num, "%d", i); + rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_rates), + OID_AUTO, + rate_num, + CTLFLAG_RW, 0, + "Individual Rate"); + SYSCTL_ADD_U32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_rate_num), + OID_AUTO, "flags", CTLFLAG_RD, + &rs->rs_rlt[i].flags, 0, + "Flags on this rate"); + SYSCTL_ADD_U32(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_rate_num), + OID_AUTO, "pacetime", CTLFLAG_RD, + &rs->rs_rlt[i].time_between, 0, + "Time hardware inserts between 1500 byte sends"); + SYSCTL_ADD_U64(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_rate_num), + OID_AUTO, "rate", CTLFLAG_RD, + &rs->rs_rlt[i].rate, 0, + "Rate in bytes per second"); + } + } +#endif +} + +static void +rs_destroy(epoch_context_t ctx) +{ + struct tcp_rate_set *rs; + + rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx); + mtx_lock(&rs_mtx); + rs->rs_flags &= ~RS_FUNERAL_SCHD; + if (rs->rs_flows_using == 0) { + /* + * In theory its possible (but unlikely) + * that while the delete was occuring + * and we were applying the DEAD flag + * someone slipped in and found the + * interface in a lookup. While we + * decided rs_flows_using were 0 and + * scheduling the epoch_call, the other + * thread incremented rs_flow_using. This + * is because users have a pointer and + * we only use the rs_flows_using in an + * atomic fashion, i.e. the other entities + * are not protected. To assure this did + * not occur, we check rs_flows_using here + * before deleteing. + */ + sysctl_ctx_free(&rs->sysctl_ctx); + free(rs->rs_rlt, M_TCPPACE); + free(rs, M_TCPPACE); + rs_number_dead--; + } + mtx_unlock(&rs_mtx); + +} + +extern counter_u64_t rate_limit_set_ok; +extern counter_u64_t rate_limit_active; +extern counter_u64_t rate_limit_alloc_fail; + +static int +rl_attach_txrtlmt(struct ifnet *ifp, + uint32_t flowtype, + int flowid, + uint64_t cfg_rate, + struct m_snd_tag **tag) +{ + int error; + union if_snd_tag_alloc_params params = { + .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT, + .rate_limit.hdr.flowid = flowid, + .rate_limit.hdr.flowtype = flowtype, + .rate_limit.max_rate = cfg_rate, + .rate_limit.flags = M_NOWAIT, + }; + + if (ifp->if_snd_tag_alloc == NULL) { + error = EOPNOTSUPP; + } else { + error = ifp->if_snd_tag_alloc(ifp, ¶ms, tag); + if (error == 0) { + if_ref((*tag)->ifp); + counter_u64_add(rate_limit_set_ok, 1); + counter_u64_add(rate_limit_active, 1); + } else + counter_u64_add(rate_limit_alloc_fail, 1); + } + return (error); +} + +static void +populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act) +{ + /* + * The internal table is "special", it + * is two seperate ordered tables that + * must be merged. We get here when the + * adapter specifies a number of rates that + * covers both ranges in the table in some + * form. + */ + int i, at_low, at_high; + uint8_t low_disabled = 0, high_disabled = 0; + + for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) { + rs->rs_rlt[i].flags = 0; + rs->rs_rlt[i].time_between = 0; + if ((low_disabled == 0) && + (high_disabled || + (rate_table_act[at_low] < rate_table_act[at_high]))) { + rs->rs_rlt[i].rate = rate_table_act[at_low]; + at_low++; + if (at_low == RS_NEXT_ORDER_GROUP) + low_disabled = 1; + } else if (high_disabled == 0) { + rs->rs_rlt[i].rate = rate_table_act[at_high]; + at_high++; + if (at_high == MAX_HDWR_RATES) + high_disabled = 1; + } + } +} + +static struct tcp_rate_set * +rt_setup_new_rs(struct ifnet *ifp, int *error) +{ + struct tcp_rate_set *rs; + const uint64_t *rate_table_act; + uint64_t lentim, res; + size_t sz; + uint32_t hash_type; + int i; + struct if_ratelimit_query_results rl; + struct sysctl_oid *rl_sysctl_root; + /* + * We expect to enter with the + * mutex locked. + */ + + if (ifp->if_ratelimit_query == NULL) { + /* + * We can do nothing if we cannot + * get a query back from the driver. + */ + return (NULL); + } + rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO); + if (rs == NULL) { + if (error) + *error = ENOMEM; + return (NULL); + } + rl.flags = RT_NOSUPPORT; + ifp->if_ratelimit_query(ifp, &rl); + if (rl.flags & RT_IS_UNUSABLE) { + /* + * The interface does not really support + * the rate-limiting. + */ + memset(rs, 0, sizeof(struct tcp_rate_set)); + rs->rs_ifp = ifp; + rs->rs_if_dunit = ifp->if_dunit; + rs->rs_flags = RS_INTF_NO_SUP; + rs->rs_disable = 1; + rs_number_alive++; + sysctl_ctx_init(&rs->sysctl_ctx); + rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), + OID_AUTO, + rs->rs_ifp->if_xname, + CTLFLAG_RW, 0, + ""); + CK_LIST_INSERT_HEAD(&int_rs, rs, next); + /* Unlock to allow the sysctl stuff to allocate */ + mtx_unlock(&rs_mtx); + rl_add_syctl_entries(rl_sysctl_root, rs); + /* re-lock for our caller */ + mtx_lock(&rs_mtx); + return (rs); + } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) { + memset(rs, 0, sizeof(struct tcp_rate_set)); + rs->rs_ifp = ifp; + rs->rs_if_dunit = ifp->if_dunit; + rs->rs_flags = RS_IS_DEFF; + rs_number_alive++; + sysctl_ctx_init(&rs->sysctl_ctx); + rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl), + OID_AUTO, + rs->rs_ifp->if_xname, + CTLFLAG_RW, 0, + ""); + CK_LIST_INSERT_HEAD(&int_rs, rs, next); + /* Unlock to allow the sysctl stuff to allocate */ + mtx_unlock(&rs_mtx); + rl_add_syctl_entries(rl_sysctl_root, rs); + /* re-lock for our caller */ + mtx_lock(&rs_mtx); + return (rs); + } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) { + /* Mellanox most likely */ + rs->rs_ifp = ifp; + rs->rs_if_dunit = ifp->if_dunit; + rs->rs_rate_cnt = rl.number_of_rates; + rs->rs_min_seg = rl.min_segment_burst; + rs->rs_highest_valid = 0; + rs->rs_flow_limit = rl.max_flows; + rs->rs_flags = RS_IS_INTF | RS_NO_PRE; + rs->rs_disable = 0; + rate_table_act = rl.rate_table; + } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) { + /* Chelsio */ + rs->rs_ifp = ifp; + rs->rs_if_dunit = ifp->if_dunit; + rs->rs_rate_cnt = rl.number_of_rates; + rs->rs_min_seg = rl.min_segment_burst; + rs->rs_disable = 0; + rs->rs_flow_limit = rl.max_flows; + rate_table_act = desired_rates; + if ((rs->rs_rate_cnt > MAX_HDWR_RATES) && + (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) { + /* + * Our desired table is not big + * enough, do what we can. + */ + rs->rs_rate_cnt = MAX_HDWR_RATES; + } + if (rs->rs_rate_cnt <= RS_ORDERED_COUNT) + rs->rs_flags = RS_IS_INTF; + else + rs->rs_flags = RS_IS_INTF | RS_INT_TBL; + if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) + rs->rs_rate_cnt = ALL_HARDWARE_RATES; + } else { + printf("Interface:%s unit:%d not one known to have rate-limits\n", + ifp->if_dname, + ifp->if_dunit); + free(rs, M_TCPPACE); + return (NULL); + } + sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt; + rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT); + if (rs->rs_rlt == NULL) { + if (error) + *error = ENOMEM; +bail: + free(rs, M_TCPPACE); + return (NULL); + } + if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) { + /* + * The interface supports all + * the rates we could possibly want. + */ + uint64_t rat; + + rs->rs_rlt[0].rate = 12500; /* 100k */ + rs->rs_rlt[1].rate = 25000; /* 200k */ + rs->rs_rlt[2].rate = 62500; /* 500k */ + /* Note 125000 == 1Megabit + * populate 1Meg - 1000meg. + */ + for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) { + rs->rs_rlt[i].rate = rat; + rat += 125000; + } + rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000; + } else if (rs->rs_flags & RS_INT_TBL) { + /* We populate this in a special way */ + populate_canned_table(rs, rate_table_act); + } else { + /* *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201908011417.x71EHW0j029518>