Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 18 Jan 2017 13:31:17 +0000 (UTC)
From:      Hans Petter Selasky <hselasky@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r312379 - in head: lib/libc/sys sbin/ifconfig sys/conf sys/kern sys/modules/if_lagg sys/modules/if_vlan sys/net sys/netinet sys/netinet6 sys/sys
Message-ID:  <201701181331.v0IDVHWf048428@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: hselasky
Date: Wed Jan 18 13:31:17 2017
New Revision: 312379
URL: https://svnweb.freebsd.org/changeset/base/312379

Log:
  Implement kernel support for hardware rate limited sockets.
  
  - Add RATELIMIT kernel configuration keyword which must be set to
  enable the new functionality.
  
  - Add support for hardware driven, Receive Side Scaling, RSS aware, rate
  limited sendqueues and expose the functionality through the already
  established SO_MAX_PACING_RATE setsockopt(). The API support rates in
  the range from 1 to 4Gbytes/s which are suitable for regular TCP and
  UDP streams. The setsockopt(2) manual page has been updated.
  
  - Add rate limit function callback API to "struct ifnet" which supports
  the following operations: if_snd_tag_alloc(), if_snd_tag_modify(),
  if_snd_tag_query() and if_snd_tag_free().
  
  - Add support to ifconfig to view, set and clear the IFCAP_TXRTLMT
  flag, which tells if a network driver supports rate limiting or not.
  
  - This patch also adds support for rate limiting through VLAN and LAGG
  intermediate network devices.
  
  - How rate limiting works:
  
  1) The userspace application calls setsockopt() after accepting or
  making a new connection to set the rate which is then stored in the
  socket structure in the kernel. Later on when packets are transmitted
  a check is made in the transmit path for rate changes. A rate change
  implies a non-blocking ifp->if_snd_tag_alloc() call will be made to the
  destination network interface, which then sets up a custom sendqueue
  with the given rate limitation parameter. A "struct m_snd_tag" pointer is
  returned which serves as a "snd_tag" hint in the m_pkthdr for the
  subsequently transmitted mbufs.
  
  2) When the network driver sees the "m->m_pkthdr.snd_tag" different
  from NULL, it will move the packets into a designated rate limited sendqueue
  given by the snd_tag pointer. It is up to the individual drivers how the rate
  limited traffic will be rate limited.
  
  3) Route changes are detected by the NIC drivers in the ifp->if_transmit()
  routine when the ifnet pointer in the incoming snd_tag mismatches the
  one of the network interface. The network adapter frees the mbuf and
  returns EAGAIN which causes the ip_output() to release and clear the send
  tag. Upon next ip_output() a new "snd_tag" will be tried allocated.
  
  4) When the PCB is detached the custom sendqueue will be released by a
  non-blocking ifp->if_snd_tag_free() call to the currently bound network
  interface.
  
  Reviewed by:		wblock (manpages), adrian, gallatin, scottl (network)
  Differential Revision:	https://reviews.freebsd.org/D3687
  Sponsored by:		Mellanox Technologies
  MFC after:		3 months

Modified:
  head/lib/libc/sys/getsockopt.2
  head/sbin/ifconfig/ifconfig.8
  head/sbin/ifconfig/ifconfig.c
  head/sys/conf/NOTES
  head/sys/conf/config.mk
  head/sys/conf/kern.opts.mk
  head/sys/conf/options
  head/sys/kern/uipc_socket.c
  head/sys/modules/if_lagg/Makefile
  head/sys/modules/if_vlan/Makefile
  head/sys/net/ieee8023ad_lacp.c
  head/sys/net/ieee8023ad_lacp.h
  head/sys/net/if.h
  head/sys/net/if_dead.c
  head/sys/net/if_lagg.c
  head/sys/net/if_var.h
  head/sys/net/if_vlan.c
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/ip_output.c
  head/sys/netinet6/ip6_output.c
  head/sys/sys/mbuf.h
  head/sys/sys/socket.h
  head/sys/sys/socketvar.h

Modified: head/lib/libc/sys/getsockopt.2
==============================================================================
--- head/lib/libc/sys/getsockopt.2	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/lib/libc/sys/getsockopt.2	Wed Jan 18 13:31:17 2017	(r312379)
@@ -28,7 +28,7 @@
 .\"     @(#)getsockopt.2	8.4 (Berkeley) 5/2/95
 .\" $FreeBSD$
 .\"
-.Dd April 5, 2013
+.Dd January 18, 2017
 .Dt GETSOCKOPT 2
 .Os
 .Sh NAME
@@ -188,6 +188,7 @@ The following options are recognized in
 .It Dv SO_LISTENINCQLEN Ta "get incomplete queue length of the socket (get only)"
 .It Dv SO_USER_COOKIE Ta "set the 'so_user_cookie' value for the socket (uint32_t, set only)"
 .It Dv SO_TS_CLOCK Ta "set specific format of timestamp returned by SO_TIMESTAMP"
+.It Dv SO_MAX_PACING_RATE "set the maximum transmit rate in bytes per second for the socket"
 .El
 .Pp
 .Dv SO_DEBUG
@@ -515,6 +516,10 @@ returns the maximal number of queued con
 returns the number of unaccepted complete connections.
 .Dv SO_LISTENINCQLEN
 returns the number of unaccepted incomplete connections.
+.Pp
+.Dv SO_MAX_PACING_RATE
+instruct the socket and underlying network adapter layers to limit the
+transfer rate to the given unsigned 32-bit value in bytes per second.
 .Sh RETURN VALUES
 .Rv -std
 .Sh ERRORS

Modified: head/sbin/ifconfig/ifconfig.8
==============================================================================
--- head/sbin/ifconfig/ifconfig.8	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sbin/ifconfig/ifconfig.8	Wed Jan 18 13:31:17 2017	(r312379)
@@ -28,7 +28,7 @@
 .\"     From: @(#)ifconfig.8	8.3 (Berkeley) 1/5/94
 .\" $FreeBSD$
 .\"
-.Dd September 17, 2016
+.Dd January 18, 2017
 .Dt IFCONFIG 8
 .Os
 .Sh NAME
@@ -460,6 +460,8 @@ this directive is used to select between
 and 802.11g
 .Pq Cm 11g
 operating modes.
+.It Cm txrtlmt
+Set if the driver supports TX rate limiting.
 .It Cm inst Ar minst , Cm instance Ar minst
 Set the media instance to
 .Ar minst .

Modified: head/sbin/ifconfig/ifconfig.c
==============================================================================
--- head/sbin/ifconfig/ifconfig.c	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sbin/ifconfig/ifconfig.c	Wed Jan 18 13:31:17 2017	(r312379)
@@ -1145,7 +1145,7 @@ unsetifdescr(const char *val, int value,
 "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
 "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
 "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
-"\26RXCSUM_IPV6\27TXCSUM_IPV6"
+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT"
 
 /*
  * Print the status of the interface.  If an address family was
@@ -1453,6 +1453,8 @@ static struct cmd basic_cmds[] = {
 	DEF_CMD("-wol_mcast",	-IFCAP_WOL_MCAST,	setifcap),
 	DEF_CMD("wol_magic",	IFCAP_WOL_MAGIC,	setifcap),
 	DEF_CMD("-wol_magic",	-IFCAP_WOL_MAGIC,	setifcap),
+	DEF_CMD("txrtlmt",	IFCAP_TXRTLMT,	setifcap),
+	DEF_CMD("-txrtlmt",	-IFCAP_TXRTLMT,	setifcap),
 	DEF_CMD("normal",	-IFF_LINK0,	setifflags),
 	DEF_CMD("compress",	IFF_LINK0,	setifflags),
 	DEF_CMD("noicmp",	IFF_LINK1,	setifflags),

Modified: head/sys/conf/NOTES
==============================================================================
--- head/sys/conf/NOTES	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/conf/NOTES	Wed Jan 18 13:31:17 2017	(r312379)
@@ -619,6 +619,8 @@ options 	HWPMC_HOOKS		# Other necessary 
 options 	INET			#Internet communications protocols
 options 	INET6			#IPv6 communications protocols
 
+options		RATELIMIT		# TX rate limiting support
+
 options 	ROUTETABLES=2		# allocated fibs up to 65536. default is 1.
 					# but that would be a bad idea as they are large.
 

Modified: head/sys/conf/config.mk
==============================================================================
--- head/sys/conf/config.mk	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/conf/config.mk	Wed Jan 18 13:31:17 2017	(r312379)
@@ -19,6 +19,10 @@ opt_inet.h:
 opt_inet6.h:
 	@echo "#define INET6 1" > ${.TARGET}
 .endif
+.if ${MK_RATELIMIT} != "no"
+opt_ratelimit.h:
+	@echo "#define RATELIMIT 1" > ${.TARGET}
+.endif
 .if ${MK_EISA} != "no"
 opt_eisa.h:
 	@echo "#define DEV_EISA 1" > ${.TARGET}

Modified: head/sys/conf/kern.opts.mk
==============================================================================
--- head/sys/conf/kern.opts.mk	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/conf/kern.opts.mk	Wed Jan 18 13:31:17 2017	(r312379)
@@ -48,6 +48,7 @@ __DEFAULT_NO_OPTIONS = \
     EXTRA_TCP_STACKS \
     NAND \
     OFED \
+    RATELIMIT \
     REPRODUCIBLE_BUILD
 
 # Some options are totally broken on some architectures. We disable

Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/conf/options	Wed Jan 18 13:31:17 2017	(r312379)
@@ -412,6 +412,7 @@ BOOTP_NFSV3		opt_bootp.h
 BOOTP_WIRED_TO		opt_bootp.h
 DEVICE_POLLING
 DUMMYNET		opt_ipdn.h
+RATELIMIT		opt_ratelimit.h
 INET			opt_inet.h
 INET6			opt_inet6.h
 IPDIVERT

Modified: head/sys/kern/uipc_socket.c
==============================================================================
--- head/sys/kern/uipc_socket.c	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/kern/uipc_socket.c	Wed Jan 18 13:31:17 2017	(r312379)
@@ -2699,6 +2699,14 @@ sosetopt(struct socket *so, struct socko
 			so->so_ts_clock = optval;
 			break;
 
+		case SO_MAX_PACING_RATE:
+			error = sooptcopyin(sopt, &val32, sizeof(val32),
+			    sizeof(val32));
+			if (error)
+				goto bad;
+			so->so_max_pacing_rate = val32;
+			break;
+
 		default:
 			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
 				error = hhook_run_socket(so, sopt,
@@ -2890,6 +2898,10 @@ integer:
 			optval = so->so_ts_clock;
 			goto integer;
 
+		case SO_MAX_PACING_RATE:
+			optval = so->so_max_pacing_rate;
+			goto integer;
+
 		default:
 			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
 				error = hhook_run_socket(so, sopt,

Modified: head/sys/modules/if_lagg/Makefile
==============================================================================
--- head/sys/modules/if_lagg/Makefile	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/modules/if_lagg/Makefile	Wed Jan 18 13:31:17 2017	(r312379)
@@ -2,6 +2,6 @@
 
 .PATH:	${.CURDIR}/../../net
 KMOD=	if_lagg
-SRCS=	if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h
+SRCS=	if_lagg.c ieee8023ad_lacp.c opt_inet.h opt_inet6.h opt_ratelimit.h
 
 .include <bsd.kmod.mk>

Modified: head/sys/modules/if_vlan/Makefile
==============================================================================
--- head/sys/modules/if_vlan/Makefile	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/modules/if_vlan/Makefile	Wed Jan 18 13:31:17 2017	(r312379)
@@ -4,6 +4,6 @@
 
 KMOD=	if_vlan
 SRCS=	if_vlan.c
-SRCS+=	opt_inet.h opt_vlan.h
+SRCS+=	opt_inet.h opt_vlan.h opt_ratelimit.h
 
 .include <bsd.kmod.mk>

Modified: head/sys/net/ieee8023ad_lacp.c
==============================================================================
--- head/sys/net/ieee8023ad_lacp.c	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/net/ieee8023ad_lacp.c	Wed Jan 18 13:31:17 2017	(r312379)
@@ -30,6 +30,8 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_ratelimit.h"
+
 #include <sys/param.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
@@ -853,6 +855,35 @@ lacp_select_tx_port(struct lagg_softc *s
 
 	return (lp->lp_lagg);
 }
+
+#ifdef RATELIMIT
+struct lagg_port *
+lacp_select_tx_port_by_hash(struct lagg_softc *sc, uint32_t flowid)
+{
+	struct lacp_softc *lsc = LACP_SOFTC(sc);
+	struct lacp_portmap *pm;
+	struct lacp_port *lp;
+	uint32_t hash;
+
+	if (__predict_false(lsc->lsc_suppress_distributing)) {
+		LACP_DPRINTF((NULL, "%s: waiting transit\n", __func__));
+		return (NULL);
+	}
+
+	pm = &lsc->lsc_pmap[lsc->lsc_activemap];
+	if (pm->pm_count == 0) {
+		LACP_DPRINTF((NULL, "%s: no active aggregator\n", __func__));
+		return (NULL);
+	}
+
+	hash = flowid >> sc->flowid_shift;
+	hash %= pm->pm_count;
+	lp = pm->pm_map[hash];
+
+	return (lp->lp_lagg);
+}
+#endif
+
 /*
  * lacp_suppress_distributing: drop transmit packets for a while
  * to preserve packet ordering.

Modified: head/sys/net/ieee8023ad_lacp.h
==============================================================================
--- head/sys/net/ieee8023ad_lacp.h	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/net/ieee8023ad_lacp.h	Wed Jan 18 13:31:17 2017	(r312379)
@@ -284,6 +284,9 @@ struct lacp_softc {
 
 struct mbuf	*lacp_input(struct lagg_port *, struct mbuf *);
 struct lagg_port *lacp_select_tx_port(struct lagg_softc *, struct mbuf *);
+#ifdef RATELIMIT
+struct lagg_port *lacp_select_tx_port_by_hash(struct lagg_softc *, uint32_t);
+#endif
 void		lacp_attach(struct lagg_softc *);
 void		lacp_detach(void *);
 void		lacp_init(struct lagg_softc *);

Modified: head/sys/net/if.h
==============================================================================
--- head/sys/net/if.h	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/net/if.h	Wed Jan 18 13:31:17 2017	(r312379)
@@ -239,6 +239,7 @@ struct if_data {
 #define	IFCAP_RXCSUM_IPV6	0x200000  /* can offload checksum on IPv6 RX */
 #define	IFCAP_TXCSUM_IPV6	0x400000  /* can offload checksum on IPv6 TX */
 #define	IFCAP_HWSTATS		0x800000 /* manages counters internally */
+#define	IFCAP_TXRTLMT		0x1000000 /* hardware supports TX rate limiting */
 
 #define IFCAP_HWCSUM_IPV6	(IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
 

Modified: head/sys/net/if_dead.c
==============================================================================
--- head/sys/net/if_dead.c	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/net/if_dead.c	Wed Jan 18 13:31:17 2017	(r312379)
@@ -100,6 +100,30 @@ ifdead_get_counter(struct ifnet *ifp, if
 	return (0);
 }
 
+static int
+ifdead_snd_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
+    struct m_snd_tag **ppmt)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+ifdead_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+ifdead_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
+{
+	return (EOPNOTSUPP);
+}
+
+static void
+ifdead_snd_tag_free(struct m_snd_tag *pmt)
+{
+}
+
 void
 if_dead(struct ifnet *ifp)
 {
@@ -112,4 +136,8 @@ if_dead(struct ifnet *ifp)
 	ifp->if_qflush = ifdead_qflush;
 	ifp->if_transmit = ifdead_transmit;
 	ifp->if_get_counter = ifdead_get_counter;
+	ifp->if_snd_tag_alloc = ifdead_snd_tag_alloc;
+	ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
+	ifp->if_snd_tag_query = ifdead_snd_tag_query;
+	ifp->if_snd_tag_free = ifdead_snd_tag_free;
 }

Modified: head/sys/net/if_lagg.c
==============================================================================
--- head/sys/net/if_lagg.c	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/net/if_lagg.c	Wed Jan 18 13:31:17 2017	(r312379)
@@ -23,6 +23,7 @@ __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
+#include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -118,6 +119,11 @@ static void	lagg_port2req(struct lagg_po
 static void	lagg_init(void *);
 static void	lagg_stop(struct lagg_softc *);
 static int	lagg_ioctl(struct ifnet *, u_long, caddr_t);
+#ifdef RATELIMIT
+static int	lagg_snd_tag_alloc(struct ifnet *,
+		    union if_snd_tag_alloc_params *,
+		    struct m_snd_tag **);
+#endif
 static int	lagg_ether_setmulti(struct lagg_softc *);
 static int	lagg_ether_cmdmulti(struct lagg_port *, int);
 static	int	lagg_setflag(struct lagg_port *, int, int,
@@ -503,7 +509,12 @@ lagg_clone_create(struct if_clone *ifc, 
 	ifp->if_ioctl = lagg_ioctl;
 	ifp->if_get_counter = lagg_get_counter;
 	ifp->if_flags = IFF_SIMPLEX | IFF_BROADCAST | IFF_MULTICAST;
+#ifdef RATELIMIT
+	ifp->if_snd_tag_alloc = lagg_snd_tag_alloc;
+	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS | IFCAP_TXRTLMT;
+#else
 	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
+#endif
 
 	/*
 	 * Attach as an ordinary ethernet device, children will be attached
@@ -1549,6 +1560,52 @@ lagg_ioctl(struct ifnet *ifp, u_long cmd
 	return (error);
 }
 
+#ifdef RATELIMIT
+static int
+lagg_snd_tag_alloc(struct ifnet *ifp,
+    union if_snd_tag_alloc_params *params,
+    struct m_snd_tag **ppmt)
+{
+	struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc;
+	struct lagg_port *lp;
+	struct lagg_lb *lb;
+	uint32_t p;
+
+	switch (sc->sc_proto) {
+	case LAGG_PROTO_FAILOVER:
+		lp = lagg_link_active(sc, sc->sc_primary);
+		break;
+	case LAGG_PROTO_LOADBALANCE:
+		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
+		    params->hdr.flowtype == M_HASHTYPE_NONE)
+			return (EOPNOTSUPP);
+		p = params->hdr.flowid >> sc->flowid_shift;
+		p %= sc->sc_count;
+		lb = (struct lagg_lb *)sc->sc_psc;
+		lp = lb->lb_ports[p];
+		lp = lagg_link_active(sc, lp);
+		break;
+	case LAGG_PROTO_LACP:
+		if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) == 0 ||
+		    params->hdr.flowtype == M_HASHTYPE_NONE)
+			return (EOPNOTSUPP);
+		lp = lacp_select_tx_port_by_hash(sc, params->hdr.flowid);
+		break;
+	default:
+		return (EOPNOTSUPP);
+	}
+	if (lp == NULL)
+		return (EOPNOTSUPP);
+	ifp = lp->lp_ifp;
+	if (ifp == NULL || ifp->if_snd_tag_alloc == NULL ||
+	    (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
+		return (EOPNOTSUPP);
+
+	/* forward allocation request */
+	return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
+}
+#endif
+
 static int
 lagg_ether_setmulti(struct lagg_softc *sc)
 {

Modified: head/sys/net/if_var.h
==============================================================================
--- head/sys/net/if_var.h	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/net/if_var.h	Wed Jan 18 13:31:17 2017	(r312379)
@@ -175,6 +175,49 @@ struct if_encap_req {
 
 #define	IFENCAP_FLAG_BROADCAST	0x02	/* Destination is broadcast */
 
+/*
+ * Network interface send tag support. The storage of "struct
+ * m_snd_tag" comes from the network driver and it is free to allocate
+ * as much additional space as it wants for its own use.
+ */
+struct m_snd_tag;
+
+#define	IF_SND_TAG_TYPE_RATE_LIMIT 0
+#define	IF_SND_TAG_TYPE_MAX 1
+
+struct if_snd_tag_alloc_header {
+	uint32_t type;		/* send tag type, see IF_SND_TAG_XXX */
+	uint32_t flowid;	/* mbuf hash value */
+	uint32_t flowtype;	/* mbuf hash type */
+};
+
+struct if_snd_tag_alloc_rate_limit {
+	struct if_snd_tag_alloc_header hdr;
+	uint64_t max_rate;	/* in bytes/s */
+};
+
+struct if_snd_tag_rate_limit_params {
+	uint64_t max_rate;	/* in bytes/s */
+};
+
+union if_snd_tag_alloc_params {
+	struct if_snd_tag_alloc_header hdr;
+	struct if_snd_tag_alloc_rate_limit rate_limit;
+};
+
+union if_snd_tag_modify_params {
+	struct if_snd_tag_rate_limit_params rate_limit;
+};
+
+union if_snd_tag_query_params {
+	struct if_snd_tag_rate_limit_params rate_limit;
+};
+
+typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
+    struct m_snd_tag **);
+typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
+typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
+typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
 
 /*
  * Structure defining a network interface.
@@ -304,12 +347,19 @@ struct ifnet {
 	u_int	if_hw_tsomaxsegsize;	/* TSO maximum segment size in bytes */
 
 	/*
+	 * Network adapter send tag support:
+	 */
+	if_snd_tag_alloc_t *if_snd_tag_alloc;
+	if_snd_tag_modify_t *if_snd_tag_modify;
+	if_snd_tag_query_t *if_snd_tag_query;
+	if_snd_tag_free_t *if_snd_tag_free;
+
+	/*
 	 * Spare fields to be added before branching a stable branch, so
 	 * that structure can be enhanced without changing the kernel
 	 * binary interface.
 	 */
-	void	*if_pspare[4];		/* packet pacing / general use */
-	int	if_ispare[4];		/* packet pacing / general use */
+	int	if_ispare[4];		/* general use */
 };
 
 /* for compatibility with other BSDs */

Modified: head/sys/net/if_vlan.c
==============================================================================
--- head/sys/net/if_vlan.c	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/net/if_vlan.c	Wed Jan 18 13:31:17 2017	(r312379)
@@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_vlan.h"
+#include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
@@ -212,6 +213,10 @@ static	void trunk_destroy(struct ifvlant
 static	void vlan_init(void *foo);
 static	void vlan_input(struct ifnet *ifp, struct mbuf *m);
 static	int vlan_ioctl(struct ifnet *ifp, u_long cmd, caddr_t addr);
+#ifdef RATELIMIT
+static	int vlan_snd_tag_alloc(struct ifnet *,
+    union if_snd_tag_alloc_params *, struct m_snd_tag **);
+#endif
 static	void vlan_qflush(struct ifnet *ifp);
 static	int vlan_setflag(struct ifnet *ifp, int flag, int status,
     int (*func)(struct ifnet *, int));
@@ -971,6 +976,9 @@ vlan_clone_create(struct if_clone *ifc, 
 	ifp->if_transmit = vlan_transmit;
 	ifp->if_qflush = vlan_qflush;
 	ifp->if_ioctl = vlan_ioctl;
+#ifdef RATELIMIT
+	ifp->if_snd_tag_alloc = vlan_snd_tag_alloc;
+#endif
 	ifp->if_flags = VLAN_IFFLAGS;
 	ether_ifattach(ifp, eaddr);
 	/* Now undo some of the damage... */
@@ -1591,6 +1599,15 @@ vlan_capabilities(struct ifvlan *ifv)
 		TOEDEV(ifp) = TOEDEV(p);
 		ifp->if_capenable |= p->if_capenable & IFCAP_TOE;
 	}
+
+#ifdef RATELIMIT
+	/*
+	 * If the parent interface supports ratelimiting, so does the
+	 * VLAN interface.
+	 */
+	ifp->if_capabilities |= (p->if_capabilities & IFCAP_TXRTLMT);
+	ifp->if_capenable |= (p->if_capenable & IFCAP_TXRTLMT);
+#endif
 }
 
 static void
@@ -1801,3 +1818,19 @@ vlan_ioctl(struct ifnet *ifp, u_long cmd
 
 	return (error);
 }
+
+#ifdef RATELIMIT
+static int
+vlan_snd_tag_alloc(struct ifnet *ifp,
+    union if_snd_tag_alloc_params *params,
+    struct m_snd_tag **ppmt)
+{
+
+	/* get trunk device */
+	ifp = vlan_trunkdev(ifp);
+	if (ifp == NULL || (ifp->if_capenable & IFCAP_TXRTLMT) == 0)
+		return (EOPNOTSUPP);
+	/* forward allocation request */
+	return (ifp->if_snd_tag_alloc(ifp, params, ppmt));
+}
+#endif

Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/netinet/in_pcb.c	Wed Jan 18 13:31:17 2017	(r312379)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
+#include "opt_ratelimit.h"
 #include "opt_pcbgroup.h"
 #include "opt_rss.h"
 
@@ -57,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
+#include <sys/sockio.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
@@ -1140,6 +1142,10 @@ in_pcbdetach(struct inpcb *inp)
 
 	KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 
+#ifdef RATELIMIT
+	if (inp->inp_snd_tag != NULL)
+		in_pcbdetach_txrtlmt(inp);
+#endif
 	inp->inp_socket->so_pcb = NULL;
 	inp->inp_socket = NULL;
 }
@@ -2677,3 +2683,253 @@ DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 	db_print_inpcb(inp, "inpcb", 0);
 }
 #endif /* DDB */
+
+#ifdef RATELIMIT
+/*
+ * Modify TX rate limit based on the existing "inp->inp_snd_tag",
+ * if any.
+ */
+int
+in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
+{
+	union if_snd_tag_modify_params params = {
+		.rate_limit.max_rate = max_pacing_rate,
+	};
+	struct m_snd_tag *mst;
+	struct ifnet *ifp;
+	int error;
+
+	mst = inp->inp_snd_tag;
+	if (mst == NULL)
+		return (EINVAL);
+
+	ifp = mst->ifp;
+	if (ifp == NULL)
+		return (EINVAL);
+
+	if (ifp->if_snd_tag_modify == NULL) {
+		error = EOPNOTSUPP;
+	} else {
+		error = ifp->if_snd_tag_modify(mst, &params);
+	}
+	return (error);
+}
+
+/*
+ * Query existing TX rate limit based on the existing
+ * "inp->inp_snd_tag", if any.
+ */
+int
+in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
+{
+	union if_snd_tag_query_params params = { };
+	struct m_snd_tag *mst;
+	struct ifnet *ifp;
+	int error;
+
+	mst = inp->inp_snd_tag;
+	if (mst == NULL)
+		return (EINVAL);
+
+	ifp = mst->ifp;
+	if (ifp == NULL)
+		return (EINVAL);
+
+	if (ifp->if_snd_tag_query == NULL) {
+		error = EOPNOTSUPP;
+	} else {
+		error = ifp->if_snd_tag_query(mst, &params);
+		if (error == 0 &&  p_max_pacing_rate != NULL)
+			*p_max_pacing_rate = params.rate_limit.max_rate;
+	}
+	return (error);
+}
+
+/*
+ * Allocate a new TX rate limit send tag from the network interface
+ * given by the "ifp" argument and save it in "inp->inp_snd_tag":
+ */
+int
+in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
+    uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
+{
+	union if_snd_tag_alloc_params params = {
+		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+		.rate_limit.hdr.flowid = flowid,
+		.rate_limit.hdr.flowtype = flowtype,
+		.rate_limit.max_rate = max_pacing_rate,
+	};
+	int error;
+
+	INP_WLOCK_ASSERT(inp);
+
+	if (inp->inp_snd_tag != NULL)
+		return (EINVAL);
+
+	if (ifp->if_snd_tag_alloc == NULL) {
+		error = EOPNOTSUPP;
+	} else {
+		error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
+
+		/*
+		 * At success increment the refcount on
+		 * the send tag's network interface:
+		 */
+		if (error == 0)
+			if_ref(inp->inp_snd_tag->ifp);
+	}
+	return (error);
+}
+
+/*
+ * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
+ * if any:
+ */
+void
+in_pcbdetach_txrtlmt(struct inpcb *inp)
+{
+	struct m_snd_tag *mst;
+	struct ifnet *ifp;
+
+	INP_WLOCK_ASSERT(inp);
+
+	mst = inp->inp_snd_tag;
+	inp->inp_snd_tag = NULL;
+
+	if (mst == NULL)
+		return;
+
+	ifp = mst->ifp;
+	if (ifp == NULL)
+		return;
+
+	/*
+	 * If the device was detached while we still had reference(s)
+	 * on the ifp, we assume if_snd_tag_free() was replaced with
+	 * stubs.
+	 */
+	ifp->if_snd_tag_free(mst);
+
+	/* release reference count on network interface */
+	if_rele(ifp);
+}
+
+/*
+ * This function should be called when the INP_RATE_LIMIT_CHANGED flag
+ * is set in the fast path and will attach/detach/modify the TX rate
+ * limit send tag based on the socket's so_max_pacing_rate value.
+ */
+void
+in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+{
+	struct socket *socket;
+	uint32_t max_pacing_rate;
+	bool did_upgrade;
+	int error;
+
+	if (inp == NULL)
+		return;
+
+	socket = inp->inp_socket;
+	if (socket == NULL)
+		return;
+
+	if (!INP_WLOCKED(inp)) {
+		/*
+		 * NOTE: If the write locking fails, we need to bail
+		 * out and use the non-ratelimited ring for the
+		 * transmit until there is a new chance to get the
+		 * write lock.
+		 */
+		if (!INP_TRY_UPGRADE(inp))
+			return;
+		did_upgrade = 1;
+	} else {
+		did_upgrade = 0;
+	}
+
+	/*
+	 * NOTE: The so_max_pacing_rate value is read unlocked,
+	 * because atomic updates are not required since the variable
+	 * is checked at every mbuf we send. It is assumed that the
+	 * variable read itself will be atomic.
+	 */
+	max_pacing_rate = socket->so_max_pacing_rate;
+
+	/*
+	 * NOTE: When attaching to a network interface a reference is
+	 * made to ensure the network interface doesn't go away until
+	 * all ratelimit connections are gone. The network interface
+	 * pointers compared below represent valid network interfaces,
+	 * except when comparing towards NULL.
+	 */
+	if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
+		error = 0;
+	} else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
+		if (inp->inp_snd_tag != NULL)
+			in_pcbdetach_txrtlmt(inp);
+		error = 0;
+	} else if (inp->inp_snd_tag == NULL) {
+		/*
+		 * In order to utilize packet pacing with RSS, we need
+		 * to wait until there is a valid RSS hash before we
+		 * can proceed:
+		 */
+		if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
+			error = EAGAIN;
+		} else {
+			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
+			    mb->m_pkthdr.flowid, max_pacing_rate);
+		}
+	} else {
+		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
+	}
+	if (error == 0 || error == EOPNOTSUPP)
+		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
+	if (did_upgrade)
+		INP_DOWNGRADE(inp);
+}
+
+/*
+ * Track route changes for TX rate limiting.
+ */
+void
+in_pcboutput_eagain(struct inpcb *inp)
+{
+	struct socket *socket;
+	bool did_upgrade;
+
+	if (inp == NULL)
+		return;
+
+	socket = inp->inp_socket;
+	if (socket == NULL)
+		return;
+
+	if (inp->inp_snd_tag == NULL)
+		return;
+
+	if (!INP_WLOCKED(inp)) {
+		/*
+		 * NOTE: If the write locking fails, we need to bail
+		 * out and use the non-ratelimited ring for the
+		 * transmit until there is a new chance to get the
+		 * write lock.
+		 */
+		if (!INP_TRY_UPGRADE(inp))
+			return;
+		did_upgrade = 1;
+	} else {
+		did_upgrade = 0;
+	}
+
+	/* detach rate limiting */
+	in_pcbdetach_txrtlmt(inp);
+
+	/* make sure new mbuf send tag allocation is made */
+	inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+
+	if (did_upgrade)
+		INP_DOWNGRADE(inp);
+}
+#endif /* RATELIMIT */

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/netinet/in_pcb.h	Wed Jan 18 13:31:17 2017	(r312379)
@@ -181,6 +181,7 @@ struct	icmp6_filter;
  * read-lock usage during modification, this model can be applied to other
  * protocols (especially SCTP).
  */
+struct m_snd_tag;
 struct inpcb {
 	LIST_ENTRY(inpcb) inp_hash;	/* (h/i) hash list */
 	LIST_ENTRY(inpcb) inp_pcbgrouphash;	/* (g/i) hash list */
@@ -202,11 +203,11 @@ struct inpcb {
 	u_char	inp_ip_minttl;		/* (i) minimum TTL or drop */
 	uint32_t inp_flowid;		/* (x) flow id / queue id */
 	u_int	inp_refcount;		/* (i) refcount */
-	void	*inp_pspare[5];		/* (x) packet pacing / general use */
+	struct m_snd_tag *inp_snd_tag;	/* (i) send tag for outgoing mbufs */
+	void	*inp_pspare[4];		/* (x) general use */
 	uint32_t inp_flowtype;		/* (x) M_HASHTYPE value */
 	uint32_t inp_rss_listen_bucket;	/* (x) overridden RSS listen bucket */
-	u_int	inp_ispare[4];		/* (x) packet pacing / user cookie /
-					 *     general use */
+	u_int	inp_ispare[4];		/* (x) user cookie / general use */
 
 	/* Local and foreign ports, local and foreign addr. */
 	struct	in_conninfo inp_inc;	/* (i) list for PCB's local port */
@@ -616,6 +617,7 @@ short	inp_so_options(const struct inpcb 
 #define	INP_RSS_BUCKET_SET	0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
 #define	INP_RECVFLOWID		0x00000100 /* populate recv datagram with flow info */
 #define	INP_RECVRSSBUCKETID	0x00000200 /* populate recv datagram with bucket id */
+#define	INP_RATE_LIMIT_CHANGED	0x00000400 /* rate limit needs attention */
 
 /*
  * Flags passed to in_pcblookup*() functions.
@@ -736,6 +738,14 @@ int	in_getsockaddr(struct socket *so, st
 struct sockaddr *
 	in_sockaddr(in_port_t port, struct in_addr *addr);
 void	in_pcbsosetlabel(struct socket *so);
+#ifdef RATELIMIT
+int	in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
+void	in_pcbdetach_txrtlmt(struct inpcb *);
+int	in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
+int	in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
+void	in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
+void	in_pcboutput_eagain(struct inpcb *);
+#endif
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IN_PCB_H_ */

Modified: head/sys/netinet/ip_output.c
==============================================================================
--- head/sys/netinet/ip_output.c	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/netinet/ip_output.c	Wed Jan 18 13:31:17 2017	(r312379)
@@ -33,6 +33,7 @@
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
+#include "opt_ratelimit.h"
 #include "opt_ipsec.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_mpath.h"
@@ -661,8 +662,23 @@ sendit:
 		 */
 		m_clrprotoflags(m);
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
+#ifdef RATELIMIT
+		if (inp != NULL) {
+			if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+				in_pcboutput_txrtlmt(inp, ifp, m);
+			/* stamp send tag on mbuf */
+			m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+		} else {
+			m->m_pkthdr.snd_tag = NULL;
+		}
+#endif
 		error = (*ifp->if_output)(ifp, m,
 		    (const struct sockaddr *)gw, ro);
+#ifdef RATELIMIT
+		/* check for route change */
+		if (error == EAGAIN)
+			in_pcboutput_eagain(inp);
+#endif
 		goto done;
 	}
 
@@ -698,8 +714,23 @@ sendit:
 
 			IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
 			    mtod(m, struct ip *), NULL);
+#ifdef RATELIMIT
+			if (inp != NULL) {
+				if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+					in_pcboutput_txrtlmt(inp, ifp, m);
+				/* stamp send tag on mbuf */
+				m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+			} else {
+				m->m_pkthdr.snd_tag = NULL;
+			}
+#endif
 			error = (*ifp->if_output)(ifp, m,
 			    (const struct sockaddr *)gw, ro);
+#ifdef RATELIMIT
+			/* check for route change */
+			if (error == EAGAIN)
+				in_pcboutput_eagain(inp);
+#endif
 		} else
 			m_freem(m);
 	}
@@ -974,6 +1005,16 @@ ip_ctloutput(struct socket *so, struct s
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
+			case SO_MAX_PACING_RATE:
+#ifdef RATELIMIT
+				INP_WLOCK(inp);
+				inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+				INP_WUNLOCK(inp);
+				error = 0;
+#else
+				error = EOPNOTSUPP;
+#endif
+				break;
 			default:
 				break;
 			}

Modified: head/sys/netinet6/ip6_output.c
==============================================================================
--- head/sys/netinet6/ip6_output.c	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/netinet6/ip6_output.c	Wed Jan 18 13:31:17 2017	(r312379)
@@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
+#include "opt_ratelimit.h"
 #include "opt_ipsec.h"
 #include "opt_sctp.h"
 #include "opt_route.h"
@@ -954,8 +955,23 @@ passout:
 			    m->m_pkthdr.len);
 			ifa_free(&ia6->ia_ifa);
 		}
+#ifdef RATELIMIT
+		if (inp != NULL) {
+			if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+				in_pcboutput_txrtlmt(inp, ifp, m);
+			/* stamp send tag on mbuf */
+			m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+		} else {
+			m->m_pkthdr.snd_tag = NULL;
+		}
+#endif
 		error = nd6_output_ifp(ifp, origifp, m, dst,
 		    (struct route *)ro);
+#ifdef RATELIMIT
+		/* check for route change */
+		if (error == EAGAIN)
+			in_pcboutput_eagain(inp);
+#endif
 		goto done;
 	}
 
@@ -1054,8 +1070,23 @@ sendorfree:
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
+#ifdef RATELIMIT
+			if (inp != NULL) {
+				if (inp->inp_flags2 & INP_RATE_LIMIT_CHANGED)
+					in_pcboutput_txrtlmt(inp, ifp, m);
+				/* stamp send tag on mbuf */
+				m->m_pkthdr.snd_tag = inp->inp_snd_tag;
+			} else {
+				m->m_pkthdr.snd_tag = NULL;
+			}
+#endif
 			error = nd6_output_ifp(ifp, origifp, m, dst,
 			    (struct route *)ro);
+#ifdef RATELIMIT
+			/* check for route change */
+			if (error == EAGAIN)
+				in_pcboutput_eagain(inp);
+#endif
 		} else
 			m_freem(m);
 	}
@@ -1441,6 +1472,16 @@ ip6_ctloutput(struct socket *so, struct 
 				INP_WUNLOCK(in6p);
 				error = 0;
 				break;
+			case SO_MAX_PACING_RATE:
+#ifdef RATELIMIT
+				INP_WLOCK(in6p);
+				in6p->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
+				INP_WUNLOCK(in6p);
+				error = 0;
+#else
+				error = EOPNOTSUPP;
+#endif
+				break;
 			default:
 				break;
 			}

Modified: head/sys/sys/mbuf.h
==============================================================================
--- head/sys/sys/mbuf.h	Wed Jan 18 13:27:24 2017	(r312378)
+++ head/sys/sys/mbuf.h	Wed Jan 18 13:31:17 2017	(r312379)
@@ -130,6 +130,14 @@ struct m_tag {
 };
 
 /*
+ * Static network interface owned tag.
+ * Allocated through ifp->if_snd_tag_alloc().
+ */
+struct m_snd_tag {
+	struct ifnet *ifp;		/* network interface tag belongs to */
+};
+
+/*
  * Record/packet header in first mbuf of chain; valid only if M_PKTHDR is set.
  * Size ILP32: 48
  *	 LP64: 56

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201701181331.v0IDVHWf048428>