FreeBSD Mail Archives

Date:      Tue, 24 Sep 2019 18:18:11 +0000 (UTC)
From:      Randall Stewart <rrs@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r352657 - in head/sys: conf kern modules/tcp modules/tcp/bbr netinet netinet/tcp_stacks sys
Message-ID:  <201909241818.x8OIIBNr039667@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help

Author: rrs
Date: Tue Sep 24 18:18:11 2019
New Revision: 352657
URL: https://svnweb.freebsd.org/changeset/base/352657

Log:
  This commit adds BBR (Bottleneck Bandwidth and RTT) congestion control. This
  is a completely separate TCP stack (tcp_bbr.ko) that will be built only if
  you add the make options WITH_EXTRA_TCP_STACKS=1 and also include the option
  TCPHPTS. You can also include the RATELIMIT option if you have a NIC interface that
  supports hardware pacing, BBR understands how to use such a feature.
  
  Note that this commit also adds in a general purpose time-filter which
  allows you to have a min-filter or max-filter. A filter allows you to
  have a low (or high) value for some period of time and degrade slowly
  to another value has time passes. You can find out the details of
  BBR by looking at the original paper at:
  
  https://queue.acm.org/detail.cfm?id=3022184
  
  or consult many other web resources you can find on the web
  referenced by "BBR congestion control". It should be noted that
  BBRv1 (which this is) does tend to unfairness in cases of small
  buffered paths, and it will usually get less bandwidth in the case
  of large BDP paths(when competing with new-reno or cubic flows). BBR
  is still an active research area and we do plan on  implementing V2
  of BBR to see if it is an improvement over V1.
  
  Sponsored by:	Netflix Inc.
  Differential Revision:	https://reviews.freebsd.org/D21582

Added:
  head/sys/kern/subr_filter.c   (contents, props changed)
  head/sys/modules/tcp/bbr/
  head/sys/modules/tcp/bbr/Makefile   (contents, props changed)
  head/sys/netinet/tcp_stacks/bbr.c   (contents, props changed)
  head/sys/netinet/tcp_stacks/tcp_bbr.h   (contents, props changed)
  head/sys/sys/tim_filter.h   (contents, props changed)
Modified:
  head/sys/conf/files
  head/sys/modules/tcp/Makefile
  head/sys/netinet/ip_output.c
  head/sys/netinet/ip_var.h
  head/sys/netinet/tcp.h
  head/sys/netinet/tcp_stacks/rack.c
  head/sys/netinet/tcp_stacks/rack_bbr_common.c
  head/sys/netinet/tcp_stacks/rack_bbr_common.h
  head/sys/netinet/tcp_stacks/sack_filter.c
  head/sys/netinet/tcp_stacks/sack_filter.h
  head/sys/netinet/tcp_stacks/tcp_rack.h
  head/sys/sys/mbuf.h

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Tue Sep 24 17:06:32 2019	(r352656)
+++ head/sys/conf/files	Tue Sep 24 18:18:11 2019	(r352657)
@@ -3808,6 +3808,7 @@ kern/subr_epoch.c		standard
 kern/subr_eventhandler.c	standard
 kern/subr_fattime.c		standard
 kern/subr_firmware.c		optional firmware
+kern/subr_filter.c              standard
 kern/subr_gtaskqueue.c		standard
 kern/subr_hash.c		standard
 kern/subr_hints.c		standard

Added: head/sys/kern/subr_filter.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/kern/subr_filter.c	Tue Sep 24 18:18:11 2019	(r352657)
@@ -0,0 +1,482 @@
+/*-
+ * Copyright (c) 2016-2019 Netflix, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Author: Randall Stewart <rrs@netflix.com>
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/errno.h>
+#include <sys/tim_filter.h>
+
+void
+reset_time(struct time_filter *tf, uint32_t time_len)
+{
+	tf->cur_time_limit = time_len;
+}
+
+void
+reset_time_small(struct time_filter_small *tf, uint32_t time_len)
+{
+	tf->cur_time_limit = time_len;
+}
+
+/*
+ * A time filter can be a filter for MIN or MAX. 
+ * You call setup_time_filter() with the pointer to
+ * the filter structure, the type (FILTER_TYPE_MIN/MAX) and
+ * the time length. You can optionally reset the time length
+ * later with reset_time().
+ *
+ * You generally call apply_filter_xxx() to apply the new value
+ * to the filter. You also provide a time (now). The filter will
+ * age out entries based on the time now and your time limit
+ * so that you are always maintaining the min or max in that
+ * window of time. Time is a relative thing, it might be ticks
+ * in milliseconds, it might be round trip times, its really
+ * up to you to decide what it is.
+ *
+ * To access the current flitered value you can use the macro
+ * get_filter_value() which returns the correct entry that
+ * has the "current" value in the filter.
+ *
+ * One thing that used to be here is a single apply_filter(). But
+ * this meant that we then had to store the type of filter in
+ * the time_filter structure. In order to keep it at a cache
+ * line size I split it to two functions. 
+ *
+ */
+int
+setup_time_filter(struct time_filter *tf, int fil_type, uint32_t time_len)
+{
+	uint64_t set_val;
+	int i;
+	
+	/* 
+	 * You must specify either a MIN or MAX filter,
+	 * though its up to the user to use the correct
+	 * apply.
+	 */
+	if ((fil_type != FILTER_TYPE_MIN) &&
+	    (fil_type != FILTER_TYPE_MAX))
+		return(EINVAL);
+
+	if (time_len < NUM_FILTER_ENTRIES)
+		return(EINVAL);
+		       
+	if (fil_type == FILTER_TYPE_MIN)
+		set_val = 0xffffffffffffffff;
+	else
+		set_val = 0;
+
+	for(i=0; i<NUM_FILTER_ENTRIES; i++) {
+		tf->entries[i].value = set_val;
+		tf->entries[i].time_up = 0;
+	}
+	tf->cur_time_limit = time_len;
+	return(0);
+}
+
+int
+setup_time_filter_small(struct time_filter_small *tf, int fil_type, uint32_t time_len)
+{
+	uint32_t set_val;
+	int i;
+	
+	/* 
+	 * You must specify either a MIN or MAX filter,
+	 * though its up to the user to use the correct
+	 * apply.
+	 */
+	if ((fil_type != FILTER_TYPE_MIN) &&
+	    (fil_type != FILTER_TYPE_MAX))
+		return(EINVAL);
+
+	if (time_len < NUM_FILTER_ENTRIES)
+		return(EINVAL);
+		       
+	if (fil_type == FILTER_TYPE_MIN)
+		set_val = 0xffffffff;
+	else
+		set_val = 0;
+
+	for(i=0; i<NUM_FILTER_ENTRIES; i++) {
+		tf->entries[i].value = set_val;
+		tf->entries[i].time_up = 0;
+	}
+	tf->cur_time_limit = time_len;
+	return(0);
+}
+
+
+static void
+check_update_times(struct time_filter *tf, uint64_t value, uint32_t now)
+{
+	int i, j, fnd;
+	uint32_t tim;
+	uint32_t time_limit;
+	for(i=0; i<(NUM_FILTER_ENTRIES-1); i++) {
+		tim = now - tf->entries[i].time_up;
+		time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES;
+		if (tim >= time_limit) {
+			fnd = 0;
+			for(j=(i+1); j<NUM_FILTER_ENTRIES; j++) {
+				if (tf->entries[i].time_up < tf->entries[j].time_up) {
+					tf->entries[i].value = tf->entries[j].value;
+					tf->entries[i].time_up = tf->entries[j].time_up;
+					fnd = 1;
+					break;
+				}
+			}
+			if (fnd == 0) {
+				/* Nothing but the same old entry */
+				tf->entries[i].value = value;
+				tf->entries[i].time_up = now;
+			}
+		}
+	}
+	i = NUM_FILTER_ENTRIES-1;
+	tim = now - tf->entries[i].time_up;
+	time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES;
+	if (tim >= time_limit) {
+		tf->entries[i].value = value;
+		tf->entries[i].time_up = now;
+	}
+}
+
+static void
+check_update_times_small(struct time_filter_small *tf, uint32_t value, uint32_t now)
+{
+	int i, j, fnd;
+	uint32_t tim;
+	uint32_t time_limit;
+	for(i=0; i<(NUM_FILTER_ENTRIES-1); i++) {
+		tim = now - tf->entries[i].time_up;
+		time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES;
+		if (tim >= time_limit) {
+			fnd = 0;
+			for(j=(i+1); j<NUM_FILTER_ENTRIES; j++) {
+				if (tf->entries[i].time_up < tf->entries[j].time_up) {
+					tf->entries[i].value = tf->entries[j].value;
+					tf->entries[i].time_up = tf->entries[j].time_up;
+					fnd = 1;
+					break;
+				}
+			}
+			if (fnd == 0) {
+				/* Nothing but the same old entry */
+				tf->entries[i].value = value;
+				tf->entries[i].time_up = now;
+			}
+		}
+	}
+	i = NUM_FILTER_ENTRIES-1;
+	tim = now - tf->entries[i].time_up;
+	time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES;
+	if (tim >= time_limit) {
+		tf->entries[i].value = value;
+		tf->entries[i].time_up = now;
+	}
+}
+
+
+
+void
+filter_reduce_by(struct time_filter *tf, uint64_t reduce_by, uint32_t now)
+{
+	int i;
+	/* 
+	 * Reduce our filter main by reduce_by and
+	 * update its time. Then walk other's and
+	 * make them the new value too.
+	 */
+	if (reduce_by < tf->entries[0].value)
+		tf->entries[0].value -= reduce_by;
+	else
+		tf->entries[0].value = 0;
+	tf->entries[0].time_up = now;
+	for(i=1; i<NUM_FILTER_ENTRIES; i++) {
+		tf->entries[i].value = tf->entries[0].value;
+		tf->entries[i].time_up = now;
+	}
+}
+
+void
+filter_reduce_by_small(struct time_filter_small *tf, uint32_t reduce_by, uint32_t now)
+{
+	int i;
+	/* 
+	 * Reduce our filter main by reduce_by and
+	 * update its time. Then walk other's and
+	 * make them the new value too.
+	 */
+	if (reduce_by < tf->entries[0].value)
+		tf->entries[0].value -= reduce_by;
+	else
+		tf->entries[0].value = 0;
+	tf->entries[0].time_up = now;
+	for(i=1; i<NUM_FILTER_ENTRIES; i++) {
+		tf->entries[i].value = tf->entries[0].value;
+		tf->entries[i].time_up = now;
+	}
+}
+
+void
+filter_increase_by(struct time_filter *tf, uint64_t incr_by, uint32_t now)
+{
+	int i;
+	/* 
+	 * Increase our filter main by incr_by and
+	 * update its time. Then walk other's and
+	 * make them the new value too.
+	 */
+	tf->entries[0].value += incr_by;
+	tf->entries[0].time_up = now;
+	for(i=1; i<NUM_FILTER_ENTRIES; i++) {
+		tf->entries[i].value = tf->entries[0].value;
+		tf->entries[i].time_up = now;
+	}
+}
+
+void
+filter_increase_by_small(struct time_filter_small *tf, uint32_t incr_by, uint32_t now)
+{
+	int i;
+	/* 
+	 * Increase our filter main by incr_by and
+	 * update its time. Then walk other's and
+	 * make them the new value too.
+	 */
+	tf->entries[0].value += incr_by;
+	tf->entries[0].time_up = now;
+	for(i=1; i<NUM_FILTER_ENTRIES; i++) {
+		tf->entries[i].value = tf->entries[0].value;
+		tf->entries[i].time_up = now;
+	}
+}
+
+void
+forward_filter_clock(struct time_filter *tf, uint32_t ticks_forward)
+{
+	/*
+	 * Bring forward all time values by N ticks. This
+	 * postpones expiring slots by that amount.
+	 */
+	int i;
+
+	for(i=0; i<NUM_FILTER_ENTRIES; i++) {
+		tf->entries[i].time_up += ticks_forward;
+	}
+}
+
+
+void
+forward_filter_clock_small(struct time_filter_small *tf, uint32_t ticks_forward)
+{
+	/*
+	 * Bring forward all time values by N ticks. This
+	 * postpones expiring slots by that amount.
+	 */
+	int i;
+
+	for(i=0; i<NUM_FILTER_ENTRIES; i++) {
+		tf->entries[i].time_up += ticks_forward;
+	}
+}
+
+
+void
+tick_filter_clock(struct time_filter *tf, uint32_t now)
+{
+	int i;
+	uint32_t tim, time_limit;
+
+	/*
+	 * We start at two positions back. This
+	 * is because the oldest worst value is
+	 * preserved always, i.e. it can't expire
+	 * due to clock ticking with no updated value.
+	 *
+	 * The other choice would be to fill it in with
+	 * zero, but I don't like that option since
+	 * some measurement is better than none (even
+	 * if its your oldest measurment).
+	 */
+	for(i=(NUM_FILTER_ENTRIES-2); i>=0 ; i--) {
+		tim = now - tf->entries[i].time_up;
+		time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES;
+		if (tim >= time_limit) {
+			/* 
+			 * This entry is expired, pull down
+			 * the next one up.
+			 */
+			tf->entries[i].value = tf->entries[(i+1)].value;
+			tf->entries[i].time_up = tf->entries[(i+1)].time_up;
+		}
+
+	}
+}
+
+void
+tick_filter_clock_small(struct time_filter_small *tf, uint32_t now)
+{
+	int i;
+	uint32_t tim, time_limit;
+
+	/*
+	 * We start at two positions back. This
+	 * is because the oldest worst value is
+	 * preserved always, i.e. it can't expire
+	 * due to clock ticking with no updated value.
+	 *
+	 * The other choice would be to fill it in with
+	 * zero, but I don't like that option since
+	 * some measurement is better than none (even
+	 * if its your oldest measurment).
+	 */
+	for(i=(NUM_FILTER_ENTRIES-2); i>=0 ; i--) {
+		tim = now - tf->entries[i].time_up;
+		time_limit = (tf->cur_time_limit * (NUM_FILTER_ENTRIES-i))/NUM_FILTER_ENTRIES;
+		if (tim >= time_limit) {
+			/* 
+			 * This entry is expired, pull down
+			 * the next one up.
+			 */
+			tf->entries[i].value = tf->entries[(i+1)].value;
+			tf->entries[i].time_up = tf->entries[(i+1)].time_up;
+		}
+
+	}
+}
+
+uint32_t
+apply_filter_min(struct time_filter *tf, uint64_t value, uint32_t now)
+{
+	int i, j;
+	
+	if (value <= tf->entries[0].value) {
+		/* Zap them all */
+		for(i=0; i<NUM_FILTER_ENTRIES; i++) {
+			tf->entries[i].value = value;
+			tf->entries[i].time_up = now;
+		}
+		return (tf->entries[0].value);
+	}
+	for (j=1; j<NUM_FILTER_ENTRIES; j++) {
+		if (value <= tf->entries[j].value) {
+			for(i=j; i<NUM_FILTER_ENTRIES; i++) {
+				tf->entries[i].value = value;
+				tf->entries[i].time_up = now;
+			}
+			break;
+		}
+	}
+	check_update_times(tf, value, now);
+	return (tf->entries[0].value);
+}
+
+uint32_t
+apply_filter_min_small(struct time_filter_small *tf,
+		       uint32_t value, uint32_t now)
+{
+	int i, j;
+	
+	if (value <= tf->entries[0].value) {
+		/* Zap them all */
+		for(i=0; i<NUM_FILTER_ENTRIES; i++) {
+			tf->entries[i].value = value;
+			tf->entries[i].time_up = now;
+		}
+		return (tf->entries[0].value);
+	}
+	for (j=1; j<NUM_FILTER_ENTRIES; j++) {
+		if (value <= tf->entries[j].value) {
+			for(i=j; i<NUM_FILTER_ENTRIES; i++) {
+				tf->entries[i].value = value;
+				tf->entries[i].time_up = now;
+			}
+			break;
+		}
+	}
+	check_update_times_small(tf, value, now);
+	return (tf->entries[0].value);
+}
+
+uint32_t
+apply_filter_max(struct time_filter *tf, uint64_t value, uint32_t now)
+{
+	int i, j;
+	
+	if (value >= tf->entries[0].value) {
+		/* Zap them all */
+		for(i=0; i<NUM_FILTER_ENTRIES; i++) {
+			tf->entries[i].value = value;
+			tf->entries[i].time_up = now;
+		}
+		return (tf->entries[0].value);
+	}
+	for (j=1; j<NUM_FILTER_ENTRIES; j++) {
+		if (value >= tf->entries[j].value) {
+			for(i=j; i<NUM_FILTER_ENTRIES; i++) {
+				tf->entries[i].value = value;
+				tf->entries[i].time_up = now;
+			}
+			break;
+		}
+	}
+	check_update_times(tf, value, now);
+	return (tf->entries[0].value);
+}
+
+
+uint32_t
+apply_filter_max_small(struct time_filter_small *tf,
+		       uint32_t value, uint32_t now)
+{
+	int i, j;
+	
+	if (value >= tf->entries[0].value) {
+		/* Zap them all */
+		for(i=0; i<NUM_FILTER_ENTRIES; i++) {
+			tf->entries[i].value = value;
+			tf->entries[i].time_up = now;
+		}
+		return (tf->entries[0].value);
+	}
+	for (j=1; j<NUM_FILTER_ENTRIES; j++) {
+		if (value >= tf->entries[j].value) {
+			for(i=j; i<NUM_FILTER_ENTRIES; i++) {
+				tf->entries[i].value = value;
+				tf->entries[i].time_up = now;
+			}
+			break;
+		}
+	}
+	check_update_times_small(tf, value, now);
+	return (tf->entries[0].value);
+}

Modified: head/sys/modules/tcp/Makefile
==============================================================================
--- head/sys/modules/tcp/Makefile	Tue Sep 24 17:06:32 2019	(r352656)
+++ head/sys/modules/tcp/Makefile	Tue Sep 24 18:18:11 2019	(r352657)
@@ -6,10 +6,12 @@ SYSDIR?=${SRCTOP}/sys
 .include "${SYSDIR}/conf/kern.opts.mk"
 
 SUBDIR=	\
+        ${_tcp_bbr} \
         ${_tcp_rack} \
 	${_tcpmd5} \
 
 .if ${MK_EXTRA_TCP_STACKS} != "no" || defined(ALL_MODULES)
+_tcp_bbr= 	bbr
 _tcp_rack= 	rack
 .endif
 

Added: head/sys/modules/tcp/bbr/Makefile
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/modules/tcp/bbr/Makefile	Tue Sep 24 18:18:11 2019	(r352657)
@@ -0,0 +1,23 @@
+#
+# $FreeBSD$
+#
+
+.PATH: ${.CURDIR}/../../../netinet/tcp_stacks
+
+STACKNAME=	bbr
+KMOD=	tcp_${STACKNAME}
+SRCS=	bbr.c sack_filter.c rack_bbr_common.c
+
+SRCS+=	opt_inet.h opt_inet6.h opt_ipsec.h
+SRCS+=	opt_tcpdebug.h
+SRCS+=	opt_kern_tls.h
+
+#
+# Enable full debugging
+#
+#CFLAGS += -g
+
+CFLAGS+=	-DMODNAME=${KMOD}
+CFLAGS+=	-DSTACKNAME=${STACKNAME}
+
+.include <bsd.kmod.mk>

Modified: head/sys/netinet/ip_output.c
==============================================================================
--- head/sys/netinet/ip_output.c	Tue Sep 24 17:06:32 2019	(r352656)
+++ head/sys/netinet/ip_output.c	Tue Sep 24 18:18:11 2019	(r352657)
@@ -212,7 +212,7 @@ ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, in
 
 static int
 ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
-    const struct sockaddr_in *gw, struct route *ro)
+    const struct sockaddr_in *gw, struct route *ro, bool stamp_tag)
 {
 #ifdef KERN_TLS
 	struct ktls_session *tls = NULL;
@@ -256,7 +256,7 @@ ip_output_send(struct inpcb *inp, struct ifnet *ifp, s
 			mst = inp->inp_snd_tag;
 	}
 #endif
-	if (mst != NULL) {
+	if (stamp_tag && mst != NULL) {
 		KASSERT(m->m_pkthdr.rcvif == NULL,
 		    ("trying to add a send tag to a forwarded packet"));
 		if (mst->ifp != ifp) {
@@ -791,7 +791,8 @@ sendit:
 		 */
 		m_clrprotoflags(m);
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
-		error = ip_output_send(inp, ifp, m, gw, ro);
+		error = ip_output_send(inp, ifp, m, gw, ro,
+		    (flags & IP_NO_SND_TAG_RL) ? false : true);
 		goto done;
 	}
 
@@ -827,7 +828,7 @@ sendit:
 
 			IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
 			    mtod(m, struct ip *), NULL);
-			error = ip_output_send(inp, ifp, m, gw, ro);
+			error = ip_output_send(inp, ifp, m, gw, ro, true);
 		} else
 			m_freem(m);
 	}

Modified: head/sys/netinet/ip_var.h
==============================================================================
--- head/sys/netinet/ip_var.h	Tue Sep 24 17:06:32 2019	(r352656)
+++ head/sys/netinet/ip_var.h	Tue Sep 24 18:18:11 2019	(r352657)
@@ -166,6 +166,7 @@ void	kmod_ipstat_dec(int statnum);
 #define IP_ROUTETOIF		SO_DONTROUTE	/* 0x10 bypass routing tables */
 #define IP_ALLOWBROADCAST	SO_BROADCAST	/* 0x20 can send broadcast packets */
 #define	IP_NODEFAULTFLOWID	0x40		/* Don't set the flowid from inp */
+#define IP_NO_SND_TAG_RL	0x80		/* Don't send down the ratelimit tag */
 
 #ifdef __NO_STRICT_ALIGNMENT
 #define IP_HDR_ALIGNED_P(ip)	1

Modified: head/sys/netinet/tcp.h
==============================================================================
--- head/sys/netinet/tcp.h	Tue Sep 24 17:06:32 2019	(r352656)
+++ head/sys/netinet/tcp.h	Tue Sep 24 18:18:11 2019	(r352657)
@@ -239,6 +239,7 @@ struct tcphdr {
 #define TCP_BBR_ACK_COMP_ALG   1096 	/* Not used */
 #define TCP_BBR_TMR_PACE_OH    1096	/* Recycled in 4.2 */
 #define TCP_BBR_EXTRA_GAIN     1097
+#define TCP_RACK_DO_DETECTION  1097	/* Recycle of extra gain for rack, attack detection */
 #define TCP_BBR_RACK_RTT_USE   1098	/* what RTT should we use 0, 1, or 2? */
 #define TCP_BBR_RETRAN_WTSO    1099
 #define TCP_DATA_AFTER_CLOSE   1100

Added: head/sys/netinet/tcp_stacks/bbr.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/netinet/tcp_stacks/bbr.c	Tue Sep 24 18:18:11 2019	(r352657)
@@ -0,0 +1,15189 @@
+/*-
+ * Copyright (c) 2016-2019
+ *	Netflix Inc.
+ *      All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/**
+ * Author: Randall Stewart <rrs@netflix.com>
+ * This work is based on the ACM Queue paper
+ * BBR - Congestion Based Congestion Control
+ * and also numerous discussions with Neal, Yuchung and Van.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+#include "opt_kern_tls.h"
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#ifdef TCP_HHOOK
+#include <sys/hhook.h>
+#endif
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/ktls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/qmath.h>
+#include <sys/tree.h>
+#ifdef NETFLIX_STATS
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
+#endif
+#include <sys/refcount.h>
+#include <sys/queue.h>
+#include <sys/eventhandler.h>
+#include <sys/smp.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/tim_filter.h>
+#include <sys/time.h>
+#include <vm/uma.h>
+#include <sys/kern_prefetch.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES		/* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
+#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define	TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/cc/cc.h>
+#include <netinet/tcp_log_buf.h>
+#include <netinet/tcp_ratelimit.h>
+#include <netinet/tcp_lro.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif				/* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_fastopen.h>
+
+#include <netipsec/ipsec_support.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/ethernet.h>
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif				/* IPSEC */
+
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <machine/in_cksum.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+
+#include "sack_filter.h"
+#include "tcp_bbr.h"
+#include "rack_bbr_common.h"
+uma_zone_t bbr_zone;
+uma_zone_t bbr_pcb_zone;
+
+struct sysctl_ctx_list bbr_sysctl_ctx;
+struct sysctl_oid *bbr_sysctl_root;
+
+#define	TCPT_RANGESET_NOSLOP(tv, value, tvmin, tvmax) do { \
+	(tv) = (value); \
+	if ((u_long)(tv) < (u_long)(tvmin)) \
+		(tv) = (tvmin); \
+	if ((u_long)(tv) > (u_long)(tvmax)) \
+		(tv) = (tvmax); \
+} while(0)
+
+/*#define BBR_INVARIANT 1*/
+
+/*
+ * initial window
+ */
+static uint32_t bbr_def_init_win = 10;
+static int32_t bbr_persist_min = 250000;	/* 250ms */
+static int32_t bbr_persist_max = 1000000;	/* 1 Second */
+static int32_t bbr_cwnd_may_shrink = 0;
+static int32_t bbr_cwndtarget_rtt_touse = BBR_RTT_PROP;
+static int32_t bbr_num_pktepo_for_del_limit = BBR_NUM_RTTS_FOR_DEL_LIMIT;
+static int32_t bbr_hardware_pacing_limit = 8000;
+static int32_t bbr_quanta = 3;	/* How much extra quanta do we get? */
+static int32_t bbr_no_retran = 0;
+static int32_t bbr_tcp_map_entries_limit = 1500;
+static int32_t bbr_tcp_map_split_limit = 256;
+
+static int32_t bbr_error_base_paceout = 10000; /* usec to pace */
+static int32_t bbr_max_net_error_cnt = 10;
+/* Should the following be dynamic too -- loss wise */
+static int32_t bbr_rtt_gain_thresh = 0;
+/* Measurement controls */
+static int32_t bbr_use_google_algo = 1;
+static int32_t bbr_ts_limiting = 1;
+static int32_t bbr_ts_can_raise = 0;
+static int32_t bbr_do_red = 600;
+static int32_t bbr_red_scale = 20000;
+static int32_t bbr_red_mul = 1;
+static int32_t bbr_red_div = 2;
+static int32_t bbr_red_growth_restrict = 1;
+static int32_t  bbr_target_is_bbunit = 0;
+static int32_t bbr_drop_limit = 0;
+/*
+ * How much gain do we need to see to
+ * stay in startup?
+ */
+static int32_t bbr_marks_rxt_sack_passed = 0;
+static int32_t bbr_start_exit = 25;
+static int32_t bbr_low_start_exit = 25;	/* When we are in reduced gain */
+static int32_t bbr_startup_loss_thresh = 2000;	/* 20.00% loss */
+static int32_t bbr_hptsi_max_mul = 1;	/* These two mul/div assure a min pacing */
+static int32_t bbr_hptsi_max_div = 2;	/* time, 0 means turned off. We need this
+					 * if we go back ever to where the pacer
+					 * has priority over timers.
+					 */
+static int32_t bbr_policer_call_from_rack_to = 0;
+static int32_t bbr_policer_detection_enabled = 1;
+static int32_t bbr_min_measurements_req = 1;	/* We need at least 2
+						 * measurments before we are
+						 * "good" note that 2 == 1.
+						 * This is because we use a >
+						 * comparison. This means if
+						 * min_measure was 0, it takes
+						 * num-measures > min(0) and
+						 * you get 1 measurement and
+						 * you are good. Set to 1, you
+						 * have to have two
+						 * measurements (this is done
+						 * to prevent it from being ok
+						 * to have no measurements). */
+static int32_t bbr_no_pacing_until = 4;
+						 
+static int32_t bbr_min_usec_delta = 20000;	/* 20,000 usecs */
+static int32_t bbr_min_peer_delta = 20;		/* 20 units */
+static int32_t bbr_delta_percent = 150;		/* 15.0 % */
+
+static int32_t bbr_target_cwnd_mult_limit = 8;
+/*
+ * bbr_cwnd_min_val is the number of
+ * segments we hold to in the RTT probe
+ * state typically 4.
+ */
+static int32_t bbr_cwnd_min_val = BBR_PROBERTT_NUM_MSS;
+
+
+static int32_t bbr_cwnd_min_val_hs = BBR_HIGHSPEED_NUM_MSS;
+
+static int32_t bbr_gain_to_target = 1;
+static int32_t bbr_gain_gets_extra_too = 1;
+/*
+ * bbr_high_gain is the 2/ln(2) value we need
+ * to double the sending rate in startup. This
+ * is used for both cwnd and hptsi gain's.
+ */
+static int32_t bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1;
+static int32_t bbr_startup_lower = BBR_UNIT * 1500 / 1000 + 1;
+static int32_t bbr_use_lower_gain_in_startup = 1;
+
+/* thresholds for reduction on drain in sub-states/drain */
+static int32_t bbr_drain_rtt = BBR_SRTT;
+static int32_t bbr_drain_floor = 88;
+static int32_t google_allow_early_out = 1;
+static int32_t google_consider_lost = 1;
+static int32_t bbr_drain_drop_mul = 4;
+static int32_t bbr_drain_drop_div = 5;
+static int32_t bbr_rand_ot = 50;
+static int32_t bbr_can_force_probertt = 0;
+static int32_t bbr_can_adjust_probertt = 1;
+static int32_t bbr_probertt_sets_rtt = 0;
+static int32_t bbr_can_use_ts_for_rtt = 1;
+static int32_t bbr_is_ratio = 0;
+static int32_t bbr_sub_drain_app_limit = 1;
+static int32_t bbr_prtt_slam_cwnd = 1;
+static int32_t bbr_sub_drain_slam_cwnd = 1;
+static int32_t bbr_slam_cwnd_in_main_drain = 1;
+static int32_t bbr_filter_len_sec = 6;	/* How long does the rttProp filter
+					 * hold */
+static uint32_t bbr_rtt_probe_limit = (USECS_IN_SECOND * 4);
+/*
+ * bbr_drain_gain is the reverse of the high_gain
+ * designed to drain back out the standing queue
+ * that is formed in startup by causing a larger
+ * hptsi gain and thus drainging the packets
+ * in flight.
+ */
+static int32_t bbr_drain_gain = BBR_UNIT * 1000 / 2885;
+static int32_t bbr_rttprobe_gain = 192;
+
+/*
+ * The cwnd_gain is the default cwnd gain applied when
+ * calculating a target cwnd. Note that the cwnd is
+ * a secondary factor in the way BBR works (see the
+ * paper and think about it, it will take some time).
+ * Basically the hptsi_gain spreads the packets out
+ * so you never get more than BDP to the peer even
+ * if the cwnd is high. In our implemenation that
+ * means in non-recovery/retransmission scenarios
+ * cwnd will never be reached by the flight-size.
+ */
+static int32_t bbr_cwnd_gain = BBR_UNIT * 2;
+static int32_t bbr_tlp_type_to_use = BBR_SRTT;
+static int32_t bbr_delack_time = 100000;	/* 100ms in useconds */
+static int32_t bbr_sack_not_required = 0;	/* set to one to allow non-sack to use bbr */
+static int32_t bbr_initial_bw_bps = 62500;	/* 500kbps in bytes ps */
+static int32_t bbr_ignore_data_after_close = 1;
+static int16_t bbr_hptsi_gain[] = {
+	(BBR_UNIT *5 / 4),
+	(BBR_UNIT * 3 / 4),
+	BBR_UNIT,
+	BBR_UNIT,
+	BBR_UNIT,
+	BBR_UNIT,
+	BBR_UNIT,
+	BBR_UNIT
+};
+int32_t bbr_use_rack_resend_cheat = 1;
+int32_t bbr_sends_full_iwnd = 1;
+
+#define BBR_HPTSI_GAIN_MAX 8
+/*
+ * The BBR module incorporates a number of
+ * TCP ideas that have been put out into the IETF
+ * over the last few years:
+ * - Yuchung Cheng's RACK TCP (for which its named) that
+ *    will stop us using the number of dup acks and instead
+ *    use time as the gage of when we retransmit.
+ * - Reorder Detection of RFC4737 and the Tail-Loss probe draft
+ *    of Dukkipati et.al.
+ * - Van Jacobson's et.al BBR.
+ *
+ * RACK depends on SACK, so if an endpoint arrives that
+ * cannot do SACK the state machine below will shuttle the
+ * connection back to using the "default" TCP stack that is
+ * in FreeBSD.
+ *
+ * To implement BBR and RACK the original TCP stack was first decomposed
+ * into a functional state machine with individual states
+ * for each of the possible TCP connection states. The do_segement
+ * functions role in life is to mandate the connection supports SACK
+ * initially and then assure that the RACK state matches the conenction
+ * state before calling the states do_segment function. Data processing
+ * of inbound segments also now happens in the hpts_do_segment in general
+ * with only one exception. This is so we can keep the connection on
+ * a single CPU.
+ *
+ * Each state is simplified due to the fact that the original do_segment
+ * has been decomposed and we *know* what state we are in (no
+ * switches on the state) and all tests for SACK are gone. This
+ * greatly simplifies what each state does.
+ *
+ * TCP output is also over-written with a new version since it
+ * must maintain the new rack scoreboard and has had hptsi
+ * integrated as a requirment. Still todo is to eliminate the
+ * use of the callout_() system and use the hpts for all
+ * timers as well.
+ */
+static uint32_t bbr_rtt_probe_time = 200000;	/* 200ms in micro seconds */
+static uint32_t bbr_rtt_probe_cwndtarg = 4;	/* How many mss's outstanding */
+static const int32_t bbr_min_req_free = 2;	/* The min we must have on the
+						 * free list */
+static int32_t bbr_tlp_thresh = 1;
+static int32_t bbr_reorder_thresh = 2;
+static int32_t bbr_reorder_fade = 60000000;	/* 0 - never fade, def
+						 * 60,000,000 - 60 seconds */
+static int32_t bbr_pkt_delay = 1000;
+static int32_t bbr_min_to = 1000;	/* Number of usec's minimum timeout */
+static int32_t bbr_incr_timers = 1;
+
+static int32_t bbr_tlp_min = 10000;	/* 10ms in usecs */
+static int32_t bbr_delayed_ack_time = 200000;	/* 200ms in usecs */
+static int32_t bbr_exit_startup_at_loss = 1;
+
+/*
+ * bbr_lt_bw_ratio is 1/8th
+ * bbr_lt_bw_diff is  < 4 Kbit/sec
+ */
+static uint64_t bbr_lt_bw_diff = 4000 / 8;	/* In bytes per second */
+static uint64_t bbr_lt_bw_ratio = 8;	/* For 1/8th */
+static uint32_t bbr_lt_bw_max_rtts = 48;	/* How many rtt's do we use
+						 * the lt_bw for */
+static uint32_t bbr_lt_intvl_min_rtts = 4;	/* Min num of RTT's to measure
+						 * lt_bw */
+static int32_t bbr_lt_intvl_fp = 0;		/* False positive epoch diff */
+static int32_t bbr_lt_loss_thresh = 196;	/* Lost vs delivered % */
+static int32_t bbr_lt_fd_thresh = 100;		/* false detection % */
+
+static int32_t bbr_verbose_logging = 0;
+/*
+ * Currently regular tcp has a rto_min of 30ms
+ * the backoff goes 12 times so that ends up
+ * being a total of 122.850 seconds before a
+ * connection is killed.
+ */
+static int32_t bbr_rto_min_ms = 30;	/* 30ms same as main freebsd */
+static int32_t bbr_rto_max_sec = 4;	/* 4 seconds */
+
+/****************************************************/
+/* DEFAULT TSO SIZING  (cpu performance impacting)  */
+/****************************************************/
+/* What amount is our formula using to get TSO size */
+static int32_t bbr_hptsi_per_second = 1000;
+
+/*
+ * For hptsi under bbr_cross_over connections what is delay 
+ * target 7ms (in usec) combined with a seg_max of 2
+ * gets us close to identical google behavior in 
+ * TSO size selection (possibly more 1MSS sends).
+ */
+static int32_t bbr_hptsi_segments_delay_tar = 7000;
+
+/* Does pacing delay include overhead's in its time calculations? */
+static int32_t bbr_include_enet_oh = 0;
+static int32_t bbr_include_ip_oh = 1;
+static int32_t bbr_include_tcp_oh = 1;
+static int32_t bbr_google_discount = 10;
+
+/* Do we use (nf mode) pkt-epoch to drive us or rttProp? */
+static int32_t bbr_state_is_pkt_epoch = 0;
+static int32_t bbr_state_drain_2_tar = 1;
+/* What is the max the 0 - bbr_cross_over MBPS TSO target
+ * can reach using our delay target. Note that this
+ * value becomes the floor for the cross over
+ * algorithm.

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201909241818.x8OIIBNr039667>

Header And Logo

Peripheral Links

Site Navigation

Header And Logo

Peripheral Links

Search

Site Navigation