Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 16 Dec 2015 00:56:45 +0000 (UTC)
From:      Randall Stewart <rrs@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r292309 - in head/sys: modules modules/tcp modules/tcp/fastpath netinet netinet/tcp_stacks
Message-ID:  <201512160056.tBG0ujqA067178@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: rrs
Date: Wed Dec 16 00:56:45 2015
New Revision: 292309
URL: https://svnweb.freebsd.org/changeset/base/292309

Log:
  First cut of the modularization of our TCP stack. Still
  to do is to clean up the timer handling using the async-drain.
  Other optimizations may be coming to go with this. Whats here
  will allow differnet tcp implementations (one included).
  Reviewed by:	jtl, hiren, transports
  Sponsored by:	Netflix Inc.
  Differential Revision:	D4055

Added:
  head/sys/modules/tcp/
  head/sys/modules/tcp/fastpath/
  head/sys/modules/tcp/fastpath/Makefile   (contents, props changed)
  head/sys/netinet/tcp_stacks/
  head/sys/netinet/tcp_stacks/fastpath.c   (contents, props changed)
Modified:
  head/sys/modules/Makefile
  head/sys/netinet/tcp.h
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_sack.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_syncache.c
  head/sys/netinet/tcp_timer.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h
  head/sys/netinet/toecore.c

Modified: head/sys/modules/Makefile
==============================================================================
--- head/sys/modules/Makefile	Wed Dec 16 00:56:38 2015	(r292308)
+++ head/sys/modules/Makefile	Wed Dec 16 00:56:45 2015	(r292309)
@@ -346,6 +346,7 @@ SUBDIR=	\
 	${_syscons} \
 	sysvipc \
 	${_ti} \
+	tcp/fastpath \
 	tests/framework \
 	tests/callout_test \
 	tl \

Added: head/sys/modules/tcp/fastpath/Makefile
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/modules/tcp/fastpath/Makefile	Wed Dec 16 00:56:45 2015	(r292309)
@@ -0,0 +1,15 @@
+#
+# $FreeBSD$
+#
+
+.PATH: ${.CURDIR}/../../../netinet/tcp_stacks
+
+KMOD=	fastpath
+SRCS=	fastpath.c
+
+#
+# Enable full debugging
+#
+#CFLAGS += -g
+
+.include <bsd.kmod.mk>

Modified: head/sys/netinet/tcp.h
==============================================================================
--- head/sys/netinet/tcp.h	Wed Dec 16 00:56:38 2015	(r292308)
+++ head/sys/netinet/tcp.h	Wed Dec 16 00:56:45 2015	(r292309)
@@ -167,7 +167,7 @@ struct tcphdr {
 #define	TCP_KEEPCNT	1024	/* L,N number of keepalives before close */
 #define	TCP_PCAP_OUT	2048	/* number of output packets to keep */
 #define	TCP_PCAP_IN	4096	/* number of input packets to keep */
-
+#define TCP_FUNCTION_BLK 8192	/* Set the tcp function pointers to the specified stack */
 /* Start of reserved space for third-party user-settable options. */
 #define	TCP_VENDOR	SO_VENDOR
 
@@ -245,5 +245,11 @@ struct tcp_info {
 	u_int32_t	__tcpi_pad[26];		/* Padding. */
 };
 #endif
+#define TCP_FUNCTION_NAME_LEN_MAX 32
+
+struct tcp_function_set {
+	char function_set_name[TCP_FUNCTION_NAME_LEN_MAX];
+	uint32_t pcbcnt;
+};
 
 #endif /* !_NETINET_TCP_H_ */

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c	Wed Dec 16 00:56:38 2015	(r292308)
+++ head/sys/netinet/tcp_input.c	Wed Dec 16 00:56:45 2015	(r292309)
@@ -230,23 +230,6 @@ VNET_DEFINE(struct inpcbhead, tcb);
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 VNET_DEFINE(struct inpcbinfo, tcbinfo);
 
-static void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
-static void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
-		     struct socket *, struct tcpcb *, int, int, uint8_t,
-		     int);
-static void	 tcp_dropwithreset(struct mbuf *, struct tcphdr *,
-		     struct tcpcb *, int, int);
-static void	 tcp_pulloutofband(struct socket *,
-		     struct tcphdr *, struct mbuf *, int);
-static void	 tcp_xmit_timer(struct tcpcb *, int);
-static void	 tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
-			    uint16_t type);
-static void inline	cc_conn_init(struct tcpcb *tp);
-static void inline	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
-static void inline	hhook_run_tcp_est_in(struct tcpcb *tp,
-			    struct tcphdr *th, struct tcpopt *to);
-
 /*
  * TCP statistics are stored in an "array" of counter(9)s.
  */
@@ -272,7 +255,7 @@ kmod_tcpstat_inc(int statnum)
 /*
  * Wrapper for the TCP established input helper hook.
  */
-static void inline
+void
 hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
 {
 	struct tcp_hhook_data hhook_data;
@@ -290,7 +273,7 @@ hhook_run_tcp_est_in(struct tcpcb *tp, s
 /*
  * CC wrapper hook functions
  */
-static void inline
+void
 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -322,7 +305,7 @@ cc_ack_received(struct tcpcb *tp, struct
 	}
 }
 
-static void inline
+void 
 cc_conn_init(struct tcpcb *tp)
 {
 	struct hc_metrics_lite metrics;
@@ -446,7 +429,7 @@ cc_cong_signal(struct tcpcb *tp, struct 
 	}
 }
 
-static void inline
+void inline
 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -601,9 +584,6 @@ tcp_input(struct mbuf **mp, int *offp, i
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
 	int ti_locked;
-#define	TI_UNLOCKED	1
-#define	TI_RLOCKED	2
-
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
@@ -1175,7 +1155,7 @@ relocked:
 			 * contains.  tcp_do_segment() consumes
 			 * the mbuf chain and unlocks the inpcb.
 			 */
-			tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
+			tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
 			    iptos, ti_locked);
 			INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 			return (IPPROTO_DONE);
@@ -1421,7 +1401,7 @@ relocked:
 	 * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 	 * the inpcb, and unlocks pcbinfo.
 	 */
-	tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
+	tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
 	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 	return (IPPROTO_DONE);
 
@@ -1476,7 +1456,7 @@ drop:
 	return (IPPROTO_DONE);
 }
 
-static void
+void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
     int ti_locked)
@@ -1788,7 +1768,7 @@ tcp_do_segment(struct mbuf *m, struct tc
 						      tp->t_rxtcur);
 				sowwakeup(so);
 				if (sbavail(&so->so_snd))
-					(void) tcp_output(tp);
+					(void) tp->t_fb->tfb_tcp_output(tp);
 				goto check_delack;
 			}
 		} else if (th->th_ack == tp->snd_una &&
@@ -1907,7 +1887,7 @@ tcp_do_segment(struct mbuf *m, struct tc
 				tp->t_flags |= TF_DELACK;
 			} else {
 				tp->t_flags |= TF_ACKNOW;
-				tcp_output(tp);
+				tp->t_fb->tfb_tcp_output(tp);
 			}
 			goto check_delack;
 		}
@@ -2522,7 +2502,7 @@ tcp_do_segment(struct mbuf *m, struct tc
 						}
 					} else
 						tp->snd_cwnd += tp->t_maxseg;
-					(void) tcp_output(tp);
+					(void) tp->t_fb->tfb_tcp_output(tp);
 					goto drop;
 				} else if (tp->t_dupacks == tcprexmtthresh) {
 					tcp_seq onxt = tp->snd_nxt;
@@ -2556,12 +2536,12 @@ tcp_do_segment(struct mbuf *m, struct tc
 						    tcps_sack_recovery_episode);
 						tp->sack_newdata = tp->snd_nxt;
 						tp->snd_cwnd = tp->t_maxseg;
-						(void) tcp_output(tp);
+						(void) tp->t_fb->tfb_tcp_output(tp);
 						goto drop;
 					}
 					tp->snd_nxt = th->th_ack;
 					tp->snd_cwnd = tp->t_maxseg;
-					(void) tcp_output(tp);
+					(void) tp->t_fb->tfb_tcp_output(tp);
 					KASSERT(tp->snd_limited <= 2,
 					    ("%s: tp->snd_limited too big",
 					    __func__));
@@ -2608,7 +2588,7 @@ tcp_do_segment(struct mbuf *m, struct tc
 					    (tp->snd_nxt - tp->snd_una);
 					SOCKBUF_UNLOCK(&so->so_snd);
 					if (avail > 0)
-						(void) tcp_output(tp);
+						(void) tp->t_fb->tfb_tcp_output(tp);
 					sent = tp->snd_max - oldsndmax;
 					if (sent > tp->t_maxseg) {
 						KASSERT((tp->t_dupacks == 2 &&
@@ -3074,7 +3054,7 @@ dodata:							/* XXX */
 	 * Return any desired output.
 	 */
 	if (needoutput || (tp->t_flags & TF_ACKNOW))
-		(void) tcp_output(tp);
+		(void) tp->t_fb->tfb_tcp_output(tp);
 
 check_delack:
 	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
@@ -3122,7 +3102,7 @@ dropafterack:
 	ti_locked = TI_UNLOCKED;
 
 	tp->t_flags |= TF_ACKNOW;
-	(void) tcp_output(tp);
+	(void) tp->t_fb->tfb_tcp_output(tp);
 	INP_WUNLOCK(tp->t_inpcb);
 	m_freem(m);
 	return;
@@ -3168,7 +3148,7 @@ drop:
  * The mbuf must still include the original packet header.
  * tp may be NULL.
  */
-static void
+void
 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
     int tlen, int rstreason)
 {
@@ -3231,7 +3211,7 @@ drop:
 /*
  * Parse TCP options and place in tcpopt.
  */
-static void
+void
 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
 {
 	int opt, optlen;
@@ -3325,7 +3305,7 @@ tcp_dooptions(struct tcpopt *to, u_char 
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
-static void
+void
 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
     int off)
 {
@@ -3358,7 +3338,7 @@ tcp_pulloutofband(struct socket *so, str
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
-static void
+void
 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 {
 	int delta;
@@ -3738,7 +3718,7 @@ tcp_mssopt(struct in_conninfo *inc)
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
-static void
+void
 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 {
 	tcp_seq onxt = tp->snd_nxt;
@@ -3755,7 +3735,7 @@ tcp_newreno_partial_ack(struct tcpcb *tp
 	 */
 	tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
 	tp->t_flags |= TF_ACKNOW;
-	(void) tcp_output(tp);
+	(void) tp->t_fb->tfb_tcp_output(tp);
 	tp->snd_cwnd = ocwnd;
 	if (SEQ_GT(onxt, tp->snd_nxt))
 		tp->snd_nxt = onxt;

Modified: head/sys/netinet/tcp_sack.c
==============================================================================
--- head/sys/netinet/tcp_sack.c	Wed Dec 16 00:56:38 2015	(r292308)
+++ head/sys/netinet/tcp_sack.c	Wed Dec 16 00:56:45 2015	(r292309)
@@ -599,7 +599,7 @@ tcp_sack_partialack(struct tcpcb *tp, st
 	if (tp->snd_cwnd > tp->snd_ssthresh)
 		tp->snd_cwnd = tp->snd_ssthresh;
 	tp->t_flags |= TF_ACKNOW;
-	(void) tcp_output(tp);
+	(void) tp->t_fb->tfb_tcp_output(tp);
 }
 
 #if 0

Added: head/sys/netinet/tcp_stacks/fastpath.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/netinet/tcp_stacks/fastpath.c	Wed Dec 16 00:56:45 2015	(r292309)
@@ -0,0 +1,2486 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2007-2008,2010
+ *	Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
+ * Copyright (c) 2015 Netflix Inc.
+ * All rights reserved.
+ *
+ * Portions of this software were developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University of Technology, by Lawrence Stewart,
+ * James Healy and David Hayes, made possible in part by a grant from the Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
+ * Portions of this software were developed by Randall R. Stewart while
+ * working for Netflix Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ipfw.h"		/* for ipfw_fwd	*/
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_kdtrace.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>		/* for proc0 declaration */
+#include <sys/protosw.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES		/* for logging */
+
+#include <netinet/cc.h>
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
+#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip_options.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet6/tcp6_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_syncache.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+
+#ifdef IPSEC
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+
+#include <security/mac/mac_framework.h>
+
+const int tcprexmtthresh;
+
+VNET_DECLARE(int, tcp_autorcvbuf_inc);
+#define	V_tcp_autorcvbuf_inc	VNET(tcp_autorcvbuf_inc)
+VNET_DECLARE(int, tcp_autorcvbuf_max);
+#define	V_tcp_autorcvbuf_max	VNET(tcp_autorcvbuf_max)
+VNET_DECLARE(int, tcp_do_rfc3042);
+#define	V_tcp_do_rfc3042	VNET(tcp_do_rfc3042)
+VNET_DECLARE(int, tcp_do_autorcvbuf);
+#define	V_tcp_do_autorcvbuf	VNET(tcp_do_autorcvbuf)
+VNET_DECLARE(int, tcp_insecure_rst);
+#define	V_tcp_insecure_rst	VNET(tcp_insecure_rst)
+VNET_DECLARE(int, tcp_insecure_syn);
+#define	V_tcp_insecure_syn	VNET(tcp_insecure_syn)
+
+
+
+
+extern void	tcp_dooptions(struct tcpopt *, u_char *, int, int);
+extern void	tcp_dropwithreset(struct mbuf *, struct tcphdr *,
+		     struct tcpcb *, int, int);
+extern void	tcp_pulloutofband(struct socket *,
+		     struct tcphdr *, struct mbuf *, int);
+extern void	tcp_xmit_timer(struct tcpcb *, int);
+extern void	tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+extern void	tcp_mss(struct tcpcb *tp, int offer);
+extern void 	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+				uint16_t type);
+extern void cc_conn_init(struct tcpcb *tp);
+extern void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
+extern void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+extern void hhook_run_tcp_est_in(struct tcpcb *tp,
+				 struct tcphdr *th, struct tcpopt *to);
+
+extern void kmod_tcpstat_inc(int statnum);
+#ifdef TCP_SIGNATURE
+extern int tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int optlen,
+	     struct tcpopt *to, struct tcphdr *th, u_int tcpbflag);
+#endif
+
+static void	 tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *,
+			struct socket *, struct tcpcb *, int, int, uint8_t,
+			int);
+
+static void	 tcp_do_segment_fastack(struct mbuf *, struct tcphdr *,
+			struct socket *, struct tcpcb *, int, int, uint8_t,
+			int);
+
+/*
+ * Indicate whether this ack should be delayed.  We can delay the ack if
+ * following conditions are met:
+ *	- There is no delayed ack timer in progress.
+ *	- Our last ack wasn't a 0-sized window. We never want to delay
+ *	  the ack that opens up a 0-sized window.
+ *	- LRO wasn't used for this segment. We make sure by checking that the
+ *	  segment size is not larger than the MSS.
+ *	- Delayed acks are enabled or this is a half-synchronized T/TCP
+ *	  connection.
+ */
+#define DELAY_ACK(tp, tlen)						\
+	((!tcp_timer_active(tp, TT_DELACK) &&				\
+	    (tp->t_flags & TF_RXWIN0SENT) == 0) &&			\
+	    (tlen <= tp->t_maxopd) &&					\
+	    (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
+
+/*
+ * So how is this faster than the normal fast ack?
+ * It basically allows us to also stay in the fastpath
+ * when a window-update ack also arrives. In testing
+ * we saw only 25-30% of connections doing fastpath 
+ * due to the fact that along with moving forward
+ * in sequence the window was also updated.
+ */
+static void
+tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+	       struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 
+	       int ti_locked, u_long tiwin)
+{
+	int acked;
+	int winup_only=0;
+#ifdef TCPDEBUG
+	/*
+	 * The size of tcp_saveipgen must be the size of the max ip header,
+	 * now IPv6.
+	 */
+	u_char tcp_saveipgen[IP6_HDR_LEN];
+	struct tcphdr tcp_savetcp;
+	short ostate = 0;
+#endif
+        /*
+	 * The following if statment will be true if
+	 * we are doing the win_up_in_fp <and>
+	 * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or>
+	 * - No more new data, but we have an ack for new data
+	 *   (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack))
+	 * - No more new data, the same ack point but the window grew
+	 *   (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd)
+	 */
+	if ((SEQ_LT(tp->snd_wl1, th->th_seq) ||
+	     (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+					    (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
+		/* keep track of pure window updates */
+		if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
+			winup_only = 1;
+			TCPSTAT_INC(tcps_rcvwinupd);
+		}
+		tp->snd_wnd = tiwin;
+		tp->snd_wl1 = th->th_seq;
+		tp->snd_wl2 = th->th_ack;
+		if (tp->snd_wnd > tp->max_sndwnd)
+			tp->max_sndwnd = tp->snd_wnd;
+	}
+	/*
+	 * If last ACK falls within this segment's sequence numbers,
+	 * record the timestamp.
+	 * NOTE that the test is modified according to the latest
+	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+	 */
+	if ((to->to_flags & TOF_TS) != 0 &&
+	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+		tp->ts_recent_age = tcp_ts_getticks();
+		tp->ts_recent = to->to_tsval;
+	}
+	/*
+	 * This is a pure ack for outstanding data.
+	 */
+	if (ti_locked == TI_RLOCKED) {
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	}
+	ti_locked = TI_UNLOCKED;
+
+	TCPSTAT_INC(tcps_predack);
+
+	/*
+	 * "bad retransmit" recovery.
+	 */
+	if (tp->t_rxtshift == 1 &&
+	    tp->t_flags & TF_PREVVALID &&
+	    (int)(ticks - tp->t_badrxtwin) < 0) {
+		cc_cong_signal(tp, th, CC_RTO_ERR);
+	}
+
+	/*
+	 * Recalculate the transmit timer / rtt.
+	 *
+	 * Some boxes send broken timestamp replies
+	 * during the SYN+ACK phase, ignore
+	 * timestamps of 0 or we could calculate a
+	 * huge RTT and blow up the retransmit timer.
+	 */
+	if ((to->to_flags & TOF_TS) != 0 &&
+	    to->to_tsecr) {
+		u_int t;
+
+		t = tcp_ts_getticks() - to->to_tsecr;
+		if (!tp->t_rttlow || tp->t_rttlow > t)
+			tp->t_rttlow = t;
+		tcp_xmit_timer(tp,
+			       TCP_TS_TO_TICKS(t) + 1);
+	} else if (tp->t_rtttime &&
+		   SEQ_GT(th->th_ack, tp->t_rtseq)) {
+		if (!tp->t_rttlow ||
+		    tp->t_rttlow > ticks - tp->t_rtttime)
+			tp->t_rttlow = ticks - tp->t_rtttime;
+		tcp_xmit_timer(tp,
+			       ticks - tp->t_rtttime);
+	}
+	if (winup_only == 0) {
+		acked = BYTES_THIS_ACK(tp, th);
+
+		/* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
+		hhook_run_tcp_est_in(tp, th, to);
+
+		TCPSTAT_ADD(tcps_rcvackbyte, acked);
+		sbdrop(&so->so_snd, acked);
+		if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
+		    SEQ_LEQ(th->th_ack, tp->snd_recover))
+			tp->snd_recover = th->th_ack - 1;
+				
+		/*
+		 * Let the congestion control algorithm update
+		 * congestion control related information. This
+		 * typically means increasing the congestion
+		 * window.
+		 */
+		cc_ack_received(tp, th, CC_ACK);
+
+		tp->snd_una = th->th_ack;
+		/*
+		 * Pull snd_wl2 up to prevent seq wrap relative
+		 * to th_ack.
+		 */
+		tp->snd_wl2 = th->th_ack;
+		tp->t_dupacks = 0;
+		m_freem(m);
+
+		/*
+		 * If all outstanding data are acked, stop
+		 * retransmit timer, otherwise restart timer
+		 * using current (possibly backed-off) value.
+		 * If process is waiting for space,
+		 * wakeup/selwakeup/signal.  If data
+		 * are ready to send, let tcp_output
+		 * decide between more output or persist.
+		 */
+#ifdef TCPDEBUG
+		if (so->so_options & SO_DEBUG)
+			tcp_trace(TA_INPUT, ostate, tp,
+				  (void *)tcp_saveipgen,
+				  &tcp_savetcp, 0);
+#endif
+		if (tp->snd_una == tp->snd_max)
+			tcp_timer_activate(tp, TT_REXMT, 0);
+		else if (!tcp_timer_active(tp, TT_PERSIST))
+			tcp_timer_activate(tp, TT_REXMT,
+					   tp->t_rxtcur);
+	} else {
+		/* 
+		 * Window update only, just free the mbufs and
+		 * send out whatever we can.
+		 */
+		m_freem(m);
+	}
+	sowwakeup(so);
+	if (sbavail(&so->so_snd))
+		(void) tcp_output(tp);
+	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+					    __func__, ti_locked));
+	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	if (tp->t_flags & TF_DELACK) {
+		tp->t_flags &= ~TF_DELACK;
+		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+	}
+	INP_WUNLOCK(tp->t_inpcb);
+}
+
+/*
+ * Here nothing is really faster, its just that we
+ * have broken out the fast-data path also just like
+ * the fast-ack. 
+ */
+static void
+tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
+		   struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 
+		   int ti_locked, u_long tiwin)
+{
+	int newsize = 0;	/* automatic sockbuf scaling */
+#ifdef TCPDEBUG
+	/*
+	 * The size of tcp_saveipgen must be the size of the max ip header,
+	 * now IPv6.
+	 */
+	u_char tcp_saveipgen[IP6_HDR_LEN];
+	struct tcphdr tcp_savetcp;
+	short ostate = 0;
+#endif
+	/*
+	 * If last ACK falls within this segment's sequence numbers,
+	 * record the timestamp.
+	 * NOTE that the test is modified according to the latest
+	 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
+	 */
+	if ((to->to_flags & TOF_TS) != 0 &&
+	    SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+		tp->ts_recent_age = tcp_ts_getticks();
+		tp->ts_recent = to->to_tsval;
+	}
+
+	/*
+	 * This is a pure, in-sequence data packet with
+	 * nothing on the reassembly queue and we have enough
+	 * buffer space to take it.
+	 */
+	if (ti_locked == TI_RLOCKED) {
+		INP_INFO_RUNLOCK(&V_tcbinfo);
+	}
+	ti_locked = TI_UNLOCKED;
+
+	/* Clean receiver SACK report if present */
+	if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
+		tcp_clean_sackreport(tp);
+	TCPSTAT_INC(tcps_preddat);
+	tp->rcv_nxt += tlen;
+	/*
+	 * Pull snd_wl1 up to prevent seq wrap relative to
+	 * th_seq.
+	 */
+	tp->snd_wl1 = th->th_seq;
+	/*
+	 * Pull rcv_up up to prevent seq wrap relative to
+	 * rcv_nxt.
+	 */
+	tp->rcv_up = tp->rcv_nxt;
+	TCPSTAT_ADD(tcps_rcvbyte, tlen);
+#ifdef TCPDEBUG
+	if (so->so_options & SO_DEBUG)
+		tcp_trace(TA_INPUT, ostate, tp,
+			  (void *)tcp_saveipgen, &tcp_savetcp, 0);
+#endif
+	/*
+	 * Automatic sizing of receive socket buffer.  Often the send
+	 * buffer size is not optimally adjusted to the actual network
+	 * conditions at hand (delay bandwidth product).  Setting the
+	 * buffer size too small limits throughput on links with high
+	 * bandwidth and high delay (eg. trans-continental/oceanic links).
+	 *
+	 * On the receive side the socket buffer memory is only rarely
+	 * used to any significant extent.  This allows us to be much
+	 * more aggressive in scaling the receive socket buffer.  For
+	 * the case that the buffer space is actually used to a large
+	 * extent and we run out of kernel memory we can simply drop
+	 * the new segments; TCP on the sender will just retransmit it
+	 * later.  Setting the buffer size too big may only consume too
+	 * much kernel memory if the application doesn't read() from
+	 * the socket or packet loss or reordering makes use of the
+	 * reassembly queue.
+	 *
+	 * The criteria to step up the receive buffer one notch are:
+	 *  1. Application has not set receive buffer size with
+	 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
+	 *  2. the number of bytes received during the time it takes
+	 *     one timestamp to be reflected back to us (the RTT);
+	 *  3. received bytes per RTT is within seven eighth of the
+	 *     current socket buffer size;
+	 *  4. receive buffer size has not hit maximal automatic size;
+	 *
+	 * This algorithm does one step per RTT at most and only if
+	 * we receive a bulk stream w/o packet losses or reorderings.
+	 * Shrinking the buffer during idle times is not necessary as
+	 * it doesn't consume any memory when idle.
+	 *
+	 * TODO: Only step up if the application is actually serving
+	 * the buffer to better manage the socket buffer resources.
+	 */
+	if (V_tcp_do_autorcvbuf &&
+	    (to->to_flags & TOF_TS) &&
+	    to->to_tsecr &&
+	    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+		if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
+		    to->to_tsecr - tp->rfbuf_ts < hz) {
+			if (tp->rfbuf_cnt >
+			    (so->so_rcv.sb_hiwat / 8 * 7) &&
+			    so->so_rcv.sb_hiwat <
+			    V_tcp_autorcvbuf_max) {
+				newsize =
+					min(so->so_rcv.sb_hiwat +
+					    V_tcp_autorcvbuf_inc,
+					    V_tcp_autorcvbuf_max);
+			}
+			/* Start over with next RTT. */
+			tp->rfbuf_ts = 0;
+			tp->rfbuf_cnt = 0;
+		} else
+			tp->rfbuf_cnt += tlen;	/* add up */
+	}
+
+	/* Add data to socket buffer. */
+	SOCKBUF_LOCK(&so->so_rcv);
+	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+		m_freem(m);
+	} else {
+		/*
+		 * Set new socket buffer size.
+		 * Give up when limit is reached.
+		 */
+		if (newsize)
+			if (!sbreserve_locked(&so->so_rcv,
+					      newsize, so, NULL))
+				so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
+		m_adj(m, drop_hdrlen);	/* delayed header drop */
+		sbappendstream_locked(&so->so_rcv, m, 0);
+	}
+	/* NB: sorwakeup_locked() does an implicit unlock. */
+	sorwakeup_locked(so);
+	if (DELAY_ACK(tp, tlen)) {
+		tp->t_flags |= TF_DELACK;
+	} else {
+		tp->t_flags |= TF_ACKNOW;
+		tcp_output(tp);
+	}
+	KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+					    __func__, ti_locked));
+	INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+	INP_WLOCK_ASSERT(tp->t_inpcb);
+
+	if (tp->t_flags & TF_DELACK) {
+		tp->t_flags &= ~TF_DELACK;
+		tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+	}
+	INP_WUNLOCK(tp->t_inpcb);
+}
+
+/*
+ * The slow-path is the clone of the long long part
+ * of tcp_do_segment past all the fast-path stuff. We
+ * use it here by two different callers, the fast/slow and
+ * the fastack only.
+ */
+static void
+tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
+		struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 
+		int ti_locked, u_long tiwin, int thflags)
+{
+	int  acked, ourfinisacked, needoutput = 0;
+	int rstreason, todrop, win;
+	char *s;
+	struct in_conninfo *inc;
+	struct mbuf *mfree = NULL;
+#ifdef TCPDEBUG
+	/*
+	 * The size of tcp_saveipgen must be the size of the max ip header,
+	 * now IPv6.
+	 */
+	u_char tcp_saveipgen[IP6_HDR_LEN];
+	struct tcphdr tcp_savetcp;
+	short ostate = 0;
+#endif
+	/*
+	 * Calculate amount of space in receive window,
+	 * and then do TCP input processing.
+	 * Receive window is amount of space in rcv queue,
+	 * but not less than advertised window.
+	 */
+	inc = &tp->t_inpcb->inp_inc;
+	win = sbspace(&so->so_rcv);
+	if (win < 0)
+		win = 0;
+	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+
+	/* Reset receive buffer auto scaling when not in bulk receive mode. */
+	tp->rfbuf_ts = 0;
+	tp->rfbuf_cnt = 0;
+
+	switch (tp->t_state) {
+
+	/*
+	 * If the state is SYN_RECEIVED:
+	 *	if seg contains an ACK, but not for our SYN/ACK, send a RST.
+	 */
+	case TCPS_SYN_RECEIVED:
+		if ((thflags & TH_ACK) &&
+		    (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+		     SEQ_GT(th->th_ack, tp->snd_max))) {
+				rstreason = BANDLIM_RST_OPENPORT;
+				goto dropwithreset;
+		}
+		break;
+
+	/*
+	 * If the state is SYN_SENT:
+	 *	if seg contains an ACK, but not for our SYN, drop the input.
+	 *	if seg contains a RST, then drop the connection.
+	 *	if seg does not contain SYN, then drop it.
+	 * Otherwise this is an acceptable SYN segment
+	 *	initialize tp->rcv_nxt and tp->irs
+	 *	if seg contains ack then advance tp->snd_una
+	 *	if seg contains an ECE and ECN support is enabled, the stream
+	 *	    is ECN capable.
+	 *	if SYN has been acked change to ESTABLISHED else SYN_RCVD state
+	 *	arrange for segment to be acked (eventually)
+	 *	continue processing rest of data/controls, beginning with URG
+	 */
+	case TCPS_SYN_SENT:
+		if ((thflags & TH_ACK) &&
+		    (SEQ_LEQ(th->th_ack, tp->iss) ||
+		     SEQ_GT(th->th_ack, tp->snd_max))) {
+			rstreason = BANDLIM_UNLIMITED;
+			goto dropwithreset;
+		}
+		if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
+			TCP_PROBE5(connect__refused, NULL, tp,
+			    mtod(m, const char *), tp, th);
+			tp = tcp_drop(tp, ECONNREFUSED);
+		}
+		if (thflags & TH_RST)
+			goto drop;
+		if (!(thflags & TH_SYN))
+			goto drop;
+
+		tp->irs = th->th_seq;
+		tcp_rcvseqinit(tp);
+		if (thflags & TH_ACK) {
+			TCPSTAT_INC(tcps_connects);
+			soisconnected(so);
+#ifdef MAC
+			mac_socketpeer_set_from_mbuf(m, so);
+#endif
+			/* Do window scaling on this connection? */
+			if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+				(TF_RCVD_SCALE|TF_REQ_SCALE)) {
+				tp->rcv_scale = tp->request_r_scale;
+			}
+			tp->rcv_adv += imin(tp->rcv_wnd,
+			    TCP_MAXWIN << tp->rcv_scale);
+			tp->snd_una++;		/* SYN is acked */
+			/*
+			 * If there's data, delay ACK; if there's also a FIN
+			 * ACKNOW will be turned on later.
+			 */
+			if (DELAY_ACK(tp, tlen) && tlen != 0)
+				tcp_timer_activate(tp, TT_DELACK,
+				    tcp_delacktime);
+			else
+				tp->t_flags |= TF_ACKNOW;
+
+			if ((thflags & TH_ECE) && V_tcp_do_ecn) {
+				tp->t_flags |= TF_ECN_PERMIT;
+				TCPSTAT_INC(tcps_ecn_shs);
+			}
+			
+			/*
+			 * Received <SYN,ACK> in SYN_SENT[*] state.
+			 * Transitions:
+			 *	SYN_SENT  --> ESTABLISHED
+			 *	SYN_SENT* --> FIN_WAIT_1
+			 */
+			tp->t_starttime = ticks;
+			if (tp->t_flags & TF_NEEDFIN) {
+				tcp_state_change(tp, TCPS_FIN_WAIT_1);
+				tp->t_flags &= ~TF_NEEDFIN;
+				thflags &= ~TH_SYN;
+			} else {
+				tcp_state_change(tp, TCPS_ESTABLISHED);
+				TCP_PROBE5(connect__established, NULL, tp,
+				    mtod(m, const char *), tp, th);
+				cc_conn_init(tp);
+				tcp_timer_activate(tp, TT_KEEP,
+				    TP_KEEPIDLE(tp));
+			}
+		} else {
+			/*
+			 * Received initial SYN in SYN-SENT[*] state =>
+			 * simultaneous open.
+			 * If it succeeds, connection is * half-synchronized.
+			 * Otherwise, do 3-way handshake:
+			 *        SYN-SENT -> SYN-RECEIVED
+			 *        SYN-SENT* -> SYN-RECEIVED*
+			 */
+			tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
+			tcp_timer_activate(tp, TT_REXMT, 0);
+			tcp_state_change(tp, TCPS_SYN_RECEIVED);
+		}
+
+		KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
+		    "ti_locked %d", __func__, ti_locked));
+		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+		INP_WLOCK_ASSERT(tp->t_inpcb);
+
+		/*
+		 * Advance th->th_seq to correspond to first data byte.
+		 * If data, trim to stay within window,
+		 * dropping FIN if necessary.
+		 */
+		th->th_seq++;
+		if (tlen > tp->rcv_wnd) {
+			todrop = tlen - tp->rcv_wnd;
+			m_adj(m, -todrop);
+			tlen = tp->rcv_wnd;
+			thflags &= ~TH_FIN;
+			TCPSTAT_INC(tcps_rcvpackafterwin);
+			TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+		}
+		tp->snd_wl1 = th->th_seq - 1;
+		tp->rcv_up = th->th_seq;
+		/*
+		 * Client side of transaction: already sent SYN and data.
+		 * If the remote host used T/TCP to validate the SYN,
+		 * our data will be ACK'd; if so, enter normal data segment
+		 * processing in the middle of step 5, ack processing.
+		 * Otherwise, goto step 6.
+		 */
+		if (thflags & TH_ACK)
+			goto process_ACK;
+
+		goto step6;
+
+	/*
+	 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
+	 *      do normal processing.
+	 *
+	 * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
+	 */

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201512160056.tBG0ujqA067178>