Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 5 Feb 2011 00:36:21 -0800
From:      Weongyo Jeong <weongyo.jeong@gmail.com>
To:        freebsd-current@freebsd.org
Cc:        freebsd-net@freebsd.org
Subject:   [CFR] Forward RTO recovery algorithm (rfc5682) patch
Message-ID:  <20110205083620.GA60200@weongyo>

next in thread | raw e-mail | index | archive | help

--XsQoSWH+UP9D9v3l
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

This patch is *VERY* experimental patch to implement rfc5862 which is on
the IETF standard tracks so it could be completely wrong or has a lot of
bugs on it or wrong approaches because I'm a really newbie on TCP stack.

This patch includes two features to support `Basic FRTO algorithm' and
`SACK-Enhanced FRTO algorithm' but not including a feature to support
things mentioned at `Appendix A'.

I'm looking for reviewers teaching me where it's wrong or what the
approach was bad on line by line.  However any comments are welcome.

The patch is available at:

        http://people.freebsd.org/~weongyo/patch_tcpfrto_20110205.diff

regards,
Weongyo Jeong


--XsQoSWH+UP9D9v3l
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="patch_tcpfrto_20110205.diff"

Index: netinet/tcp_input.c
===================================================================
--- netinet/tcp_input.c	(revision 218148)
+++ netinet/tcp_input.c	(working copy)
@@ -161,6 +161,11 @@
     &VNET_NAME(tcp_abc_l_var), 2,
     "Cap the max cwnd increment during slow-start to this number of segments");
 
+VNET_DEFINE(int, tcp_do_rfc5682) = 1;
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc5682, CTLFLAG_RW,
+    &VNET_NAME(tcp_do_rfc5682), 0,
+    "Enable RFC 5682 (Forward RTO-Recovery: F-RTO)");
+
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
 
 VNET_DEFINE(int, tcp_do_ecn) = 0;
@@ -254,6 +259,212 @@
 	}
 }
 
+/* revert to the conventional RTO recovery. */
+#define	TCP_FRTO_REVERT(tp)	do {	\
+	(tp)->frto_flags = 0;		\
+} while (0)
+
+static int
+tcp_frto_send2mss(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
+{
+	struct inpcb *inp = tp->t_inpcb;
+	struct socket *so = inp->inp_socket;
+	u_long oldcwnd;
+	tcp_seq onxt;
+
+	/*
+	 * XXXWG If the TCP sender does not have any new data to send, OR the
+	 * advertised window prohibits new transmissions, the recommended
+	 * action is to skip step 3 of this algorithm and continue with
+	 * slow-start retransmissions, following the conventional RTO recovery
+	 * algorithm.  However, alternative ways of handling the window-limited
+	 * cases that could result in better performance are discussed in
+	 * Appendix A.
+	 */
+	if (so->so_snd.sb_cc == 0 || tp->snd_wnd == 0)
+		/* XXXWG skip step 3 OR do alternative ways.  */
+		return (1);
+	oldcwnd = tp->snd_cwnd;
+	onxt = tp->snd_nxt;
+	/*
+	 * XXXWG transmit up to two new (previously unsent) segments and enter
+	 * step 3 of this algorithm.  If the TCP sender does not have enough
+	 * unsent data, it can send only one segment.
+	 */
+	tp->snd_cwnd = 2 * tp->t_maxseg;
+	tp->snd_nxt = tp->snd_max;
+	/*
+	 * XXXWG in addition, the TCP sender MAY override the Nagle algorithm.
+	 */
+	tp->t_flags |= TF_ACKNOW;
+	(void) tcp_output(tp);
+	tp->snd_nxt = onxt;
+	tp->snd_cwnd = oldcwnd;
+	return (0);
+}
+
+static void inline
+tcp_frto_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
+{
+
+	/* SACK-Enhanced F-RTO */
+
+	if (tp->t_flags & TF_SACK_PERMIT) {
+		if ((tp->frto_flags & FRTO_INSTEP1) != 0 &&
+		    (tp->frto_flags & FRTO_CANGOSTEP2) != 0) {
+			/* SACK-Enhanced F-RTO - step 2 */
+			if (type == CC_DUPACK)
+				/*
+				 * 2) If duplicate ACKs arrive before the
+				 * cumulative acknowledgment for retransmitted
+				 * data, adjust the scoreboard according to
+				 * the incoming SACK information.  Stay in
+				 * step 2 and wait for the next new
+				 * acknowledgment.
+				 */
+				return;
+			KASSERT(type == CC_ACK,
+			    ("%s: expected ACK (but %d)", __func__, type));
+
+			/*
+			 * 2) When a new acknowledgment arrives, set variable
+			 * "RecoveryPoint" to indicate the highest sequence
+			 * number transmitted so far.
+			 */
+			tp->frto_flags |= FRTO_INSTEP2;
+			tp->snd_recover = tp->snd_max;
+
+			/*
+			 * 2a) If the Cumulative Acknowledgement field covers
+			 * "RecoveryPoint" but not more than
+			 * "RecoveryPoint", revert to the conventional RTO
+			 * recovery and set the congestion window to no
+			 * more than 2 * MSS, like a regular TCP would do.
+			 * Do not enter step 3 of this algorithm.
+			 */
+			if (th->th_ack == tp->snd_recover) {
+				tp->snd_cwnd = 2 * tp->t_maxseg;
+				TCP_FRTO_REVERT(tp);
+				return;
+			/*
+			 * 2b) If the Cumulative Acknowledgment field does not
+			 * cover "RecoveryPoint" but is larger than
+			 * SND.UNA.
+			 */
+			} else if (SEQ_LT(th->th_ack, tp->snd_recover) &&
+			    SEQ_GT(th->th_ack, tp->snd_una) &&
+			    !tcp_frto_send2mss(tp, th, type)) {
+				tp->frto_flags |= FRTO_CANGOSTEP3;
+				tp->frto_fack = tp->snd_fack;
+			}
+		}
+		if ((tp->frto_flags & FRTO_INSTEP2) != 0 &&
+		    (tp->frto_flags & FRTO_CANGOSTEP3) != 0) {
+			struct sackhole *q = TAILQ_FIRST(&tp->snd_holes);
+			/* the Cumulative Acknowledgment */
+			tcp_seq cack = SEQ_MAX(th->th_ack, tp->snd_una);
+
+			/*
+			 * XXXWG 3a) If the Cumulative Acknowledgment field or
+			 * the SACK information covers more than
+			 * "RecoveryPoint".
+			 */
+			if (SEQ_GEQ(cack, tp->snd_recover) ||
+			    (q != NULL && SEQ_GT(q->start, tp->snd_recover)))
+				goto frto_revert;
+			/*
+			 * XXXWG 3a) take this branch also when the
+			 * acknowledgment is a duplicate ACK and it does not
+			 * acknowledge any new, previously unacknowledged
+			 * data below "RecoveryPoint" in the SACK information.
+			 *
+			 * FIXME At this implementation there is a limitation
+			 * that it doesn't traverse all SACK information below
+			 * "RecoveryPoint".  So needs to implement a simple way
+			 * to check whether SACK information is updated or not
+			 * after receiving this ACK.
+			 */
+			if (type == CC_DUPACK &&
+			    SEQ_LEQ(tp->snd_fack, tp->snd_recover) &&
+			    SEQ_LEQ(tp->snd_fack, tp->frto_fack)) {
+frto_revert:
+				tp->snd_cwnd = 3 * tp->t_maxseg;
+				TCP_FRTO_REVERT(tp);
+				return;
+			}
+			/*
+			 * XXXWG 3b) If the Cumulative Acknowledgment field or a
+			 * SACK information in the ACK does not cover more than
+			 * "RecoveryPoint" AND it acknowledges data that was
+			 * not acknowledged earlier, declare the timeout
+			 * spurious.
+			 */
+			if (type == CC_ACK &&
+			    (SEQ_LEQ(cack, tp->snd_recover) ||
+			     (q != NULL &&
+			      SEQ_LT(q->start, tp->snd_recover))))
+				/* SPUR_TO: the timeout is spurious */
+				tp->snd_recover = tp->snd_una;
+		}
+		return;
+	}
+
+	/*
+	 * Basic F-RTO algorithm
+	 */
+
+	if ((tp->frto_flags & FRTO_INSTEP1) != 0 &&
+	    (tp->frto_flags & FRTO_CANGOSTEP2) != 0) {
+		/* Basic F-RTO - step 2 */
+		tp->frto_flags |= FRTO_INSTEP2;
+		tp->snd_recover = tp->snd_max;
+
+		if (type == CC_DUPACK ||
+		    /*
+		     * XXXWG The acknowledgment field covers "recover" but not
+		     * more than "recover"
+		     */
+		    th->th_ack == tp->snd_recover ||
+		    /*
+		     * XXXWG acknowledgement does not acknowledge all of
+		     * the data that was retransmitted in step 1.
+		     */
+		    SEQ_LEQ(th->th_ack, tp->snd_una))
+			TCP_FRTO_REVERT(tp);
+		else if (type == CC_ACK &&
+		    /*
+		     * XXXWG Acknowledgement field does not cover
+		     * "recover"
+		     */
+		    SEQ_LT(th->th_ack, tp->snd_recover) &&
+		    !tcp_frto_send2mss(tp, th, type))
+			tp->frto_flags |= FRTO_CANGOSTEP3;
+		else
+			TCP_FRTO_REVERT(tp);
+	}
+	if ((tp->frto_flags & FRTO_INSTEP2) != 0 &&
+	    (tp->frto_flags & FRTO_CANGOSTEP3) != 0) {
+		/* Basic F-RTO - step 3 */
+		if (type == CC_DUPACK) {
+			/*
+			 * XXXWG set the congestion window to no more
+			 * than 3 * MSS (where MSS indicates Maximum
+			 * Segment Size), and continue with the
+			 * slow-start algorithm retransmitting
+			 * unacknowledged segments.  The congestion
+			 * window can be set to 3 * MSS, because two
+			 * round-trip times have elapsed since the TRO,
+			 * and a conventional TCP sender would have
+			 * increased cwnd to 3 during the same time.
+			 */
+			tp->snd_cwnd = 3 * tp->t_maxseg;
+			TCP_FRTO_REVERT(tp);
+		} else if (type == CC_ACK)
+			/* SPUR_TO: the timeout is spurious */
+			tp->snd_recover = tp->snd_una;
+	}
+}
+
 /*
  * CC wrapper hook functions
  */
@@ -262,6 +473,9 @@
 {
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
+	if (V_tcp_do_rfc5682)
+		tcp_frto_ack_received(tp, th, type);
+
 	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
 	if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd))
 		tp->ccv->flags |= CCF_CWND_LIMITED;
Index: netinet/tcp_timer.c
===================================================================
--- netinet/tcp_timer.c	(revision 218148)
+++ netinet/tcp_timer.c	(working copy)
@@ -60,6 +60,7 @@
 #endif
 #include <netinet/ip_var.h>
 #include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcpip.h>
@@ -556,8 +557,64 @@
 		tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
 		tp->t_srtt = 0;
 	}
+	if (V_tcp_do_rfc5682) {
+		/*
+		 * If the retrasmission timer expires again during
+		 * the execution of the F-RTO algorithm, the TCP sender MUST
+		 * re-start the algorithm processing from step 1.
+		 */
+		tp->frto_flags = FRTO_INSTEP1;
+
+		/*
+		 * If the sender implements some loss recovery algorithm
+		 * other than Reno or NewReno [FHH04], the F-RTO algorithm
+		 * SHOULD NOT be entered when earlier fast recovery is underway.
+		 */
+		if (CC_ALGO(tp) != &newreno_cc_algo &&
+		    IN_RECOVERY(tp->t_flags))
+			goto no_rfc5682;
+		/*
+		 * SACK-Enhanced F-RTO
+		 * This algorithm SHOULD NOT be applied if the TCP sender is
+		 * already in loss recovery when a retransmission timeout
+		 * occurs.
+		 */
+		if ((tp->t_flags & TF_SACK_PERMIT) != 0 &&
+		    IN_RECOVERY(tp->t_flags))
+			goto no_rfc5682;
+
+		/*
+		 * Basic F-RTO algorithm - step 1: if the TCP sender is already
+		 * in RTO recovery AND "recover" is larger than or equal to
+		 * SND.UNA (the oldest unacknowledged sequence number [Pos81]),
+		 * do not enter step 2 of this algorithm.  Instead, store the
+		 * highest sequence number transmitted so far in variable
+		 * "recover".
+		 */
+		if ((tp->t_flags & TF_SACK_PERMIT) == 0 && 
+		    tp->t_rxtshift > 0 &&
+		    SEQ_GEQ(tp->snd_recover, tp->snd_una))
+			/*
+			 * continue with slow-start retransmissions following
+			 * the conventional RTO recovery algorithm.
+			 */
+			goto no_rfc5682;
+		/*
+		 * SACK-Enhanced F-RTO - step 1: If "RecoveryPoint" is larger
+		 * than or equal to SND.UNA, do not enter step 2 of this
+		 * algorithm.
+		 */
+		else if ((tp->t_flags & TF_SACK_PERMIT) != 0 &&
+		    SEQ_GEQ(tp->snd_recover, tp->snd_una))
+			goto no_rfc5682;
+		else
+			tp->frto_flags |= FRTO_CANGOSTEP2;
+	} else {
+no_rfc5682:
+		tp->snd_recover = tp->snd_max;
+	}
 	tp->snd_nxt = tp->snd_una;
-	tp->snd_recover = tp->snd_max;
+
 	/*
 	 * Force a segment to be sent.
 	 */
Index: netinet/tcp_var.h
===================================================================
--- netinet/tcp_var.h	(revision 218148)
+++ netinet/tcp_var.h	(working copy)
@@ -202,6 +202,12 @@
 	struct cc_algo	*cc_algo;	/* congestion control algorithm */
 	struct cc_var	*ccv;		/* congestion control specific vars */
 	struct osd	*osd;		/* storage for Khelp module data */
+	u_int	frto_flags;		/* step flags for RFC5682 where it's */
+#define	FRTO_INSTEP1	(1 << 0)
+#define	FRTO_CANGOSTEP2	(1 << 1)
+#define	FRTO_INSTEP2	(1 << 2)
+#define	FRTO_CANGOSTEP3	(1 << 3)
+	tcp_seq	frto_fack;		/* previous snd_fack at step 1 */
 
 	int	t_ispare;		/* explicit pad for 64bit alignment */
 	void	*t_pspare2[4];		/* 4 TBD */
@@ -602,6 +608,7 @@
 VNET_DECLARE(int, ss_fltsz_local);
 VNET_DECLARE(int, tcp_do_rfc3465);
 VNET_DECLARE(int, tcp_abc_l_var);
+VNET_DECLARE(int, tcp_do_rfc5682);
 #define	V_tcb			VNET(tcb)
 #define	V_tcbinfo		VNET(tcbinfo)
 #define	V_tcpstat		VNET(tcpstat)
@@ -614,6 +621,7 @@
 #define	V_ss_fltsz_local	VNET(ss_fltsz_local)
 #define	V_tcp_do_rfc3465	VNET(tcp_do_rfc3465)
 #define	V_tcp_abc_l_var		VNET(tcp_abc_l_var)
+#define	V_tcp_do_rfc5682	VNET(tcp_do_rfc5682)
 
 VNET_DECLARE(int, tcp_do_sack);			/* SACK enabled/disabled */
 VNET_DECLARE(int, tcp_sc_rst_sock_fail);	/* RST on sock alloc failure */

--XsQoSWH+UP9D9v3l--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20110205083620.GA60200>