Date: Wed, 27 Aug 2014 07:23:18 +0000 From: "Eggert, Lars" <lars@netapp.com> To: Adrian Chadd <adrian@freebsd.org> Cc: Tom Jones <jones@sdf.org>, FreeBSD Net <freebsd-net@freebsd.org> Subject: Re: Patches for RFC6937 and draft-ietf-tcpm-newcwv-00 Message-ID: <76D986F7-72A8-4ABE-8731-064C6C77A56F@netapp.com> In-Reply-To: <CAJ-Vmo=TsqAKUrV3BRAk1bX9E1zKq7j5og5CHv4PEz-9sqXpAA@mail.gmail.com> References: <259C9434-C6FE-42EA-823D-ECB024DBF3D7@netapp.com> <B7145157-9A03-4053-BFCC-627633E20122@neville-neil.com> <814E0886-1B6B-4316-8BAB-684DAFDE1983@netapp.com> <20140826145517.GD12732@gmail.com> <CAJ-Vmo=TsqAKUrV3BRAk1bX9E1zKq7j5og5CHv4PEz-9sqXpAA@mail.gmail.com>
next in thread | previous in thread | raw e-mail | index | archive | help
--Apple-Mail=_3746A6C5-7BF1-4796-97F0-9AC400CD96D2
Content-Type: multipart/mixed;
boundary="Apple-Mail=_B62A7D06-AE4C-4A19-904D-CCD75DCF6A4A"
--Apple-Mail=_B62A7D06-AE4C-4A19-904D-CCD75DCF6A4A
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain;
charset=us-ascii
It would be great if people could also review Aris' PRR patch - RFC6937 =
has been out for a while.
Lars
--Apple-Mail=_B62A7D06-AE4C-4A19-904D-CCD75DCF6A4A
Content-Disposition: attachment;
filename=prr.patch
Content-Type: application/octet-stream;
name="prr.patch"
Content-Transfer-Encoding: 7bit
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 75609fd..70c29a8 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -145,6 +145,18 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
&VNET_NAME(drop_synfin), 0,
"Drop TCP packets with SYN+FIN set");
+VNET_DEFINE(int, tcp_do_prr_conservative) = 0;
+#define V_tcp_do_prr_conservative VNET(tcp_do_prr_conservative)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, do_prr_conservative, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_prr_conservative), 0,
+ "Do conservative PRR");
+
+VNET_DEFINE(int, tcp_do_prr) = 0;
+#define V_tcp_do_prr VNET(tcp_do_prr)
+SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_RW,
+ &VNET_NAME(tcp_do_prr), 0,
+ "Do the Proportional Rate Reduction Algorithm");
+
VNET_DEFINE(int, tcp_do_rfc3042) = 1;
#define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042)
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
@@ -229,6 +241,7 @@ static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static void tcp_xmit_timer(struct tcpcb *, int);
static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+static void tcp_prr_partial_ack(struct tcpcb *, struct tcphdr *);
static void inline tcp_fields_to_host(struct tcphdr *);
#ifdef TCP_SIGNATURE
static void inline tcp_fields_to_net(struct tcphdr *);
@@ -2460,7 +2473,50 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
else if (++tp->t_dupacks > tcprexmtthresh ||
IN_FASTRECOVERY(tp->t_flags)) {
cc_ack_received(tp, th, CC_DUPACK);
- if ((tp->t_flags & TF_SACK_PERMIT) &&
+ if (V_tcp_do_prr &&
+ IN_FASTRECOVERY(tp->t_flags) &&
+ (tp->t_flags & TF_SACK_PERMIT)) {
+ long snd_cnt = 0, limit = 0, del_data = 0, pipe = 0;
+ /*
+ *In a duplicate ACK del_data is only the
+ *diff_in_sack. If no SACK is used del_data will be 0.
+ *Pipe is the amount of data we estimate to be
+ *in the network.
+ */
+ del_data = tp->diff_in_sack;
+ pipe = (tp->snd_nxt - tp->snd_fack) +
+ tp->sackhint.sack_bytes_rexmit;
+ tp->prr_delivered += del_data;
+ if (pipe > tp->snd_ssthresh)
+ snd_cnt = (tp->prr_delivered * tp->snd_ssthresh /
+ tp->recover_fs) + 1 - tp->prr_out;
+ else {
+ if (V_tcp_do_prr_conservative)
+ limit = tp->prr_delivered - tp->prr_out;
+ else
+ if ((tp->prr_delivered - tp->prr_out) > del_data)
+ limit = tp->prr_delivered - tp->prr_out +
+ tp->t_maxseg;
+ else
+ limit = del_data + tp->t_maxseg;
+ if ((tp->snd_ssthresh - pipe) < limit)
+ snd_cnt = tp->snd_ssthresh - pipe;
+ else
+ snd_cnt = limit;
+ }
+ snd_cnt = (snd_cnt / tp->t_maxseg);
+ if (snd_cnt < 0)
+ snd_cnt = 0;
+ /*
+ * Send snd_cnt new data into the network in
+ * response to this ack.If there is gonna be a
+ * SACK retransmission, adjust snd_cwnd
+ * accordingly.
+ */
+ tp->snd_cwnd = tp->snd_nxt - tp->sack_newdata +
+ tp->sackhint.sack_bytes_rexmit + (snd_cnt*tp->t_maxseg);
+ }
+ else if ((tp->t_flags & TF_SACK_PERMIT) &&
IN_FASTRECOVERY(tp->t_flags)) {
int awnd;
@@ -2495,12 +2551,18 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_seq onxt = tp->snd_nxt;
/*
- * If we're doing sack, check to
- * see if we're already in sack
+ * If we're doing sack or prr, check to
+ * see if we're already in
* recovery. If we're not doing sack,
* check to see if we're in newreno
* recovery.
*/
+ if (V_tcp_do_prr) {
+ if (IN_FASTRECOVERY(tp->t_flags)) {
+ tp->t_dupacks = 0;
+ break;
+ }
+ }
if (tp->t_flags & TF_SACK_PERMIT) {
if (IN_FASTRECOVERY(tp->t_flags)) {
tp->t_dupacks = 0;
@@ -2518,6 +2580,15 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
cc_ack_received(tp, th, CC_DUPACK);
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
+ if (V_tcp_do_prr) {
+ /*
+ * snd_ssthresh is already updated by cc_cong_signal.
+ */
+ tp->prr_delivered = 0;
+ tp->prr_out = 0;
+ if(!(tp->recover_fs = tp->snd_nxt - tp->snd_una))
+ tp->recover_fs = 1;
+ }
if (tp->t_flags & TF_SACK_PERMIT) {
TCPSTAT_INC(
tcps_sack_recovery_episode);
@@ -2614,7 +2685,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (IN_FASTRECOVERY(tp->t_flags)) {
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
- if (tp->t_flags & TF_SACK_PERMIT)
+ if (V_tcp_do_prr && (tp->t_flags & TF_SACK_PERMIT))
+ tcp_prr_partial_ack(tp, th);
+ else if (tp->t_flags & TF_SACK_PERMIT)
tcp_sack_partialack(tp, th);
else
tcp_newreno_partial_ack(tp, th);
@@ -3692,6 +3765,57 @@ tcp_mssopt(struct in_conninfo *inc)
return (mss);
}
+static void
+tcp_prr_partial_ack(struct tcpcb *tp, struct tcphdr *th)
+{
+ long snd_cnt = 0, limit = 0, del_data = 0, pipe = 0;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ tcp_timer_activate(tp, TT_REXMT, 0);
+ tp->t_rtttime = 0;
+ /*
+ * Compute amount of data that this ACK is indicating (del_data)
+ * and an estimate of how many bytes are in the network.
+ */
+ if (SEQ_GEQ(th->th_ack,tp->snd_una))
+ del_data = BYTES_THIS_ACK(tp, th);
+ del_data += tp->diff_in_sack;
+ pipe = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit;
+ tp->prr_delivered += del_data;
+ /*
+ * Proportional Rate Reduction
+ */
+ if (pipe > tp->snd_ssthresh)
+ snd_cnt = (tp->prr_delivered * tp->snd_ssthresh / tp->recover_fs) -
+ tp->prr_out;
+ else {
+ if (V_tcp_do_prr_conservative)
+ limit = tp->prr_delivered - tp->prr_out;
+ else
+ if ((tp->prr_delivered - tp->prr_out) > del_data)
+ limit = tp->prr_delivered - tp->prr_out + tp->t_maxseg;
+ else
+ limit = del_data + tp->t_maxseg;
+ if ((tp->snd_ssthresh - pipe) < limit)
+ snd_cnt = tp->snd_ssthresh - pipe;
+ else
+ snd_cnt = limit;
+ }
+ snd_cnt = (snd_cnt / tp->t_maxseg);
+ if (snd_cnt < 0)
+ snd_cnt = 0;
+ /*
+ * Send snd_cnt new data into the network
+ * in response to this ack.
+ * If there is gonna be a SACK retransmission,
+ * adjust snd_cwnd accordingly.
+ */
+ tp->snd_cwnd = tp->snd_nxt - tp->sack_newdata +
+ tp->sackhint.sack_bytes_rexmit + (snd_cnt * tp->t_maxseg);
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+}
/*
* On a partial ack arrives, force the retransmission of the
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index 00d5415..7b4936d 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -1194,6 +1194,8 @@ send:
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0),
NULL, NULL, tp->t_inpcb);
+ if (V_tcp_do_prr && IN_FASTRECOVERY(tp->t_flags))
+ tp->prr_out += len;
if (error == EMSGSIZE && ro.ro_rt != NULL)
mtu = ro.ro_rt->rt_rmx.rmx_mtu;
RO_RTFREE(&ro);
@@ -1232,6 +1234,8 @@ send:
((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
tp->t_inpcb);
+ if (V_tcp_do_prr && IN_FASTRECOVERY(tp->t_flags))
+ tp->prr_out += len;
if (error == EMSGSIZE && ro.ro_rt != NULL)
mtu = ro.ro_rt->rt_rmx.rmx_mtu;
RO_RTFREE(&ro);
@@ -1323,6 +1327,8 @@ timer:
* XXX: It is a POLA question whether calling tcp_drop right
* away would be the really correct behavior instead.
*/
+ if (V_tcp_do_prr && IN_FASTRECOVERY(tp->t_flags))
+ tp->prr_out -= len;
if (((tp->t_flags & TF_FORCEDATA) == 0 ||
!tcp_timer_active(tp, TT_PERSIST)) &&
((flags & TH_SYN) == 0) &&
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 440bd64..800df2f 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -348,9 +348,10 @@ tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole)
void
tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
{
- struct sackhole *cur, *temp;
+ struct sackhole *cur, *temp, *temp1;
struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
int i, j, num_sack_blks;
+ tcp_seq old = 0, new = 0;
INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -382,13 +383,25 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
sack_blocks[num_sack_blks++] = sack;
}
}
+ if (TAILQ_EMPTY(&tp->snd_holes))
+ /*
+ * Empty scoreboard. Need to initialize snd_fack (it may be
+ * uninitialized or have a bogus value). Scoreboard holes
+ * (from the sack blocks received) are created later below
+ * (in the logic that adds holes to the tail of the
+ * scoreboard).
+ */
+ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
/*
* Return if SND.UNA is not advanced and no valid SACK block is
- * received.
+ * received.If no new valid SACK block the scoreboard remains
+ * the same, i.e. the difference is 0.
*/
- if (num_sack_blks == 0)
+ if (num_sack_blks == 0){
+ if (V_tcp_do_prr)
+ tp->diff_in_sack = 0;
return;
-
+ }
/*
* Sort the SACK blocks so we can update the scoreboard with just one
* pass. The overhead of sorting upto 4+1 elements is less than
@@ -403,15 +416,14 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
}
}
}
- if (TAILQ_EMPTY(&tp->snd_holes))
- /*
- * Empty scoreboard. Need to initialize snd_fack (it may be
- * uninitialized or have a bogus value). Scoreboard holes
- * (from the sack blocks received) are created later below
- * (in the logic that adds holes to the tail of the
- * scoreboard).
- */
- tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
+ if (V_tcp_do_prr)
+ if(!TAILQ_EMPTY(&tp->snd_holes))
+ TAILQ_FOREACH(temp, &tp->snd_holes, scblink) {
+ if ((temp1 = TAILQ_NEXT(temp, scblink)) != NULL)
+ old += temp1->start - temp->end;
+ else if (SEQ_GT(tp->snd_fack, temp->end))
+ old += tp->snd_fack - temp->end;
+ }
/*
* In the while-loop below, incoming SACK blocks (sack_blocks[]) and
* SACK holes (snd_holes) are traversed from their tails with just
@@ -540,6 +552,19 @@ tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
else
sblkp--;
}
+ /*
+ * Calculate number of bytes in the scoreboard.
+ */
+ if (V_tcp_do_prr)
+ if (!TAILQ_EMPTY(&tp->snd_holes))
+ TAILQ_FOREACH(temp, &tp->snd_holes, scblink) {
+ if ((temp1 = TAILQ_NEXT(temp, scblink)) != NULL)
+ new += temp1->start - temp->end;
+ else if (SEQ_GT(tp->snd_fack, temp->end))
+ new += tp->snd_fack - temp->end;
+ }
+ /* Change in the scoreboard in # of bytes */
+ tp->diff_in_sack = new - old;
}
/*
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 5d37b50..089d8c6 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -801,6 +801,7 @@ tcp_newtcpcb(struct inpcb *inp)
tp->t_rxtcur = TCPTV_RTOBASE;
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+ tp->diff_in_sack = 0;
tp->t_rcvtime = ticks;
/*
* IPv4 TTL initialization is necessary for an IPv6 socket as well,
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index aaaa4a4..fe1507e 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -161,6 +161,11 @@ struct tcpcb {
u_long t_rttupdated; /* number of times rtt sampled */
u_long max_sndwnd; /* largest window peer has offered */
+ tcp_seq prr_delivered; /* Total bytes delivered during PRR recovery */
+ tcp_seq prr_out; /* Total bytes sent during PRR recovery */
+ tcp_seq recover_fs; /* FlightSize at the start of PRR recovery */
+ tcp_seq diff_in_sack; /* (Signed) Difference of data in scoreboard due to the current ACK */
+
int t_softerror; /* possible error not yet reported */
/* out-of-band data */
char t_oobflags; /* have some */
@@ -174,6 +179,7 @@ struct tcpcb {
u_int32_t ts_offset; /* our timestamp offset */
tcp_seq last_ack_sent;
+
/* experimental */
u_long snd_cwnd_prev; /* cwnd prior to retransmit */
u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */
@@ -627,8 +633,10 @@ VNET_DECLARE(int, tcp_abc_l_var);
#define V_tcp_abc_l_var VNET(tcp_abc_l_var)
VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */
+VNET_DECLARE(int, tcp_do_prr); /* PRR enabled/disabled */
VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */
#define V_tcp_do_sack VNET(tcp_do_sack)
+#define V_tcp_do_prr VNET(tcp_do_prr)
#define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail)
VNET_DECLARE(int, tcp_do_ecn); /* TCP ECN enabled/disabled */
--Apple-Mail=_B62A7D06-AE4C-4A19-904D-CCD75DCF6A4A
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain;
charset=us-ascii
On 2014-8-26, at 20:09, Adrian Chadd <adrian@freebsd.org> wrote:
> Hi!
>=20
> I'm going to merge Tom's work in a week unless someone gives me a
> really good reason not to.
>=20
> I think there's been enough work and discussion about it since the
> first post from Lars in Feburary and enough review opportunity.
>=20
>=20
> -a
>=20
>=20
> On 26 August 2014 07:55, Tom Jones <jones@sdf.org> wrote:
>> On Tue, Aug 26, 2014 at 02:43:49PM +0000, Eggert, Lars wrote:
>>> Hi,
>>>=20
>>> the newcwv patch is probably stale now with Tom Jones' recent patch =
based on
>>> a more up-to-date version of the Internet-Draft, but the PRR patch =
should
>>> still be useful?
>>=20
>> My newcwv patch is much more up to date than Aris's, but it is =
slightly
>> different in implementation. I have had a few suggestions from =
Adrian, but he
>> couldn't comment on how it relates to the tcp internals.
>>=20
>> There is a PR: =
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=3D191520
>>=20
>> The biggest difference in structure between mine and Aris's patch is =
the use of
>> tcp timers. It would be good to hear if my approach or Aris's is =
prefered.
>>=20
>>> On 2014-6-19, at 23:35, George Neville-Neil <gnn@neville-neil.com> =
wrote:
>>>=20
>>>> On 4 Feb 2014, at 1:38, Eggert, Lars wrote:
>>>>=20
>>>>> Hi,
>>>>>=20
>>>>> below are two patches that implement RFC6937 ("Proportional Rate =
Reduction for TCP") and draft-ietf-tcpm-newcwv-00 ("Updating TCP to =
support Rate-Limited Traffic"). They were done by Aris =
Angelogiannopoulos for his MS thesis, which is at =
https://eggert.org/students/angelogiannopoulos-thesis.pdf.
>>>>>=20
>>>>> The patches should apply to -CURRENT as of Sep 17, 2013. (Sorry =
for the delay in sending them, we'd been trying to get some feedback =
from committers first, without luck.)
>>>>>=20
>>>>> Please note that newcwv is still a work in progress in the IETF, =
and the patch has some limitations with regards to the "pipeACK Sampling =
Period" mentioned in the Internet-Draft. Aris says this in his thesis =
about what exactly he implemented:
>>>>>=20
>>>>> "The second implementation choice, is in regards with the =
measurement of pipeACK. This variable is the most important introduced =
by the method and is used to compute the phase that the sender currently =
lies in. In order to compute pipeACK the approach suggested by the =
Internet Draft (ID) is followed [ncwv]. During initialization, pipeACK =
is set to the maximum possible value. A helper variable prevHighACK is =
introduced that is initialized to the initial sequence number (iss). =
prevHighACK holds the value of the highest acknowledged byte so far. =
pipeACK is measured once per RTT meaning that when an ACK covering =
prevHighACK is received, pipeACK becomes the difference between the =
current ACK and prevHighACK. This is called a pipeACK sample. A newer =
version of the draft suggests that multiple pipeACK samples can be used =
during the pipeACK sampling period."
>>>>>=20
>>>>> Lars
>>>>>=20
>>>>>=20
>>>>> [prr.patch]
>>>>>=20
>>>>> [newcwv.patch]
>>>>=20
>>>> Apologies for not looking at this as yet. It is now closer to the =
top of my list.
>>>>=20
>>>> Best,
>>>> George
>>>=20
>>=20
>>=20
>>=20
>> --
>> Tom
>> @adventureloop
>> adventurist.me
>>=20
>> :wq
>> _______________________________________________
>> freebsd-net@freebsd.org mailing list
>> http://lists.freebsd.org/mailman/listinfo/freebsd-net
>> To unsubscribe, send any mail to =
"freebsd-net-unsubscribe@freebsd.org"
--Apple-Mail=_B62A7D06-AE4C-4A19-904D-CCD75DCF6A4A--
--Apple-Mail=_3746A6C5-7BF1-4796-97F0-9AC400CD96D2
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment; filename="signature.asc"
Content-Type: application/pgp-signature; name="signature.asc"
Content-Description: Message signed with OpenPGP using GPGMail
-----BEGIN PGP SIGNATURE-----
iQCVAwUBU/2HedZcnpRveo1xAQKWOQP8CAI7w0vJ9ttPJxLIGF9p77PsZgcoV8wU
qbmH4RfkDYGzdHhEXqYQkIBYOCsrDVCGHYLjVzmBMRGbY5+s3Jz/mGI8m2IXh/n/
rs7kkp1AORY5BsBCHjObfXWGicNSw8btpQSDkoTPfrk4zTHtWQESwFAB/ud1M/31
V+USeVu/pV0=
=eWB1
-----END PGP SIGNATURE-----
--Apple-Mail=_3746A6C5-7BF1-4796-97F0-9AC400CD96D2--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?76D986F7-72A8-4ABE-8731-064C6C77A56F>
