Date: Thu, 11 Feb 2010 07:00:21 +0000 (UTC) From: Lawrence Stewart <lstewart@FreeBSD.org> To: src-committers@freebsd.org, svn-src-projects@freebsd.org Subject: svn commit: r203771 - projects/tcp_cc_head/sys/netinet Message-ID: <201002110700.o1B70Lrw099941@svn.freebsd.org>
index | next in thread | raw e-mail
Author: lstewart Date: Thu Feb 11 07:00:21 2010 New Revision: 203771 URL: http://svn.freebsd.org/changeset/base/203771 Log: - Import David's integrated ertt helper. - Add a version number to helpers so they can be depended on in a sensible way. Added: projects/tcp_cc_head/sys/netinet/h_ertt.h (contents, props changed) Modified: projects/tcp_cc_head/sys/netinet/h_ertt.c projects/tcp_cc_head/sys/netinet/helper_module.h projects/tcp_cc_head/sys/netinet/tcp_var.h Modified: projects/tcp_cc_head/sys/netinet/h_ertt.c ============================================================================== --- projects/tcp_cc_head/sys/netinet/h_ertt.c Thu Feb 11 06:42:08 2010 (r203770) +++ projects/tcp_cc_head/sys/netinet/h_ertt.c Thu Feb 11 07:00:21 2010 (r203771) @@ -43,25 +43,53 @@ __FBSDID("$FreeBSD$"); #include <net/if.h> #include <net/pfil.h> +#include <net/vnet.h> +#include <netinet/h_ertt.h> #include <netinet/helper.h> #include <netinet/helper_module.h> #include <netinet/hhooks.h> #include <netinet/in.h> #include <netinet/in_pcb.h> #include <netinet/in_var.h> +#include <netinet/tcp_seq.h> #include <netinet/tcp_var.h> -void ertt_tcpest_hook(void *udata, void *ctx_data, void *dblock); +#include <vm/uma.h> + +static VNET_DEFINE(uma_zone_t, txseginfo_zone); +#define V_txseginfo_zone VNET(txseginfo_zone) + +#define DLYACK_SMOOTH 5 /* smoothing factor for delayed ack guess */ +#define MAX_TS_ERR 10 /* maximum number of time stamp errors allowed in a session */ + +void ertt_packet_measurement_hook(void *udata, void *ctx_data, void *dblock); +void ertt_add_tx_segment_info_hook(void *udata, void *ctx_data, void *dblock); int ertt_mod_init(void); int ertt_mod_destroy(void); int ertt_uma_ctor(void *mem, int size, void *arg, int flags); void ertt_uma_dtor(void *mem, int size, void *arg); -struct ertt { - int test; +/* Structure contains the information about the + sent segment, for comparison with the corresponding ack */ +struct txseginfo { + TAILQ_ENTRY(txseginfo) txsegi_lnk; + /* segment sequence number */ + tcp_seq seq; + long len; + /* time stamp indicating when the packet was sent */ + u_int32_t tx_ts; + /* Last received receiver ts (if the tcp option is used). */ + u_int32_t rx_ts; + /* flags for operation */ + u_int flags; }; +/* txseginfo flags */ +#define TXSI_TSO 0x01 /* TSO was used for this entry */ +#define TXSI_RTT_MEASURE_START 0x02 /* a rate measure starts here based on this txsi's rtt */ +#define TXSI_RX_MEASURE_END 0x04 /* measure the received rate until this txsi */ + struct helper ertt_helper = { .mod_init = ertt_mod_init, .mod_destroy = ertt_mod_destroy, @@ -69,37 +97,267 @@ struct helper ertt_helper = { .class = HELPER_CLASS_TCP }; +#define MULTI_ACK 1 + static void inline +marked_packet_rtt(struct txseginfo *txsi, struct ertt *e_t, struct tcpcb *tp, struct tcphdr *th, + u_int32_t *pmeasurenext, int multiack) +{ + /* if we can't measure this one properly due to delayed acking */ + /* adjust byte counters and flag to measure next txsi. */ + /* Note that since the marked packet's tx and rx bytes are measured */ + /* we need to subtract the tx, and not add the rx. */ + /* Then pretend the next txsi was marked */ + if (multiack && e_t->dlyack_rx && !*pmeasurenext) { + *pmeasurenext=txsi->tx_ts; + } else { + if (*pmeasurenext) + e_t->markedpkt_rtt = ticks - *pmeasurenext + 1; + else + e_t->markedpkt_rtt = ticks - txsi->tx_ts + 1; + e_t->bytes_tx_in_marked_rtt=e_t->bytes_tx_in_rtt; + e_t->marked_snd_cwnd=tp->snd_cwnd; + + /* set flags */ + e_t->flags &= ~ERTT_MEASUREMENT_IN_PROGRESS; /* Not measuring - indicates to add_tx_segment_info + a new measurment needs to be started */ + e_t->flags |= ERTT_NEW_MEASUREMENT; /* indicates to the CC that a new marked RTT measurement has been taken */ + if (tp->t_flags & TF_TSO) { + tp->t_flags &= ~TF_TSO; /* temporarily disable TSO to aid in a new measurment */ + e_t->flags |= ERTT_TSO_DISABLED; /* note that I've done it so I can renable it later */ + } + } +} -void -ertt_tcpest_hook(void *udata, void *ctx_data, void *dblock) + + +/* packet_measurements use state kept on each packet sent to more accurately and more + * securely measure the round trip time. The resulting measurement is used for + * congestion control algorithms which require a more accurate time. +*/ + void +ertt_packet_measurement_hook(void *udata, void *ctx_data, void *dblock) { //struct ertt *e = (struct ertt *)(((struct tcpcb *)inp->inp_ppcb)->helper_data[0]); - //struct ertt *e = (struct ertt *)dblock; - //printf("In the hook with errt->test: %d, ctx_data: %p, curack = %u\n", - //e->test, ctx_data, ((struct tcp_hhook_data *)ctx_data)->curack); + struct tcpcb *tp = ((struct tcp_hhook_data *)ctx_data)->tp; + struct tcphdr *th = ((struct tcp_hhook_data *)ctx_data)->th; + struct tcpopt *to = ((struct tcp_hhook_data *)ctx_data)->to; + int new_sacked_bytes = ((struct tcp_hhook_data *)ctx_data)->new_sacked_bytes; + struct ertt *e_t = (struct ertt *)dblock; + + printf("In the hook with e_t->rtt: %d, ctx_data: %p, curack = %u\n", + e_t->rtt, ctx_data, th->th_ack); + + struct txseginfo *txsi; + u_int32_t rts=0; + u_int32_t measurenext=0; + tcp_seq ack; + int multiack=0; + + + INP_WLOCK_ASSERT(tp->t_inpcb); + int acked = th->th_ack - tp->snd_una; + /* Packet has provided new acknowledgements */ + if (acked > 0 || new_sacked_bytes) { + if (acked == 0 && new_sacked_bytes) { + /* no delayed acks at the moment, + use packets being acknowledged with sack instead of th_ack*/ + ack = tp->sackhint.last_sack_ack; + } else + ack = th->th_ack; + + + txsi = TAILQ_FIRST(&e_t->txsegi_q); + while(txsi != NULL) { + rts = 0; + + + if (SEQ_GT(ack, txsi->seq+txsi->len)) { /* acking more than this txsi */ + if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) + marked_packet_rtt(txsi, e_t, tp, th, &measurenext, MULTI_ACK); + TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); + uma_zfree(V_txseginfo_zone, txsi); + txsi = TAILQ_FIRST(&e_t->txsegi_q); + continue; + } + + + /* Guess if delayed acks are being used by the receiver */ + if (!new_sacked_bytes) { + if (acked > tp->t_maxseg) { + e_t->dlyack_rx += (e_t->dlyack_rx < DLYACK_SMOOTH) ? 1 : 0; + multiack=1; + } else if (acked > txsi->len) { + multiack=1; + e_t->dlyack_rx += (e_t->dlyack_rx < DLYACK_SMOOTH) ? 1 : 0; + } else if (acked == tp->t_maxseg || acked == txsi->len) + e_t->dlyack_rx -= (e_t->dlyack_rx > 0) ? 1 : 0; + /* otherwise leave dlyack_rx the way it was */ + } + + /* Time stamps are only used to help identify packets */ + if (e_t->timestamp_errors < MAX_TS_ERR && + (to->to_flags & TOF_TS) != 0 && to->to_tsecr) { + /* Note: All packets sent with the offload will have the same time stamp. + If we are sending on a fast interface, and the t_maxseg is much + smaller than one tick, this will be fine. The time stamp would be + the same whether we were using tso or not. However, if the interface + is slow, this will cause problems with the calculations. If the interface + is slow, there is not reason to be using tso, and it should be turned off. */ + /* If there are too many time stamp errors, time stamps won't be trusted */ + rts = to->to_tsecr; + if (!e_t->dlyack_rx && TSTMP_LT(rts,txsi->tx_ts)) /*before this packet */ + /* When delayed acking is used, the reflected time stamp + is of the first packet, and thus may be before txsi->tx_ts*/ + break; + if (TSTMP_GT(rts,txsi->tx_ts)) { + /* if reflected time stamp is later than tx_tsi, then this txsi is old */ + if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) + marked_packet_rtt(txsi, e_t, tp, th, &measurenext, 0); + TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); + uma_zfree(V_txseginfo_zone, txsi); + txsi = TAILQ_FIRST(&e_t->txsegi_q); + continue; + } + if (rts == txsi->tx_ts && TSTMP_LT(to->to_tsval,txsi->rx_ts)) { + /* rx before sent!!! something wrong with rx timestamping + process without timestamps */ + e_t->timestamp_errors++; + } + } + + /* old txsi that may have had the same seq numbers (rtx) should have been + removed if time stamps are being used */ + if (SEQ_LEQ(ack,txsi->seq)) + break; /* before first packet in txsi */ + + /* only ack > txsi->seq and ack <= txsi->seq+txsi->len past this point */ + + + if (!e_t->dlyack_rx || multiack || new_sacked_bytes) { + e_t->rtt = ticks - txsi->tx_ts + 1; /* new measurement */ + if (e_t->rtt < e_t->minrtt || e_t->minrtt==0) + e_t->minrtt=e_t->rtt; + } + + if (txsi->flags & TXSI_RTT_MEASURE_START || measurenext) + marked_packet_rtt(txsi, e_t, tp, th, &measurenext, 0); + + if (txsi->flags & TXSI_TSO) { + txsi->len -= acked; + if (txsi->len > 0) { + /* this presumes ack for first bytes in txsi, + this may not be true but it shouldn't + cause problems for the timing */ + txsi->seq += acked; + txsi->flags &= ~TXSI_RTT_MEASURE_START; /* reset measure flag */ + break; /* still more data to be acked with this tso transmission */ + } + } + TAILQ_REMOVE(&e_t->txsegi_q, txsi, txsegi_lnk); + uma_zfree(V_txseginfo_zone, txsi); + break; + } /* end while */ + if (measurenext) /* need to do a tx rate measurement, won't be the best if I'm doing it here */ + marked_packet_rtt(txsi, e_t, tp, th, &measurenext, 0); + } } +/* add transmitted segment info to the list */ +void +ertt_add_tx_segment_info_hook(void *udata, void *ctx_data, void *dblock) +{ + struct tcpcb *tp = ((struct tcp_hhook_data *)ctx_data)->tp; + struct tcphdr *th = ((struct tcp_hhook_data *)ctx_data)->th; + struct tcpopt *to = ((struct tcp_hhook_data *)ctx_data)->to; + long len = ((struct tcp_hhook_data *)ctx_data)->len; + int tso = ((struct tcp_hhook_data *)ctx_data)->tso; + + + INP_WLOCK_ASSERT(tp->t_inpcb); + + + if(len > 0) { + struct txseginfo *txsi; + txsi = (struct txseginfo *) uma_zalloc(V_txseginfo_zone, M_NOWAIT); + if (txsi != NULL) { + struct ertt *e_t= (struct ertt *)dblock; + txsi->flags=0; /* needs to be initialised */ + txsi->seq = ntohl(th->th_seq); + txsi->len = len; + if (tso) + txsi->flags |= TXSI_TSO; + else + if (e_t->flags & ERTT_TSO_DISABLED) { + tp->t_flags |= TF_TSO; + e_t->flags &= ~ERTT_TSO_DISABLED; + } + if (e_t->flags & ERTT_MEASUREMENT_IN_PROGRESS) { + e_t->bytes_tx_in_rtt += len; + } else { + txsi->flags |= TXSI_RTT_MEASURE_START; + e_t->flags |= ERTT_MEASUREMENT_IN_PROGRESS; + e_t->bytes_tx_in_rtt = len; + } + if (((tp->t_flags & TF_NOOPT) == 0) && (to->to_flags & TOF_TS)) { + txsi->tx_ts = ntohl(to->to_tsval) - tp->ts_offset; + txsi->rx_ts = ntohl(to->to_tsecr); + } else { + txsi->tx_ts = ticks; + txsi->rx_ts = 0; /* no received time stamp */ + } + TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk); + } + printf("** A %u %ld %d\n", ntohl(th->th_seq), len, tso); + } +} int ertt_mod_init(void) { - return register_hhook(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED, - &ertt_helper, &ertt_tcpest_hook, NULL, HHOOK_WAITOK); + int ret; + + V_txseginfo_zone = uma_zcreate("txseginfo", sizeof(struct txseginfo), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + + ret = register_hhook(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_IN, + &ertt_helper, &ertt_packet_measurement_hook, NULL, HHOOK_WAITOK); + if (ret) + return (ret); + + return register_hhook(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_OUT, + &ertt_helper, & ertt_add_tx_segment_info_hook, NULL, HHOOK_WAITOK); } int ertt_mod_destroy(void) { - return deregister_hhook(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED, - &ertt_tcpest_hook, NULL, 0); + int ret; + ret = deregister_hhook(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_IN, + &ertt_packet_measurement_hook, NULL, 0); + ret += deregister_hhook(HHOOK_TYPE_TCP, HHOOK_TCP_ESTABLISHED_OUT, + &ertt_add_tx_segment_info_hook, NULL, 0); + + uma_zdestroy(V_txseginfo_zone); + + return (ret); } int ertt_uma_ctor(void *mem, int size, void *arg, int flags) { - printf("Creating ertt block %p\n", mem); + struct ertt *e_t = (struct ertt *)mem; + + TAILQ_INIT(&e_t->txsegi_q); + e_t->timestamp_errors=0; + e_t->minrtt = 0; + e_t->maxrtt = 0; + e_t->rtt = 0; + e_t->flags=0; + e_t->dlyack_rx = 0; + e_t->bytes_tx_in_rtt = 0; + e_t->markedpkt_rtt = 0; - ((struct ertt *)mem)->test = 5; + printf("Creating ertt block %p\n", mem); return (0); } @@ -107,7 +365,17 @@ ertt_uma_ctor(void *mem, int size, void void ertt_uma_dtor(void *mem, int size, void *arg) { + struct ertt *e_t = (struct ertt *)mem; + struct txseginfo *txsi, *n_txsi; + + txsi = TAILQ_FIRST(&e_t->txsegi_q); + while (txsi != NULL) { + n_txsi = TAILQ_NEXT(txsi, txsegi_lnk); + uma_zfree(V_txseginfo_zone, txsi); + txsi = n_txsi; + } + printf("Destroying ertt block %p\n", mem); } -DECLARE_HELPER_UMA(ertt, &ertt_helper, sizeof(struct ertt), ertt_uma_ctor, ertt_uma_dtor); +DECLARE_HELPER_UMA(ertt, &ertt_helper, 1, sizeof(struct ertt), ertt_uma_ctor, ertt_uma_dtor); Added: projects/tcp_cc_head/sys/netinet/h_ertt.h ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ projects/tcp_cc_head/sys/netinet/h_ertt.h Thu Feb 11 07:00:21 2010 (r203771) @@ -0,0 +1,65 @@ +/*- + * Copyright (c) 2009-2010 + * Swinburne University of Technology, Melbourne, Australia + * Copyright (c) 2010 Lawrence Stewart <lstewart@freebsd.org> + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University, by David Hayes and Lawrence Stewart, + * made possible in part by a grant from the FreeBSD Foundation and + * Cisco University Research Program Fund at Community Foundation + * Silicon Valley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _NETINET_H_ERTT_ +#define _NETINET_H_ERTT_ + +/* Structure contains the information about the + sent segment, for comparison with the corresponding ack */ +struct txseginfo; + +/* Structure used as the ertt data block. */ +struct ertt { + /* information about transmitted segments to aid in + RTT calculation for delay/rate based CC */ + TAILQ_HEAD(txseginfo_head, txseginfo) txsegi_q; + int rtt; /* per packet measured round trip time */ + int maxrtt; /* maximum seen rtt */ + int minrtt; /* minimum seen rtt */ + int dlyack_rx; /* guess if the receiver is using delayed acknowledgements.*/ + int timestamp_errors; /* for keeping track of inconsistencies in packet timestamps */ + int markedpkt_rtt; /* rtt for a marked packet */ + long bytes_tx_in_rtt; /* bytes tx so far in marked rtt */ + long bytes_tx_in_marked_rtt;/* final version of above */ + u_long marked_snd_cwnd; /* cwnd for marked rtt */ + int flags; /* flags*/ +}; + +#define ERTT_NEW_MEASUREMENT 0x01 /* new measurement */ +#define ERTT_MEASUREMENT_IN_PROGRESS 0x02 /* measuring marked RTT */ +#define ERTT_TSO_DISABLED 0x04 /* indicates TSO has been temporarily disabled */ + +#endif /* _NETINET_H_ERTT_ */ Modified: projects/tcp_cc_head/sys/netinet/helper_module.h ============================================================================== --- projects/tcp_cc_head/sys/netinet/helper_module.h Thu Feb 11 06:42:08 2010 (r203770) +++ projects/tcp_cc_head/sys/netinet/helper_module.h Thu Feb 11 07:00:21 2010 (r203771) @@ -41,7 +41,7 @@ struct helper_modevent_data { uma_dtor umadtor; }; -#define DECLARE_HELPER(hname, hdata) \ +#define DECLARE_HELPER(hname, hdata, version) \ static struct helper_modevent_data hmd_##hname = { \ .name = #hname, \ .helper = hdata \ @@ -52,9 +52,10 @@ struct helper_modevent_data { .priv = &hmd_##hname \ }; \ DECLARE_MODULE(hname, h_##hname, SI_SUB_PROTO_IFATTACHDOMAIN, \ - SI_ORDER_ANY) + SI_ORDER_ANY); \ + MODULE_VERSION(hname, version); -#define DECLARE_HELPER_UMA(hname, hdata, size, ctor, dtor) \ +#define DECLARE_HELPER_UMA(hname, hdata, version, size, ctor, dtor) \ static struct helper_modevent_data hmd_##hname = { \ .name = #hname, \ .helper = hdata, \ @@ -68,7 +69,8 @@ struct helper_modevent_data { .priv = &hmd_##hname \ }; \ DECLARE_MODULE(hname, h_##hname, SI_SUB_PROTO_IFATTACHDOMAIN, \ - SI_ORDER_ANY) + SI_ORDER_ANY); \ + MODULE_VERSION(hname, version); int helper_modevent(module_t mod, int type, void *data); Modified: projects/tcp_cc_head/sys/netinet/tcp_var.h ============================================================================== --- projects/tcp_cc_head/sys/netinet/tcp_var.h Thu Feb 11 06:42:08 2010 (r203770) +++ projects/tcp_cc_head/sys/netinet/tcp_var.h Thu Feb 11 07:00:21 2010 (r203771) @@ -76,7 +76,7 @@ struct sackhole { struct sackhint { struct sackhole *nexthole; int sack_bytes_rexmit; - + tcp_seq last_sack_ack; /* Last sack block acked with current pkt - used for enhanced RTT calculations*/ int ispare; /* explicit pad for 64bit alignment */ uint64_t _pad[2]; /* 1 sacked_bytes, 1 TBD */ }; @@ -247,12 +247,19 @@ struct tcpcb { #define BYTES_ACKED(tp, th) (th->th_ack - tp->snd_una) /* - * TCP specific helper hook point identifiers + * TCP specific helper hook point identifiers. */ -#define HHOOK_TCP_ESTABLISHED 1 +#define HHOOK_TCP_ESTABLISHED_IN 1 +#define HHOOK_TCP_ESTABLISHED_OUT 2 struct tcp_hhook_data { - tcp_seq curack; + struct tcpcb *tp; + struct tcphdr *th; + struct tcpopt *to; + long len; + int tso; + tcp_seq curack; + int new_sacked_bytes; }; /*help
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201002110700.o1B70Lrw099941>
