Date: Wed, 10 Jul 2019 20:40:40 +0000 (UTC) From: Randall Stewart <rrs@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r349893 - in head/sys: modules/tcp/rack netinet netinet/tcp_stacks sys Message-ID: <201907102040.x6AKeern006731@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: rrs Date: Wed Jul 10 20:40:39 2019 New Revision: 349893 URL: https://svnweb.freebsd.org/changeset/base/349893 Log: This commit updates rack to what is basically being used at NF as well as sets in some of the groundwork for committing BBR. The hpts system is updated as well as some other needed utilities for the entrance of BBR. This is actually part 1 of 3 more needed commits which will finally complete with BBRv1 being added as a new tcp stack. Sponsored by: Netflix Inc. Differential Revision: https://reviews.freebsd.org/D20834 Added: head/sys/netinet/tcp_stacks/rack_bbr_common.c (contents, props changed) Modified: head/sys/modules/tcp/rack/Makefile head/sys/netinet/in_pcb.h head/sys/netinet/tcp.h head/sys/netinet/tcp_hpts.c head/sys/netinet/tcp_hpts.h head/sys/netinet/tcp_log_buf.h head/sys/netinet/tcp_stacks/rack.c head/sys/netinet/tcp_stacks/rack_bbr_common.h head/sys/netinet/tcp_var.h head/sys/sys/mbuf.h Modified: head/sys/modules/tcp/rack/Makefile ============================================================================== --- head/sys/modules/tcp/rack/Makefile Wed Jul 10 19:57:48 2019 (r349892) +++ head/sys/modules/tcp/rack/Makefile Wed Jul 10 20:40:39 2019 (r349893) @@ -6,7 +6,7 @@ STACKNAME= rack KMOD= tcp_${STACKNAME} -SRCS= rack.c sack_filter.c +SRCS= rack.c sack_filter.c rack_bbr_common.c SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_tcpdebug.h Modified: head/sys/netinet/in_pcb.h ============================================================================== --- head/sys/netinet/in_pcb.h Wed Jul 10 19:57:48 2019 (r349892) +++ head/sys/netinet/in_pcb.h Wed Jul 10 20:40:39 2019 (r349893) @@ -759,7 +759,9 @@ int inp_so_options(const struct inpcb *inp); #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ #define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */ #define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */ - +#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */ +#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */ +#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */ /* * Flags passed to in_pcblookup*() functions. */ Modified: head/sys/netinet/tcp.h ============================================================================== --- head/sys/netinet/tcp.h Wed Jul 10 19:57:48 2019 (r349892) +++ head/sys/netinet/tcp.h Wed Jul 10 20:40:39 2019 (r349893) @@ -201,9 +201,8 @@ struct tcphdr { #define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */ #define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */ #define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */ -#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */ #define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */ -#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */ +#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */ #define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */ #define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */ #define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */ @@ -211,14 +210,18 @@ struct tcphdr { #define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */ #define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */ #define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */ -#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */ -#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */ -#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */ +#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */ +#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */ +#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */ +#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */ +#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */ +#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */ #define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */ #define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */ #define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */ #define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */ -#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */ +#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */ +#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */ #define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */ #define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */ #define TCP_BBR_PACE_PER_SEC 1086 @@ -227,17 +230,27 @@ struct tcphdr { #define TCP_BBR_PACE_SEG_MIN 1089 #define TCP_BBR_PACE_CROSS 1090 #define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ -#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ #define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */ #define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */ +#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase */ #define TCP_RACK_TLP_USE 1095 #define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */ +#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */ #define TCP_BBR_EXTRA_GAIN 1097 #define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */ #define TCP_BBR_RETRAN_WTSO 1099 #define TCP_DATA_AFTER_CLOSE 1100 #define TCP_BBR_PROBE_RTT_GAIN 1101 #define TCP_BBR_PROBE_RTT_LEN 1102 +#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */ +#define TCP_BBR_USE_RACK_CHEAT 1104 /* Do we use the rack cheat for pacing rxt's */ +#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */ +#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */ +#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */ +#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */ +#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */ +#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */ +#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */ /* Start of reserved space for third-party user-settable options. */ Modified: head/sys/netinet/tcp_hpts.c ============================================================================== --- head/sys/netinet/tcp_hpts.c Wed Jul 10 19:57:48 2019 (r349892) +++ head/sys/netinet/tcp_hpts.c Wed Jul 10 20:40:39 2019 (r349893) @@ -37,7 +37,7 @@ __FBSDID("$FreeBSD$"); * pacing packets out onto the wire. It can be used in two ways * by a given TCP stack (and those two methods can be used simultaneously). * - * First, and probably the main thing its used by Rack and BBR for, it can + * First, and probably the main thing its used by Rack and BBR, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The @@ -59,42 +59,57 @@ __FBSDID("$FreeBSD$"); * to prevent output processing until the time alotted has gone by. * Of course this is a bare bones example and the stack will probably * have more consideration then just the above. - * - * Now the tcp_hpts system will call tcp_output in one of two forms, - * it will first check to see if the stack as defined a - * tfb_tcp_output_wtime() function, if so that is the routine it - * will call, if that function is not defined then it will call the - * tfb_tcp_output() function. The only difference between these - * two calls is that the former passes the time in to the function - * so the function does not have to access the time (which tcp_hpts - * already has). What these functions do is of course totally up - * to the individual tcp stack. - * + * * Now the second function (actually two functions I guess :D) * the tcp_hpts system provides is the ability to either abort - * a connection (later) or process input on a connection. - * Why would you want to do this? To keep processor locality. + * a connection (later) or process input on a connection. + * Why would you want to do this? To keep processor locality + * and or not have to worry about untangling any recursive + * locks. The input function now is hooked to the new LRO + * system as well. * - * So in order to use the input redirection function the - * stack changes its tcp_do_segment() routine to instead - * of process the data call the function: + * In order to use the input redirection function the + * tcp stack must define an input function for + * tfb_do_queued_segments(). This function understands + * how to dequeue a array of packets that were input and + * knows how to call the correct processing routine. * - * tcp_queue_pkt_to_input() - * - * You will note that the arguments to this function look - * a lot like tcp_do_segments's arguments. This function - * will assure that the tcp_hpts system will - * call the functions tfb_tcp_hpts_do_segment() from the - * correct CPU. Note that multiple calls can get pushed - * into the tcp_hpts system this will be indicated by - * the next to last argument to tfb_tcp_hpts_do_segment() - * (nxt_pkt). If nxt_pkt is a 1 then another packet is - * coming. If nxt_pkt is a 0 then this is the last call - * that the tcp_hpts system has available for the tcp stack. + * Locking in this is important as well so most likely the + * stack will need to define the tfb_do_segment_nounlock() + * splitting tfb_do_segment() into two parts. The main processing + * part that does not unlock the INP and returns a value of 1 or 0. + * It returns 0 if all is well and the lock was not released. It + * returns 1 if we had to destroy the TCB (a reset received etc). + * The remains of tfb_do_segment() then become just a simple call + * to the tfb_do_segment_nounlock() function and check the return + * code and possibly unlock. * - * The other point of the input system is to be able to safely - * drop a tcp connection without worrying about the recursive - * locking that may be occuring on the INP_WLOCK. So if + * The stack must also set the flag on the INP that it supports this + * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes + * this flag as well and will queue packets when it is set. + * There are other flags as well INP_MBUF_QUEUE_READY and + * INP_DONT_SACK_QUEUE. The first flag tells the LRO code + * that we are in the pacer for output so there is no + * need to wake up the hpts system to get immediate + * input. The second tells the LRO code that its okay + * if a SACK arrives you can still defer input and let + * the current hpts timer run (this is usually set when + * a rack timer is up so we know SACK's are happening + * on the connection already and don't want to wakeup yet). + * + * There is a common functions within the rack_bbr_common code + * version i.e. ctf_do_queued_segments(). This function + * knows how to take the input queue of packets from + * tp->t_in_pkts and process them digging out + * all the arguments, calling any bpf tap and + * calling into tfb_do_segment_nounlock(). The common + * function (ctf_do_queued_segments()) requires that + * you have defined the tfb_do_segment_nounlock() as + * described above. + * + * The second feature of the input side of hpts is the + * dropping of a connection. This is due to the way that + * locking may have occured on the INP_WLOCK. So if * a stack wants to drop a connection it calls: * * tcp_set_inp_to_drop(tp, ETIMEDOUT) @@ -156,6 +171,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcpip.h> #include <netinet/cc/cc.h> #include <netinet/tcp_hpts.h> +#include <netinet/tcp_log_buf.h> #ifdef tcpdebug #include <netinet/tcp_debug.h> @@ -168,24 +184,19 @@ __FBSDID("$FreeBSD$"); MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS -#include <net/netisr.h> -#include <net/rss_config.h> static int tcp_bind_threads = 1; #else static int tcp_bind_threads = 2; #endif TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); -static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG; - -TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size); - static struct tcp_hptsi tcp_pace; +static int hpts_does_tp_logging = 0; static void tcp_wakehpts(struct tcp_hpts_entry *p); static void tcp_wakeinput(struct tcp_hpts_entry *p); static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); -static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick); +static void tcp_hptsi(struct tcp_hpts_entry *hpts); static void tcp_hpts_thread(void *ctx); static void tcp_init_hptsi(void *st); @@ -204,8 +215,6 @@ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, } \ } while (0) -static int32_t logging_on = 0; -static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); static int32_t tcp_hpts_precision = 120; struct hpts_domain_info { @@ -219,44 +228,75 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CT &tcp_hpts_precision, 120, "Value for PRE() precision of callout"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, - &logging_on, 0, - "Turn on logging if compiled in"); +counter_u64_t hpts_hopelessly_behind; +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD, + &hpts_hopelessly_behind, + "Number of times hpts could not catch up and was behind hopelessly"); + counter_u64_t hpts_loops; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, &hpts_loops, "Number of times hpts had to loop to catch up"); + counter_u64_t back_tosleep; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, &back_tosleep, "Number of times hpts found no tcbs"); -static int32_t in_newts_every_tcb = 0; +counter_u64_t combined_wheel_wrap; -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW, - &in_newts_every_tcb, 0, - "Do we have a new cts every tcb we process for input"); -static int32_t in_ts_percision = 0; +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD, + &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW, - &in_ts_percision, 0, - "Do we use percise timestamp for clients on input"); -static int32_t out_newts_every_tcb = 0; +counter_u64_t wheel_wrap; -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW, - &out_newts_every_tcb, 0, - "Do we have a new cts every tcb we process for output"); +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD, + &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); + static int32_t out_ts_percision = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, &out_ts_percision, 0, "Do we use a percise timestamp for every output cts"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, + &hpts_does_tp_logging, 0, + "Do we add to any tp that has logging on pacer logs"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW, +static int32_t max_pacer_loops = 10; +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW, + &max_pacer_loops, 10, + "What is the maximum number of times the pacer will loop trying to catch up"); + +#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2) + +static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED; + + +static int +sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS) +{ + int error; + uint32_t new; + + new = hpts_sleep_max; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + if ((new < (NUM_OF_HPTSI_SLOTS / 4)) || + (new > HPTS_MAX_SLEEP_ALLOWED)) + error = EINVAL; + else + hpts_sleep_max = new; + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep, + CTLTYPE_UINT | CTLFLAG_RW, &hpts_sleep_max, 0, - "The maximum time the hpts will sleep <1 - 254>"); + &sysctl_net_inet_tcp_hpts_max_sleep, "IU", + "Maximum time hpts will sleep"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, &tcp_min_hptsi_time, 0, @@ -267,55 +307,35 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTL "Do we have the callout call directly to the hpts?"); static void -__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot, - uint32_t ticknow, int32_t line) +tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, + int ticks_to_run, int idx) { - struct hpts_log *pl; - - HPTS_MTX_ASSERT(hpts); - if (hpts->p_log == NULL) - return; - pl = &hpts->p_log[hpts->p_log_at]; - hpts->p_log_at++; - if (hpts->p_log_at >= hpts->p_logsize) { - hpts->p_log_at = 0; - hpts->p_log_wrapped = 1; - } - pl->inp = inp; - if (inp) { - pl->t_paceslot = inp->inp_hptsslot; - pl->t_hptsreq = inp->inp_hpts_request; - pl->p_onhpts = inp->inp_in_hpts; - pl->p_oninput = inp->inp_in_input; - } else { - pl->t_paceslot = 0; - pl->t_hptsreq = 0; - pl->p_onhpts = 0; - pl->p_oninput = 0; - } - pl->is_notempty = 1; - pl->event = event; - pl->line = line; - pl->cts = tcp_get_usecs(NULL); - pl->p_curtick = hpts->p_curtick; - pl->p_prevtick = hpts->p_prevtick; - pl->p_on_queue_cnt = hpts->p_on_queue_cnt; - pl->ticknow = ticknow; - pl->slot_req = slot; - pl->p_nxt_slot = hpts->p_nxt_slot; - pl->p_cur_slot = hpts->p_cur_slot; - pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time; - pl->p_flags = (hpts->p_cpu & 0x7f); - pl->p_flags <<= 7; - pl->p_flags |= (hpts->p_num & 0x7f); - pl->p_flags <<= 2; - if (hpts->p_hpts_active) { - pl->p_flags |= HPTS_HPTS_ACTIVE; - } + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = hpts->p_nxt_slot; + log.u_bbr.flex2 = hpts->p_cur_slot; + log.u_bbr.flex3 = hpts->p_prev_slot; + log.u_bbr.flex4 = idx; + log.u_bbr.flex5 = hpts->p_curtick; + log.u_bbr.flex6 = hpts->p_on_queue_cnt; + log.u_bbr.use_lt_bw = 1; + log.u_bbr.inflight = ticks_to_run; + log.u_bbr.applimited = hpts->overidden_sleep; + log.u_bbr.delivered = hpts->saved_curtick; + log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); + log.u_bbr.epoch = hpts->saved_curslot; + log.u_bbr.lt_epoch = hpts->saved_prev_slot; + log.u_bbr.pkts_out = hpts->p_delayed_by; + log.u_bbr.lost = hpts->p_hpts_sleep_time; + log.u_bbr.cur_del_rate = hpts->p_runningtick; + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + BBR_LOG_HPTSDIAG, 0, + 0, &log, false, tv); } -#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__) - static void hpts_timeout_swi(void *arg) { @@ -347,12 +367,6 @@ hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, str /* We are not on the hpts? */ panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); } - if (TAILQ_EMPTY(head) && - (hpts->p_on_queue_cnt != 0)) { - /* We should not be empty with a queue count */ - panic("%s hpts:%p hpts bucket empty but cnt:%d", - __FUNCTION__, hpts, hpts->p_on_queue_cnt); - } #endif TAILQ_REMOVE(head, inp, inp_hpts); hpts->p_on_queue_cnt--; @@ -456,58 +470,13 @@ hpts_sane_input_insert(struct tcp_hpts_entry *hpts, st in_pcbref(inp); } -static int -sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS) -{ - struct tcp_hpts_entry *hpts; - size_t sz; - int32_t logging_was, i; - int32_t error = 0; - - /* - * HACK: Turn off logging so no locks are required this really needs - * a memory barrier :) - */ - logging_was = logging_on; - logging_on = 0; - if (!req->oldptr) { - /* How much? */ - sz = 0; - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - if (hpts->p_log == NULL) - continue; - sz += (sizeof(struct hpts_log) * hpts->p_logsize); - } - error = SYSCTL_OUT(req, 0, sz); - } else { - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - if (hpts->p_log == NULL) - continue; - if (hpts->p_log_wrapped) - sz = (sizeof(struct hpts_log) * hpts->p_logsize); - else - sz = (sizeof(struct hpts_log) * hpts->p_log_at); - error = SYSCTL_OUT(req, hpts->p_log, sz); - } - } - logging_on = logging_was; - return error; -} - -SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, - 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); - - static void tcp_wakehpts(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); - swi_sched(hpts->ie_cookie, 0); - if (hpts->p_hpts_active == 2) { - /* Rare sleeping on a ENOBUF */ - wakeup_one(hpts); + if (hpts->p_hpts_wake_scheduled == 0) { + hpts->p_hpts_wake_scheduled = 1; + swi_sched(hpts->ie_cookie, 0); } } @@ -515,10 +484,9 @@ static void tcp_wakeinput(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); - swi_sched(hpts->ie_cookie, 0); - if (hpts->p_hpts_active == 2) { - /* Rare sleeping on a ENOBUF */ - wakeup_one(hpts); + if (hpts->p_hpts_wake_scheduled == 0) { + hpts->p_hpts_wake_scheduled = 1; + swi_sched(hpts->ie_cookie, 0); } } @@ -648,8 +616,8 @@ tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hp * Valid values in the flags are * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. * HPTS_REMOVE_INPUT - remove from the input of the hpts. - * Note that you can or both values together and get two - * actions. + * Note that you can use one or both values together + * and get two actions. */ void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) @@ -670,53 +638,198 @@ __tcp_hpts_remove(struct inpcb *inp, int32_t flags, in } static inline int -hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus) +hpts_tick(uint32_t wheel_tick, uint32_t plus) { - return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS); + /* + * Given a slot on the wheel, what slot + * is that plus ticks out? + */ + KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick)); + return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS); } +static inline int +tick_to_wheel(uint32_t cts_in_wticks) +{ + /* + * Given a timestamp in wheel ticks (10usec inc's) + * map it to our limited space wheel. + */ + return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); +} + +static inline int +hpts_ticks_diff(int prev_tick, int tick_now) +{ + /* + * Given two ticks that are someplace + * on our wheel. How far are they apart? + */ + if (tick_now > prev_tick) + return (tick_now - prev_tick); + else if (tick_now == prev_tick) + /* + * Special case, same means we can go all of our + * wheel less one slot. + */ + return (NUM_OF_HPTSI_SLOTS - 1); + else + return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now); +} + +/* + * Given a tick on the wheel that is the current time + * mapped to the wheel (wheel_tick), what is the maximum + * distance forward that can be obtained without + * wrapping past either prev_tick or running_tick + * depending on the htps state? Also if passed + * a uint32_t *, fill it with the tick location. + * + * Note if you do not give this function the current + * time (that you think it is) mapped to the wheel + * then the results will not be what you expect and + * could lead to invalid inserts. + */ +static inline int32_t +max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick) +{ + uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel; + + if ((hpts->p_hpts_active == 1) && + (hpts->p_wheel_complete == 0)) { + end_tick = hpts->p_runningtick; + /* Back up one tick */ + if (end_tick == 0) + end_tick = NUM_OF_HPTSI_SLOTS - 1; + else + end_tick--; + if (target_tick) + *target_tick = end_tick; + } else { + /* + * For the case where we are + * not active, or we have + * completed the pass over + * the wheel, we can use the + * prev tick and subtract one from it. This puts us + * as far out as possible on the wheel. + */ + end_tick = hpts->p_prev_slot; + if (end_tick == 0) + end_tick = NUM_OF_HPTSI_SLOTS - 1; + else + end_tick--; + if (target_tick) + *target_tick = end_tick; + /* + * Now we have close to the full wheel left minus the + * time it has been since the pacer went to sleep. Note + * that wheel_tick, passed in, should be the current time + * from the perspective of the caller, mapped to the wheel. + */ + if (hpts->p_prev_slot != wheel_tick) + dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); + else + dis_to_travel = 1; + /* + * dis_to_travel in this case is the space from when the + * pacer stopped (p_prev_slot) and where our wheel_tick + * is now. To know how many slots we can put it in we + * subtract from the wheel size. We would not want + * to place something after p_prev_slot or it will + * get ran too soon. + */ + return (NUM_OF_HPTSI_SLOTS - dis_to_travel); + } + /* + * So how many slots are open between p_runningtick -> p_cur_slot + * that is what is currently un-available for insertion. Special + * case when we are at the last slot, this gets 1, so that + * the answer to how many slots are available is all but 1. + */ + if (hpts->p_runningtick == hpts->p_cur_slot) + dis_to_travel = 1; + else + dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); + /* + * How long has the pacer been running? + */ + if (hpts->p_cur_slot != wheel_tick) { + /* The pacer is a bit late */ + pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick); + } else { + /* The pacer is right on time, now == pacers start time */ + pacer_to_now = 0; + } + /* + * To get the number left we can insert into we simply + * subract the distance the pacer has to run from how + * many slots there are. + */ + avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel; + /* + * Now how many of those we will eat due to the pacer's + * time (p_cur_slot) of start being behind the + * real time (wheel_tick)? + */ + if (avail_on_wheel <= pacer_to_now) { + /* + * Wheel wrap, we can't fit on the wheel, that + * is unusual the system must be way overloaded! + * Insert into the assured tick, and return special + * "0". + */ + counter_u64_add(combined_wheel_wrap, 1); + *target_tick = hpts->p_nxt_slot; + return (0); + } else { + /* + * We know how many slots are open + * on the wheel (the reverse of what + * is left to run. Take away the time + * the pacer started to now (wheel_tick) + * and that tells you how many slots are + * open that can be inserted into that won't + * be touched by the pacer until later. + */ + return (avail_on_wheel - pacer_to_now); + } +} + static int tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) { - int32_t need_wake = 0; - uint32_t ticknow = 0; - + uint32_t need_wake = 0; + HPTS_MTX_ASSERT(hpts); if (inp->inp_in_hpts == 0) { /* Ok we need to set it on the hpts in the current slot */ - if (hpts->p_hpts_active == 0) { - /* A sleeping hpts we want in next slot to run */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0, - hpts_tick(hpts, 1)); - } - inp->inp_hptsslot = hpts_tick(hpts, 1); - inp->inp_hpts_request = 0; - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow); - } - need_wake = 1; + inp->inp_hpts_request = 0; + if ((hpts->p_hpts_active == 0) || + (hpts->p_wheel_complete)) { + /* + * A sleeping hpts we want in next slot to run + * note that in this state p_prev_slot == p_cur_slot + */ + inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1); + if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0)) + need_wake = 1; } else if ((void *)inp == hpts->p_inp) { /* + * The hpts system is running and the caller + * was awoken by the hpts system. * We can't allow you to go into the same slot we - * are in. We must put you out. + * are in (we don't want a loop :-D). */ inp->inp_hptsslot = hpts->p_nxt_slot; } else - inp->inp_hptsslot = hpts->p_cur_slot; + inp->inp_hptsslot = hpts->p_runningtick; hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); - inp->inp_hpts_request = 0; - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0); - } if (need_wake) { /* * Activate the hpts if it is sleeping and its * timeout is not 1. */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow); - } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); } @@ -737,141 +850,129 @@ __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32 return (ret); } +#ifdef INVARIANTS static void -tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line, - struct hpts_diag *diag, int32_t noref) +check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line) { - int32_t need_new_to = 0; - int32_t need_wakeup = 0; - uint32_t largest_slot; - uint32_t ticknow = 0; - uint32_t slot_calc; + /* + * Sanity checks for the pacer with invariants + * on insert. + */ + if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS) + panic("hpts:%p inp:%p slot:%d > max", + hpts, inp, inp_hptsslot); + if ((hpts->p_hpts_active) && + (hpts->p_wheel_complete == 0)) { + /* + * If the pacer is processing a arc + * of the wheel, we need to make + * sure we are not inserting within + * that arc. + */ + int distance, yet_to_run; + distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot); + if (hpts->p_runningtick != hpts->p_cur_slot) + yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); + else + yet_to_run = 0; /* processing last slot */ + if (yet_to_run > distance) { + panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d", + hpts, inp, inp_hptsslot, + distance, yet_to_run, + hpts->p_runningtick, hpts->p_cur_slot); + } + } +} +#endif + +static void +tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line, + struct hpts_diag *diag, struct timeval *tv) +{ + uint32_t need_new_to = 0; + uint32_t wheel_cts, last_tick; + int32_t wheel_tick, maxticks; + int8_t need_wakeup = 0; + HPTS_MTX_ASSERT(hpts); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); diag->p_hpts_active = hpts->p_hpts_active; + diag->p_prev_slot = hpts->p_prev_slot; + diag->p_runningtick = hpts->p_runningtick; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; + diag->p_curtick = hpts->p_curtick; + diag->p_lasttick = hpts->p_lasttick; diag->slot_req = slot; + diag->p_on_min_sleep = hpts->p_on_min_sleep; + diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } - if ((inp->inp_in_hpts == 0) || noref) { - inp->inp_hpts_request = slot; + if (inp->inp_in_hpts == 0) { if (slot == 0) { /* Immediate */ - tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref); + tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); return; } - if (hpts->p_hpts_active) { - /* - * Its slot - 1 since nxt_slot is the next tick that - * will go off since the hpts is awake - */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0); - } - /* - * We want to make sure that we don't place a inp in - * the range of p_cur_slot <-> p_nxt_slot. If we - * take from p_nxt_slot to the end, plus p_cur_slot - * and then take away 2, we will know how many is - * the max slots we can use. - */ - if (hpts->p_nxt_slot > hpts->p_cur_slot) { - /* - * Non-wrap case nxt_slot <-> cur_slot we - * don't want to land in. So the diff gives - * us what is taken away from the number of - * slots. + /* Get the current time relative to the wheel */ + wheel_cts = tcp_tv_to_hptstick(tv); + /* Map it onto the wheel */ + wheel_tick = tick_to_wheel(wheel_cts); + /* Now what's the max we can place it at? */ + maxticks = max_ticks_available(hpts, wheel_tick, &last_tick); + if (diag) { + diag->wheel_tick = wheel_tick; + diag->maxticks = maxticks; + diag->wheel_cts = wheel_cts; + } + if (maxticks == 0) { + /* The pacer is in a wheel wrap behind, yikes! */ + if (slot > 1) { + /* + * Reduce by 1 to prevent a forever loop in + * case something else is wrong. Note this + * probably does not hurt because the pacer + * if its true is so far behind we will be + * > 1second late calling anyway. */ - largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot); - } else if (hpts->p_nxt_slot == hpts->p_cur_slot) { - largest_slot = NUM_OF_HPTSI_SLOTS - 2; - } else { - /* - * Wrap case so the diff gives us the number - * of slots that we can land in. - */ - largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot; + slot--; } - /* - * We take away two so we never have a problem (20 - * usec's) out of 1024000 usecs - */ - largest_slot -= 2; - if (inp->inp_hpts_request > largest_slot) { - /* - * Restrict max jump of slots and remember - * leftover - */ - slot = largest_slot; - inp->inp_hpts_request -= largest_slot; - } else { - /* This one will run when we hit it */ - inp->inp_hpts_request = 0; - } - if (hpts->p_nxt_slot == hpts->p_cur_slot) - slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS; - else - slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS; - if (slot_calc == hpts->p_cur_slot) { + inp->inp_hptsslot = last_tick; + inp->inp_hpts_request = slot; + } else if (maxticks >= slot) { + /* It all fits on the wheel */ + inp->inp_hpts_request = 0; + inp->inp_hptsslot = hpts_tick(wheel_tick, slot); + } else { + /* It does not fit */ + inp->inp_hpts_request = slot - maxticks; + inp->inp_hptsslot = last_tick; + } + if (diag) { + diag->slot_remaining = inp->inp_hpts_request; + diag->inp_hptsslot = inp->inp_hptsslot; + } #ifdef INVARIANTS - /* TSNH */ - panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n", - hpts, slot_calc, slot, largest_slot); + check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line); #endif - if (slot_calc) - slot_calc--; - else - slot_calc = NUM_OF_HPTSI_SLOTS - 1; - } - inp->inp_hptsslot = slot_calc; - if (diag) { - diag->inp_hptsslot = inp->inp_hptsslot; - } - } else { + hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0); + if ((hpts->p_hpts_active == 0) && + (inp->inp_hpts_request == 0) && + (hpts->p_on_min_sleep == 0)) { /* - * The hpts is sleeping, we need to figure out where + * The hpts is sleeping and not on a minimum + * sleep time, we need to figure out where * it will wake up at and if we need to reschedule * its time-out. */ uint32_t have_slept, yet_to_sleep; - uint32_t slot_now; - struct timeval tv; - ticknow = tcp_gethptstick(&tv); - slot_now = ticknow % NUM_OF_HPTSI_SLOTS; - /* - * The user wants to be inserted at (slot_now + - * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up. - */ - largest_slot = NUM_OF_HPTSI_SLOTS - 2; - if (inp->inp_hpts_request > largest_slot) { - /* Adjust the residual in inp_hpts_request */ - slot = largest_slot; - inp->inp_hpts_request -= largest_slot; - } else { - /* No residual it all fits */ - inp->inp_hpts_request = 0; - } - inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS; - if (diag) { - diag->slot_now = slot_now; - diag->inp_hptsslot = inp->inp_hptsslot; - diag->p_on_min_sleep = hpts->p_on_min_sleep; - } - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow); - } /* Now do we need to restart the hpts's timer? */ - if (TSTMP_GT(ticknow, hpts->p_curtick)) - have_slept = ticknow - hpts->p_curtick; - else - have_slept = 0; - if (have_slept < hpts->p_hpts_sleep_time) { - /* This should be what happens */ + have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); + if (have_slept < hpts->p_hpts_sleep_time) yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; - } else { + else { /* We are over-due */ yet_to_sleep = 0; need_wakeup = 1; @@ -879,29 +980,22 @@ tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, st if (diag) { diag->have_slept = have_slept; diag->yet_to_sleep = yet_to_sleep; - diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } - if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) { + if (yet_to_sleep && + (yet_to_sleep > slot)) { /* - * We need to reschedule the hptss time-out. + * We need to reschedule the hpts's time-out. */ hpts->p_hpts_sleep_time = slot; need_new_to = slot * HPTS_TICKS_PER_USEC; } } - hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow); - } /* * Now how far is the hpts sleeping to? if active is 1, its * up and ticking we do nothing, otherwise we may need to * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201907102040.x6AKeern006731>