Date: Wed, 7 Jul 2021 10:49:18 -0400 From: Andrew Gallatin <gallatin@cs.duke.edu> To: Mateusz Guzik <mjguzik@gmail.com>, Andrew Gallatin <gallatin@freebsd.org> Cc: src-committers@freebsd.org, dev-commits-src-all@freebsd.org, dev-commits-src-main@freebsd.org Subject: Re: git: 28d0a740dd9a - main - ktls: auto-disable ifnet (inline hw) kTLS Message-ID: <8285ccb3-87d8-fe84-de27-0fd06409df70@cs.duke.edu> In-Reply-To: <CAGudoHEE-ZsmOHLpgagONTv1O3DJ-g5y%2BVGY9XWuC8BCoeAE1Q@mail.gmail.com> References: <202107061429.166ETTEn064437@gitrepo.freebsd.org> <CAGudoHEE-ZsmOHLpgagONTv1O3DJ-g5y%2BVGY9XWuC8BCoeAE1Q@mail.gmail.com>
next in thread | previous in thread | raw e-mail | index | archive | help
On 7/7/21 7:00 AM, Mateusz Guzik wrote: > This breaks NOIP kernel builds. Thanks for pointing this out, it should be fixed in 4150a5a87ed > On 7/6/21, Andrew Gallatin <gallatin@freebsd.org> wrote: >> The branch main has been updated by gallatin: >> >> URL: >> https://urldefense.com/v3/__https://cgit.FreeBSD.org/src/commit/?id=28d0a740dd9a67e4a4fa9fda5bb39b5963316f35__;!!OToaGQ!_d4pkzhNaWowgMsR4-c1qtLXr1H9SC_kBWNDvXvVV15lerMV4elltm-V6OZj3iET-A$ >> >> commit 28d0a740dd9a67e4a4fa9fda5bb39b5963316f35 >> Author: Andrew Gallatin <gallatin@FreeBSD.org> >> AuthorDate: 2021-07-06 14:17:33 +0000 >> Commit: Andrew Gallatin <gallatin@FreeBSD.org> >> CommitDate: 2021-07-06 14:28:32 +0000 >> >> ktls: auto-disable ifnet (inline hw) kTLS >> >> Ifnet (inline) hw kTLS NICs typically keep state within >> a TLS record, so that when transmitting in-order, >> they can continue encryption on each segment sent without >> DMA'ing extra state from the host. >> >> This breaks down when transmits are out of order (eg, >> TCP retransmits). In this case, the NIC must re-DMA >> the entire TLS record up to and including the segment >> being retransmitted. This means that when re-transmitting >> the last 1448 byte segment of a TLS record, the NIC will >> have to re-DMA the entire 16KB TLS record. This can lead >> to the NIC running out of PCIe bus bandwidth well before >> it saturates the network link if a lot of TCP connections have >> a high retransmoit rate. >> >> This change introduces a new sysctl >> (kern.ipc.tls.ifnet_max_rexmit_pct), >> where TCP connections with higher retransmit rate will be >> switched to SW kTLS so as to conserve PCIe bandwidth. >> >> Reviewed by: hselasky, markj, rrs >> Sponsored by: Netflix >> Differential Revision: https://urldefense.com/v3/__https://reviews.freebsd.org/D30908__;!!OToaGQ!_d4pkzhNaWowgMsR4-c1qtLXr1H9SC_kBWNDvXvVV15lerMV4elltm-V6OYOYLaV0A$ >> --- >> sys/kern/uipc_ktls.c | 107 >> ++++++++++++++++++++++++++++++++++++++++++++++++++ >> sys/netinet/tcp_var.h | 13 +++++- >> sys/sys/ktls.h | 15 ++++++- >> 3 files changed, 133 insertions(+), 2 deletions(-) >> >> diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c >> index 7e87e7c740e3..88e29157289d 100644 >> --- a/sys/kern/uipc_ktls.c >> +++ b/sys/kern/uipc_ktls.c >> @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$"); >> >> #include "opt_inet.h" >> #include "opt_inet6.h" >> +#include "opt_kern_tls.h" >> #include "opt_ratelimit.h" >> #include "opt_rss.h" >> >> @@ -121,6 +122,11 @@ SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, >> CTLFLAG_RD, >> &ktls_number_threads, 0, >> "Number of TLS threads in thread-pool"); >> >> +unsigned int ktls_ifnet_max_rexmit_pct = 2; >> +SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN, >> + &ktls_ifnet_max_rexmit_pct, 2, >> + "Max percent bytes retransmitted before ifnet TLS is disabled"); >> + >> static bool ktls_offload_enable; >> SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN, >> &ktls_offload_enable, 0, >> @@ -184,6 +190,14 @@ static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed); >> SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, >> CTLFLAG_RD, >> &ktls_switch_failed, "TLS sessions unable to switch between SW and >> ifnet"); >> >> +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail); >> +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, >> CTLFLAG_RD, >> + &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from >> ifnet"); >> + >> +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok); >> +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, >> CTLFLAG_RD, >> + &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from >> ifnet"); >> + >> SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, >> "Software TLS session stats"); >> SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, >> 0, >> @@ -2187,3 +2201,96 @@ ktls_work_thread(void *ctx) >> } >> } >> } >> + >> +static void >> +ktls_disable_ifnet_help(void *context, int pending __unused) >> +{ >> + struct ktls_session *tls; >> + struct inpcb *inp; >> + struct tcpcb *tp; >> + struct socket *so; >> + int err; >> + >> + tls = context; >> + inp = tls->inp; >> + if (inp == NULL) >> + return; >> + INP_WLOCK(inp); >> + so = inp->inp_socket; >> + MPASS(so != NULL); >> + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || >> + (inp->inp_flags2 & INP_FREED)) { >> + goto out; >> + } >> + >> + if (so->so_snd.sb_tls_info != NULL) >> + err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW); >> + else >> + err = ENXIO; >> + if (err == 0) { >> + counter_u64_add(ktls_ifnet_disable_ok, 1); >> + /* ktls_set_tx_mode() drops inp wlock, so recheck flags */ >> + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 && >> + (inp->inp_flags2 & INP_FREED) == 0 && >> + (tp = intotcpcb(inp)) != NULL && >> + tp->t_fb->tfb_hwtls_change != NULL) >> + (*tp->t_fb->tfb_hwtls_change)(tp, 0); >> + } else { >> + counter_u64_add(ktls_ifnet_disable_fail, 1); >> + } >> + >> +out: >> + SOCK_LOCK(so); >> + sorele(so); >> + if (!in_pcbrele_wlocked(inp)) >> + INP_WUNLOCK(inp); >> + ktls_free(tls); >> +} >> + >> +/* >> + * Called when re-transmits are becoming a substantial portion of the >> + * sends on this connection. When this happens, we transition the >> + * connection to software TLS. This is needed because most inline TLS >> + * NICs keep crypto state only for in-order transmits. This means >> + * that to handle a TCP rexmit (which is out-of-order), the NIC must >> + * re-DMA the entire TLS record up to and including the current >> + * segment. This means that when re-transmitting the last ~1448 byte >> + * segment of a 16KB TLS record, we could wind up re-DMA'ing an order >> + * of magnitude more data than we are sending. This can cause the >> + * PCIe link to saturate well before the network, which can cause >> + * output drops, and a general loss of capacity. >> + */ >> +void >> +ktls_disable_ifnet(void *arg) >> +{ >> + struct tcpcb *tp; >> + struct inpcb *inp; >> + struct socket *so; >> + struct ktls_session *tls; >> + >> + tp = arg; >> + inp = tp->t_inpcb; >> + INP_WLOCK_ASSERT(inp); >> + so = inp->inp_socket; >> + SOCK_LOCK(so); >> + tls = so->so_snd.sb_tls_info; >> + if (tls->disable_ifnet_pending) { >> + SOCK_UNLOCK(so); >> + return; >> + } >> + >> + /* >> + * note that disable_ifnet_pending is never cleared; disabling >> + * ifnet can only be done once per session, so we never want >> + * to do it again >> + */ >> + >> + (void)ktls_hold(tls); >> + in_pcbref(inp); >> + soref(so); >> + tls->disable_ifnet_pending = true; >> + tls->inp = inp; >> + SOCK_UNLOCK(so); >> + TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls); >> + (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task); >> +} >> diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h >> index dd30f89896d2..3f72a821e71f 100644 >> --- a/sys/netinet/tcp_var.h >> +++ b/sys/netinet/tcp_var.h >> @@ -39,8 +39,10 @@ >> #include <netinet/tcp_fsm.h> >> >> #ifdef _KERNEL >> +#include "opt_kern_tls.h" >> #include <net/vnet.h> >> #include <sys/mbuf.h> >> +#include <sys/ktls.h> >> #endif >> >> #define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information >> array" */ >> @@ -1139,8 +1141,10 @@ tcp_fields_to_net(struct tcphdr *th) >> >> static inline void >> tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt, >> - uint8_t is_tlp, int hw_tls __unused) >> + uint8_t is_tlp, int hw_tls) >> { >> + uint64_t rexmit_percent; >> + >> if (is_tlp) { >> tp->t_sndtlppack++; >> tp->t_sndtlpbyte += len; >> @@ -1150,6 +1154,13 @@ tcp_account_for_send(struct tcpcb *tp, uint32_t len, >> uint8_t is_rxt, >> tp->t_snd_rxt_bytes += len; >> else >> tp->t_sndbytes += len; >> + >> + if (hw_tls && is_rxt) { >> + rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * >> (tp->t_snd_rxt_bytes + tp->t_sndbytes)); >> + if (rexmit_percent > ktls_ifnet_max_rexmit_pct) >> + ktls_disable_ifnet(tp); >> + } >> + >> } >> #endif /* _KERNEL */ >> >> diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h >> index b28c94965c97..7fd8831878b4 100644 >> --- a/sys/sys/ktls.h >> +++ b/sys/sys/ktls.h >> @@ -189,10 +189,12 @@ struct ktls_session { >> u_int wq_index; >> volatile u_int refcount; >> int mode; >> - bool reset_pending; >> >> struct task reset_tag_task; >> + struct task disable_ifnet_task; >> struct inpcb *inp; >> + bool reset_pending; >> + bool disable_ifnet_pending; >> } __aligned(CACHE_LINE_SIZE); >> >> void ktls_check_rx(struct sockbuf *sb); >> @@ -231,5 +233,16 @@ ktls_free(struct ktls_session *tls) >> ktls_destroy(tls); >> } >> >> +#ifdef KERN_TLS >> +extern unsigned int ktls_ifnet_max_rexmit_pct; >> +void ktls_disable_ifnet(void *arg); >> +#else >> +#define ktls_ifnet_max_rexmit_pct 1 >> +inline void >> +ktls_disable_ifnet(void *arg __unused) >> +{ >> +} >> +#endif >> + >> #endif /* !_KERNEL */ >> #endif /* !_SYS_KTLS_H_ */ >> > >
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?8285ccb3-87d8-fe84-de27-0fd06409df70>