Date: Sun, 28 Apr 2019 06:51:59 +0000 (UTC) From: Navdeep Parhar <np@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org Subject: svn commit: r346805 - in stable/11: sys/dev/cxgbe sys/dev/cxgbe/tom usr.sbin/cxgbetool Message-ID: <201904280651.x3S6pxhY008312@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: np Date: Sun Apr 28 06:51:59 2019 New Revision: 346805 URL: https://svnweb.freebsd.org/changeset/base/346805 Log: MFC r317849 (partial), r332506, and r332787. r317849 (partial, required by r332506): cxgbe/t4_tom: Per-connection rate limiting for TCP sockets handled by the TOE. Sponsored by: Chelsio Communications r332506: cxgbe(4): Add support for Connection Offload Policy (aka COP). COP allows fine-grained control on whether to offload a TCP connection using t4_tom, and what settings to apply to a connection selected for offload. t4_tom must still be loaded and IFCAP_TOE must still be enabled for full TCP offload to take place on an interface. The difference is that IFCAP_TOE used to be the only knob and would enable TOE for all new connections on the inteface, but now the driver will also consult the COP, if any, before offloading to the hardware TOE. A policy is a plain text file with any number of rules, one per line. Each rule has a "match" part consisting of a socket-type (L = listen, A = active open, P = passive open, D = don't care) and a pcap-filter(7) expression, and a "settings" part that specifies whether to offload the connection or not and the parameters to use if so. The general format of a rule is: [socket-type] expr => settings Example. See cxgbetool(8) for more information. [L] ip && port http => offload [L] port 443 => !offload [L] port ssh => offload [P] src net 192.168/16 && dst port ssh => offload !nagle !timestamp cong newreno [P] dst port ssh => offload !nagle ecn cong tahoe [P] dst port http => offload [A] dst port 443 => offload tls [A] dst net 192.168/16 => offload !timestamp cong highspeed The driver processes the rules for each new listen, active open, or passive open and stops at the first match. There is an implicit rule at the end of every policy that prohibits offload when no rule in the policy matches: [D] all => !offload This is a reworked and expanded version of a patch submitted by Krishnamraju Eraparaju @ Chelsio. Sponsored by: Chelsio Communications r332787: cxgbe(4): Fix bugs in the handling of COP rules that match on VLAN tag. Retrieve the tag from the correct ifnet and use the provided tag (instead of hardcoded 0xffff, implying no tag) in the routines that process offload policy. Submitted by: Krishnamraju Eraparaju @ Chelsio Sponsored by: Chelsio Communications Modified: stable/11/sys/dev/cxgbe/adapter.h stable/11/sys/dev/cxgbe/offload.h stable/11/sys/dev/cxgbe/t4_ioctl.h stable/11/sys/dev/cxgbe/t4_main.c stable/11/sys/dev/cxgbe/t4_sge.c stable/11/sys/dev/cxgbe/tom/t4_connect.c stable/11/sys/dev/cxgbe/tom/t4_cpl_io.c stable/11/sys/dev/cxgbe/tom/t4_listen.c stable/11/sys/dev/cxgbe/tom/t4_tom.c stable/11/sys/dev/cxgbe/tom/t4_tom.h stable/11/usr.sbin/cxgbetool/Makefile stable/11/usr.sbin/cxgbetool/cxgbetool.8 stable/11/usr.sbin/cxgbetool/cxgbetool.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/dev/cxgbe/adapter.h ============================================================================== --- stable/11/sys/dev/cxgbe/adapter.h Sun Apr 28 04:05:43 2019 (r346804) +++ stable/11/sys/dev/cxgbe/adapter.h Sun Apr 28 06:51:59 2019 (r346805) @@ -804,8 +804,11 @@ struct adapter { void *tom_softc; /* (struct tom_data *) */ struct tom_tunables tt; - struct iw_tunables iwt; + struct t4_offload_policy *policy; + struct rwlock policy_lock; + void *iwarp_softc; /* (struct c4iw_dev *) */ + struct iw_tunables iwt; void *iscsi_ulp_softc; /* (struct cxgbei_data *) */ void *ccr_softc; /* (struct ccr_softc *) */ struct l2t_data *l2t; /* L2 table */ Modified: stable/11/sys/dev/cxgbe/offload.h ============================================================================== --- stable/11/sys/dev/cxgbe/offload.h Sun Apr 28 04:05:43 2019 (r346804) +++ stable/11/sys/dev/cxgbe/offload.h Sun Apr 28 06:51:59 2019 (r346805) @@ -154,6 +154,7 @@ struct tom_tunables { int num_tls_rx_ports; int tx_align; int tx_zcopy; + int cop_managed_offloading; }; /* iWARP driver tunables */ struct iw_tunables { Modified: stable/11/sys/dev/cxgbe/t4_ioctl.h ============================================================================== --- stable/11/sys/dev/cxgbe/t4_ioctl.h Sun Apr 28 04:05:43 2019 (r346804) +++ stable/11/sys/dev/cxgbe/t4_ioctl.h Sun Apr 28 06:51:59 2019 (r346805) @@ -33,6 +33,7 @@ #include <sys/types.h> #include <net/ethernet.h> +#include <net/bpf.h> /* * Ioctl commands specific to this driver. @@ -342,6 +343,44 @@ struct t4_cudbg_dump { uint8_t *data; }; +enum { + OPEN_TYPE_LISTEN = 'L', + OPEN_TYPE_ACTIVE = 'A', + OPEN_TYPE_PASSIVE = 'P', + OPEN_TYPE_DONTCARE = 'D', +}; + +struct offload_settings { + int8_t offload; + int8_t rx_coalesce; + int8_t cong_algo; + int8_t sched_class; + int8_t tstamp; + int8_t sack; + int8_t nagle; + int8_t ecn; + int8_t ddp; + int8_t tls; + int16_t txq; + int16_t rxq; + int16_t mss; +}; + +struct offload_rule { + char open_type; + struct offload_settings settings; + struct bpf_program bpf_prog; /* compiled program/filter */ +}; + +/* + * An offload policy consists of a set of rules matched in sequence. The + * settings of the first rule that matches are applied to that connection. + */ +struct t4_offload_policy { + uint32_t nrules; + struct offload_rule *rule; +}; + #define CHELSIO_T4_GETREG _IOWR('f', T4_GETREG, struct t4_reg) #define CHELSIO_T4_SETREG _IOW('f', T4_SETREG, struct t4_reg) #define CHELSIO_T4_REGDUMP _IOWR('f', T4_REGDUMP, struct t4_regdump) @@ -366,4 +405,5 @@ struct t4_cudbg_dump { #define CHELSIO_T4_LOAD_BOOT _IOW('f', T4_LOAD_BOOT, struct t4_bootrom) #define CHELSIO_T4_LOAD_BOOTCFG _IOW('f', T4_LOAD_BOOTCFG, struct t4_data) #define CHELSIO_T4_CUDBG_DUMP _IOWR('f', T4_CUDBG_DUMP, struct t4_cudbg_dump) +#define CHELSIO_T4_SET_OFLD_POLICY _IOW('f', T4_SET_OFLD_POLICY, struct t4_offload_policy) #endif Modified: stable/11/sys/dev/cxgbe/t4_main.c ============================================================================== --- stable/11/sys/dev/cxgbe/t4_main.c Sun Apr 28 04:05:43 2019 (r346804) +++ stable/11/sys/dev/cxgbe/t4_main.c Sun Apr 28 06:51:59 2019 (r346805) @@ -543,6 +543,14 @@ SYSCTL_INT(_hw_cxgbe, OID_AUTO, pcie_relaxed_ordering, "PCIe Relaxed Ordering: 0 = disable, 1 = enable, 2 = leave alone"); +#ifdef TCP_OFFLOAD +/* + * TOE tunables. + */ +static int t4_cop_managed_offloading = 0; +TUNABLE_INT("hw.cxgbe.cop_managed_offloading", &t4_cop_managed_offloading); +#endif + /* Functions used by VIs to obtain unique MAC addresses for each VI. */ static int vi_mac_funcs[] = { FW_VI_FUNC_ETH, @@ -699,6 +707,8 @@ static int load_cfg(struct adapter *, struct t4_data * static int load_boot(struct adapter *, struct t4_bootrom *); static int load_bootcfg(struct adapter *, struct t4_data *); static int cudbg_dump(struct adapter *, struct t4_cudbg_dump *); +static void free_offload_policy(struct t4_offload_policy *); +static int set_offload_policy(struct adapter *, struct t4_offload_policy *); static int read_card_mem(struct adapter *, int, struct t4_mem_range *); static int read_i2c(struct adapter *, struct t4_i2c_data *); #ifdef TCP_OFFLOAD @@ -1000,6 +1010,9 @@ t4_attach(device_t dev) mtx_init(&sc->reg_lock, "indirect register access", 0, MTX_DEF); + sc->policy = NULL; + rw_init(&sc->policy_lock, "connection offload policy"); + rc = t4_map_bars_0_and_4(sc); if (rc != 0) goto done; /* error message displayed already */ @@ -1541,6 +1554,14 @@ t4_detach_common(device_t dev) if (mtx_initialized(&sc->reg_lock)) mtx_destroy(&sc->reg_lock); + if (rw_initialized(&sc->policy_lock)) { + rw_destroy(&sc->policy_lock); +#ifdef TCP_OFFLOAD + if (sc->policy != NULL) + free_offload_policy(sc->policy); +#endif + } + for (i = 0; i < NUM_MEMWIN; i++) { struct memwin *mw = &sc->memwin[i]; @@ -5817,6 +5838,12 @@ t4_sysctls(struct adapter *sc) CTLFLAG_RW, &sc->tt.tx_zcopy, 0, "Enable zero-copy aio_write(2)"); + sc->tt.cop_managed_offloading = !!t4_cop_managed_offloading; + SYSCTL_ADD_INT(ctx, children, OID_AUTO, + "cop_managed_offloading", CTLFLAG_RW, + &sc->tt.cop_managed_offloading, 0, + "COP (Connection Offload Policy) controls all TOE offload"); + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "timer_tick", CTLTYPE_STRING | CTLFLAG_RD, sc, 0, sysctl_tp_tick, "A", "TP timer tick (us)"); @@ -9839,6 +9866,113 @@ done: return (rc); } +static void +free_offload_policy(struct t4_offload_policy *op) +{ + struct offload_rule *r; + int i; + + if (op == NULL) + return; + + r = &op->rule[0]; + for (i = 0; i < op->nrules; i++, r++) { + free(r->bpf_prog.bf_insns, M_CXGBE); + } + free(op->rule, M_CXGBE); + free(op, M_CXGBE); +} + +static int +set_offload_policy(struct adapter *sc, struct t4_offload_policy *uop) +{ + int i, rc, len; + struct t4_offload_policy *op, *old; + struct bpf_program *bf; + const struct offload_settings *s; + struct offload_rule *r; + void *u; + + if (!is_offload(sc)) + return (ENODEV); + + if (uop->nrules == 0) { + /* Delete installed policies. */ + op = NULL; + goto set_policy; + } if (uop->nrules > 256) { /* arbitrary */ + return (E2BIG); + } + + /* Copy userspace offload policy to kernel */ + op = malloc(sizeof(*op), M_CXGBE, M_ZERO | M_WAITOK); + op->nrules = uop->nrules; + len = op->nrules * sizeof(struct offload_rule); + op->rule = malloc(len, M_CXGBE, M_ZERO | M_WAITOK); + rc = copyin(uop->rule, op->rule, len); + if (rc) { + free(op->rule, M_CXGBE); + free(op, M_CXGBE); + return (rc); + } + + r = &op->rule[0]; + for (i = 0; i < op->nrules; i++, r++) { + + /* Validate open_type */ + if (r->open_type != OPEN_TYPE_LISTEN && + r->open_type != OPEN_TYPE_ACTIVE && + r->open_type != OPEN_TYPE_PASSIVE && + r->open_type != OPEN_TYPE_DONTCARE) { +error: + /* + * Rules 0 to i have malloc'd filters that need to be + * freed. Rules i+1 to nrules have userspace pointers + * and should be left alone. + */ + op->nrules = i; + free_offload_policy(op); + return (rc); + } + + /* Validate settings */ + s = &r->settings; + if ((s->offload != 0 && s->offload != 1) || + s->cong_algo < -1 || s->cong_algo > CONG_ALG_HIGHSPEED || + s->sched_class < -1 || + s->sched_class >= sc->chip_params->nsched_cls) { + rc = EINVAL; + goto error; + } + + bf = &r->bpf_prog; + u = bf->bf_insns; /* userspace ptr */ + bf->bf_insns = NULL; + if (bf->bf_len == 0) { + /* legal, matches everything */ + continue; + } + len = bf->bf_len * sizeof(*bf->bf_insns); + bf->bf_insns = malloc(len, M_CXGBE, M_ZERO | M_WAITOK); + rc = copyin(u, bf->bf_insns, len); + if (rc != 0) + goto error; + + if (!bpf_validate(bf->bf_insns, bf->bf_len)) { + rc = EINVAL; + goto error; + } + } +set_policy: + rw_wlock(&sc->policy_lock); + old = sc->policy; + sc->policy = op; + rw_wunlock(&sc->policy_lock); + free_offload_policy(old); + + return (0); +} + #define MAX_READ_BUF_SIZE (128 * 1024) static int read_card_mem(struct adapter *sc, int win, struct t4_mem_range *mr) @@ -10204,6 +10338,9 @@ t4_ioctl(struct cdev *dev, unsigned long cmd, caddr_t break; case CHELSIO_T4_CUDBG_DUMP: rc = cudbg_dump(sc, (struct t4_cudbg_dump *)data); + break; + case CHELSIO_T4_SET_OFLD_POLICY: + rc = set_offload_policy(sc, (struct t4_offload_policy *)data); break; default: rc = ENOTTY; Modified: stable/11/sys/dev/cxgbe/t4_sge.c ============================================================================== --- stable/11/sys/dev/cxgbe/t4_sge.c Sun Apr 28 04:05:43 2019 (r346804) +++ stable/11/sys/dev/cxgbe/t4_sge.c Sun Apr 28 06:51:59 2019 (r346805) @@ -965,8 +965,10 @@ mtu_to_max_payload(struct adapter *sc, int mtu, const #ifdef TCP_OFFLOAD if (toe) { - payload = sc->tt.rx_coalesce ? - G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu; + int rxcs = G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)); + + /* Note that COP can set rx_coalesce on/off per connection. */ + payload = max(mtu, rxcs); } else { #endif /* large enough even when hw VLAN extraction is disabled */ Modified: stable/11/sys/dev/cxgbe/tom/t4_connect.c ============================================================================== --- stable/11/sys/dev/cxgbe/tom/t4_connect.c Sun Apr 28 04:05:43 2019 (r346804) +++ stable/11/sys/dev/cxgbe/tom/t4_connect.c Sun Apr 28 06:51:59 2019 (r346805) @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include <sys/domain.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/sysctl.h> #include <net/ethernet.h> #include <net/if.h> #include <net/if_types.h> @@ -53,6 +54,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_fsm.h> #include <netinet/tcp_var.h> #include <netinet/toecore.h> +#include <netinet/cc/cc.h> #include "common/common.h" #include "common/t4_msg.h" @@ -231,47 +233,85 @@ do_act_open_rpl(struct sge_iq *iq, const struct rss_he * Options2 for active open. */ static uint32_t -calc_opt2a(struct socket *so, struct toepcb *toep) +calc_opt2a(struct socket *so, struct toepcb *toep, + const struct offload_settings *s) { struct tcpcb *tp = so_sototcpcb(so); struct port_info *pi = toep->vi->pi; struct adapter *sc = pi->adapter; - uint32_t opt2; + uint32_t opt2 = 0; - opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) | - F_RSS_QUEUE_VALID | V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id); + /* + * rx flow control, rx coalesce, congestion control, and tx pace are all + * explicitly set by the driver. On T5+ the ISS is also set by the + * driver to the value picked by the kernel. + */ + if (is_t4(sc)) { + opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; + opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; + } else { + opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ + opt2 |= F_T5_ISS; /* ISS provided in CPL */ + } - if (tp->t_flags & TF_SACK_PERMIT) + if (s->sack > 0 || (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT))) opt2 |= F_SACK_EN; - if (tp->t_flags & TF_REQ_TSTMP) + if (s->tstamp > 0 || (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP))) opt2 |= F_TSTAMPS_EN; if (tp->t_flags & TF_REQ_SCALE) opt2 |= F_WND_SCALE_EN; - if (V_tcp_do_ecn) + if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1)) opt2 |= F_CCTRL_ECN; - /* RX_COALESCE is always a valid value (M_RX_COALESCE). */ - if (is_t4(sc)) - opt2 |= F_RX_COALESCE_VALID; + /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ + + opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); + + /* These defaults are subject to ULP specific fixups later. */ + opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); + + opt2 |= V_PACE(0); + + if (s->cong_algo >= 0) + opt2 |= V_CONG_CNTRL(s->cong_algo); + else if (sc->tt.cong_algorithm >= 0) + opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); else { - opt2 |= F_T5_OPT_2_VALID; - opt2 |= F_T5_ISS; + struct cc_algo *cc = CC_ALGO(tp); + + if (strcasecmp(cc->name, "reno") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_RENO); + else if (strcasecmp(cc->name, "tahoe") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + if (strcasecmp(cc->name, "newreno") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); + if (strcasecmp(cc->name, "highspeed") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED); + else { + /* + * Use newreno in case the algorithm selected by the + * host stack is not supported by the hardware. + */ + opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); + } } - if (sc->tt.rx_coalesce) + + if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce)) opt2 |= V_RX_COALESCE(M_RX_COALESCE); - if (sc->tt.cong_algorithm != -1) - opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); + /* Note that ofld_rxq is already set according to s->rxq. */ + opt2 |= F_RSS_QUEUE_VALID; + opt2 |= V_RSS_QUEUE(toep->ofld_rxq->iq.abs_id); #ifdef USE_DDP_RX_FLOW_CONTROL if (toep->ulp_mode == ULP_MODE_TCPDDP) - opt2 |= F_RX_FC_VALID | F_RX_FC_DDP; + opt2 |= F_RX_FC_DDP; #endif + if (toep->ulp_mode == ULP_MODE_TLS) { - opt2 |= F_RX_FC_VALID; opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); opt2 |= F_RX_FC_DISABLE; } @@ -346,10 +386,12 @@ t4_connect(struct toedev *tod, struct socket *so, stru struct wrqe *wr = NULL; struct ifnet *rt_ifp = rt->rt_ifp; struct vi_info *vi; - int mtu_idx, rscale, qid_atid, rc, isipv6; + int mtu_idx, rscale, qid_atid, rc, isipv6, txqid, rxqid; struct inpcb *inp = sotoinpcb(so); struct tcpcb *tp = intotcpcb(inp); int reason; + struct offload_settings settings; + uint16_t vid = 0xffff; INP_WLOCK_ASSERT(inp); KASSERT(nam->sa_family == AF_INET || nam->sa_family == AF_INET6, @@ -361,12 +403,30 @@ t4_connect(struct toedev *tod, struct socket *so, stru struct ifnet *ifp = VLAN_COOKIE(rt_ifp); vi = ifp->if_softc; + VLAN_TAG(rt_ifp, &vid); } else if (rt_ifp->if_type == IFT_IEEE8023ADLAG) DONT_OFFLOAD_ACTIVE_OPEN(ENOSYS); /* XXX: implement lagg+TOE */ else DONT_OFFLOAD_ACTIVE_OPEN(ENOTSUP); - toep = alloc_toepcb(vi, -1, -1, M_NOWAIT | M_ZERO); + rw_rlock(&sc->policy_lock); + settings = *lookup_offload_policy(sc, OPEN_TYPE_ACTIVE, NULL, vid, inp); + rw_runlock(&sc->policy_lock); + if (!settings.offload) + DONT_OFFLOAD_ACTIVE_OPEN(EPERM); + + if (settings.txq >= 0 && settings.txq < vi->nofldtxq) + txqid = settings.txq; + else + txqid = arc4random() % vi->nofldtxq; + txqid += vi->first_ofld_txq; + if (settings.rxq >= 0 && settings.rxq < vi->nofldrxq) + rxqid = settings.rxq; + else + rxqid = arc4random() % vi->nofldrxq; + rxqid += vi->first_ofld_rxq; + + toep = alloc_toepcb(vi, txqid, rxqid, M_NOWAIT | M_ZERO); if (toep == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); @@ -385,7 +445,7 @@ t4_connect(struct toedev *tod, struct socket *so, stru DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); toep->vnet = so->so_vnet; - set_ulp_mode(toep, select_ulp_mode(so, sc)); + set_ulp_mode(toep, select_ulp_mode(so, sc, &settings)); SOCKBUF_LOCK(&so->so_rcv); /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ toep->rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); @@ -400,7 +460,7 @@ t4_connect(struct toedev *tod, struct socket *so, stru rscale = tp->request_r_scale = select_rcv_wscale(); else rscale = 0; - mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, 0); + mtu_idx = find_best_mtu_idx(sc, &inp->inp_inc, &settings); qid_atid = (toep->ofld_rxq->iq.abs_id << 14) | toep->tid; if (isipv6) { @@ -441,8 +501,8 @@ t4_connect(struct toedev *tod, struct socket *so, stru cpl->peer_ip_hi = *(uint64_t *)&inp->in6p_faddr.s6_addr[0]; cpl->peer_ip_lo = *(uint64_t *)&inp->in6p_faddr.s6_addr[8]; cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, - toep->rx_credits, toep->ulp_mode); - cpl->opt2 = calc_opt2a(so, toep); + toep->rx_credits, toep->ulp_mode, &settings); + cpl->opt2 = calc_opt2a(so, toep, &settings); } else { struct cpl_act_open_req *cpl = wrtod(wr); struct cpl_t5_act_open_req *cpl5 = (void *)cpl; @@ -470,8 +530,8 @@ t4_connect(struct toedev *tod, struct socket *so, stru inp_4tuple_get(inp, &cpl->local_ip, &cpl->local_port, &cpl->peer_ip, &cpl->peer_port); cpl->opt0 = calc_opt0(so, vi, toep->l2te, mtu_idx, rscale, - toep->rx_credits, toep->ulp_mode); - cpl->opt2 = calc_opt2a(so, toep); + toep->rx_credits, toep->ulp_mode, &settings); + cpl->opt2 = calc_opt2a(so, toep, &settings); } CTR5(KTR_CXGBE, "%s: atid %u (%s), toep %p, inp %p", __func__, Modified: stable/11/sys/dev/cxgbe/tom/t4_cpl_io.c ============================================================================== --- stable/11/sys/dev/cxgbe/tom/t4_cpl_io.c Sun Apr 28 04:05:43 2019 (r346804) +++ stable/11/sys/dev/cxgbe/tom/t4_cpl_io.c Sun Apr 28 06:51:59 2019 (r346805) @@ -29,6 +29,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet.h" +#include "opt_inet6.h" #ifdef TCP_OFFLOAD #include <sys/param.h> @@ -130,6 +131,11 @@ send_flowc_wr(struct toepcb *toep, struct flowc_tx_par nparams++; if (toep->tls.fcplenmax != 0) nparams++; + if (toep->tc_idx != -1) { + MPASS(toep->tc_idx >= 0 && + toep->tc_idx < sc->chip_params->nsched_cls); + nparams++; + } flowclen = sizeof(*flowc) + nparams * sizeof(struct fw_flowc_mnemval); @@ -181,6 +187,8 @@ send_flowc_wr(struct toepcb *toep, struct flowc_tx_par FLOWC_PARAM(ULP_MODE, toep->ulp_mode); if (toep->tls.fcplenmax != 0) FLOWC_PARAM(TXDATAPLEN_MAX, toep->tls.fcplenmax); + if (toep->tc_idx != -1) + FLOWC_PARAM(SCHEDCLASS, toep->tc_idx); #undef FLOWC_PARAM KASSERT(paramidx == nparams, ("nparams mismatch")); @@ -198,6 +206,76 @@ send_flowc_wr(struct toepcb *toep, struct flowc_tx_par t4_wrq_tx(sc, wr); } +#ifdef RATELIMIT +/* + * Input is Bytes/second (so_max_pacing-rate), chip counts in Kilobits/second. + */ +static int +update_tx_rate_limit(struct adapter *sc, struct toepcb *toep, u_int Bps) +{ + int tc_idx, rc; + const u_int kbps = (u_int) (uint64_t)Bps * 8ULL / 1000; + const int port_id = toep->vi->pi->port_id; + + CTR3(KTR_CXGBE, "%s: tid %u, rate %uKbps", __func__, toep->tid, kbps); + + if (kbps == 0) { + /* unbind */ + tc_idx = -1; + } else { + rc = t4_reserve_cl_rl_kbps(sc, port_id, kbps, &tc_idx); + if (rc != 0) + return (rc); + MPASS(tc_idx >= 0 && tc_idx < sc->chip_params->nsched_cls); + } + + if (toep->tc_idx != tc_idx) { + struct wrqe *wr; + struct fw_flowc_wr *flowc; + int nparams = 1, flowclen, flowclen16; + struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; + + flowclen = sizeof(*flowc) + nparams * sizeof(struct + fw_flowc_mnemval); + flowclen16 = howmany(flowclen, 16); + if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0 || + (wr = alloc_wrqe(roundup2(flowclen, 16), toep->ofld_txq)) == NULL) { + if (tc_idx >= 0) + t4_release_cl_rl_kbps(sc, port_id, tc_idx); + return (ENOMEM); + } + + flowc = wrtod(wr); + memset(flowc, 0, wr->wr_len); + + flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) | + V_FW_FLOWC_WR_NPARAMS(nparams)); + flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) | + V_FW_WR_FLOWID(toep->tid)); + + flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; + if (tc_idx == -1) + flowc->mnemval[0].val = htobe32(0xff); + else + flowc->mnemval[0].val = htobe32(tc_idx); + + txsd->tx_credits = flowclen16; + txsd->plen = 0; + toep->tx_credits -= txsd->tx_credits; + if (__predict_false(++toep->txsd_pidx == toep->txsd_total)) + toep->txsd_pidx = 0; + toep->txsd_avail--; + t4_wrq_tx(sc, wr); + } + + if (toep->tc_idx >= 0) + t4_release_cl_rl_kbps(sc, port_id, toep->tc_idx); + toep->tc_idx = tc_idx; + + return (0); +} +#endif + void send_reset(struct adapter *sc, struct toepcb *toep, uint32_t snd_nxt) { @@ -272,19 +350,19 @@ assign_rxopt(struct tcpcb *tp, unsigned int opt) n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else n = sizeof(struct ip) + sizeof(struct tcphdr); - if (V_tcp_do_rfc1323) - n += TCPOLEN_TSTAMP_APPA; tp->t_maxseg = sc->params.mtus[G_TCPOPT_MSS(opt)] - n; - CTR4(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u)", __func__, toep->tid, - G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)]); - if (G_TCPOPT_TSTAMP(opt)) { tp->t_flags |= TF_RCVD_TSTMP; /* timestamps ok */ tp->ts_recent = 0; /* hmmm */ tp->ts_recent_age = tcp_ts_getticks(); + tp->t_maxseg -= TCPOLEN_TSTAMP_APPA; } + CTR5(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), mss %u", __func__, + toep->tid, G_TCPOPT_MSS(opt), sc->params.mtus[G_TCPOPT_MSS(opt)], + tp->t_maxseg); + if (G_TCPOPT_SACK(opt)) tp->t_flags |= TF_SACK_PERMIT; /* should already be set */ else @@ -661,7 +739,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep struct socket *so = inp->inp_socket; struct sockbuf *sb = &so->so_snd; int tx_credits, shove, compl, sowwakeup; - struct ofld_tx_sdesc *txsd = &toep->txsd[toep->txsd_pidx]; + struct ofld_tx_sdesc *txsd; bool aiotx_mbuf_seen; INP_WLOCK_ASSERT(inp); @@ -681,6 +759,13 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep if (__predict_false(toep->flags & TPF_ABORT_SHUTDOWN)) return; +#ifdef RATELIMIT + if (__predict_false(inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) && + (update_tx_rate_limit(sc, toep, so->so_max_pacing_rate) == 0)) { + inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; + } +#endif + /* * This function doesn't resume by itself. Someone else must clear the * flag and call this function. @@ -691,6 +776,7 @@ t4_push_frames(struct adapter *sc, struct toepcb *toep return; } + txsd = &toep->txsd[toep->txsd_pidx]; do { tx_credits = min(toep->tx_credits, MAX_OFLD_TX_CREDITS); max_imm = max_imm_payload(tx_credits); Modified: stable/11/sys/dev/cxgbe/tom/t4_listen.c ============================================================================== --- stable/11/sys/dev/cxgbe/tom/t4_listen.c Sun Apr 28 04:05:43 2019 (r346804) +++ stable/11/sys/dev/cxgbe/tom/t4_listen.c Sun Apr 28 06:51:59 2019 (r346805) @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include <sys/fnv_hash.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/sysctl.h> #include <net/ethernet.h> #include <net/if.h> #include <net/if_types.h> @@ -60,6 +61,7 @@ __FBSDID("$FreeBSD$"); #include <netinet/tcp_fsm.h> #include <netinet/tcp_var.h> #include <netinet/toecore.h> +#include <netinet/cc/cc.h> #include "common/common.h" #include "common/t4_msg.h" @@ -82,7 +84,8 @@ static struct listen_ctx *listen_hash_find(struct adap static struct listen_ctx *listen_hash_del(struct adapter *, struct inpcb *); static struct inpcb *release_lctx(struct adapter *, struct listen_ctx *); -static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *); +static inline void save_qids_in_mbuf(struct mbuf *, struct vi_info *, + struct offload_settings *); static inline void get_qids_from_mbuf(struct mbuf *m, int *, int *); static void send_reset_synqe(struct toedev *, struct synq_entry *); @@ -511,9 +514,17 @@ t4_listen_start(struct toedev *tod, struct tcpcb *tp) struct inpcb *inp = tp->t_inpcb; struct listen_ctx *lctx; int i, rc, v; + struct offload_settings settings; INP_WLOCK_ASSERT(inp); + rw_rlock(&sc->policy_lock); + settings = *lookup_offload_policy(sc, OPEN_TYPE_LISTEN, NULL, 0xffff, + inp); + rw_runlock(&sc->policy_lock); + if (!settings.offload) + return (0); + /* Don't start a hardware listener for any loopback address. */ if (inp->inp_vflag & INP_IPV6 && IN6_IS_ADDR_LOOPBACK(&inp->in6p_laddr)) return (0); @@ -946,13 +957,23 @@ t4_offload_socket(struct toedev *tod, void *arg, struc } static inline void -save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi) +save_qids_in_mbuf(struct mbuf *m, struct vi_info *vi, + struct offload_settings *s) { uint32_t txqid, rxqid; - txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq; - rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq; + if (s->txq >= 0 && s->txq < vi->nofldtxq) + txqid = s->txq; + else + txqid = arc4random() % vi->nofldtxq; + txqid += vi->first_ofld_txq; + if (s->rxq >= 0 && s->rxq < vi->nofldrxq) + rxqid = s->rxq; + else + rxqid = arc4random() % vi->nofldrxq; + rxqid += vi->first_ofld_rxq; + m->m_pkthdr.flowid = (txqid << 16) | (rxqid & 0xffff); } @@ -1017,50 +1038,88 @@ t4opt_to_tcpopt(const struct tcp_options *t4opt, struc */ static uint32_t calc_opt2p(struct adapter *sc, struct port_info *pi, int rxqid, - const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode) + const struct tcp_options *tcpopt, struct tcphdr *th, int ulp_mode, + struct cc_algo *cc, const struct offload_settings *s) { struct sge_ofld_rxq *ofld_rxq = &sc->sge.ofld_rxq[rxqid]; - uint32_t opt2; + uint32_t opt2 = 0; - opt2 = V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]) | - F_RSS_QUEUE_VALID | V_RSS_QUEUE(ofld_rxq->iq.abs_id); - - if (V_tcp_do_rfc1323) { - if (tcpopt->tstamp) - opt2 |= F_TSTAMPS_EN; - if (tcpopt->sack) - opt2 |= F_SACK_EN; - if (tcpopt->wsf <= 14) - opt2 |= F_WND_SCALE_EN; + /* + * rx flow control, rx coalesce, congestion control, and tx pace are all + * explicitly set by the driver. On T5+ the ISS is also set by the + * driver to the value picked by the kernel. + */ + if (is_t4(sc)) { + opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID; + opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID; + } else { + opt2 |= F_T5_OPT_2_VALID; /* all 4 valid */ + opt2 |= F_T5_ISS; /* ISS provided in CPL */ } - if (V_tcp_do_ecn && th->th_flags & (TH_ECE | TH_CWR)) + if (tcpopt->sack && (s->sack > 0 || (s->sack < 0 && V_tcp_do_rfc1323))) + opt2 |= F_SACK_EN; + + if (tcpopt->tstamp && + (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323))) + opt2 |= F_TSTAMPS_EN; + + if (tcpopt->wsf < 15 && V_tcp_do_rfc1323) + opt2 |= F_WND_SCALE_EN; + + if (th->th_flags & (TH_ECE | TH_CWR) && + (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn))) opt2 |= F_CCTRL_ECN; - /* RX_COALESCE is always a valid value (0 or M_RX_COALESCE). */ - if (is_t4(sc)) - opt2 |= F_RX_COALESCE_VALID; + /* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */ + + opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]); + + /* These defaults are subject to ULP specific fixups later. */ + opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0); + + opt2 |= V_PACE(0); + + if (s->cong_algo >= 0) + opt2 |= V_CONG_CNTRL(s->cong_algo); + else if (sc->tt.cong_algorithm >= 0) + opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); else { - opt2 |= F_T5_OPT_2_VALID; - opt2 |= F_T5_ISS; + if (strcasecmp(cc->name, "reno") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_RENO); + else if (strcasecmp(cc->name, "tahoe") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_TAHOE); + if (strcasecmp(cc->name, "newreno") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); + if (strcasecmp(cc->name, "highspeed") == 0) + opt2 |= V_CONG_CNTRL(CONG_ALG_HIGHSPEED); + else { + /* + * Use newreno in case the algorithm selected by the + * host stack is not supported by the hardware. + */ + opt2 |= V_CONG_CNTRL(CONG_ALG_NEWRENO); + } } - if (sc->tt.rx_coalesce) + + if (s->rx_coalesce > 0 || (s->rx_coalesce < 0 && sc->tt.rx_coalesce)) opt2 |= V_RX_COALESCE(M_RX_COALESCE); - if (sc->tt.cong_algorithm != -1) - opt2 |= V_CONG_CNTRL(sc->tt.cong_algorithm & M_CONG_CNTRL); + /* Note that ofld_rxq is already set according to s->rxq. */ + opt2 |= F_RSS_QUEUE_VALID; + opt2 |= V_RSS_QUEUE(ofld_rxq->iq.abs_id); #ifdef USE_DDP_RX_FLOW_CONTROL if (ulp_mode == ULP_MODE_TCPDDP) - opt2 |= F_RX_FC_VALID | F_RX_FC_DDP; + opt2 |= F_RX_FC_DDP; #endif + if (ulp_mode == ULP_MODE_TLS) { - opt2 |= F_RX_FC_VALID; opt2 &= ~V_RX_COALESCE(M_RX_COALESCE); opt2 |= F_RX_FC_DISABLE; } - return htobe32(opt2); + return (htobe32(opt2)); } static void @@ -1196,6 +1255,7 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); #endif + struct offload_settings settings; KASSERT(opcode == CPL_PASS_ACCEPT_REQ, ("%s: unexpected opcode 0x%x", __func__, opcode)); @@ -1331,15 +1391,23 @@ found: REJECT_PASS_ACCEPT(); } so = inp->inp_socket; + rw_rlock(&sc->policy_lock); + settings = *lookup_offload_policy(sc, OPEN_TYPE_PASSIVE, m, 0xffff, inp); + rw_runlock(&sc->policy_lock); + if (!settings.offload) { + INP_WUNLOCK(inp); + free(wr, M_CXGBE); + REJECT_PASS_ACCEPT(); + } - mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss)); + mtu_idx = find_best_mtu_idx(sc, &inc, &settings); rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; SOCKBUF_LOCK(&so->so_rcv); /* opt0 rcv_bufsiz initially, assumes its normal meaning later */ rx_credits = min(select_rcv_wnd(so) >> 10, M_RCV_BUFSIZ); SOCKBUF_UNLOCK(&so->so_rcv); - save_qids_in_mbuf(m, vi); + save_qids_in_mbuf(m, vi, &settings); get_qids_from_mbuf(m, NULL, &rxqid); if (is_t4(sc)) @@ -1349,7 +1417,7 @@ found: INIT_TP_WR_MIT_CPL(rpl5, CPL_PASS_ACCEPT_RPL, tid); } - ulp_mode = select_ulp_mode(so, sc); + ulp_mode = select_ulp_mode(so, sc, &settings); switch (ulp_mode) { case ULP_MODE_TCPDDP: synqe->flags |= TPF_SYNQE_TCPDDP; @@ -1358,8 +1426,10 @@ found: synqe->flags |= TPF_SYNQE_TLS; break; } - rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode); - rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode); + rpl->opt0 = calc_opt0(so, vi, e, mtu_idx, rscale, rx_credits, ulp_mode, + &settings); + rpl->opt2 = calc_opt2p(sc, pi, rxqid, &cpl->tcpopt, &th, ulp_mode, + CC_ALGO(intotcpcb(inp)), &settings); synqe->tid = tid; synqe->lctx = lctx; Modified: stable/11/sys/dev/cxgbe/tom/t4_tom.c ============================================================================== --- stable/11/sys/dev/cxgbe/tom/t4_tom.c Sun Apr 28 04:05:43 2019 (r346804) +++ stable/11/sys/dev/cxgbe/tom/t4_tom.c Sun Apr 28 06:51:59 2019 (r346805) @@ -48,6 +48,8 @@ __FBSDID("$FreeBSD$"); #include <sys/taskqueue.h> #include <net/if.h> #include <net/if_var.h> +#include <net/if_types.h> +#include <net/if_vlan_var.h> #include <netinet/in.h> #include <netinet/in_pcb.h> #include <netinet/in_var.h> @@ -134,15 +136,11 @@ alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, txsd_total = tx_credits / howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16); - if (txqid < 0) - txqid = (arc4random() % vi->nofldtxq) + vi->first_ofld_txq; KASSERT(txqid >= vi->first_ofld_txq && txqid < vi->first_ofld_txq + vi->nofldtxq, ("%s: txqid %d for vi %p (first %d, n %d)", __func__, txqid, vi, vi->first_ofld_txq, vi->nofldtxq)); - if (rxqid < 0) - rxqid = (arc4random() % vi->nofldrxq) + vi->first_ofld_rxq; KASSERT(rxqid >= vi->first_ofld_rxq && rxqid < vi->first_ofld_rxq + vi->nofldrxq, ("%s: rxqid %d for vi %p (first %d, n %d)", __func__, rxqid, vi, @@ -158,6 +156,7 @@ alloc_toepcb(struct vi_info *vi, int txqid, int rxqid, refcount_init(&toep->refcount, 1); toep->td = sc->tom_softc; toep->vi = vi; + toep->tc_idx = -1; toep->tx_total = tx_credits; toep->tx_credits = tx_credits; toep->ofld_txq = &sc->sge.ofld_txq[txqid]; @@ -316,6 +315,10 @@ release_offload_resources(struct toepcb *toep) if (toep->ce) release_lip(td, toep->ce); +#ifdef RATELIMIT + if (toep->tc_idx != -1) + t4_release_cl_rl_kbps(sc, toep->vi->pi->port_id, toep->tc_idx); +#endif mtx_lock(&td->toep_list_lock); TAILQ_REMOVE(&td->toep_list, toep, link); mtx_unlock(&td->toep_list_lock); @@ -483,27 +486,28 @@ queue_tid_release(struct adapter *sc, int tid) } /* - * What mtu_idx to use, given a 4-tuple and/or an MSS cap + * What mtu_idx to use, given a 4-tuple. Note that both s->mss and tcp_mssopt + * have the MSS that we should advertise in our SYN. Advertised MSS doesn't + * account for any TCP options so the effective MSS (only payload, no headers or + * options) could be different. We fill up tp->t_maxseg with the effective MSS + * at the end of the 3-way handshake. */ int -find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, int pmss) +find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc, + struct offload_settings *s) { unsigned short *mtus = &sc->params.mtus[0]; - int i, mss, n; + int i, mss, mtu; - KASSERT(inc != NULL || pmss > 0, - ("%s: at least one of inc/pmss must be specified", __func__)); + MPASS(inc != NULL); - mss = inc ? tcp_mssopt(inc) : pmss; - if (pmss > 0 && mss > pmss) - mss = pmss; - + mss = s->mss > 0 ? s->mss : tcp_mssopt(inc); if (inc->inc_flags & INC_ISIPV6) - n = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr); else - n = sizeof(struct ip) + sizeof(struct tcphdr); + mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr); - for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mss + n; i++) + for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++) continue; return (i); @@ -546,34 +550,33 @@ select_rcv_wscale(void) */ uint64_t calc_opt0(struct socket *so, struct vi_info *vi, struct l2t_entry *e, - int mtu_idx, int rscale, int rx_credits, int ulp_mode) + int mtu_idx, int rscale, int rx_credits, int ulp_mode, + struct offload_settings *s) { + int keepalive; uint64_t opt0; *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201904280651.x3S6pxhY008312>