Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 18 Sep 2020 03:01:47 +0000 (UTC)
From:      Navdeep Parhar <np@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r365871 - in head: share/man/man4 sys/dev/cxgbe sys/dev/cxgbe/common sys/dev/cxgbe/firmware
Message-ID:  <202009180301.08I31lhU021051@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: np
Date: Fri Sep 18 03:01:47 2020
New Revision: 365871
URL: https://svnweb.freebsd.org/changeset/base/365871

Log:
  cxgbe(4): add support for stateless offloads for VXLAN traffic.
  
  Hardware assistance includes checksumming (tx and rx), TSO, and RSS on
  the inner traffic in a VXLAN tunnel.
  
  Relnotes:	Yes
  Sponsored by:	Chelsio Communications

Modified:
  head/share/man/man4/cxgbe.4
  head/sys/dev/cxgbe/adapter.h
  head/sys/dev/cxgbe/common/common.h
  head/sys/dev/cxgbe/common/t4_hw.c
  head/sys/dev/cxgbe/firmware/t6fw_cfg.txt
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/cxgbe/t4_sge.c

Modified: head/share/man/man4/cxgbe.4
==============================================================================
--- head/share/man/man4/cxgbe.4	Fri Sep 18 02:37:57 2020	(r365870)
+++ head/share/man/man4/cxgbe.4	Fri Sep 18 03:01:47 2020	(r365871)
@@ -31,7 +31,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd Dec 10, 2019
+.Dd September 17, 2020
 .Dt CXGBE 4
 .Os
 .Sh NAME
@@ -61,8 +61,8 @@ driver provides support for PCI Express Ethernet adapt
 the Chelsio Terminator 4, Terminator 5, and Terminator 6 ASICs (T4, T5, and T6).
 The driver supports Jumbo Frames, Transmit/Receive checksum offload,
 TCP segmentation offload (TSO), Large Receive Offload (LRO), VLAN
-tag insertion/extraction, VLAN checksum offload, VLAN TSO, and
-Receive Side Steering (RSS).
+tag insertion/extraction, VLAN checksum offload, VLAN TSO, VXLAN checksum
+offload, VXLAN TSO, and Receive Side Steering (RSS).
 For further hardware information and questions related to hardware
 requirements, see
 .Pa http://www.chelsio.com/ .

Modified: head/sys/dev/cxgbe/adapter.h
==============================================================================
--- head/sys/dev/cxgbe/adapter.h	Fri Sep 18 02:37:57 2020	(r365870)
+++ head/sys/dev/cxgbe/adapter.h	Fri Sep 18 03:01:47 2020	(r365871)
@@ -119,6 +119,7 @@ enum {
 	TX_SGL_SEGS = 39,
 	TX_SGL_SEGS_TSO = 38,
 	TX_SGL_SEGS_EO_TSO = 30,	/* XXX: lower for IPv6. */
+	TX_SGL_SEGS_VXLAN_TSO = 37,
 	TX_WR_FLITS = SGE_MAX_WR_LEN / 8
 };
 
@@ -286,6 +287,7 @@ struct port_info {
 	int nvi;
 	int up_vis;
 	int uld_vis;
+	bool vxlan_tcam_entry;
 
 	struct tx_sched_params *sched_params;
 
@@ -593,6 +595,8 @@ struct sge_txq {
 	uint64_t txpkts0_pkts;	/* # of frames in type0 coalesced tx WRs */
 	uint64_t txpkts1_pkts;	/* # of frames in type1 coalesced tx WRs */
 	uint64_t raw_wrs;	/* # of raw work requests (alloc_wr_mbuf) */
+	uint64_t vxlan_tso_wrs;	/* # of VXLAN TSO work requests */
+	uint64_t vxlan_txcsum;
 
 	uint64_t kern_tls_records;
 	uint64_t kern_tls_short;
@@ -625,6 +629,7 @@ struct sge_rxq {
 
 	uint64_t rxcsum;	/* # of times hardware assisted with checksum */
 	uint64_t vlan_extraction;/* # of times VLAN tag was extracted */
+	uint64_t vxlan_rxcsum;
 
 	/* stats for not-that-common events */
 
@@ -847,6 +852,11 @@ struct adapter {
 	struct sge sge;
 	int lro_timeout;
 	int sc_do_rxcopy;
+
+	int vxlan_port;
+	u_int vxlan_refcount;
+	int rawf_base;
+	int nrawf;
 
 	struct taskqueue *tq[MAX_NCHAN];	/* General purpose taskqueues */
 	struct task async_event_task;

Modified: head/sys/dev/cxgbe/common/common.h
==============================================================================
--- head/sys/dev/cxgbe/common/common.h	Fri Sep 18 02:37:57 2020	(r365870)
+++ head/sys/dev/cxgbe/common/common.h	Fri Sep 18 03:01:47 2020	(r365871)
@@ -249,7 +249,7 @@ struct tp_params {
 	uint32_t max_rx_pdu;
 	uint32_t max_tx_pdu;
 	uint64_t hash_filter_mask;
-	__be16 err_vec_mask;
+	bool rx_pkt_encap;
 
 	int8_t fcoe_shift;
 	int8_t port_shift;

Modified: head/sys/dev/cxgbe/common/t4_hw.c
==============================================================================
--- head/sys/dev/cxgbe/common/t4_hw.c	Fri Sep 18 02:37:57 2020	(r365870)
+++ head/sys/dev/cxgbe/common/t4_hw.c	Fri Sep 18 03:01:47 2020	(r365871)
@@ -9647,19 +9647,11 @@ int t4_init_tp_params(struct adapter *adap, bool sleep
 
 	read_filter_mode_and_ingress_config(adap, sleep_ok);
 
-	/*
-	 * Cache a mask of the bits that represent the error vector portion of
-	 * rx_pkt.err_vec.  T6+ can use a compressed error vector to make room
-	 * for information about outer encapsulation (GENEVE/VXLAN/NVGRE).
-	 */
-	tpp->err_vec_mask = htobe16(0xffff);
 	if (chip_id(adap) > CHELSIO_T5) {
 		v = t4_read_reg(adap, A_TP_OUT_CONFIG);
-		if (v & F_CRXPKTENC) {
-			tpp->err_vec_mask =
-			    htobe16(V_T6_COMPR_RXERR_VEC(M_T6_COMPR_RXERR_VEC));
-		}
-	}
+		tpp->rx_pkt_encap = v & F_CRXPKTENC;
+	} else
+		tpp->rx_pkt_encap = false;
 
 	rx_len = t4_read_reg(adap, A_TP_PMM_RX_PAGE_SIZE);
 	tx_len = t4_read_reg(adap, A_TP_PMM_TX_PAGE_SIZE);

Modified: head/sys/dev/cxgbe/firmware/t6fw_cfg.txt
==============================================================================
--- head/sys/dev/cxgbe/firmware/t6fw_cfg.txt	Fri Sep 18 02:37:57 2020	(r365870)
+++ head/sys/dev/cxgbe/firmware/t6fw_cfg.txt	Fri Sep 18 03:01:47 2020	(r365871)
@@ -146,7 +146,8 @@
 	nethctrl = 1024
 	neq = 2048
 	nqpcq = 8192
-	nexactf = 456
+	nexactf = 454
+	nrawf = 2
 	cmask = all
 	pmask = all
 	ncrypto_lookaside = 16
@@ -272,7 +273,7 @@
 
 [fini]
 	version = 0x1
-	checksum = 0x13640470
+	checksum = 0xa92352a8
 #
 # $FreeBSD$
 #

Modified: head/sys/dev/cxgbe/t4_main.c
==============================================================================
--- head/sys/dev/cxgbe/t4_main.c	Fri Sep 18 02:37:57 2020	(r365870)
+++ head/sys/dev/cxgbe/t4_main.c	Fri Sep 18 03:01:47 2020	(r365871)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/priv.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
+#include <sys/eventhandler.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
@@ -1069,6 +1070,8 @@ t4_attach(device_t dev)
 	TASK_INIT(&sc->async_event_task, 0, t4_async_event, sc);
 #endif
 
+	refcount_init(&sc->vxlan_refcount, 0);
+
 	rc = t4_map_bars_0_and_4(sc);
 	if (rc != 0)
 		goto done; /* error message displayed already */
@@ -1716,6 +1719,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
 	struct ifnet *ifp;
 	struct sbuf *sb;
 	struct pfil_head_args pa;
+	struct adapter *sc = vi->adapter;
 
 	vi->xact_addr_filt = -1;
 	callout_init(&vi->tick, 1);
@@ -1749,28 +1753,36 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
 
 	ifp->if_capabilities = T4_CAP;
 	ifp->if_capenable = T4_CAP_ENABLE;
+	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
+	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
+	if (chip_id(sc) >= CHELSIO_T6) {
+		ifp->if_capabilities |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
+		ifp->if_capenable |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
+		ifp->if_hwassist |= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP |
+		    CSUM_INNER_IP6_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
+		    CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN;
+	}
+
 #ifdef TCP_OFFLOAD
-	if (vi->nofldrxq != 0 && (vi->adapter->flags & KERN_TLS_OK) == 0)
+	if (vi->nofldrxq != 0 && (sc->flags & KERN_TLS_OK) == 0)
 		ifp->if_capabilities |= IFCAP_TOE;
 #endif
 #ifdef RATELIMIT
-	if (is_ethoffload(vi->adapter) && vi->nofldtxq != 0) {
+	if (is_ethoffload(sc) && vi->nofldtxq != 0) {
 		ifp->if_capabilities |= IFCAP_TXRTLMT;
 		ifp->if_capenable |= IFCAP_TXRTLMT;
 	}
 #endif
-	ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
-	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
 
 	ifp->if_hw_tsomax = IP_MAXPACKET;
 	ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO;
 #ifdef RATELIMIT
-	if (is_ethoffload(vi->adapter) && vi->nofldtxq != 0)
+	if (is_ethoffload(sc) && vi->nofldtxq != 0)
 		ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO;
 #endif
 	ifp->if_hw_tsomaxsegsize = 65536;
 #ifdef KERN_TLS
-	if (vi->adapter->flags & KERN_TLS_OK) {
+	if (sc->flags & KERN_TLS_OK) {
 		ifp->if_capabilities |= IFCAP_TXTLS;
 		ifp->if_capenable |= IFCAP_TXTLS;
 	}
@@ -2100,6 +2112,17 @@ cxgbe_ioctl(struct ifnet *ifp, unsigned long cmd, cadd
 		if (mask & IFCAP_TXTLS)
 			ifp->if_capenable ^= (mask & IFCAP_TXTLS);
 #endif
+		if (mask & IFCAP_VXLAN_HWCSUM) {
+			ifp->if_capenable ^= IFCAP_VXLAN_HWCSUM;
+			ifp->if_hwassist ^= CSUM_INNER_IP6_UDP |
+			    CSUM_INNER_IP6_TCP | CSUM_INNER_IP |
+			    CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP;
+		}
+		if (mask & IFCAP_VXLAN_HWTSO) {
+			ifp->if_capenable ^= IFCAP_VXLAN_HWTSO;
+			ifp->if_hwassist ^= CSUM_INNER_IP6_TSO |
+			    CSUM_INNER_IP_TSO;
+		}
 
 #ifdef VLAN_CAPABILITIES
 		VLAN_CAPABILITIES(ifp);
@@ -4411,6 +4434,19 @@ get_params__post_init(struct adapter *sc)
 			MPASS(sc->tids.hpftid_base == 0);
 			MPASS(sc->tids.tid_base == sc->tids.nhpftids);
 		}
+
+		param[0] = FW_PARAM_PFVF(RAWF_START);
+		param[1] = FW_PARAM_PFVF(RAWF_END);
+		rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
+		if (rc != 0) {
+			device_printf(sc->dev,
+			   "failed to query rawf parameters: %d.\n", rc);
+			return (rc);
+		}
+		if ((int)val[1] > (int)val[0]) {
+			sc->rawf_base = val[0];
+			sc->nrawf = val[1] - val[0] + 1;
+		}
 	}
 
 	/*
@@ -5142,6 +5178,7 @@ update_mac_settings(struct ifnet *ifp, int flags)
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1;
+	uint8_t match_all_mac[ETHER_ADDR_LEN] = {0};
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 	KASSERT(flags, ("%s: not told what to update.", __func__));
@@ -5215,7 +5252,7 @@ update_mac_settings(struct ifnet *ifp, int flags)
 				rc = -rc;
 				for (j = 0; j < ctx.i; j++) {
 					if_printf(ifp,
-					    "failed to add mc address"
+					    "failed to add mcast address"
 					    " %02x:%02x:%02x:"
 					    "%02x:%02x:%02x rc=%d\n",
 					    ctx.mcaddr[j][0], ctx.mcaddr[j][1],
@@ -5225,14 +5262,36 @@ update_mac_settings(struct ifnet *ifp, int flags)
 				}
 				return (rc);
 			}
+			ctx.del = 0;
 		} else
 			NET_EPOCH_EXIT(et);
 
 		rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, ctx.hash, 0);
 		if (rc != 0)
-			if_printf(ifp, "failed to set mc address hash: %d", rc);
+			if_printf(ifp, "failed to set mcast address hash: %d\n",
+			    rc);
+		if (ctx.del == 0) {
+			/* We clobbered the VXLAN entry if there was one. */
+			pi->vxlan_tcam_entry = false;
+		}
 	}
 
+	if (IS_MAIN_VI(vi) && sc->vxlan_refcount > 0 &&
+	    pi->vxlan_tcam_entry == false) {
+		rc = t4_alloc_raw_mac_filt(sc, vi->viid, match_all_mac,
+		    match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id,
+		    true);
+		if (rc < 0) {
+			rc = -rc;
+			if_printf(ifp, "failed to add VXLAN TCAM entry: %d.\n",
+			    rc);
+		} else {
+			MPASS(rc == sc->rawf_base + pi->port_id);
+			rc = 0;
+			pi->vxlan_tcam_entry = true;
+		}
+	}
+
 	return (rc);
 }
 
@@ -10407,6 +10466,7 @@ clear_stats(struct adapter *sc, u_int port_id)
 #endif
 				rxq->rxcsum = 0;
 				rxq->vlan_extraction = 0;
+				rxq->vxlan_rxcsum = 0;
 
 				rxq->fl.cl_allocated = 0;
 				rxq->fl.cl_recycled = 0;
@@ -10425,6 +10485,8 @@ clear_stats(struct adapter *sc, u_int port_id)
 				txq->txpkts0_pkts = 0;
 				txq->txpkts1_pkts = 0;
 				txq->raw_wrs = 0;
+				txq->vxlan_tso_wrs = 0;
+				txq->vxlan_txcsum = 0;
 				txq->kern_tls_records = 0;
 				txq->kern_tls_short = 0;
 				txq->kern_tls_partial = 0;
@@ -11235,6 +11297,116 @@ DB_FUNC(tcb, db_show_t4tcb, db_t4_table, CS_OWN, NULL)
 }
 #endif
 
+static eventhandler_tag vxlan_start_evtag;
+static eventhandler_tag vxlan_stop_evtag;
+
+struct vxlan_evargs {
+	struct ifnet *ifp;
+	uint16_t port;
+};
+
+static void
+t4_vxlan_start(struct adapter *sc, void *arg)
+{
+	struct vxlan_evargs *v = arg;
+	struct port_info *pi;
+	uint8_t match_all_mac[ETHER_ADDR_LEN] = {0};
+	int i, rc;
+
+	if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5)
+		return;
+	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxst") != 0)
+		return;
+
+	if (sc->vxlan_refcount == 0) {
+		sc->vxlan_port = v->port;
+		sc->vxlan_refcount = 1;
+		t4_write_reg(sc, A_MPS_RX_VXLAN_TYPE,
+		    V_VXLAN(v->port) | F_VXLAN_EN);
+		for_each_port(sc, i) {
+			pi = sc->port[i];
+			if (pi->vxlan_tcam_entry == true)
+				continue;
+			rc = t4_alloc_raw_mac_filt(sc, pi->vi[0].viid,
+			    match_all_mac, match_all_mac,
+			    sc->rawf_base + pi->port_id, 1, pi->port_id, true);
+			if (rc < 0) {
+				rc = -rc;
+				log(LOG_ERR,
+				    "%s: failed to add VXLAN TCAM entry: %d.\n",
+				    device_get_name(pi->vi[0].dev), rc);
+			} else {
+				MPASS(rc == sc->rawf_base + pi->port_id);
+				rc = 0;
+				pi->vxlan_tcam_entry = true;
+			}
+		}
+	} else if (sc->vxlan_port == v->port) {
+		sc->vxlan_refcount++;
+	} else {
+		log(LOG_ERR, "%s: VXLAN already configured on port  %d; "
+		    "ignoring attempt to configure it on port %d\n",
+		    device_get_nameunit(sc->dev), sc->vxlan_port, v->port);
+	}
+	end_synchronized_op(sc, 0);
+}
+
+static void
+t4_vxlan_stop(struct adapter *sc, void *arg)
+{
+	struct vxlan_evargs *v = arg;
+
+	if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5)
+		return;
+	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxsp") != 0)
+		return;
+
+	/*
+	 * VXLANs may have been configured before the driver was loaded so we
+	 * may see more stops than starts.  This is not handled cleanly but at
+	 * least we keep the refcount sane.
+	 */
+	if (sc->vxlan_port != v->port)
+		goto done;
+	if (sc->vxlan_refcount == 0) {
+		log(LOG_ERR,
+		    "%s: VXLAN operation on port %d was stopped earlier; "
+		    "ignoring attempt to stop it again.\n",
+		    device_get_nameunit(sc->dev), sc->vxlan_port);
+	} else if (--sc->vxlan_refcount == 0) {
+		t4_set_reg_field(sc, A_MPS_RX_VXLAN_TYPE, F_VXLAN_EN, 0);
+	}
+done:
+	end_synchronized_op(sc, 0);
+}
+
+static void
+t4_vxlan_start_handler(void *arg __unused, struct ifnet *ifp,
+    sa_family_t family, u_int port)
+{
+	struct vxlan_evargs v;
+
+	MPASS(family == AF_INET || family == AF_INET6);
+	v.ifp = ifp;
+	v.port = port;
+
+	t4_iterate(t4_vxlan_start, &v);
+}
+
+static void
+t4_vxlan_stop_handler(void *arg __unused, struct ifnet *ifp, sa_family_t family,
+    u_int port)
+{
+	struct vxlan_evargs v;
+
+	MPASS(family == AF_INET || family == AF_INET6);
+	v.ifp = ifp;
+	v.port = port;
+
+	t4_iterate(t4_vxlan_stop, &v);
+}
+
+
 static struct sx mlu;	/* mod load unload */
 SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload");
 
@@ -11278,6 +11450,14 @@ mod_event(module_t mod, int cmd, void *arg)
 #endif
 			t4_tracer_modload();
 			tweak_tunables();
+			vxlan_start_evtag =
+			    EVENTHANDLER_REGISTER(vxlan_start,
+				t4_vxlan_start_handler, NULL,
+				EVENTHANDLER_PRI_ANY);
+			vxlan_stop_evtag =
+			    EVENTHANDLER_REGISTER(vxlan_stop,
+				t4_vxlan_stop_handler, NULL,
+				EVENTHANDLER_PRI_ANY);
 		}
 		sx_xunlock(&mlu);
 		break;
@@ -11314,6 +11494,10 @@ mod_event(module_t mod, int cmd, void *arg)
 			sx_sunlock(&t4_list_lock);
 
 			if (t4_sge_extfree_refs() == 0) {
+				EVENTHANDLER_DEREGISTER(vxlan_start,
+				    vxlan_start_evtag);
+				EVENTHANDLER_DEREGISTER(vxlan_stop,
+				    vxlan_stop_evtag);
 				t4_tracer_modunload();
 #ifdef KERN_TLS
 				t6_ktls_modunload();

Modified: head/sys/dev/cxgbe/t4_sge.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sge.c	Fri Sep 18 02:37:57 2020	(r365870)
+++ head/sys/dev/cxgbe/t4_sge.c	Fri Sep 18 03:01:47 2020	(r365871)
@@ -55,6 +55,7 @@ __FBSDID("$FreeBSD$");
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_vlan_var.h>
+#include <net/if_vxlan.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
@@ -266,8 +267,9 @@ static int find_refill_source(struct adapter *, int, b
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
-static inline u_int txpkt_len16(u_int, u_int);
-static inline u_int txpkt_vm_len16(u_int, u_int);
+static inline u_int txpkt_len16(u_int, const u_int);
+static inline u_int txpkt_vm_len16(u_int, const u_int);
+static inline void calculate_mbuf_len16(struct adapter *, struct mbuf *);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
@@ -1917,13 +1919,42 @@ eth_rx(struct adapter *sc, struct sge_rxq *rxq, const 
 #if defined(INET) || defined(INET6)
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
+	uint16_t err_vec, tnl_type, tnlhdr_len;
 	static const int sw_hashtype[4][2] = {
 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
 	};
+	static const int sw_csum_flags[2][2] = {
+		{
+			/* IP, inner IP */
+			CSUM_ENCAP_VXLAN |
+			    CSUM_L3_CALC | CSUM_L3_VALID |
+			    CSUM_L4_CALC | CSUM_L4_VALID |
+			    CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
+			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
 
+			/* IP, inner IP6 */
+			CSUM_ENCAP_VXLAN |
+			    CSUM_L3_CALC | CSUM_L3_VALID |
+			    CSUM_L4_CALC | CSUM_L4_VALID |
+			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+		},
+		{
+			/* IP6, inner IP */
+			CSUM_ENCAP_VXLAN |
+			    CSUM_L4_CALC | CSUM_L4_VALID |
+			    CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
+			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+
+			/* IP6, inner IP6 */
+			CSUM_ENCAP_VXLAN |
+			    CSUM_L4_CALC | CSUM_L4_VALID |
+			    CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+		},
+	};
+
 	MPASS(plen > sc->params.sge.fl_pktshift);
 	if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) &&
 	    __predict_true((fl->flags & FL_BUF_RESUME) == 0)) {
@@ -1963,23 +1994,73 @@ have_mbuf:
 	m0->m_pkthdr.flowid = be32toh(d->rss.hash_val);
 
 	cpl = (const void *)(&d->rss + 1);
-	if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) {
-		if (ifp->if_capenable & IFCAP_RXCSUM &&
-		    cpl->l2info & htobe32(F_RXF_IP)) {
-			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
-			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+	if (sc->params.tp.rx_pkt_encap) {
+		const uint16_t ev = be16toh(cpl->err_vec);
+
+		err_vec = G_T6_COMPR_RXERR_VEC(ev);
+		tnl_type = G_T6_RX_TNL_TYPE(ev);
+		tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev);
+	} else {
+		err_vec = be16toh(cpl->err_vec);
+		tnl_type = 0;
+		tnlhdr_len = 0;
+	}
+	if (cpl->csum_calc && err_vec == 0) {
+		int ipv6 = !!(cpl->l2info & htobe32(F_RXF_IP6));
+
+		/* checksum(s) calculated and found to be correct. */
+
+		MPASS((cpl->l2info & htobe32(F_RXF_IP)) ^
+		    (cpl->l2info & htobe32(F_RXF_IP6)));
+		m0->m_pkthdr.csum_data = be16toh(cpl->csum);
+		if (tnl_type == 0) {
+	    		if (!ipv6 && ifp->if_capenable & IFCAP_RXCSUM) {
+				m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
+				    CSUM_L3_VALID | CSUM_L4_CALC |
+				    CSUM_L4_VALID;
+			} else if (ipv6 && ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
+				m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
+				    CSUM_L4_VALID;
+			}
 			rxq->rxcsum++;
-		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
-		    cpl->l2info & htobe32(F_RXF_IP6)) {
-			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
-			    CSUM_PSEUDO_HDR);
-			rxq->rxcsum++;
-		}
+		} else {
+			MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN);
+			if (__predict_false(cpl->ip_frag)) {
+				/*
+				 * csum_data is for the inner frame (which is an
+				 * IP fragment) and is not 0xffff.  There is no
+				 * way to pass the inner csum_data to the stack.
+				 * We don't want the stack to use the inner
+				 * csum_data to validate the outer frame or it
+				 * will get rejected.  So we fix csum_data here
+				 * and let sw do the checksum of inner IP
+				 * fragments.
+				 *
+				 * XXX: Need 32b for csum_data2 in an rx mbuf.
+				 * Maybe stuff it into rcv_tstmp?
+				 */
+				m0->m_pkthdr.csum_data = 0xffff;
+				if (ipv6) {
+					m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
+					    CSUM_L4_VALID;
+				} else {
+					m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
+					    CSUM_L3_VALID | CSUM_L4_CALC |
+					    CSUM_L4_VALID;
+				}
+			} else {
+				int outer_ipv6;
 
-		if (__predict_false(cpl->ip_frag))
-			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
-		else
-			m0->m_pkthdr.csum_data = 0xffff;
+				MPASS(m0->m_pkthdr.csum_data == 0xffff);
+
+				outer_ipv6 = tnlhdr_len >=
+				    sizeof(struct ether_header) +
+				    sizeof(struct ip6_hdr);
+				m0->m_pkthdr.csum_flags =
+				    sw_csum_flags[outer_ipv6][ipv6];
+			}
+			rxq->vxlan_rxcsum++;
+		}
 	}
 
 	if (cpl->vlan_ex) {
@@ -2007,7 +2088,7 @@ have_mbuf:
 	m0->m_pkthdr.numa_domain = ifp->if_numa_domain;
 #endif
 #if defined(INET) || defined(INET6)
-	if (rxq->iq.flags & IQ_LRO_ENABLED &&
+	if (rxq->iq.flags & IQ_LRO_ENABLED && tnl_type == 0 &&
 	    (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 ||
 	    M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) {
 		if (sort_before_lro(lro)) {
@@ -2179,10 +2260,10 @@ mbuf_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
-	KASSERT(m->m_pkthdr.l5hlen > 0,
+	KASSERT(m->m_pkthdr.inner_l5hlen > 0,
 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
-	return (m->m_pkthdr.l5hlen);
+	return (m->m_pkthdr.inner_l5hlen);
 }
 
 static inline void
@@ -2190,7 +2271,7 @@ set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
-	m->m_pkthdr.l5hlen = nsegs;
+	m->m_pkthdr.inner_l5hlen = nsegs;
 }
 
 static inline int
@@ -2316,63 +2397,108 @@ alloc_wr_mbuf(int len, int how)
 	return (m);
 }
 
-static inline int
+static inline bool
 needs_hwcsum(struct mbuf *m)
 {
+	const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP |
+	    CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
+	    CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP |
+	    CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP |
+	    CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO;
 
 	M_ASSERTPKTHDR(m);
 
-	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP |
-	    CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6));
+	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
-static inline int
+static inline bool
 needs_tso(struct mbuf *m)
 {
+	const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO |
+	    CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
 
 	M_ASSERTPKTHDR(m);
 
-	return (m->m_pkthdr.csum_flags & CSUM_TSO);
+	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
-static inline int
+static inline bool
+needs_vxlan_csum(struct mbuf *m)
+{
+
+	M_ASSERTPKTHDR(m);
+
+	return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN);
+}
+
+static inline bool
+needs_vxlan_tso(struct mbuf *m)
+{
+	const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO |
+	    CSUM_INNER_IP6_TSO;
+
+	M_ASSERTPKTHDR(m);
+
+	return ((m->m_pkthdr.csum_flags & csum_flags) != 0 &&
+	    (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN);
+}
+
+static inline bool
+needs_inner_tcp_csum(struct mbuf *m)
+{
+	const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
+
+	M_ASSERTPKTHDR(m);
+
+	return (m->m_pkthdr.csum_flags & csum_flags);
+}
+
+static inline bool
 needs_l3_csum(struct mbuf *m)
 {
+	const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP |
+	    CSUM_INNER_IP_TSO;
 
 	M_ASSERTPKTHDR(m);
 
-	return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO));
+	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
-static inline int
-needs_tcp_csum(struct mbuf *m)
+static inline bool
+needs_outer_tcp_csum(struct mbuf *m)
 {
+	const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP |
+	    CSUM_IP6_TSO;
 
 	M_ASSERTPKTHDR(m);
-	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO));
+
+	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
 #ifdef RATELIMIT
-static inline int
-needs_l4_csum(struct mbuf *m)
+static inline bool
+needs_outer_l4_csum(struct mbuf *m)
 {
+	const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO |
+	    CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO;
 
 	M_ASSERTPKTHDR(m);
 
-	return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
-	    CSUM_TCP_IPV6 | CSUM_TSO));
+	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 
-static inline int
-needs_udp_csum(struct mbuf *m)
+static inline bool
+needs_outer_udp_csum(struct mbuf *m)
 {
+	const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP;
 
 	M_ASSERTPKTHDR(m);
-	return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6));
+
+	return (m->m_pkthdr.csum_flags & csum_flags);
 }
 #endif
 
-static inline int
+static inline bool
 needs_vlan_insertion(struct mbuf *m)
 {
 
@@ -2513,6 +2639,23 @@ count_mbuf_nsegs(struct mbuf *m, int skip, uint8_t *cf
 }
 
 /*
+ * The maximum number of segments that can fit in a WR.
+ */
+static int
+max_nsegs_allowed(struct mbuf *m)
+{
+
+	if (needs_tso(m)) {
+		if (needs_vxlan_tso(m))
+			return (TX_SGL_SEGS_VXLAN_TSO);
+		else
+			return (TX_SGL_SEGS_TSO);
+	}
+
+	return (TX_SGL_SEGS);
+}
+
+/*
  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
  * a) caller can assume it's been freed if this function returns with an error.
  * b) it may get defragged up if the gather list is too long for the hardware.
@@ -2570,7 +2713,7 @@ restart:
 		return (0);
 	}
 #endif
-	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
+	if (nsegs > max_nsegs_allowed(m0)) {
 		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
 			rc = EFBIG;
 			goto fail;
@@ -2592,18 +2735,15 @@ restart:
 	}
 	set_mbuf_nsegs(m0, nsegs);
 	set_mbuf_cflags(m0, cflags);
-	if (sc->flags & IS_VF)
-		set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
-	else
-		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
+	calculate_mbuf_len16(sc, m0);
 
 #ifdef RATELIMIT
 	/*
 	 * Ethofld is limited to TCP and UDP for now, and only when L4 hw
-	 * checksumming is enabled.  needs_l4_csum happens to check for all the
-	 * right things.
+	 * checksumming is enabled.  needs_outer_l4_csum happens to check for
+	 * all the right things.
 	 */
-	if (__predict_false(needs_eo(cst) && !needs_l4_csum(m0))) {
+	if (__predict_false(needs_eo(cst) && !needs_outer_l4_csum(m0))) {
 		m_snd_tag_rele(m0->m_pkthdr.snd_tag);
 		m0->m_pkthdr.snd_tag = NULL;
 		m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
@@ -2635,21 +2775,27 @@ restart:
 	switch (eh_type) {
 #ifdef INET6
 	case ETHERTYPE_IPV6:
-	{
-		struct ip6_hdr *ip6 = l3hdr;
-
-		MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
-
-		m0->m_pkthdr.l3hlen = sizeof(*ip6);
+		m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
 		break;
-	}
 #endif
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = l3hdr;
 
-		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
+		if (needs_vxlan_csum(m0)) {
+			/* Driver will do the outer IP hdr checksum. */
+			ip->ip_sum = 0;
+			if (needs_vxlan_tso(m0)) {
+				const uint16_t ipl = ip->ip_len;
+
+				ip->ip_len = 0;
+				ip->ip_sum = ~in_cksum_hdr(ip);
+				ip->ip_len = ipl;
+			} else
+				ip->ip_sum = in_cksum_hdr(ip);
+		}
+		m0->m_pkthdr.l3hlen = ip->ip_hl << 2;
 		break;
 	}
 #endif
@@ -2659,8 +2805,59 @@ restart:
 		    __func__, eh_type);
 	}
 
+	if (needs_vxlan_csum(m0)) {
+		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
+		m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header);
+
+		/* Inner headers. */
+		eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen +
+		    sizeof(struct udphdr) + sizeof(struct vxlan_header));
+		eh_type = ntohs(eh->ether_type);
+		if (eh_type == ETHERTYPE_VLAN) {
+			struct ether_vlan_header *evh = (void *)eh;
+
+			eh_type = ntohs(evh->evl_proto);
+			m0->m_pkthdr.inner_l2hlen = sizeof(*evh);
+		} else
+			m0->m_pkthdr.inner_l2hlen = sizeof(*eh);
+		l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen);
+
+		switch (eh_type) {
+#ifdef INET6
+		case ETHERTYPE_IPV6:
+			m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr);
+			break;
+#endif
+#ifdef INET
+		case ETHERTYPE_IP:
+		{
+			struct ip *ip = l3hdr;
+
+			m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2;
+			break;
+		}
+#endif
+		default:
+			panic("%s: VXLAN hw offload requested with unknown "
+			    "ethertype 0x%04x.  if_cxgbe must be compiled"
+			    " with the same INET/INET6 options as the kernel.",
+			    __func__, eh_type);
+		}
 #if defined(INET) || defined(INET6)
-	if (needs_tcp_csum(m0)) {
+		if (needs_inner_tcp_csum(m0)) {
+			tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen);
+			m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4;
+		}
+#endif
+		MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
+		m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP |
+		    CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP |
+		    CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO |
+		    CSUM_ENCAP_VXLAN;
+	}
+
+#if defined(INET) || defined(INET6)
+	if (needs_outer_tcp_csum(m0)) {
 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
 #ifdef RATELIMIT
@@ -2670,7 +2867,7 @@ restart:
 			    V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
 		} else
 			set_mbuf_eo_tsclk_tsoff(m0, 0);
-	} else if (needs_udp_csum(m0)) {
+	} else if (needs_outer_udp_csum(m0)) {
 		m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
 #endif
 	}
@@ -3627,6 +3824,9 @@ alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
 	    CTLFLAG_RD, &rxq->vlan_extraction,
 	    "# of times hardware extracted 802.1Q tag");
+	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_rxcsum",
+	    CTLFLAG_RD, &rxq->vxlan_rxcsum,
+	    "# of times hardware assisted with inner checksum (VXLAN) ");
 
 	add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
 
@@ -4281,6 +4481,11 @@ alloc_txq(struct vi_info *vi, struct sge_txq *txq, int
 	    "# of frames tx'd using type1 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
 	    &txq->raw_wrs, "# of raw work requests (non-packets)");
+	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_tso_wrs",
+	    CTLFLAG_RD, &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests");
+	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_txcsum",
+	    CTLFLAG_RD, &txq->vxlan_txcsum,
+	    "# of times hardware assisted with inner checksums (VXLAN)");
 
 #ifdef KERN_TLS
 	if (sc->flags & KERN_TLS_OK) {
@@ -4570,27 +4775,25 @@ get_pkt_gl(struct mbuf *m, struct sglist *gl)
 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
 	    mbuf_nsegs(m), gl->sg_nseg));
-	KASSERT(gl->sg_nseg > 0 &&
-	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
+	KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m),
 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
-		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
+		gl->sg_nseg, max_nsegs_allowed(m)));
 }
 
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
-txpkt_len16(u_int nsegs, u_int tso)
+txpkt_len16(u_int nsegs, const u_int extra)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
-	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
+	n = extra + sizeof(struct fw_eth_tx_pkt_wr) +
+	    sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
-	if (tso)
-		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
@@ -4600,22 +4803,43 @@ txpkt_len16(u_int nsegs, u_int tso)
  * request header.
  */
 static inline u_int
-txpkt_vm_len16(u_int nsegs, u_int tso)
+txpkt_vm_len16(u_int nsegs, const u_int extra)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
-	n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
+	n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) +
 	    sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
-	if (tso)
-		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202009180301.08I31lhU021051>