Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 24 Aug 2017 22:33:43 +0000 (UTC)
From:      David C Somayajulu <davidcs@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org
Subject:   svn commit: r322864 - in stable/10/sys: dev/qlnx/qlnxe modules/qlnx/qlnxe
Message-ID:  <201708242233.v7OMXhOL022509@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: davidcs
Date: Thu Aug 24 22:33:42 2017
New Revision: 322864
URL: https://svnweb.freebsd.org/changeset/base/322864

Log:
  MFC r322408
  Performance enhancements to reduce CPU utililization for large number of
  TCP connections (order of tens of thousands), with predominantly Transmits.
  
  Submitted by:	Vaishali.Kulkarni@cavium.com
  Approved by:	re(marius)

Modified:
  stable/10/sys/dev/qlnx/qlnxe/qlnx_def.h
  stable/10/sys/dev/qlnx/qlnxe/qlnx_os.c
  stable/10/sys/dev/qlnx/qlnxe/qlnx_ver.h
  stable/10/sys/modules/qlnx/qlnxe/Makefile
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/dev/qlnx/qlnxe/qlnx_def.h
==============================================================================
--- stable/10/sys/dev/qlnx/qlnxe/qlnx_def.h	Thu Aug 24 22:11:10 2017	(r322863)
+++ stable/10/sys/dev/qlnx/qlnxe/qlnx_def.h	Thu Aug 24 22:33:42 2017	(r322864)
@@ -50,9 +50,10 @@ struct qlnx_ivec {
 
 typedef struct qlnx_ivec qlnx_ivec_t;
 
-//#define QLNX_MAX_RSS	30
-#define QLNX_MAX_RSS	16
-#define QLNX_MAX_TC	1
+//#define QLNX_MAX_RSS		30
+#define QLNX_MAX_RSS		36
+#define QLNX_DEFAULT_RSS	16
+#define QLNX_MAX_TC		1
 
 enum QLNX_STATE {
         QLNX_STATE_CLOSED,
@@ -201,6 +202,17 @@ struct qlnx_fastpath {
 	uint64_t		tx_pkts_freed;
 	uint64_t		tx_pkts_transmitted;
 	uint64_t		tx_pkts_completed;
+	uint64_t		tx_tso_pkts;
+	uint64_t		tx_non_tso_pkts;
+
+#ifdef QLNX_TRACE_PERF_DATA
+	uint64_t		tx_pkts_trans_ctx;
+	uint64_t		tx_pkts_compl_ctx;
+	uint64_t		tx_pkts_trans_fp;
+	uint64_t		tx_pkts_compl_fp;
+	uint64_t		tx_pkts_compl_intr;
+#endif
+
 	uint64_t		tx_lso_wnd_min_len;
 	uint64_t		tx_defrag;
 	uint64_t		tx_nsegs_gt_elem_left;
@@ -209,6 +221,13 @@ struct qlnx_fastpath {
 	uint32_t		tx_tso_max_pkt_len;
 	uint32_t		tx_tso_min_pkt_len;
 	uint64_t		tx_pkts[QLNX_FP_MAX_SEGS];
+
+#ifdef QLNX_TRACE_PERF_DATA
+	uint64_t		tx_pkts_hist[QLNX_FP_MAX_SEGS];
+	uint64_t		tx_comInt[QLNX_FP_MAX_SEGS];
+	uint64_t		tx_pkts_q[QLNX_FP_MAX_SEGS];
+#endif
+
 	uint64_t		err_tx_nsegs_gt_elem_left;
         uint64_t                err_tx_dmamap_create;
         uint64_t                err_tx_defrag_dmamap_load;
@@ -301,8 +320,13 @@ typedef struct qlnx_link_output qlnx_link_output_t;
 #define QLNX_MFW_VERSION_LENGTH 32
 #define QLNX_STORMFW_VERSION_LENGTH 32
 
-#define QLNX_TX_ELEM_RESERVE	2
+#define QLNX_TX_ELEM_RESERVE		2
+#define QLNX_TX_ELEM_THRESH		128
+#define QLNX_TX_ELEM_MAX_THRESH		512
+#define QLNX_TX_ELEM_MIN_THRESH		32
+#define QLNX_TX_COMPL_THRESH		32
 
+
 #define QLNX_TPA_MAX_AGG_BUFFERS             (20)
 
 #define QLNX_MAX_NUM_MULTICAST_ADDRS	ECORE_MAX_MC_ADDRS
@@ -454,6 +478,7 @@ struct qlnx_host {
 	qlnx_storm_stats_t	storm_stats[QLNX_STORM_STATS_TOTAL];
 	uint32_t		storm_stats_index;
 	uint32_t		storm_stats_enable;
+	uint32_t		storm_stats_gather;
 
 	uint32_t		personality;
 };
@@ -470,8 +495,11 @@ typedef struct qlnx_host qlnx_host_t;
 
 #define QLNX_MAX_MTU			9000
 #define QLNX_MAX_SEGMENTS_NON_TSO	(ETH_TX_MAX_BDS_PER_NON_LSO_PACKET - 1)
-#define QLNX_MAX_TSO_FRAME_SIZE		((64 * 1024 - 1) + 22)
+//#define QLNX_MAX_TSO_FRAME_SIZE		((64 * 1024 - 1) + 22)
+#define QLNX_MAX_TSO_FRAME_SIZE		65536
+#define QLNX_MAX_TX_MBUF_SIZE		65536    /* bytes - bd_len = 16bits */
 
+
 #define QL_MAC_CMP(mac1, mac2)    \
         ((((*(uint32_t *) mac1) == (*(uint32_t *) mac2) && \
         (*(uint16_t *)(mac1 + 4)) == (*(uint16_t *)(mac2 + 4)))) ? 0 : 1)
@@ -702,6 +730,18 @@ extern void qlnx_fill_link(struct ecore_hwfn *hwfn,
 #define CQE_HAS_VLAN(flags) \
         ((flags) & (PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK \
                 << PARSING_AND_ERR_FLAGS_TAG8021QEXIST_SHIFT))
+
+#if defined(__i386__) || defined(__amd64__)
+
+static __inline
+void prefetch(void *x)
+{
+        __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
+}
+
+#else
+#define prefetch(x)
+#endif
 
 
 #endif /* #ifndef _QLNX_DEF_H_ */

Modified: stable/10/sys/dev/qlnx/qlnxe/qlnx_os.c
==============================================================================
--- stable/10/sys/dev/qlnx/qlnxe/qlnx_os.c	Thu Aug 24 22:11:10 2017	(r322863)
+++ stable/10/sys/dev/qlnx/qlnxe/qlnx_os.c	Thu Aug 24 22:33:42 2017	(r322864)
@@ -94,6 +94,8 @@ static int qlnx_get_ifq_snd_maxlen(qlnx_host_t *ha);
 static uint32_t qlnx_get_optics(qlnx_host_t *ha,
 			struct qlnx_link_output *if_link);
 static int qlnx_transmit(struct ifnet *ifp, struct mbuf  *mp);
+static int qlnx_transmit_locked(struct ifnet *ifp, struct qlnx_fastpath *fp,
+		struct mbuf *mp);
 static void qlnx_qflush(struct ifnet *ifp);
 
 static int qlnx_alloc_parent_dma_tag(qlnx_host_t *ha);
@@ -133,6 +135,8 @@ static void qlnx_timer(void *arg);
 static int qlnx_alloc_tx_br(qlnx_host_t *ha, struct qlnx_fastpath *fp);
 static void qlnx_free_tx_br(qlnx_host_t *ha, struct qlnx_fastpath *fp);
 static void qlnx_trigger_dump(qlnx_host_t *ha);
+static uint16_t qlnx_num_tx_compl(qlnx_host_t *ha, struct qlnx_fastpath *fp,
+			struct qlnx_tx_queue *txq);
 static void qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp,
 		struct qlnx_tx_queue *txq);
 static int qlnx_rx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, int budget,
@@ -215,6 +219,12 @@ char qlnx_name_str[NAME_SIZE];
 #define QLOGIC_PCI_DEVICE_ID_8070	0x8070
 #endif
 
+SYSCTL_NODE(_hw, OID_AUTO, qlnxe, CTLFLAG_RD, 0, "qlnxe driver parameters");
+/* Number of Queues: 0 (Auto) or 1 to 32 (fixed queue number) */
+static int qlnxe_queue_count = QLNX_DEFAULT_RSS;
+SYSCTL_INT(_hw_qlnxe, OID_AUTO, queue_count, CTLFLAG_RDTUN,
+		&qlnxe_queue_count, 0, "Multi-Queue queue count");
+
 static int
 qlnx_valid_device(device_t dev)
 {
@@ -302,7 +312,26 @@ qlnx_pci_probe(device_t dev)
         return (BUS_PROBE_DEFAULT);
 }
 
+static uint16_t
+qlnx_num_tx_compl(qlnx_host_t *ha, struct qlnx_fastpath *fp,
+	struct qlnx_tx_queue *txq)
+{
+	u16 hw_bd_cons;
+	u16 ecore_cons_idx;
+	uint16_t diff;
 
+	hw_bd_cons = le16toh(*txq->hw_cons_ptr);
+
+	ecore_cons_idx = ecore_chain_get_cons_idx(&txq->tx_pbl);
+	if (hw_bd_cons < ecore_cons_idx) {
+		diff = (1 << 16) - (ecore_cons_idx - hw_bd_cons);
+	} else {
+		diff = hw_bd_cons - ecore_cons_idx;
+	}
+	return diff;
+}
+
+
 static void
 qlnx_sp_intr(void *arg)
 {
@@ -395,14 +424,11 @@ qlnx_fp_taskqueue(void *context, int pending)
         struct qlnx_fastpath	*fp;
         qlnx_host_t		*ha;
         struct ifnet		*ifp;
-        struct mbuf		*mp;
-        int			ret;
-	struct thread		*cthread;
 
 #ifdef QLNX_RCV_IN_TASKQ
 	int			lro_enable;
 	int			rx_int = 0, total_rx_count = 0;
-
+	struct thread		*cthread;
 #endif /* #ifdef QLNX_RCV_IN_TASKQ */
 
         fp = context;
@@ -410,6 +436,12 @@ qlnx_fp_taskqueue(void *context, int pending)
         if (fp == NULL)
                 return;
 
+        ha = (qlnx_host_t *)fp->edev;
+
+        ifp = ha->ifp;
+
+#ifdef QLNX_RCV_IN_TASKQ
+
 	cthread = curthread;
 
 	thread_lock(cthread);
@@ -419,112 +451,81 @@ qlnx_fp_taskqueue(void *context, int pending)
 
 	thread_unlock(cthread);
 
-        ha = (qlnx_host_t *)fp->edev;
+	lro_enable = ifp->if_capenable & IFCAP_LRO;
 
-        ifp = ha->ifp;
+	rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold, lro_enable);
 
-#ifdef QLNX_RCV_IN_TASKQ
-	{
-		lro_enable = ifp->if_capenable & IFCAP_LRO;
+	if (rx_int) {
+		fp->rx_pkts += rx_int;
+		total_rx_count += rx_int;
+	}
 
-		rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold, lro_enable);
-
-		if (rx_int) {
-			fp->rx_pkts += rx_int;
-			total_rx_count += rx_int;
-		}
-
 #ifdef QLNX_SOFT_LRO
-		{
-			struct lro_ctrl *lro;
-	
-			lro = &fp->rxq->lro;
+	{
+		struct lro_ctrl *lro;
 
-			if (lro_enable && total_rx_count) {
+		lro = &fp->rxq->lro;
 
+		if (lro_enable && total_rx_count) {
+
 #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO)
 
-				if (ha->dbg_trace_lro_cnt) {
-					if (lro->lro_mbuf_count & ~1023)
-						fp->lro_cnt_1024++;
-					else if (lro->lro_mbuf_count & ~511)
-						fp->lro_cnt_512++;
-					else if (lro->lro_mbuf_count & ~255)
-						fp->lro_cnt_256++;
-					else if (lro->lro_mbuf_count & ~127)
-						fp->lro_cnt_128++;
-					else if (lro->lro_mbuf_count & ~63)
-						fp->lro_cnt_64++;
-				}
-				tcp_lro_flush_all(lro);
+			if (ha->dbg_trace_lro_cnt) {
+				if (lro->lro_mbuf_count & ~1023)
+					fp->lro_cnt_1024++;
+				else if (lro->lro_mbuf_count & ~511)
+					fp->lro_cnt_512++;
+				else if (lro->lro_mbuf_count & ~255)
+					fp->lro_cnt_256++;
+				else if (lro->lro_mbuf_count & ~127)
+					fp->lro_cnt_128++;
+				else if (lro->lro_mbuf_count & ~63)
+					fp->lro_cnt_64++;
+			}
+			tcp_lro_flush_all(lro);
 
 #else
-				struct lro_entry *queued;
+			struct lro_entry *queued;
 
-				while ((!SLIST_EMPTY(&lro->lro_active))) {
-					queued = SLIST_FIRST(&lro->lro_active);
-					SLIST_REMOVE_HEAD(&lro->lro_active, next);
-					tcp_lro_flush(lro, queued);
-				}
-#endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */
+			while ((!SLIST_EMPTY(&lro->lro_active))) {
+				queued = SLIST_FIRST(&lro->lro_active);
+				SLIST_REMOVE_HEAD(&lro->lro_active, next);
+				tcp_lro_flush(lro, queued);
 			}
+#endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */
 		}
+	}
 #endif /* #ifdef QLNX_SOFT_LRO */
 
-		ecore_sb_update_sb_idx(fp->sb_info);
-		rmb();
-	}
+	ecore_sb_update_sb_idx(fp->sb_info);
+	rmb();
 
 #endif /* #ifdef QLNX_RCV_IN_TASKQ */
 
-        mtx_lock(&fp->tx_mtx);
+        if(ifp->if_drv_flags & IFF_DRV_RUNNING) {
 
-        if (((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
-                IFF_DRV_RUNNING) || (!ha->link_up)) {
+                if (!drbr_empty(ifp, fp->tx_br)) {
 
-                mtx_unlock(&fp->tx_mtx);
-                goto qlnx_fp_taskqueue_exit;
-        }
+                        if(mtx_trylock(&fp->tx_mtx)) {
 
-        mp = drbr_peek(ifp, fp->tx_br);
+#ifdef QLNX_TRACE_PERF_DATA
+                                tx_pkts = fp->tx_pkts_transmitted;
+                                tx_compl = fp->tx_pkts_completed;
+#endif
 
-        while (mp != NULL) {
+                                qlnx_transmit_locked(ifp, fp, NULL);
 
-		if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
-			ret = qlnx_send(ha, fp, &mp);
-		} else {
-			ret = -1;
-		}
-
-                if (ret) {
-
-                        if (mp != NULL) {
-                                drbr_putback(ifp, fp->tx_br, mp);
-                        } else {
-                                fp->tx_pkts_processed++;
-                                drbr_advance(ifp, fp->tx_br);
+#ifdef QLNX_TRACE_PERF_DATA
+                                fp->tx_pkts_trans_fp +=
+					(fp->tx_pkts_transmitted - tx_pkts);
+                                fp->tx_pkts_compl_fp +=
+					(fp->tx_pkts_completed - tx_compl);
+#endif
+                                mtx_unlock(&fp->tx_mtx);
                         }
-
-                        mtx_unlock(&fp->tx_mtx);
-
-                        goto qlnx_fp_taskqueue_exit;
-
-                } else {
-                        drbr_advance(ifp, fp->tx_br);
-                        fp->tx_pkts_transmitted++;
-                        fp->tx_pkts_processed++;
                 }
-
-		if (fp->tx_ring_full)
-			break;
-
-                mp = drbr_peek(ifp, fp->tx_br);
         }
 
-        mtx_unlock(&fp->tx_mtx);
-
-qlnx_fp_taskqueue_exit:
-
 #ifdef QLNX_RCV_IN_TASKQ
 	if (rx_int) {
 		if (fp->fp_taskqueue != NULL)
@@ -537,7 +538,7 @@ qlnx_fp_taskqueue_exit:
 	}
 #endif /* #ifdef QLNX_RCV_IN_TASKQ */
 
-        QL_DPRINT2(ha, "exit ret = %d\n", ret);
+        QL_DPRINT2(ha, "exit \n");
         return;
 }
 
@@ -611,6 +612,17 @@ qlnx_drain_fp_taskqueues(qlnx_host_t *ha)
 	return;
 }
 
+static void
+qlnx_get_params(qlnx_host_t *ha)
+{
+	if ((qlnxe_queue_count < 0) || (qlnxe_queue_count > QLNX_MAX_RSS)) {
+		device_printf(ha->pci_dev, "invalid queue_count value (%d)\n",
+			qlnxe_queue_count);
+		qlnxe_queue_count = 0;
+	}
+	return;
+}
+
 /*
  * Name:	qlnx_pci_attach
  * Function:	attaches the device to the operating system
@@ -706,10 +718,21 @@ qlnx_pci_attach(device_t dev)
 	if (qlnx_init_hw(ha) != 0)
 		goto qlnx_pci_attach_err;
 
+	qlnx_get_params(ha);
+
+	if((pci_get_device(dev) == QLOGIC_PCI_DEVICE_ID_1644) &&
+		(qlnxe_queue_count == QLNX_DEFAULT_RSS)) {
+		qlnxe_queue_count = QLNX_MAX_RSS;
+	}
+
 	/*
 	 * Allocate MSI-x vectors
 	 */
-	ha->num_rss = QLNX_MAX_RSS;
+	if(qlnxe_queue_count == 0)
+		ha->num_rss = QLNX_DEFAULT_RSS;
+	 else
+		ha->num_rss = qlnxe_queue_count;
+
 	ha->num_tc = QLNX_MAX_TC;
 
         ha->msix_count = pci_msix_count(dev);
@@ -1236,6 +1259,44 @@ qlnx_add_fp_stats_sysctls(qlnx_host_t *ha)
 			CTLFLAG_RD, &ha->fp_array[i].tx_pkts_completed,
 			"No. of transmit completions");
 
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_non_tso_pkts",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_non_tso_pkts,
+                        "No. of non LSO transmited packets");
+
+#ifdef QLNX_TRACE_PERF_DATA
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_pkts_trans_ctx",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_trans_ctx,
+                        "No. of transmitted packets in transmit context");
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_pkts_compl_ctx",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_ctx,
+                        "No. of transmit completions in transmit context");
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_pkts_trans_fp",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_trans_fp,
+                        "No. of transmitted packets in taskqueue");
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_pkts_compl_fp",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_fp,
+                        "No. of transmit completions in taskqueue");
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_pkts_compl_intr",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_intr,
+                        "No. of transmit completions in interrupt ctx");
+#endif
+
+                SYSCTL_ADD_QUAD(ctx, node_children,
+                        OID_AUTO, "tx_tso_pkts",
+                        CTLFLAG_RD, &ha->fp_array[i].tx_tso_pkts,
+                        "No. of LSO transmited packets");
+
 		SYSCTL_ADD_QUAD(ctx, node_children,
 			OID_AUTO, "tx_lso_wnd_min_len",
 			CTLFLAG_RD, &ha->fp_array[i].tx_lso_wnd_min_len,
@@ -1284,6 +1345,39 @@ qlnx_add_fp_stats_sysctls(qlnx_host_t *ha)
 				&ha->fp_array[i].tx_pkts[j], name_str);
 		}
 
+#ifdef QLNX_TRACE_PERF_DATA
+                for (j = 0; j < 18; j++) {
+
+                        bzero(name_str, (sizeof(uint8_t) * sizeof(name_str)));
+                        snprintf(name_str, sizeof(name_str),
+                                "tx_pkts_hist_%02d", (j+1));
+
+                        SYSCTL_ADD_QUAD(ctx, node_children,
+                                OID_AUTO, name_str, CTLFLAG_RD,
+                                &ha->fp_array[i].tx_pkts_hist[j], name_str);
+                }
+                for (j = 0; j < 5; j++) {
+
+                        bzero(name_str, (sizeof(uint8_t) * sizeof(name_str)));
+                        snprintf(name_str, sizeof(name_str),
+                                "tx_comInt_%02d", (j+1));
+
+                        SYSCTL_ADD_QUAD(ctx, node_children,
+                                OID_AUTO, name_str, CTLFLAG_RD,
+                                &ha->fp_array[i].tx_comInt[j], name_str);
+                }
+                for (j = 0; j < 18; j++) {
+
+                        bzero(name_str, (sizeof(uint8_t) * sizeof(name_str)));
+                        snprintf(name_str, sizeof(name_str),
+                                "tx_pkts_q_%02d", (j+1));
+
+                        SYSCTL_ADD_QUAD(ctx, node_children,
+                                OID_AUTO, name_str, CTLFLAG_RD,
+                                &ha->fp_array[i].tx_pkts_q[j], name_str);
+                }
+#endif
+
 		SYSCTL_ADD_QUAD(ctx, node_children,
 			OID_AUTO, "err_tx_nsegs_gt_elem_left",
 			CTLFLAG_RD, &ha->fp_array[i].err_tx_nsegs_gt_elem_left,
@@ -1979,6 +2073,12 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha)
 	ifp->if_capabilities |= IFCAP_TSO6;
 	ifp->if_capabilities |= IFCAP_LRO;
 
+	ifp->if_hw_tsomax =  QLNX_MAX_TSO_FRAME_SIZE -
+				(ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+	ifp->if_hw_tsomaxsegcount = QLNX_MAX_SEGMENTS - 1 /* hdr */;
+	ifp->if_hw_tsomaxsegsize = QLNX_MAX_TX_MBUF_SIZE;
+
+
         ifp->if_capenable = ifp->if_capabilities;
 
 	ifp->if_hwassist = CSUM_IP;
@@ -2543,6 +2643,7 @@ qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp,
 	u16 hw_bd_cons;
 	u16 ecore_cons_idx;
 	uint16_t diff;
+	uint16_t idx, idx2;
 
 	hw_bd_cons = le16toh(*txq->hw_cons_ptr);
 
@@ -2580,6 +2681,11 @@ qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp,
 			qlnx_trigger_dump(ha);
 		}
 
+		idx = (txq->sw_tx_cons + 1) & (TX_RING_SIZE - 1);
+		idx2 = (txq->sw_tx_cons + 2) & (TX_RING_SIZE - 1);
+		prefetch(txq->sw_tx_ring[idx].mp);
+		prefetch(txq->sw_tx_ring[idx2].mp);
+
 		qlnx_free_tx_pkt(ha, fp, txq);
 
 		txq->sw_tx_cons = (txq->sw_tx_cons + 1) & (TX_RING_SIZE - 1);
@@ -2588,12 +2694,71 @@ qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp,
 }
 
 static int
+qlnx_transmit_locked(struct ifnet *ifp,struct qlnx_fastpath  *fp, struct mbuf  *mp)
+{
+        int                     ret = 0;
+        struct qlnx_tx_queue    *txq;
+        qlnx_host_t *           ha;
+        uint16_t elem_left;
+
+        txq = fp->txq[0];
+        ha = (qlnx_host_t *)fp->edev;
+
+
+        if ((!(ifp->if_drv_flags & IFF_DRV_RUNNING)) || (!ha->link_up)) {
+                if(mp != NULL)
+                        ret = drbr_enqueue(ifp, fp->tx_br, mp);
+                return (ret);
+        }
+
+        if(mp != NULL)
+                ret  = drbr_enqueue(ifp, fp->tx_br, mp);
+
+        mp = drbr_peek(ifp, fp->tx_br);
+
+        while (mp != NULL) {
+
+                if (qlnx_send(ha, fp, &mp)) {
+
+                        if (mp != NULL) {
+                                drbr_putback(ifp, fp->tx_br, mp);
+                        } else {
+                                fp->tx_pkts_processed++;
+                                drbr_advance(ifp, fp->tx_br);
+                        }
+                        goto qlnx_transmit_locked_exit;
+
+                } else {
+                        drbr_advance(ifp, fp->tx_br);
+                        fp->tx_pkts_transmitted++;
+                        fp->tx_pkts_processed++;
+                }
+
+                mp = drbr_peek(ifp, fp->tx_br);
+        }
+
+qlnx_transmit_locked_exit:
+        if((qlnx_num_tx_compl(ha,fp, fp->txq[0]) > QLNX_TX_COMPL_THRESH) ||
+                ((int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl))
+                                        < QLNX_TX_ELEM_MAX_THRESH))
+                (void)qlnx_tx_int(ha, fp, fp->txq[0]);
+
+        QL_DPRINT2(ha, "%s: exit ret = %d\n", __func__, ret);
+        return ret;
+}
+
+
+static int
 qlnx_transmit(struct ifnet *ifp, struct mbuf  *mp)
 {
         qlnx_host_t		*ha = (qlnx_host_t *)ifp->if_softc;
         struct qlnx_fastpath	*fp;
         int			rss_id = 0, ret = 0;
 
+#ifdef QLNX_TRACEPERF_DATA
+        uint64_t tx_pkts = 0, tx_compl = 0;
+#endif
+
         QL_DPRINT2(ha, "enter\n");
 
 #if __FreeBSD_version >= 1100000
@@ -2611,15 +2776,27 @@ qlnx_transmit(struct ifnet *ifp, struct mbuf  *mp)
                 goto qlnx_transmit_exit;
         }
 
-        if (mp != NULL) {
-                ret = drbr_enqueue(ifp, fp->tx_br, mp);
-        }
+        if (mtx_trylock(&fp->tx_mtx)) {
 
-        if (fp->fp_taskqueue != NULL)
-                taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task);
+#ifdef QLNX_TRACEPERF_DATA
+                        tx_pkts = fp->tx_pkts_transmitted;
+                        tx_compl = fp->tx_pkts_completed;
+#endif
 
-        ret = 0;
+                        ret = qlnx_transmit_locked(ifp, fp, mp);
 
+#ifdef QLNX_TRACEPERF_DATA
+                        fp->tx_pkts_trans_ctx += (fp->tx_pkts_transmitted - tx_pkts);
+                        fp->tx_pkts_compl_ctx += (fp->tx_pkts_completed - tx_compl);
+#endif
+                        mtx_unlock(&fp->tx_mtx);
+        } else {
+                if (mp != NULL && (fp->fp_taskqueue != NULL)) {
+                        ret = drbr_enqueue(ifp, fp->tx_br, mp);
+                        taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task);
+                }
+        }
+
 qlnx_transmit_exit:
 
         QL_DPRINT2(ha, "exit ret = %d\n", ret);
@@ -2799,6 +2976,10 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
 	uint32_t		nbds_in_hdr = 0;
 	uint32_t		offset = 0;
 
+#ifdef QLNX_TRACE_PERF_DATA
+        uint16_t                bd_used;
+#endif
+
 	QL_DPRINT8(ha, "enter\n");
 
 	if (!ha->link_up)
@@ -2811,15 +2992,15 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
 
 	txq = fp->txq[0];
 
-	if (fp->tx_ring_full) {
-		elem_left = ecore_chain_get_elem_left(&txq->tx_pbl);
+        if ((int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl)) <
+		QLNX_TX_ELEM_MIN_THRESH) {
 
-		if (elem_left < (TX_RING_SIZE >> 4)) 
-			return (-1);
-		else 
-			fp->tx_ring_full = 0;
-	}
+                fp->tx_nsegs_gt_elem_left++;
+                fp->err_tx_nsegs_gt_elem_left++;
 
+                return (ENOBUFS);
+        }
+
 	idx = txq->sw_tx_prod;
 
 	map = txq->sw_tx_ring[idx].map;
@@ -2829,14 +3010,18 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
 			BUS_DMA_NOWAIT);
 
 	if (ha->dbg_trace_tso_pkt_len) {
-		if (!fp->tx_tso_min_pkt_len) {
-			fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
-			fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
-		} else {
-			if (fp->tx_tso_min_pkt_len > m_head->m_pkthdr.len)
+		if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+			if (!fp->tx_tso_min_pkt_len) {
 				fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
-			if (fp->tx_tso_max_pkt_len < m_head->m_pkthdr.len)
-				fp->tx_tso_max_pkt_len = m_head->m_pkthdr.len;
+				fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len;
+			} else {
+				if (fp->tx_tso_min_pkt_len > m_head->m_pkthdr.len)
+					fp->tx_tso_min_pkt_len =
+						m_head->m_pkthdr.len;
+				if (fp->tx_tso_max_pkt_len < m_head->m_pkthdr.len)
+					fp->tx_tso_max_pkt_len =
+						m_head->m_pkthdr.len;
+			}
 		}
 	}
 
@@ -2923,6 +3108,105 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
 			fp->tx_pkts[(QLNX_FP_MAX_SEGS - 1)]++; 
 	}
 
+#ifdef QLNX_TRACE_PERF_DATA
+        if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+                if(m_head->m_pkthdr.len <= 2048)
+                        fp->tx_pkts_hist[0]++;
+                else if((m_head->m_pkthdr.len > 2048) &&
+				(m_head->m_pkthdr.len <= 4096))
+                        fp->tx_pkts_hist[1]++;
+                else if((m_head->m_pkthdr.len > 4096) &&
+				(m_head->m_pkthdr.len <= 8192))
+                        fp->tx_pkts_hist[2]++;
+                else if((m_head->m_pkthdr.len > 8192) &&
+				(m_head->m_pkthdr.len <= 12288 ))
+                        fp->tx_pkts_hist[3]++;
+                else if((m_head->m_pkthdr.len > 11288) &&
+				(m_head->m_pkthdr.len <= 16394))
+                        fp->tx_pkts_hist[4]++;
+                else if((m_head->m_pkthdr.len > 16384) &&
+				(m_head->m_pkthdr.len <= 20480))
+                        fp->tx_pkts_hist[5]++;
+                else if((m_head->m_pkthdr.len > 20480) &&
+				(m_head->m_pkthdr.len <= 24576))
+                        fp->tx_pkts_hist[6]++;
+                else if((m_head->m_pkthdr.len > 24576) &&
+				(m_head->m_pkthdr.len <= 28672))
+                        fp->tx_pkts_hist[7]++;
+                else if((m_head->m_pkthdr.len > 28762) &&
+				(m_head->m_pkthdr.len <= 32768))
+                        fp->tx_pkts_hist[8]++;
+                else if((m_head->m_pkthdr.len > 32768) &&
+				(m_head->m_pkthdr.len <= 36864))
+                        fp->tx_pkts_hist[9]++;
+                else if((m_head->m_pkthdr.len > 36864) &&
+				(m_head->m_pkthdr.len <= 40960))
+                        fp->tx_pkts_hist[10]++;
+                else if((m_head->m_pkthdr.len > 40960) &&
+				(m_head->m_pkthdr.len <= 45056))
+                        fp->tx_pkts_hist[11]++;
+                else if((m_head->m_pkthdr.len > 45056) &&
+				(m_head->m_pkthdr.len <= 49152))
+                        fp->tx_pkts_hist[12]++;
+                else if((m_head->m_pkthdr.len > 49512) && 
+				m_head->m_pkthdr.len <= 53248))
+                        fp->tx_pkts_hist[13]++;
+                else if((m_head->m_pkthdr.len > 53248) &&
+				(m_head->m_pkthdr.len <= 57344))
+                        fp->tx_pkts_hist[14]++;
+                else if((m_head->m_pkthdr.len > 53248) &&
+				(m_head->m_pkthdr.len <= 57344))
+                        fp->tx_pkts_hist[15]++;
+                else if((m_head->m_pkthdr.len > 57344) &&
+				(m_head->m_pkthdr.len <= 61440))
+                        fp->tx_pkts_hist[16]++;
+                else
+                        fp->tx_pkts_hist[17]++;
+        }
+
+        if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
+
+                elem_left =  ecore_chain_get_elem_left(&txq->tx_pbl);
+                bd_used = TX_RING_SIZE - elem_left;
+
+                if(bd_used <= 100)
+                        fp->tx_pkts_q[0]++;
+                else if((bd_used > 100) && (bd_used <= 500))
+                        fp->tx_pkts_q[1]++;
+                else if((bd_used > 500) && (bd_used <= 1000))
+                        fp->tx_pkts_q[2]++;
+                else if((bd_used > 1000) && (bd_used <= 2000))
+                        fp->tx_pkts_q[3]++;
+                else if((bd_used > 3000) && (bd_used <= 4000))
+                        fp->tx_pkts_q[4]++;
+                else if((bd_used > 4000) && (bd_used <= 5000))
+                        fp->tx_pkts_q[5]++;
+                else if((bd_used > 6000) && (bd_used <= 7000))
+                        fp->tx_pkts_q[6]++;
+                else if((bd_used > 7000) && (bd_used <= 8000))
+                        fp->tx_pkts_q[7]++;
+                else if((bd_used > 8000) && (bd_used <= 9000))
+                        fp->tx_pkts_q[8]++;
+                else if((bd_used > 9000) && (bd_used <= 10000))
+                        fp->tx_pkts_q[9]++;
+                else if((bd_used > 10000) && (bd_used <= 11000))
+                        fp->tx_pkts_q[10]++;
+                else if((bd_used > 11000) && (bd_used <= 12000))
+                        fp->tx_pkts_q[11]++;
+                else if((bd_used > 12000) && (bd_used <= 13000))
+                        fp->tx_pkts_q[12]++;
+                else if((bd_used > 13000) && (bd_used <= 14000))
+                        fp->tx_pkts_q[13]++;
+                else if((bd_used > 14000) && (bd_used <= 15000))
+                        fp->tx_pkts_q[14]++;
+               else if((bd_used > 15000) && (bd_used <= 16000))
+                        fp->tx_pkts_q[15]++;
+                else
+                        fp->tx_pkts_q[16]++;
+        }
+
+#endif /* end of QLNX_TRACE_PERF_DATA */
+
 	if ((nsegs + QLNX_TX_ELEM_RESERVE) >
 		(int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl))) {
 
@@ -2943,7 +3227,8 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
 
 			fp->err_tx_nsegs_gt_elem_left++;
 			fp->tx_ring_full = 1;
-			ha->storm_stats_enable = 1;
+			if (ha->storm_stats_enable)
+				ha->storm_stats_gather = 1;
 			return (ENOBUFS);
 		}
 	}
@@ -3131,6 +3416,7 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
 			third_bd->data.bitfields |=
 				(nbds_in_hdr<<ETH_TX_DATA_3RD_BD_HDR_NBD_SHIFT);
 		}
+		fp->tx_tso_pkts++;
 	} else {
 		segs++;
 		for (seg_idx = 1; seg_idx < nsegs; seg_idx++) {
@@ -3147,6 +3433,7 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s
 				 << ETH_TX_DATA_1ST_BD_PKT_LEN_SHIFT;
 		first_bd->data.bitfields =
 			htole16(first_bd->data.bitfields);
+		fp->tx_non_tso_pkts++;
 	}
 
 
@@ -4303,8 +4590,10 @@ qlnx_fp_isr(void *arg)
 		if (fp->fp_taskqueue != NULL)
 			taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task);
 #else
-		int	rx_int = 0, total_rx_count = 0;
-		int 	lro_enable, tc;
+		int			rx_int = 0, total_rx_count = 0;
+		int 			lro_enable, tc;
+		struct qlnx_tx_queue	*txq;
+		uint16_t		elem_left;
 
 		lro_enable = ha->ifp->if_capenable & IFCAP_LRO;
 
@@ -4312,10 +4601,36 @@ qlnx_fp_isr(void *arg)
 
                 do {
                         for (tc = 0; tc < ha->num_tc; tc++) {
-                                if (mtx_trylock(&fp->tx_mtx)) {
-                                        qlnx_tx_int(ha, fp, fp->txq[tc]);
-                                        mtx_unlock(&fp->tx_mtx);
-                                }
+
+				txq = fp->txq[tc];
+
+				if((int)(elem_left =
+					ecore_chain_get_elem_left(&txq->tx_pbl)) <
+						QLNX_TX_ELEM_THRESH)  {
+
+                                	if (mtx_trylock(&fp->tx_mtx)) {
+#ifdef QLNX_TRACE_PERF_DATA
+						tx_compl = fp->tx_pkts_completed;
+#endif
+
+						qlnx_tx_int(ha, fp, fp->txq[tc]);
+#ifdef QLNX_TRACE_PERF_DATA
+						fp->tx_pkts_compl_intr +=
+							(fp->tx_pkts_completed - tx_compl);
+						if ((fp->tx_pkts_completed - tx_compl) <= 32)
+							fp->tx_comInt[0]++;
+						else if (((fp->tx_pkts_completed - tx_compl) > 32) &&
+							((fp->tx_pkts_completed - tx_compl) <= 64))
+							fp->tx_comInt[1]++;
+						else if(((fp->tx_pkts_completed - tx_compl) > 64) &&
+							((fp->tx_pkts_completed - tx_compl) <= 128))
+							fp->tx_comInt[2]++;
+						else if(((fp->tx_pkts_completed - tx_compl) > 128))
+							fp->tx_comInt[3]++;
+#endif
+						mtx_unlock(&fp->tx_mtx);
+					}
+				}
                         }
 
                         rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold,
@@ -4328,7 +4643,6 @@ qlnx_fp_isr(void *arg)
 
                 } while (rx_int);
 
-
 #ifdef QLNX_SOFT_LRO
                 {
                         struct lro_ctrl *lro;
@@ -4608,8 +4922,8 @@ qlnx_alloc_tx_dma_tag(qlnx_host_t *ha)
                 NULL, NULL,      /* filter, filterarg */
                 QLNX_MAX_TSO_FRAME_SIZE,     /* maxsize */
                 QLNX_MAX_SEGMENTS,        /* nsegments */
-                (PAGE_SIZE * 4),        /* maxsegsize */
-                BUS_DMA_ALLOCNOW,        /* flags */
+                QLNX_MAX_TX_MBUF_SIZE,	  /* maxsegsize */
+                0,        /* flags */
                 NULL,    /* lockfunc */
                 NULL,    /* lockfuncarg */
                 &ha->tx_tag)) {
@@ -4642,7 +4956,7 @@ qlnx_alloc_rx_dma_tag(qlnx_host_t *ha)
                         MJUM9BYTES,     /* maxsize */
                         1,        /* nsegments */
                         MJUM9BYTES,        /* maxsegsize */
-                        BUS_DMA_ALLOCNOW,        /* flags */
+                        0,        /* flags */
                         NULL,    /* lockfunc */
                         NULL,    /* lockfuncarg */
                         &ha->rx_tag)) {
@@ -5255,6 +5569,14 @@ qlnx_init_fp(qlnx_host_t *ha)
 		fp->tx_pkts_freed = 0;
 		fp->tx_pkts_transmitted = 0;
 		fp->tx_pkts_completed = 0;
+
+#ifdef QLNX_TRACE_PERF_DATA
+		fp->tx_pkts_trans_ctx = 0;
+		fp->tx_pkts_compl_ctx = 0;
+		fp->tx_pkts_trans_fp = 0;
+		fp->tx_pkts_compl_fp = 0;
+		fp->tx_pkts_compl_intr = 0;
+#endif
 		fp->tx_lso_wnd_min_len = 0;
 		fp->tx_defrag = 0;
 		fp->tx_nsegs_gt_elem_left = 0;
@@ -6606,7 +6928,7 @@ qlnx_timer(void *arg)
 
        	ecore_get_vport_stats(&ha->cdev, &ha->hw_stats);
 
-	if (ha->storm_stats_enable)
+	if (ha->storm_stats_gather)
 		qlnx_sample_storm_stats(ha);
 
 	callout_reset(&ha->qlnx_callout, hz, qlnx_timer, ha);
@@ -6855,7 +7177,7 @@ qlnx_sample_storm_stats(qlnx_host_t *ha)
         struct ecore_hwfn	*hwfn;
 
 	if (ha->storm_stats_index >= QLNX_STORM_STATS_SAMPLES_PER_HWFN) {
-		ha->storm_stats_enable = 0;
+		ha->storm_stats_gather = 0;
 		return;
 	}
 

Modified: stable/10/sys/dev/qlnx/qlnxe/qlnx_ver.h
==============================================================================
--- stable/10/sys/dev/qlnx/qlnxe/qlnx_ver.h	Thu Aug 24 22:11:10 2017	(r322863)
+++ stable/10/sys/dev/qlnx/qlnxe/qlnx_ver.h	Thu Aug 24 22:33:42 2017	(r322864)
@@ -39,5 +39,5 @@
 
 #define QLNX_VERSION_MAJOR      1
 #define QLNX_VERSION_MINOR      4
-#define QLNX_VERSION_BUILD      6
+#define QLNX_VERSION_BUILD      7
 

Modified: stable/10/sys/modules/qlnx/qlnxe/Makefile
==============================================================================
--- stable/10/sys/modules/qlnx/qlnxe/Makefile	Thu Aug 24 22:11:10 2017	(r322863)
+++ stable/10/sys/modules/qlnx/qlnxe/Makefile	Thu Aug 24 22:33:42 2017	(r322864)
@@ -52,7 +52,7 @@ SRCS+= pci_if.h
 
 CWARNEXTRA += -Wno-cast-qual
 
-CFLAGS += -DQLNX_DEBUG
+#CFLAGS += -DQLNX_DEBUG
 CFLAGS += -DECORE_PACKAGE
 CFLAGS += -DCONFIG_ECORE_L2
 CFLAGS += -DECORE_CONFIG_DIRECT_HWFN



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201708242233.v7OMXhOL022509>