From owner-svn-src-stable@freebsd.org Thu Aug 24 22:33:44 2017 Return-Path: Delivered-To: svn-src-stable@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 88C54DE9689; Thu, 24 Aug 2017 22:33:44 +0000 (UTC) (envelope-from davidcs@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 4D3CA688D9; Thu, 24 Aug 2017 22:33:44 +0000 (UTC) (envelope-from davidcs@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id v7OMXhrw022513; Thu, 24 Aug 2017 22:33:43 GMT (envelope-from davidcs@FreeBSD.org) Received: (from davidcs@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id v7OMXhOL022509; Thu, 24 Aug 2017 22:33:43 GMT (envelope-from davidcs@FreeBSD.org) Message-Id: <201708242233.v7OMXhOL022509@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: davidcs set sender to davidcs@FreeBSD.org using -f From: David C Somayajulu Date: Thu, 24 Aug 2017 22:33:43 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r322864 - in stable/10/sys: dev/qlnx/qlnxe modules/qlnx/qlnxe X-SVN-Group: stable-10 X-SVN-Commit-Author: davidcs X-SVN-Commit-Paths: in stable/10/sys: dev/qlnx/qlnxe modules/qlnx/qlnxe X-SVN-Commit-Revision: 322864 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-stable@freebsd.org X-Mailman-Version: 2.1.23 Precedence: list List-Id: SVN commit messages for all the -stable branches of the src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 24 Aug 2017 22:33:44 -0000 Author: davidcs Date: Thu Aug 24 22:33:42 2017 New Revision: 322864 URL: https://svnweb.freebsd.org/changeset/base/322864 Log: MFC r322408 Performance enhancements to reduce CPU utililization for large number of TCP connections (order of tens of thousands), with predominantly Transmits. Submitted by: Vaishali.Kulkarni@cavium.com Approved by: re(marius) Modified: stable/10/sys/dev/qlnx/qlnxe/qlnx_def.h stable/10/sys/dev/qlnx/qlnxe/qlnx_os.c stable/10/sys/dev/qlnx/qlnxe/qlnx_ver.h stable/10/sys/modules/qlnx/qlnxe/Makefile Directory Properties: stable/10/ (props changed) Modified: stable/10/sys/dev/qlnx/qlnxe/qlnx_def.h ============================================================================== --- stable/10/sys/dev/qlnx/qlnxe/qlnx_def.h Thu Aug 24 22:11:10 2017 (r322863) +++ stable/10/sys/dev/qlnx/qlnxe/qlnx_def.h Thu Aug 24 22:33:42 2017 (r322864) @@ -50,9 +50,10 @@ struct qlnx_ivec { typedef struct qlnx_ivec qlnx_ivec_t; -//#define QLNX_MAX_RSS 30 -#define QLNX_MAX_RSS 16 -#define QLNX_MAX_TC 1 +//#define QLNX_MAX_RSS 30 +#define QLNX_MAX_RSS 36 +#define QLNX_DEFAULT_RSS 16 +#define QLNX_MAX_TC 1 enum QLNX_STATE { QLNX_STATE_CLOSED, @@ -201,6 +202,17 @@ struct qlnx_fastpath { uint64_t tx_pkts_freed; uint64_t tx_pkts_transmitted; uint64_t tx_pkts_completed; + uint64_t tx_tso_pkts; + uint64_t tx_non_tso_pkts; + +#ifdef QLNX_TRACE_PERF_DATA + uint64_t tx_pkts_trans_ctx; + uint64_t tx_pkts_compl_ctx; + uint64_t tx_pkts_trans_fp; + uint64_t tx_pkts_compl_fp; + uint64_t tx_pkts_compl_intr; +#endif + uint64_t tx_lso_wnd_min_len; uint64_t tx_defrag; uint64_t tx_nsegs_gt_elem_left; @@ -209,6 +221,13 @@ struct qlnx_fastpath { uint32_t tx_tso_max_pkt_len; uint32_t tx_tso_min_pkt_len; uint64_t tx_pkts[QLNX_FP_MAX_SEGS]; + +#ifdef QLNX_TRACE_PERF_DATA + uint64_t tx_pkts_hist[QLNX_FP_MAX_SEGS]; + uint64_t tx_comInt[QLNX_FP_MAX_SEGS]; + uint64_t tx_pkts_q[QLNX_FP_MAX_SEGS]; +#endif + uint64_t err_tx_nsegs_gt_elem_left; uint64_t err_tx_dmamap_create; uint64_t err_tx_defrag_dmamap_load; @@ -301,8 +320,13 @@ typedef struct qlnx_link_output qlnx_link_output_t; #define QLNX_MFW_VERSION_LENGTH 32 #define QLNX_STORMFW_VERSION_LENGTH 32 -#define QLNX_TX_ELEM_RESERVE 2 +#define QLNX_TX_ELEM_RESERVE 2 +#define QLNX_TX_ELEM_THRESH 128 +#define QLNX_TX_ELEM_MAX_THRESH 512 +#define QLNX_TX_ELEM_MIN_THRESH 32 +#define QLNX_TX_COMPL_THRESH 32 + #define QLNX_TPA_MAX_AGG_BUFFERS (20) #define QLNX_MAX_NUM_MULTICAST_ADDRS ECORE_MAX_MC_ADDRS @@ -454,6 +478,7 @@ struct qlnx_host { qlnx_storm_stats_t storm_stats[QLNX_STORM_STATS_TOTAL]; uint32_t storm_stats_index; uint32_t storm_stats_enable; + uint32_t storm_stats_gather; uint32_t personality; }; @@ -470,8 +495,11 @@ typedef struct qlnx_host qlnx_host_t; #define QLNX_MAX_MTU 9000 #define QLNX_MAX_SEGMENTS_NON_TSO (ETH_TX_MAX_BDS_PER_NON_LSO_PACKET - 1) -#define QLNX_MAX_TSO_FRAME_SIZE ((64 * 1024 - 1) + 22) +//#define QLNX_MAX_TSO_FRAME_SIZE ((64 * 1024 - 1) + 22) +#define QLNX_MAX_TSO_FRAME_SIZE 65536 +#define QLNX_MAX_TX_MBUF_SIZE 65536 /* bytes - bd_len = 16bits */ + #define QL_MAC_CMP(mac1, mac2) \ ((((*(uint32_t *) mac1) == (*(uint32_t *) mac2) && \ (*(uint16_t *)(mac1 + 4)) == (*(uint16_t *)(mac2 + 4)))) ? 0 : 1) @@ -702,6 +730,18 @@ extern void qlnx_fill_link(struct ecore_hwfn *hwfn, #define CQE_HAS_VLAN(flags) \ ((flags) & (PARSING_AND_ERR_FLAGS_TAG8021QEXIST_MASK \ << PARSING_AND_ERR_FLAGS_TAG8021QEXIST_SHIFT)) + +#if defined(__i386__) || defined(__amd64__) + +static __inline +void prefetch(void *x) +{ + __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); +} + +#else +#define prefetch(x) +#endif #endif /* #ifndef _QLNX_DEF_H_ */ Modified: stable/10/sys/dev/qlnx/qlnxe/qlnx_os.c ============================================================================== --- stable/10/sys/dev/qlnx/qlnxe/qlnx_os.c Thu Aug 24 22:11:10 2017 (r322863) +++ stable/10/sys/dev/qlnx/qlnxe/qlnx_os.c Thu Aug 24 22:33:42 2017 (r322864) @@ -94,6 +94,8 @@ static int qlnx_get_ifq_snd_maxlen(qlnx_host_t *ha); static uint32_t qlnx_get_optics(qlnx_host_t *ha, struct qlnx_link_output *if_link); static int qlnx_transmit(struct ifnet *ifp, struct mbuf *mp); +static int qlnx_transmit_locked(struct ifnet *ifp, struct qlnx_fastpath *fp, + struct mbuf *mp); static void qlnx_qflush(struct ifnet *ifp); static int qlnx_alloc_parent_dma_tag(qlnx_host_t *ha); @@ -133,6 +135,8 @@ static void qlnx_timer(void *arg); static int qlnx_alloc_tx_br(qlnx_host_t *ha, struct qlnx_fastpath *fp); static void qlnx_free_tx_br(qlnx_host_t *ha, struct qlnx_fastpath *fp); static void qlnx_trigger_dump(qlnx_host_t *ha); +static uint16_t qlnx_num_tx_compl(qlnx_host_t *ha, struct qlnx_fastpath *fp, + struct qlnx_tx_queue *txq); static void qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_tx_queue *txq); static int qlnx_rx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, int budget, @@ -215,6 +219,12 @@ char qlnx_name_str[NAME_SIZE]; #define QLOGIC_PCI_DEVICE_ID_8070 0x8070 #endif +SYSCTL_NODE(_hw, OID_AUTO, qlnxe, CTLFLAG_RD, 0, "qlnxe driver parameters"); +/* Number of Queues: 0 (Auto) or 1 to 32 (fixed queue number) */ +static int qlnxe_queue_count = QLNX_DEFAULT_RSS; +SYSCTL_INT(_hw_qlnxe, OID_AUTO, queue_count, CTLFLAG_RDTUN, + &qlnxe_queue_count, 0, "Multi-Queue queue count"); + static int qlnx_valid_device(device_t dev) { @@ -302,7 +312,26 @@ qlnx_pci_probe(device_t dev) return (BUS_PROBE_DEFAULT); } +static uint16_t +qlnx_num_tx_compl(qlnx_host_t *ha, struct qlnx_fastpath *fp, + struct qlnx_tx_queue *txq) +{ + u16 hw_bd_cons; + u16 ecore_cons_idx; + uint16_t diff; + hw_bd_cons = le16toh(*txq->hw_cons_ptr); + + ecore_cons_idx = ecore_chain_get_cons_idx(&txq->tx_pbl); + if (hw_bd_cons < ecore_cons_idx) { + diff = (1 << 16) - (ecore_cons_idx - hw_bd_cons); + } else { + diff = hw_bd_cons - ecore_cons_idx; + } + return diff; +} + + static void qlnx_sp_intr(void *arg) { @@ -395,14 +424,11 @@ qlnx_fp_taskqueue(void *context, int pending) struct qlnx_fastpath *fp; qlnx_host_t *ha; struct ifnet *ifp; - struct mbuf *mp; - int ret; - struct thread *cthread; #ifdef QLNX_RCV_IN_TASKQ int lro_enable; int rx_int = 0, total_rx_count = 0; - + struct thread *cthread; #endif /* #ifdef QLNX_RCV_IN_TASKQ */ fp = context; @@ -410,6 +436,12 @@ qlnx_fp_taskqueue(void *context, int pending) if (fp == NULL) return; + ha = (qlnx_host_t *)fp->edev; + + ifp = ha->ifp; + +#ifdef QLNX_RCV_IN_TASKQ + cthread = curthread; thread_lock(cthread); @@ -419,112 +451,81 @@ qlnx_fp_taskqueue(void *context, int pending) thread_unlock(cthread); - ha = (qlnx_host_t *)fp->edev; + lro_enable = ifp->if_capenable & IFCAP_LRO; - ifp = ha->ifp; + rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold, lro_enable); -#ifdef QLNX_RCV_IN_TASKQ - { - lro_enable = ifp->if_capenable & IFCAP_LRO; + if (rx_int) { + fp->rx_pkts += rx_int; + total_rx_count += rx_int; + } - rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold, lro_enable); - - if (rx_int) { - fp->rx_pkts += rx_int; - total_rx_count += rx_int; - } - #ifdef QLNX_SOFT_LRO - { - struct lro_ctrl *lro; - - lro = &fp->rxq->lro; + { + struct lro_ctrl *lro; - if (lro_enable && total_rx_count) { + lro = &fp->rxq->lro; + if (lro_enable && total_rx_count) { + #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) - if (ha->dbg_trace_lro_cnt) { - if (lro->lro_mbuf_count & ~1023) - fp->lro_cnt_1024++; - else if (lro->lro_mbuf_count & ~511) - fp->lro_cnt_512++; - else if (lro->lro_mbuf_count & ~255) - fp->lro_cnt_256++; - else if (lro->lro_mbuf_count & ~127) - fp->lro_cnt_128++; - else if (lro->lro_mbuf_count & ~63) - fp->lro_cnt_64++; - } - tcp_lro_flush_all(lro); + if (ha->dbg_trace_lro_cnt) { + if (lro->lro_mbuf_count & ~1023) + fp->lro_cnt_1024++; + else if (lro->lro_mbuf_count & ~511) + fp->lro_cnt_512++; + else if (lro->lro_mbuf_count & ~255) + fp->lro_cnt_256++; + else if (lro->lro_mbuf_count & ~127) + fp->lro_cnt_128++; + else if (lro->lro_mbuf_count & ~63) + fp->lro_cnt_64++; + } + tcp_lro_flush_all(lro); #else - struct lro_entry *queued; + struct lro_entry *queued; - while ((!SLIST_EMPTY(&lro->lro_active))) { - queued = SLIST_FIRST(&lro->lro_active); - SLIST_REMOVE_HEAD(&lro->lro_active, next); - tcp_lro_flush(lro, queued); - } -#endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */ + while ((!SLIST_EMPTY(&lro->lro_active))) { + queued = SLIST_FIRST(&lro->lro_active); + SLIST_REMOVE_HEAD(&lro->lro_active, next); + tcp_lro_flush(lro, queued); } +#endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */ } + } #endif /* #ifdef QLNX_SOFT_LRO */ - ecore_sb_update_sb_idx(fp->sb_info); - rmb(); - } + ecore_sb_update_sb_idx(fp->sb_info); + rmb(); #endif /* #ifdef QLNX_RCV_IN_TASKQ */ - mtx_lock(&fp->tx_mtx); + if(ifp->if_drv_flags & IFF_DRV_RUNNING) { - if (((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != - IFF_DRV_RUNNING) || (!ha->link_up)) { + if (!drbr_empty(ifp, fp->tx_br)) { - mtx_unlock(&fp->tx_mtx); - goto qlnx_fp_taskqueue_exit; - } + if(mtx_trylock(&fp->tx_mtx)) { - mp = drbr_peek(ifp, fp->tx_br); +#ifdef QLNX_TRACE_PERF_DATA + tx_pkts = fp->tx_pkts_transmitted; + tx_compl = fp->tx_pkts_completed; +#endif - while (mp != NULL) { + qlnx_transmit_locked(ifp, fp, NULL); - if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - ret = qlnx_send(ha, fp, &mp); - } else { - ret = -1; - } - - if (ret) { - - if (mp != NULL) { - drbr_putback(ifp, fp->tx_br, mp); - } else { - fp->tx_pkts_processed++; - drbr_advance(ifp, fp->tx_br); +#ifdef QLNX_TRACE_PERF_DATA + fp->tx_pkts_trans_fp += + (fp->tx_pkts_transmitted - tx_pkts); + fp->tx_pkts_compl_fp += + (fp->tx_pkts_completed - tx_compl); +#endif + mtx_unlock(&fp->tx_mtx); } - - mtx_unlock(&fp->tx_mtx); - - goto qlnx_fp_taskqueue_exit; - - } else { - drbr_advance(ifp, fp->tx_br); - fp->tx_pkts_transmitted++; - fp->tx_pkts_processed++; } - - if (fp->tx_ring_full) - break; - - mp = drbr_peek(ifp, fp->tx_br); } - mtx_unlock(&fp->tx_mtx); - -qlnx_fp_taskqueue_exit: - #ifdef QLNX_RCV_IN_TASKQ if (rx_int) { if (fp->fp_taskqueue != NULL) @@ -537,7 +538,7 @@ qlnx_fp_taskqueue_exit: } #endif /* #ifdef QLNX_RCV_IN_TASKQ */ - QL_DPRINT2(ha, "exit ret = %d\n", ret); + QL_DPRINT2(ha, "exit \n"); return; } @@ -611,6 +612,17 @@ qlnx_drain_fp_taskqueues(qlnx_host_t *ha) return; } +static void +qlnx_get_params(qlnx_host_t *ha) +{ + if ((qlnxe_queue_count < 0) || (qlnxe_queue_count > QLNX_MAX_RSS)) { + device_printf(ha->pci_dev, "invalid queue_count value (%d)\n", + qlnxe_queue_count); + qlnxe_queue_count = 0; + } + return; +} + /* * Name: qlnx_pci_attach * Function: attaches the device to the operating system @@ -706,10 +718,21 @@ qlnx_pci_attach(device_t dev) if (qlnx_init_hw(ha) != 0) goto qlnx_pci_attach_err; + qlnx_get_params(ha); + + if((pci_get_device(dev) == QLOGIC_PCI_DEVICE_ID_1644) && + (qlnxe_queue_count == QLNX_DEFAULT_RSS)) { + qlnxe_queue_count = QLNX_MAX_RSS; + } + /* * Allocate MSI-x vectors */ - ha->num_rss = QLNX_MAX_RSS; + if(qlnxe_queue_count == 0) + ha->num_rss = QLNX_DEFAULT_RSS; + else + ha->num_rss = qlnxe_queue_count; + ha->num_tc = QLNX_MAX_TC; ha->msix_count = pci_msix_count(dev); @@ -1236,6 +1259,44 @@ qlnx_add_fp_stats_sysctls(qlnx_host_t *ha) CTLFLAG_RD, &ha->fp_array[i].tx_pkts_completed, "No. of transmit completions"); + SYSCTL_ADD_QUAD(ctx, node_children, + OID_AUTO, "tx_non_tso_pkts", + CTLFLAG_RD, &ha->fp_array[i].tx_non_tso_pkts, + "No. of non LSO transmited packets"); + +#ifdef QLNX_TRACE_PERF_DATA + + SYSCTL_ADD_QUAD(ctx, node_children, + OID_AUTO, "tx_pkts_trans_ctx", + CTLFLAG_RD, &ha->fp_array[i].tx_pkts_trans_ctx, + "No. of transmitted packets in transmit context"); + + SYSCTL_ADD_QUAD(ctx, node_children, + OID_AUTO, "tx_pkts_compl_ctx", + CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_ctx, + "No. of transmit completions in transmit context"); + + SYSCTL_ADD_QUAD(ctx, node_children, + OID_AUTO, "tx_pkts_trans_fp", + CTLFLAG_RD, &ha->fp_array[i].tx_pkts_trans_fp, + "No. of transmitted packets in taskqueue"); + + SYSCTL_ADD_QUAD(ctx, node_children, + OID_AUTO, "tx_pkts_compl_fp", + CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_fp, + "No. of transmit completions in taskqueue"); + + SYSCTL_ADD_QUAD(ctx, node_children, + OID_AUTO, "tx_pkts_compl_intr", + CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_intr, + "No. of transmit completions in interrupt ctx"); +#endif + + SYSCTL_ADD_QUAD(ctx, node_children, + OID_AUTO, "tx_tso_pkts", + CTLFLAG_RD, &ha->fp_array[i].tx_tso_pkts, + "No. of LSO transmited packets"); + SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_lso_wnd_min_len", CTLFLAG_RD, &ha->fp_array[i].tx_lso_wnd_min_len, @@ -1284,6 +1345,39 @@ qlnx_add_fp_stats_sysctls(qlnx_host_t *ha) &ha->fp_array[i].tx_pkts[j], name_str); } +#ifdef QLNX_TRACE_PERF_DATA + for (j = 0; j < 18; j++) { + + bzero(name_str, (sizeof(uint8_t) * sizeof(name_str))); + snprintf(name_str, sizeof(name_str), + "tx_pkts_hist_%02d", (j+1)); + + SYSCTL_ADD_QUAD(ctx, node_children, + OID_AUTO, name_str, CTLFLAG_RD, + &ha->fp_array[i].tx_pkts_hist[j], name_str); + } + for (j = 0; j < 5; j++) { + + bzero(name_str, (sizeof(uint8_t) * sizeof(name_str))); + snprintf(name_str, sizeof(name_str), + "tx_comInt_%02d", (j+1)); + + SYSCTL_ADD_QUAD(ctx, node_children, + OID_AUTO, name_str, CTLFLAG_RD, + &ha->fp_array[i].tx_comInt[j], name_str); + } + for (j = 0; j < 18; j++) { + + bzero(name_str, (sizeof(uint8_t) * sizeof(name_str))); + snprintf(name_str, sizeof(name_str), + "tx_pkts_q_%02d", (j+1)); + + SYSCTL_ADD_QUAD(ctx, node_children, + OID_AUTO, name_str, CTLFLAG_RD, + &ha->fp_array[i].tx_pkts_q[j], name_str); + } +#endif + SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_tx_nsegs_gt_elem_left", CTLFLAG_RD, &ha->fp_array[i].err_tx_nsegs_gt_elem_left, @@ -1979,6 +2073,12 @@ qlnx_init_ifnet(device_t dev, qlnx_host_t *ha) ifp->if_capabilities |= IFCAP_TSO6; ifp->if_capabilities |= IFCAP_LRO; + ifp->if_hw_tsomax = QLNX_MAX_TSO_FRAME_SIZE - + (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ifp->if_hw_tsomaxsegcount = QLNX_MAX_SEGMENTS - 1 /* hdr */; + ifp->if_hw_tsomaxsegsize = QLNX_MAX_TX_MBUF_SIZE; + + ifp->if_capenable = ifp->if_capabilities; ifp->if_hwassist = CSUM_IP; @@ -2543,6 +2643,7 @@ qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, u16 hw_bd_cons; u16 ecore_cons_idx; uint16_t diff; + uint16_t idx, idx2; hw_bd_cons = le16toh(*txq->hw_cons_ptr); @@ -2580,6 +2681,11 @@ qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, qlnx_trigger_dump(ha); } + idx = (txq->sw_tx_cons + 1) & (TX_RING_SIZE - 1); + idx2 = (txq->sw_tx_cons + 2) & (TX_RING_SIZE - 1); + prefetch(txq->sw_tx_ring[idx].mp); + prefetch(txq->sw_tx_ring[idx2].mp); + qlnx_free_tx_pkt(ha, fp, txq); txq->sw_tx_cons = (txq->sw_tx_cons + 1) & (TX_RING_SIZE - 1); @@ -2588,12 +2694,71 @@ qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, } static int +qlnx_transmit_locked(struct ifnet *ifp,struct qlnx_fastpath *fp, struct mbuf *mp) +{ + int ret = 0; + struct qlnx_tx_queue *txq; + qlnx_host_t * ha; + uint16_t elem_left; + + txq = fp->txq[0]; + ha = (qlnx_host_t *)fp->edev; + + + if ((!(ifp->if_drv_flags & IFF_DRV_RUNNING)) || (!ha->link_up)) { + if(mp != NULL) + ret = drbr_enqueue(ifp, fp->tx_br, mp); + return (ret); + } + + if(mp != NULL) + ret = drbr_enqueue(ifp, fp->tx_br, mp); + + mp = drbr_peek(ifp, fp->tx_br); + + while (mp != NULL) { + + if (qlnx_send(ha, fp, &mp)) { + + if (mp != NULL) { + drbr_putback(ifp, fp->tx_br, mp); + } else { + fp->tx_pkts_processed++; + drbr_advance(ifp, fp->tx_br); + } + goto qlnx_transmit_locked_exit; + + } else { + drbr_advance(ifp, fp->tx_br); + fp->tx_pkts_transmitted++; + fp->tx_pkts_processed++; + } + + mp = drbr_peek(ifp, fp->tx_br); + } + +qlnx_transmit_locked_exit: + if((qlnx_num_tx_compl(ha,fp, fp->txq[0]) > QLNX_TX_COMPL_THRESH) || + ((int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl)) + < QLNX_TX_ELEM_MAX_THRESH)) + (void)qlnx_tx_int(ha, fp, fp->txq[0]); + + QL_DPRINT2(ha, "%s: exit ret = %d\n", __func__, ret); + return ret; +} + + +static int qlnx_transmit(struct ifnet *ifp, struct mbuf *mp) { qlnx_host_t *ha = (qlnx_host_t *)ifp->if_softc; struct qlnx_fastpath *fp; int rss_id = 0, ret = 0; +#ifdef QLNX_TRACEPERF_DATA + uint64_t tx_pkts = 0, tx_compl = 0; +#endif + QL_DPRINT2(ha, "enter\n"); #if __FreeBSD_version >= 1100000 @@ -2611,15 +2776,27 @@ qlnx_transmit(struct ifnet *ifp, struct mbuf *mp) goto qlnx_transmit_exit; } - if (mp != NULL) { - ret = drbr_enqueue(ifp, fp->tx_br, mp); - } + if (mtx_trylock(&fp->tx_mtx)) { - if (fp->fp_taskqueue != NULL) - taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task); +#ifdef QLNX_TRACEPERF_DATA + tx_pkts = fp->tx_pkts_transmitted; + tx_compl = fp->tx_pkts_completed; +#endif - ret = 0; + ret = qlnx_transmit_locked(ifp, fp, mp); +#ifdef QLNX_TRACEPERF_DATA + fp->tx_pkts_trans_ctx += (fp->tx_pkts_transmitted - tx_pkts); + fp->tx_pkts_compl_ctx += (fp->tx_pkts_completed - tx_compl); +#endif + mtx_unlock(&fp->tx_mtx); + } else { + if (mp != NULL && (fp->fp_taskqueue != NULL)) { + ret = drbr_enqueue(ifp, fp->tx_br, mp); + taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task); + } + } + qlnx_transmit_exit: QL_DPRINT2(ha, "exit ret = %d\n", ret); @@ -2799,6 +2976,10 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s uint32_t nbds_in_hdr = 0; uint32_t offset = 0; +#ifdef QLNX_TRACE_PERF_DATA + uint16_t bd_used; +#endif + QL_DPRINT8(ha, "enter\n"); if (!ha->link_up) @@ -2811,15 +2992,15 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s txq = fp->txq[0]; - if (fp->tx_ring_full) { - elem_left = ecore_chain_get_elem_left(&txq->tx_pbl); + if ((int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl)) < + QLNX_TX_ELEM_MIN_THRESH) { - if (elem_left < (TX_RING_SIZE >> 4)) - return (-1); - else - fp->tx_ring_full = 0; - } + fp->tx_nsegs_gt_elem_left++; + fp->err_tx_nsegs_gt_elem_left++; + return (ENOBUFS); + } + idx = txq->sw_tx_prod; map = txq->sw_tx_ring[idx].map; @@ -2829,14 +3010,18 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s BUS_DMA_NOWAIT); if (ha->dbg_trace_tso_pkt_len) { - if (!fp->tx_tso_min_pkt_len) { - fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len; - fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len; - } else { - if (fp->tx_tso_min_pkt_len > m_head->m_pkthdr.len) + if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { + if (!fp->tx_tso_min_pkt_len) { fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len; - if (fp->tx_tso_max_pkt_len < m_head->m_pkthdr.len) - fp->tx_tso_max_pkt_len = m_head->m_pkthdr.len; + fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len; + } else { + if (fp->tx_tso_min_pkt_len > m_head->m_pkthdr.len) + fp->tx_tso_min_pkt_len = + m_head->m_pkthdr.len; + if (fp->tx_tso_max_pkt_len < m_head->m_pkthdr.len) + fp->tx_tso_max_pkt_len = + m_head->m_pkthdr.len; + } } } @@ -2923,6 +3108,105 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s fp->tx_pkts[(QLNX_FP_MAX_SEGS - 1)]++; } +#ifdef QLNX_TRACE_PERF_DATA + if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { + if(m_head->m_pkthdr.len <= 2048) + fp->tx_pkts_hist[0]++; + else if((m_head->m_pkthdr.len > 2048) && + (m_head->m_pkthdr.len <= 4096)) + fp->tx_pkts_hist[1]++; + else if((m_head->m_pkthdr.len > 4096) && + (m_head->m_pkthdr.len <= 8192)) + fp->tx_pkts_hist[2]++; + else if((m_head->m_pkthdr.len > 8192) && + (m_head->m_pkthdr.len <= 12288 )) + fp->tx_pkts_hist[3]++; + else if((m_head->m_pkthdr.len > 11288) && + (m_head->m_pkthdr.len <= 16394)) + fp->tx_pkts_hist[4]++; + else if((m_head->m_pkthdr.len > 16384) && + (m_head->m_pkthdr.len <= 20480)) + fp->tx_pkts_hist[5]++; + else if((m_head->m_pkthdr.len > 20480) && + (m_head->m_pkthdr.len <= 24576)) + fp->tx_pkts_hist[6]++; + else if((m_head->m_pkthdr.len > 24576) && + (m_head->m_pkthdr.len <= 28672)) + fp->tx_pkts_hist[7]++; + else if((m_head->m_pkthdr.len > 28762) && + (m_head->m_pkthdr.len <= 32768)) + fp->tx_pkts_hist[8]++; + else if((m_head->m_pkthdr.len > 32768) && + (m_head->m_pkthdr.len <= 36864)) + fp->tx_pkts_hist[9]++; + else if((m_head->m_pkthdr.len > 36864) && + (m_head->m_pkthdr.len <= 40960)) + fp->tx_pkts_hist[10]++; + else if((m_head->m_pkthdr.len > 40960) && + (m_head->m_pkthdr.len <= 45056)) + fp->tx_pkts_hist[11]++; + else if((m_head->m_pkthdr.len > 45056) && + (m_head->m_pkthdr.len <= 49152)) + fp->tx_pkts_hist[12]++; + else if((m_head->m_pkthdr.len > 49512) && + m_head->m_pkthdr.len <= 53248)) + fp->tx_pkts_hist[13]++; + else if((m_head->m_pkthdr.len > 53248) && + (m_head->m_pkthdr.len <= 57344)) + fp->tx_pkts_hist[14]++; + else if((m_head->m_pkthdr.len > 53248) && + (m_head->m_pkthdr.len <= 57344)) + fp->tx_pkts_hist[15]++; + else if((m_head->m_pkthdr.len > 57344) && + (m_head->m_pkthdr.len <= 61440)) + fp->tx_pkts_hist[16]++; + else + fp->tx_pkts_hist[17]++; + } + + if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { + + elem_left = ecore_chain_get_elem_left(&txq->tx_pbl); + bd_used = TX_RING_SIZE - elem_left; + + if(bd_used <= 100) + fp->tx_pkts_q[0]++; + else if((bd_used > 100) && (bd_used <= 500)) + fp->tx_pkts_q[1]++; + else if((bd_used > 500) && (bd_used <= 1000)) + fp->tx_pkts_q[2]++; + else if((bd_used > 1000) && (bd_used <= 2000)) + fp->tx_pkts_q[3]++; + else if((bd_used > 3000) && (bd_used <= 4000)) + fp->tx_pkts_q[4]++; + else if((bd_used > 4000) && (bd_used <= 5000)) + fp->tx_pkts_q[5]++; + else if((bd_used > 6000) && (bd_used <= 7000)) + fp->tx_pkts_q[6]++; + else if((bd_used > 7000) && (bd_used <= 8000)) + fp->tx_pkts_q[7]++; + else if((bd_used > 8000) && (bd_used <= 9000)) + fp->tx_pkts_q[8]++; + else if((bd_used > 9000) && (bd_used <= 10000)) + fp->tx_pkts_q[9]++; + else if((bd_used > 10000) && (bd_used <= 11000)) + fp->tx_pkts_q[10]++; + else if((bd_used > 11000) && (bd_used <= 12000)) + fp->tx_pkts_q[11]++; + else if((bd_used > 12000) && (bd_used <= 13000)) + fp->tx_pkts_q[12]++; + else if((bd_used > 13000) && (bd_used <= 14000)) + fp->tx_pkts_q[13]++; + else if((bd_used > 14000) && (bd_used <= 15000)) + fp->tx_pkts_q[14]++; + else if((bd_used > 15000) && (bd_used <= 16000)) + fp->tx_pkts_q[15]++; + else + fp->tx_pkts_q[16]++; + } + +#endif /* end of QLNX_TRACE_PERF_DATA */ + if ((nsegs + QLNX_TX_ELEM_RESERVE) > (int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl))) { @@ -2943,7 +3227,8 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s fp->err_tx_nsegs_gt_elem_left++; fp->tx_ring_full = 1; - ha->storm_stats_enable = 1; + if (ha->storm_stats_enable) + ha->storm_stats_gather = 1; return (ENOBUFS); } } @@ -3131,6 +3416,7 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s third_bd->data.bitfields |= (nbds_in_hdr<tx_tso_pkts++; } else { segs++; for (seg_idx = 1; seg_idx < nsegs; seg_idx++) { @@ -3147,6 +3433,7 @@ qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, s << ETH_TX_DATA_1ST_BD_PKT_LEN_SHIFT; first_bd->data.bitfields = htole16(first_bd->data.bitfields); + fp->tx_non_tso_pkts++; } @@ -4303,8 +4590,10 @@ qlnx_fp_isr(void *arg) if (fp->fp_taskqueue != NULL) taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task); #else - int rx_int = 0, total_rx_count = 0; - int lro_enable, tc; + int rx_int = 0, total_rx_count = 0; + int lro_enable, tc; + struct qlnx_tx_queue *txq; + uint16_t elem_left; lro_enable = ha->ifp->if_capenable & IFCAP_LRO; @@ -4312,10 +4601,36 @@ qlnx_fp_isr(void *arg) do { for (tc = 0; tc < ha->num_tc; tc++) { - if (mtx_trylock(&fp->tx_mtx)) { - qlnx_tx_int(ha, fp, fp->txq[tc]); - mtx_unlock(&fp->tx_mtx); - } + + txq = fp->txq[tc]; + + if((int)(elem_left = + ecore_chain_get_elem_left(&txq->tx_pbl)) < + QLNX_TX_ELEM_THRESH) { + + if (mtx_trylock(&fp->tx_mtx)) { +#ifdef QLNX_TRACE_PERF_DATA + tx_compl = fp->tx_pkts_completed; +#endif + + qlnx_tx_int(ha, fp, fp->txq[tc]); +#ifdef QLNX_TRACE_PERF_DATA + fp->tx_pkts_compl_intr += + (fp->tx_pkts_completed - tx_compl); + if ((fp->tx_pkts_completed - tx_compl) <= 32) + fp->tx_comInt[0]++; + else if (((fp->tx_pkts_completed - tx_compl) > 32) && + ((fp->tx_pkts_completed - tx_compl) <= 64)) + fp->tx_comInt[1]++; + else if(((fp->tx_pkts_completed - tx_compl) > 64) && + ((fp->tx_pkts_completed - tx_compl) <= 128)) + fp->tx_comInt[2]++; + else if(((fp->tx_pkts_completed - tx_compl) > 128)) + fp->tx_comInt[3]++; +#endif + mtx_unlock(&fp->tx_mtx); + } + } } rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold, @@ -4328,7 +4643,6 @@ qlnx_fp_isr(void *arg) } while (rx_int); - #ifdef QLNX_SOFT_LRO { struct lro_ctrl *lro; @@ -4608,8 +4922,8 @@ qlnx_alloc_tx_dma_tag(qlnx_host_t *ha) NULL, NULL, /* filter, filterarg */ QLNX_MAX_TSO_FRAME_SIZE, /* maxsize */ QLNX_MAX_SEGMENTS, /* nsegments */ - (PAGE_SIZE * 4), /* maxsegsize */ - BUS_DMA_ALLOCNOW, /* flags */ + QLNX_MAX_TX_MBUF_SIZE, /* maxsegsize */ + 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &ha->tx_tag)) { @@ -4642,7 +4956,7 @@ qlnx_alloc_rx_dma_tag(qlnx_host_t *ha) MJUM9BYTES, /* maxsize */ 1, /* nsegments */ MJUM9BYTES, /* maxsegsize */ - BUS_DMA_ALLOCNOW, /* flags */ + 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &ha->rx_tag)) { @@ -5255,6 +5569,14 @@ qlnx_init_fp(qlnx_host_t *ha) fp->tx_pkts_freed = 0; fp->tx_pkts_transmitted = 0; fp->tx_pkts_completed = 0; + +#ifdef QLNX_TRACE_PERF_DATA + fp->tx_pkts_trans_ctx = 0; + fp->tx_pkts_compl_ctx = 0; + fp->tx_pkts_trans_fp = 0; + fp->tx_pkts_compl_fp = 0; + fp->tx_pkts_compl_intr = 0; +#endif fp->tx_lso_wnd_min_len = 0; fp->tx_defrag = 0; fp->tx_nsegs_gt_elem_left = 0; @@ -6606,7 +6928,7 @@ qlnx_timer(void *arg) ecore_get_vport_stats(&ha->cdev, &ha->hw_stats); - if (ha->storm_stats_enable) + if (ha->storm_stats_gather) qlnx_sample_storm_stats(ha); callout_reset(&ha->qlnx_callout, hz, qlnx_timer, ha); @@ -6855,7 +7177,7 @@ qlnx_sample_storm_stats(qlnx_host_t *ha) struct ecore_hwfn *hwfn; if (ha->storm_stats_index >= QLNX_STORM_STATS_SAMPLES_PER_HWFN) { - ha->storm_stats_enable = 0; + ha->storm_stats_gather = 0; return; } Modified: stable/10/sys/dev/qlnx/qlnxe/qlnx_ver.h ============================================================================== --- stable/10/sys/dev/qlnx/qlnxe/qlnx_ver.h Thu Aug 24 22:11:10 2017 (r322863) +++ stable/10/sys/dev/qlnx/qlnxe/qlnx_ver.h Thu Aug 24 22:33:42 2017 (r322864) @@ -39,5 +39,5 @@ #define QLNX_VERSION_MAJOR 1 #define QLNX_VERSION_MINOR 4 -#define QLNX_VERSION_BUILD 6 +#define QLNX_VERSION_BUILD 7 Modified: stable/10/sys/modules/qlnx/qlnxe/Makefile ============================================================================== --- stable/10/sys/modules/qlnx/qlnxe/Makefile Thu Aug 24 22:11:10 2017 (r322863) +++ stable/10/sys/modules/qlnx/qlnxe/Makefile Thu Aug 24 22:33:42 2017 (r322864) @@ -52,7 +52,7 @@ SRCS+= pci_if.h CWARNEXTRA += -Wno-cast-qual -CFLAGS += -DQLNX_DEBUG +#CFLAGS += -DQLNX_DEBUG CFLAGS += -DECORE_PACKAGE CFLAGS += -DCONFIG_ECORE_L2 CFLAGS += -DECORE_CONFIG_DIRECT_HWFN