Date: Thu, 30 May 2019 13:16:56 +0000 (UTC) From: Marcin Wojtas <mw@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r348393 - head/sys/dev/ena Message-ID: <201905301316.x4UDGuli061146@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mw Date: Thu May 30 13:16:56 2019 New Revision: 348393 URL: https://svnweb.freebsd.org/changeset/base/348393 Log: Check for missing MSI-x and Tx completions in ENA If the first MSI-x won't be executed, then the timer service will detect that and trigger device reset. The checking for missing Tx completion was reworked, so it will also check for missing interrupts. Checking number of missing Tx completions can be performed after loop, instead of checking it every iteration. Submitted by: Michal Krawczyk <mk@semihalf.com> Obtained from: Semihalf Sponsored by: Amazon, Inc. Modified: head/sys/dev/ena/ena.c head/sys/dev/ena/ena.h Modified: head/sys/dev/ena/ena.c ============================================================================== --- head/sys/dev/ena/ena.c Thu May 30 13:15:38 2019 (r348392) +++ head/sys/dev/ena/ena.c Thu May 30 13:16:56 2019 (r348393) @@ -405,6 +405,8 @@ ena_init_io_rings_common(struct ena_adapter *adapter, ring->qid = qid; ring->adapter = adapter; ring->ena_dev = adapter->ena_dev; + ring->first_interrupt = false; + ring->no_interrupt_event_cnt = 0; } static void @@ -1773,6 +1775,9 @@ ena_handle_msix(void *arg) ena_qid = ENA_IO_TXQ_IDX(qid); io_cq = &adapter->ena_dev->io_cq_queues[ena_qid]; + tx_ring->first_interrupt = true; + rx_ring->first_interrupt = true; + for (i = 0; i < CLEAN_BUDGET; ++i) { /* * If lock cannot be acquired, then deferred cleanup task was @@ -3329,13 +3334,37 @@ static void check_for_admin_com_state(struct ena_adapt } static int -check_missing_comp_in_queue(struct ena_adapter *adapter, +check_for_rx_interrupt_queue(struct ena_adapter *adapter, + struct ena_ring *rx_ring) +{ + if (likely(rx_ring->first_interrupt)) + return (0); + + if (ena_com_cq_empty(rx_ring->ena_com_io_cq)) + return (0); + + rx_ring->no_interrupt_event_cnt++; + + if (rx_ring->no_interrupt_event_cnt == ENA_MAX_NO_INTERRUPT_ITERATIONS) { + device_printf(adapter->pdev, "Potential MSIX issue on Rx side " + "Queue = %d. Reset the device\n", rx_ring->qid); + adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT; + adapter->trigger_reset = true; + return (EIO); + } + + return (0); +} + +static int +check_missing_comp_in_tx_queue(struct ena_adapter *adapter, struct ena_ring *tx_ring) { struct bintime curtime, time; struct ena_tx_buffer *tx_buf; + sbintime_t time_offset; uint32_t missed_tx = 0; - int i; + int i, rc = 0; getbinuptime(&curtime); @@ -3347,9 +3376,24 @@ check_missing_comp_in_queue(struct ena_adapter *adapte time = curtime; bintime_sub(&time, &tx_buf->timestamp); + time_offset = bttosbt(time); + if (unlikely(!tx_ring->first_interrupt && + time_offset > 2 * adapter->missing_tx_timeout)) { + /* + * If after graceful period interrupt is still not + * received, we schedule a reset. + */ + device_printf(adapter->pdev, + "Potential MSIX issue on Tx side Queue = %d. " + "Reset the device\n", tx_ring->qid); + adapter->reset_reason = ENA_REGS_RESET_MISS_INTERRUPT; + adapter->trigger_reset = true; + return (EIO); + } + /* Check again if packet is still waiting */ - if (unlikely(bttosbt(time) > adapter->missing_tx_timeout)) { + if (unlikely(time_offset > adapter->missing_tx_timeout)) { if (!tx_buf->print_once) ena_trace(ENA_WARNING, "Found a Tx that wasn't " @@ -3358,24 +3402,22 @@ check_missing_comp_in_queue(struct ena_adapter *adapte tx_buf->print_once = true; missed_tx++; - counter_u64_add(tx_ring->tx_stats.missing_tx_comp, 1); - - if (unlikely(missed_tx > - adapter->missing_tx_threshold)) { - device_printf(adapter->pdev, - "The number of lost tx completion " - "is above the threshold (%d > %d). " - "Reset the device\n", - missed_tx, adapter->missing_tx_threshold); - adapter->reset_reason = - ENA_REGS_RESET_MISS_TX_CMPL; - adapter->trigger_reset = true; - return (EIO); - } } } - return (0); + if (unlikely(missed_tx > adapter->missing_tx_threshold)) { + device_printf(adapter->pdev, + "The number of lost tx completion is above the threshold " + "(%d > %d). Reset the device\n", + missed_tx, adapter->missing_tx_threshold); + adapter->reset_reason = ENA_REGS_RESET_MISS_TX_CMPL; + adapter->trigger_reset = true; + rc = EIO; + } + + counter_u64_add(tx_ring->tx_stats.missing_tx_comp, missed_tx); + + return (rc); } /* @@ -3385,9 +3427,10 @@ check_missing_comp_in_queue(struct ena_adapter *adapte * transactions exceeds "missing_tx_threshold". */ static void -check_for_missing_tx_completions(struct ena_adapter *adapter) +check_for_missing_completions(struct ena_adapter *adapter) { struct ena_ring *tx_ring; + struct ena_ring *rx_ring; int i, budget, rc; /* Make sure the driver doesn't turn the device in other process */ @@ -3406,11 +3449,16 @@ check_for_missing_tx_completions(struct ena_adapter *a for (i = adapter->next_monitored_tx_qid; i < adapter->num_queues; i++) { tx_ring = &adapter->tx_ring[i]; + rx_ring = &adapter->rx_ring[i]; - rc = check_missing_comp_in_queue(adapter, tx_ring); + rc = check_missing_comp_in_tx_queue(adapter, tx_ring); if (unlikely(rc != 0)) return; + rc = check_for_rx_interrupt_queue(adapter, rx_ring); + if (unlikely(rc != 0)) + return; + budget--; if (budget == 0) { i++; @@ -3516,7 +3564,7 @@ ena_timer_service(void *data) check_for_admin_com_state(adapter); - check_for_missing_tx_completions(adapter); + check_for_missing_completions(adapter); check_for_empty_rx_ring(adapter); Modified: head/sys/dev/ena/ena.h ============================================================================== --- head/sys/dev/ena/ena.h Thu May 30 13:15:38 2019 (r348392) +++ head/sys/dev/ena/ena.h Thu May 30 13:16:56 2019 (r348393) @@ -120,6 +120,8 @@ #define ENA_IO_IRQ_FIRST_IDX 1 #define ENA_IO_IRQ_IDX(q) (ENA_IO_IRQ_FIRST_IDX + (q)) +#define ENA_MAX_NO_INTERRUPT_ITERATIONS 3 + /* * ENA device should send keep alive msg every 1 sec. * We wait for 6 sec just to be on the safe side. @@ -240,6 +242,9 @@ struct ena_ring { enum ena_admin_placement_policy_type tx_mem_queue_type; /* The maximum length the driver can push to the device (For LLQ) */ uint8_t tx_max_header_size; + + bool first_interrupt; + uint16_t no_interrupt_event_cnt; struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS];
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201905301316.x4UDGuli061146>