Date: Tue, 20 May 2025 23:51:16 GMT From: Mark Johnston <markj@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: 3d2957336c7d - main - gve: Add callout to detect and handle TX timeouts Message-ID: <202505202351.54KNpGV4059126@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by markj: URL: https://cgit.FreeBSD.org/src/commit/?id=3d2957336c7ddaa0a29cf60cfd458c07df1f5be9 commit 3d2957336c7ddaa0a29cf60cfd458c07df1f5be9 Author: Jasper Tran O'Leary <jtranoleary@google.com> AuthorDate: 2025-05-20 23:34:45 +0000 Commit: Mark Johnston <markj@FreeBSD.org> CommitDate: 2025-05-20 23:50:59 +0000 gve: Add callout to detect and handle TX timeouts A TX timeout occurs when the driver allocates resources on a TX queue for a packet to be sent, prompts the hardware to send the packet, but does not receive a completion for the packet within a given timeout period. An accumulation of TX timeouts can cause one or more queues to run out of space and cause the entire driver to become stuck. This commit adds a lockless timer service that runs periodically and checks queues for timed out packets. In the event we detect a timeout, we prompt the completion phase taskqueue to process completions. Upon the next inspection of the queue we still detect timed out packets, if the last "kick" occurred within a fixed cooldown window, we opt to reset the driver, even if the prior kick successfully freed timed out packets. Signed-off-by: Jasper Tran O'Leary <jtranoleary@google.com> Reviewed by: markj, ziaee MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D50385 --- share/man/man4/gve.4 | 8 ++++- sys/dev/gve/gve.h | 47 +++++++++++++++++++++++++++++ sys/dev/gve/gve_main.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++-- sys/dev/gve/gve_sysctl.c | 6 +++- sys/dev/gve/gve_tx.c | 32 ++++++++++++++++++++ sys/dev/gve/gve_tx_dqo.c | 32 +++++++++++++++++++- sys/dev/gve/gve_utils.c | 43 ++++++++++++++++++++++++++ 7 files changed, 241 insertions(+), 5 deletions(-) diff --git a/share/man/man4/gve.4 b/share/man/man4/gve.4 index 754071e2fad8..924a01a06d08 100644 --- a/share/man/man4/gve.4 +++ b/share/man/man4/gve.4 @@ -26,7 +26,7 @@ .\" ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS .\" SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -.Dd October 14, 2024 +.Dd May 20, 2025 .Dt GVE 4 .Os .Sh NAME @@ -180,6 +180,12 @@ These messages are seen if any admin queue command fails: .It "Unknown AQ command opcode %d" .El .Pp +These messages appear if a TX timeout is detected: +.Bl -diag +.It "Found %d timed out packet(s) on txq%d, kicking it for completions" +.It "Found %d timed out packet(s) on txq%d with its last kick %ld sec ago which is less than the cooldown period %d. Resetting device" +.El +.Pp These messages are recorded when the device is being reset due to an error: .Bl -diag .It "Scheduling reset task!" diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h index 5b298b889ed6..48e9a371df21 100644 --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -47,6 +47,21 @@ #define GVE_TX_MAX_DESCS 4 #define GVE_TX_BUFRING_ENTRIES 4096 +#define GVE_TX_TIMEOUT_PKT_SEC 5 +#define GVE_TX_TIMEOUT_CHECK_CADENCE_SEC 5 +/* + * If the driver finds timed out packets on a tx queue it first kicks it and + * records the time. If the driver again finds a timeout on the same queue + * before the end of the cooldown period, only then will it reset. Thus, for a + * reset to be able to occur at all, the cooldown must be at least as long + * as the tx timeout checking cadence multiplied by the number of queues. + */ +#define GVE_TX_TIMEOUT_MAX_TX_QUEUES 16 +#define GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC \ + (2 * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC * GVE_TX_TIMEOUT_MAX_TX_QUEUES) + +#define GVE_TIMESTAMP_INVALID -1 + #define ADMINQ_SIZE PAGE_SIZE #define GVE_DEFAULT_RX_BUFFER_SIZE 2048 @@ -337,6 +352,14 @@ struct gve_tx_fifo { struct gve_tx_buffer_state { struct mbuf *mbuf; + + /* + * Time at which the xmit tq places descriptors for mbuf's payload on a + * tx queue. This timestamp is invalidated when the mbuf is freed and + * must be checked for validity when read. + */ + int64_t enqueue_time_sec; + struct gve_tx_iovec iov[GVE_TX_MAX_DESCS]; }; @@ -357,12 +380,21 @@ struct gve_txq_stats { counter_u64_t tx_mbuf_defrag_err; counter_u64_t tx_mbuf_dmamap_enomem_err; counter_u64_t tx_mbuf_dmamap_err; + counter_u64_t tx_timeout; }; #define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) struct gve_tx_pending_pkt_dqo { struct mbuf *mbuf; + + /* + * Time at which the xmit tq places descriptors for mbuf's payload on a + * tx queue. This timestamp is invalidated when the mbuf is freed and + * must be checked for validity when read. + */ + int64_t enqueue_time_sec; + union { /* RDA */ bus_dmamap_t dmamap; @@ -396,6 +428,8 @@ struct gve_tx_ring { uint32_t req; /* free-running total number of packets written to the nic */ uint32_t done; /* free-running total number of completed packets */ + int64_t last_kicked; /* always-valid timestamp in seconds for the last queue kick */ + union { /* GQI specific stuff */ struct { @@ -594,6 +628,11 @@ struct gve_priv { struct gve_state_flags state_flags; struct sx gve_iface_lock; + + struct callout tx_timeout_service; + /* The index of tx queue that the timer service will check on its next invocation */ + uint16_t check_tx_queue_idx; + }; static inline bool @@ -652,6 +691,7 @@ int gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_ void gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); int gve_create_tx_rings(struct gve_priv *priv); int gve_destroy_tx_rings(struct gve_priv *priv); +int gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx); int gve_tx_intr(void *arg); int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf); void gve_qflush(if_t ifp); @@ -662,6 +702,7 @@ void gve_tx_cleanup_tq(void *arg, int pending); int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i); void gve_tx_free_ring_dqo(struct gve_priv *priv, int i); void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i); +int gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx); int gve_tx_intr_dqo(void *arg); int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr); int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf); @@ -697,6 +738,12 @@ int gve_alloc_irqs(struct gve_priv *priv); void gve_unmask_all_queue_irqs(struct gve_priv *priv); void gve_mask_all_queue_irqs(struct gve_priv *priv); +/* Miscellaneous functions defined in gve_utils.c */ +void gve_invalidate_timestamp(int64_t *timestamp_sec); +int64_t gve_seconds_since(int64_t *timestamp_sec); +void gve_set_timestamp(int64_t *timestamp_sec); +bool gve_timestamp_valid(int64_t *timestamp_sec); + /* Systcl functions defined in gve_sysctl.c */ extern bool gve_disable_hw_lro; extern char gve_queue_format[8]; diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c index 8a00deedef36..2abd9d1aa698 100644 --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -32,10 +32,10 @@ #include "gve_adminq.h" #include "gve_dqo.h" -#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.3\n" +#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.4\n" #define GVE_VERSION_MAJOR 1 #define GVE_VERSION_MINOR 3 -#define GVE_VERSION_SUB 3 +#define GVE_VERSION_SUB 4 #define GVE_DEFAULT_RX_COPYBREAK 256 @@ -50,6 +50,9 @@ static struct gve_dev { struct sx gve_global_lock; +static void gve_start_tx_timeout_service(struct gve_priv *priv); +static void gve_stop_tx_timeout_service(struct gve_priv *priv); + static int gve_verify_driver_compatibility(struct gve_priv *priv) { @@ -99,6 +102,72 @@ gve_verify_driver_compatibility(struct gve_priv *priv) return (err); } +static void +gve_handle_tx_timeout(struct gve_priv *priv, struct gve_tx_ring *tx, + int num_timeout_pkts) +{ + int64_t time_since_last_kick; + + counter_u64_add_protected(tx->stats.tx_timeout, 1); + + /* last_kicked is never GVE_TIMESTAMP_INVALID so we can skip checking */ + time_since_last_kick = gve_seconds_since(&tx->last_kicked); + + /* Try kicking first in case the timeout is due to a missed interrupt */ + if (time_since_last_kick > GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC) { + device_printf(priv->dev, + "Found %d timed out packet(s) on txq%d, kicking it for completions\n", + num_timeout_pkts, tx->com.id); + gve_set_timestamp(&tx->last_kicked); + taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); + } else { + device_printf(priv->dev, + "Found %d timed out packet(s) on txq%d with its last kick %jd sec ago which is less than the cooldown period %d. Resetting device\n", + num_timeout_pkts, tx->com.id, + (intmax_t)time_since_last_kick, + GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC); + gve_schedule_reset(priv); + } +} + +static void +gve_tx_timeout_service_callback(void *data) +{ + struct gve_priv *priv = (struct gve_priv *)data; + struct gve_tx_ring *tx; + uint16_t num_timeout_pkts; + + tx = &priv->tx[priv->check_tx_queue_idx]; + + num_timeout_pkts = gve_is_gqi(priv) ? + gve_check_tx_timeout_gqi(priv, tx) : + gve_check_tx_timeout_dqo(priv, tx); + if (num_timeout_pkts) + gve_handle_tx_timeout(priv, tx, num_timeout_pkts); + + priv->check_tx_queue_idx = (priv->check_tx_queue_idx + 1) % + priv->tx_cfg.num_queues; + callout_reset_sbt(&priv->tx_timeout_service, + SBT_1S * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, 0, + gve_tx_timeout_service_callback, (void *)priv, 0); +} + +static void +gve_start_tx_timeout_service(struct gve_priv *priv) +{ + priv->check_tx_queue_idx = 0; + callout_init(&priv->tx_timeout_service, true); + callout_reset_sbt(&priv->tx_timeout_service, + SBT_1S * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, 0, + gve_tx_timeout_service_callback, (void *)priv, 0); +} + +static void +gve_stop_tx_timeout_service(struct gve_priv *priv) +{ + callout_drain(&priv->tx_timeout_service); +} + static int gve_up(struct gve_priv *priv) { @@ -149,6 +218,9 @@ gve_up(struct gve_priv *priv) gve_unmask_all_queue_irqs(priv); gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_up_cnt++; + + gve_start_tx_timeout_service(priv); + return (0); reset: @@ -164,6 +236,8 @@ gve_down(struct gve_priv *priv) if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return; + gve_stop_tx_timeout_service(priv); + if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c index f7c7b5803865..661f61918853 100644 --- a/sys/dev/gve/gve_sysctl.c +++ b/sys/dev/gve/gve_sysctl.c @@ -168,7 +168,7 @@ gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx, &stats->tx_delayed_pkt_tsoerr, "TSO packets delayed due to err in prep errors"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, - "tx_mbuf_collpase", CTLFLAG_RD, + "tx_mbuf_collapse", CTLFLAG_RD, &stats->tx_mbuf_collapse, "tx mbufs that had to be collapsed"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, @@ -187,6 +187,10 @@ gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx, "tx_mbuf_dmamap_err", CTLFLAG_RD, &stats->tx_mbuf_dmamap_err, "tx mbufs that could not be dma-mapped"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_timeout", CTLFLAG_RD, + &stats->tx_timeout, + "detections of timed out packets on tx queues"); } static void diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c index b667df4ca06e..84e3a4c4eb9f 100644 --- a/sys/dev/gve/gve_tx.c +++ b/sys/dev/gve/gve_tx.c @@ -173,6 +173,8 @@ gve_tx_alloc_ring(struct gve_priv *priv, int i) } com->q_resources = com->q_resources_mem.cpu_addr; + tx->last_kicked = 0; + return (0); abort: @@ -218,6 +220,7 @@ gve_tx_clear_desc_ring(struct gve_tx_ring *tx) for (i = 0; i < com->priv->tx_desc_cnt; i++) { tx->desc_ring[i] = (union gve_tx_desc){}; tx->info[i] = (struct gve_tx_buffer_state){}; + gve_invalidate_timestamp(&tx->info[i].enqueue_time_sec); } bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, @@ -344,6 +347,30 @@ gve_destroy_tx_rings(struct gve_priv *priv) return (0); } +int +gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_tx_buffer_state *info; + uint32_t pkt_idx; + int num_timeouts; + + num_timeouts = 0; + + for (pkt_idx = 0; pkt_idx < priv->tx_desc_cnt; pkt_idx++) { + info = &tx->info[pkt_idx]; + + if (!gve_timestamp_valid(&info->enqueue_time_sec)) + continue; + + if (__predict_false( + gve_seconds_since(&info->enqueue_time_sec) > + GVE_TX_TIMEOUT_PKT_SEC)) + num_timeouts += 1; + } + + return (num_timeouts); +} + int gve_tx_intr(void *arg) { @@ -396,7 +423,10 @@ gve_tx_cleanup_tq(void *arg, int pending) if (mbuf == NULL) continue; + gve_invalidate_timestamp(&info->enqueue_time_sec); + info->mbuf = NULL; + counter_enter(); counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); counter_u64_add_protected(tx->stats.tpackets, 1); @@ -685,6 +715,8 @@ gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) /* So that the cleanup taskqueue can free the mbuf eventually. */ info->mbuf = mbuf; + gve_set_timestamp(&info->enqueue_time_sec); + /* * We don't want to split the header, so if necessary, pad to the end * of the fifo and then put the header at the beginning of the fifo. diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c index 8a1993c3e712..bbf2ee1b0324 100644 --- a/sys/dev/gve/gve_tx_dqo.c +++ b/sys/dev/gve/gve_tx_dqo.c @@ -527,6 +527,8 @@ gve_alloc_pending_packet(struct gve_tx_ring *tx) tx->dqo.free_pending_pkts_csm = pending_pkt->next; pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; + gve_set_timestamp(&pending_pkt->enqueue_time_sec); + return (pending_pkt); } @@ -539,6 +541,8 @@ gve_free_pending_packet(struct gve_tx_ring *tx, pending_pkt->state = GVE_PACKET_STATE_FREE; + gve_invalidate_timestamp(&pending_pkt->enqueue_time_sec); + /* Add pending_pkt to the producer list */ while (true) { old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd); @@ -939,6 +943,29 @@ gve_handle_packet_completion(struct gve_priv *priv, return (pkt_len); } +int +gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int num_timeouts; + uint16_t pkt_idx; + + num_timeouts = 0; + for (pkt_idx = 0; pkt_idx < tx->dqo.num_pending_pkts; pkt_idx++) { + pending_pkt = &tx->dqo.pending_pkts[pkt_idx]; + + if (!gve_timestamp_valid(&pending_pkt->enqueue_time_sec)) + continue; + + if (__predict_false( + gve_seconds_since(&pending_pkt->enqueue_time_sec) > + GVE_TX_TIMEOUT_PKT_SEC)) + num_timeouts += 1; + } + + return (num_timeouts); +} + int gve_tx_intr_dqo(void *arg) { @@ -960,8 +987,11 @@ gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx) struct gve_ring_com *com = &tx->com; int i; - for (i = 0; i < com->priv->tx_desc_cnt; i++) + for (i = 0; i < com->priv->tx_desc_cnt; i++) { tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){}; + gve_invalidate_timestamp( + &tx->dqo.pending_pkts[i].enqueue_time_sec); + } bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); diff --git a/sys/dev/gve/gve_utils.c b/sys/dev/gve/gve_utils.c index 4e9dd4625e2f..707b8f039d88 100644 --- a/sys/dev/gve/gve_utils.c +++ b/sys/dev/gve/gve_utils.c @@ -439,3 +439,46 @@ gve_mask_all_queue_irqs(struct gve_priv *priv) gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); } } + +/* + * In some cases, such as tracking timeout events, we must mark a timestamp as + * invalid when we do not want to consider its value. Such timestamps must be + * checked for validity before reading them. + */ +void +gve_invalidate_timestamp(int64_t *timestamp_sec) +{ + atomic_store_64(timestamp_sec, GVE_TIMESTAMP_INVALID); +} + +/* + * Returns 0 if the timestamp is invalid, otherwise returns the elapsed seconds + * since the timestamp was set. + */ +int64_t +gve_seconds_since(int64_t *timestamp_sec) +{ + struct bintime curr_time; + int64_t enqueued_time; + + getbintime(&curr_time); + enqueued_time = atomic_load_64(timestamp_sec); + if (enqueued_time == GVE_TIMESTAMP_INVALID) + return (0); + return ((int64_t)(curr_time.sec - enqueued_time)); +} + +void +gve_set_timestamp(int64_t *timestamp_sec) +{ + struct bintime curr_time; + + getbintime(&curr_time); + atomic_store_64(timestamp_sec, curr_time.sec); +} + +bool +gve_timestamp_valid(int64_t *timestamp_sec) +{ + return (atomic_load_64(timestamp_sec) != GVE_TIMESTAMP_INVALID); +}
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202505202351.54KNpGV4059126>