FreeBSD Mail Archives

Date:      Mon, 21 Jul 2025 22:21:57 -0600
From:      Warner Losh <imp@bsdimp.com>
To:        FreeBSD Current <freebsd-current@freebsd.org>
Subject:   Fwd: git: b21e67875bf0 - main - nvme: Move AER processing into a task thread
Message-ID:  <CANCZdfrvnG%2BcrC_ovPuqq=Wx=Rqrvvsy-UhFKyDfUe1t0rF1HA@mail.gmail.com>
In-Reply-To: <202507220400.56M40qpk032441@gitrepo.freebsd.org>

index | next in thread | previous in thread | raw e-mail


This is a change to the NVMe error handling. It should fix some
crashes that people have seen when there's a NVMe data integrity
failure or other SMART issue since I added devd reporting for those.
However, it's lightly tested. I have an nvme card that does similar
smart failures, but I only had one failure test (and it may have been
inconclusive). This failure is hard to test because usually it's a
one-time event on an NVMe card.

I was going to run this in the Netflix world for a few months to see
if we avoided the crash or three we get a month from this issue, but
it 'leaked' out of my tree due to an oversight on my part when I
pushed a separate change. I'd thought this was in a branch, but was in
main. Double apologies for pushing this during STAB week. Catching up
with my email I just noticed we were there...

If there's any issues, I'll back this out asap, but will let it mellow if not.

Warner

---------- Forwarded message ---------
From: Warner Losh <imp@freebsd.org>
Date: Mon, Jul 21, 2025 at 10:01 PM
Subject: git: b21e67875bf0 - main - nvme: Move AER processing into a task thread
To: <src-committers@freebsd.org>, <dev-commits-src-all@freebsd.org>,
<dev-commits-src-main@freebsd.org>


The branch main has been updated by imp:

URL: https://cgit.FreeBSD.org/src/commit/?id=b21e67875bf0c4b1e7933090b80307abb24b7d03

commit b21e67875bf0c4b1e7933090b80307abb24b7d03
Author:     Warner Losh <imp@FreeBSD.org>
AuthorDate: 2025-07-18 22:17:35 +0000
Commit:     Warner Losh <imp@FreeBSD.org>
CommitDate: 2025-07-22 04:00:33 +0000

    nvme: Move AER processing into a task thread

    Move the AER processing into the taskqueue thread. We do memory
    allocations and such burried deep in things we call, so this just makes
    all that simpler and doesn't stall the completion thread. It fixes a few
    panics if you get a reliability failure from the drive at the wrong
    time.

    Sponsored by:           Netflix
    MFC After:              2 weeks
---
 sys/dev/nvme/nvme_ctrlr.c   | 289 +++++++++++++++++++++++++-------------------
 sys/dev/nvme/nvme_private.h |   2 +
 2 files changed, 170 insertions(+), 121 deletions(-)

diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index c86316337e93..fd7f00ced14b 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -48,7 +48,7 @@
 #define B4_CHK_RDY_DELAY_MS    2300            /* work around controller bug */

 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
-                                               struct
nvme_async_event_request *aer);
+    struct nvme_async_event_request *aer);

 static void
 nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags)
@@ -679,96 +679,6 @@ nvme_ctrlr_log_critical_warnings(struct
nvme_controller *ctrlr,
        nvme_ctrlr_devctl(ctrlr, "critical", "SMART_ERROR",
"state=0x%02x", state);
 }

-static void
-nvme_ctrlr_async_event_log_page_cb(void *arg, const struct
nvme_completion *cpl)
-{
-       struct nvme_async_event_request         *aer = arg;
-       struct nvme_health_information_page     *health_info;
-       struct nvme_ns_list                     *nsl;
-       struct nvme_error_information_entry     *err;
-       int i;
-
-       /*
-        * If the log page fetch for some reason completed with an error,
-        *  don't pass log page data to the consumers.  In practice, this case
-        *  should never happen.
-        */
-       if (nvme_completion_is_error(cpl))
-               nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
-                   aer->log_page_id, NULL, 0);
-       else {
-               /* Convert data to host endian */
-               switch (aer->log_page_id) {
-               case NVME_LOG_ERROR:
-                       err = (struct nvme_error_information_entry
*)aer->log_page_buffer;
-                       for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
-                               nvme_error_information_entry_swapbytes(err++);
-                       break;
-               case NVME_LOG_HEALTH_INFORMATION:
-                       nvme_health_information_page_swapbytes(
-                           (struct nvme_health_information_page
*)aer->log_page_buffer);
-                       break;
-               case NVME_LOG_CHANGED_NAMESPACE:
-                       nvme_ns_list_swapbytes(
-                           (struct nvme_ns_list *)aer->log_page_buffer);
-                       break;
-               case NVME_LOG_COMMAND_EFFECT:
-                       nvme_command_effects_page_swapbytes(
-                           (struct nvme_command_effects_page
*)aer->log_page_buffer);
-                       break;
-               case NVME_LOG_RES_NOTIFICATION:
-                       nvme_res_notification_page_swapbytes(
-                           (struct nvme_res_notification_page
*)aer->log_page_buffer);
-                       break;
-               case NVME_LOG_SANITIZE_STATUS:
-                       nvme_sanitize_status_page_swapbytes(
-                           (struct nvme_sanitize_status_page
*)aer->log_page_buffer);
-                       break;
-               default:
-                       break;
-               }
-
-               if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
-                       health_info = (struct nvme_health_information_page *)
-                           aer->log_page_buffer;
-                       nvme_ctrlr_log_critical_warnings(aer->ctrlr,
-                           health_info->critical_warning);
-                       /*
-                        * Critical warnings reported through the
-                        *  SMART/health log page are persistent, so
-                        *  clear the associated bits in the async event
-                        *  config so that we do not receive repeated
-                        *  notifications for the same event.
-                        */
-                       aer->ctrlr->async_event_config &=
-                           ~health_info->critical_warning;
-                       nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
-                           aer->ctrlr->async_event_config, NULL, NULL);
-               } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
-                   !nvme_use_nvd) {
-                       nsl = (struct nvme_ns_list *)aer->log_page_buffer;
-                       for (i = 0; i < nitems(nsl->ns) && nsl->ns[i]
!= 0; i++) {
-                               if (nsl->ns[i] > NVME_MAX_NAMESPACES)
-                                       break;
-                               nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
-                       }
-               }
-
-               /*
-                * Pass the cpl data from the original async event completion,
-                *  not the log page fetch.
-                */
-               nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
-                   aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
-       }
-
-       /*
-        * Repost another asynchronous event request to replace the one
-        *  that just completed.
-        */
-       nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
-}
-
 static void
 nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
 {
@@ -784,33 +694,18 @@ nvme_ctrlr_async_event_cb(void *arg, const
struct nvme_completion *cpl)
                return;
        }

-       /* Associated log page is in bits 23:16 of completion entry dw0. */
+       /*
+        * Save the completion status and associated log page is in bits 23:16
+        * of completion entry dw0. Print a message and queue it for further
+        * processing.
+        */
+       memcpy(&aer->cpl, cpl, sizeof(*cpl));
        aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cpl->cdw0);
-
        nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
            " page 0x%02x)\n", NVMEV(NVME_ASYNC_EVENT_TYPE, cpl->cdw0),
            NVMEV(NVME_ASYNC_EVENT_INFO, cpl->cdw0),
            aer->log_page_id);
-
-       if (is_log_page_id_valid(aer->log_page_id)) {
-               aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
-                   aer->log_page_id);
-               memcpy(&aer->cpl, cpl, sizeof(*cpl));
-               nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
-                   NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
-                   aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
-                   aer);
-               /* Wait to notify consumers until after log page is fetched. */
-       } else {
-               nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
-                   NULL, 0);
-
-               /*
-                * Repost another asynchronous event request to replace the one
-                *  that just completed.
-                */
-               nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
-       }
+       taskqueue_enqueue(aer->ctrlr->taskqueue, &aer->task);
 }

 static void
@@ -819,15 +714,21 @@ nvme_ctrlr_construct_and_submit_aer(struct
nvme_controller *ctrlr,
 {
        struct nvme_request *req;

-       aer->ctrlr = ctrlr;
        /*
-        * XXX-MJ this should be M_WAITOK but we might be in a non-sleepable
-        * callback context.  AER completions should be handled on a dedicated
-        * thread.
+        * We're racing the reset thread, so let that process submit this again.
+        * XXX does this really solve that race? And is that race even possible
+        * since we only reset when we've no theard from the card in a long
+        * time. Why would we get an AER in the middle of that just before we
+        * kick off the reset?
         */
-       req = nvme_allocate_request_null(M_NOWAIT, nvme_ctrlr_async_event_cb,
+       if (ctrlr->is_resetting)
+               return;
+
+       aer->ctrlr = ctrlr;
+       req = nvme_allocate_request_null(M_WAITOK, nvme_ctrlr_async_event_cb,
            aer);
        aer->req = req;
+       aer->log_page_id = 0;           /* Not a valid page */

        /*
         * Disable timeout here, since asynchronous event requests should by
@@ -1203,6 +1104,140 @@ nvme_ctrlr_reset_task(void *arg, int pending)
        atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 }

+static void
+nvme_ctrlr_aer_done(void *arg,  const struct nvme_completion *cpl)
+{
+       struct nvme_async_event_request *aer = arg;
+
+       mtx_lock(&aer->mtx);
+       if (nvme_completion_is_error(cpl))
+               aer->log_page_size = (uint32_t)-1;
+       else
+               aer->log_page_size = nvme_ctrlr_get_log_page_size(
+                   aer->ctrlr, aer->log_page_id);
+       wakeup(aer);
+       mtx_unlock(&aer->mtx);
+}
+
+static void
+nvme_ctrlr_aer_task(void *arg, int pending)
+{
+       struct nvme_async_event_request *aer = arg;
+       struct nvme_controller  *ctrlr = aer->ctrlr;
+       uint32_t len;
+
+       /*
+        * We're resetting, so just punt.
+        */
+       if (ctrlr->is_resetting)
+               return;
+
+       if (!is_log_page_id_valid(aer->log_page_id)) {
+               /*
+                * Repost another asynchronous event request to replace the one
+                * that just completed.
+                */
+               nvme_notify_async_consumers(ctrlr, &aer->cpl, aer->log_page_id,
+                   NULL, 0);
+               nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+               goto out;
+       }
+
+       aer->log_page_size = 0;
+       len = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id);
+       nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
+           NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, len,
+           nvme_ctrlr_aer_done, aer);
+       mtx_lock(&aer->mtx);
+       while (aer->log_page_size == 0)
+               mtx_sleep(aer, &aer->mtx, PRIBIO, "nvme_pt", 0);
+       mtx_unlock(&aer->mtx);
+
+       if (aer->log_page_size != (uint32_t)-1) {
+               /*
+                * If the log page fetch for some reason completed with an
+                * error, don't pass log page data to the consumers.  In
+                * practice, this case should never happen.
+                */
+               nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
+                   aer->log_page_id, NULL, 0);
+               goto out;
+       }
+
+       /* Convert data to host endian */
+       switch (aer->log_page_id) {
+       case NVME_LOG_ERROR: {
+               struct nvme_error_information_entry *err =
+                   (struct nvme_error_information_entry *)aer->log_page_buffer;
+               for (int i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
+                       nvme_error_information_entry_swapbytes(err++);
+               break;
+       }
+       case NVME_LOG_HEALTH_INFORMATION:
+               nvme_health_information_page_swapbytes(
+                       (struct nvme_health_information_page
*)aer->log_page_buffer);
+               break;
+       case NVME_LOG_CHANGED_NAMESPACE:
+               nvme_ns_list_swapbytes(
+                       (struct nvme_ns_list *)aer->log_page_buffer);
+               break;
+       case NVME_LOG_COMMAND_EFFECT:
+               nvme_command_effects_page_swapbytes(
+                       (struct nvme_command_effects_page
*)aer->log_page_buffer);
+               break;
+       case NVME_LOG_RES_NOTIFICATION:
+               nvme_res_notification_page_swapbytes(
+                       (struct nvme_res_notification_page
*)aer->log_page_buffer);
+               break;
+       case NVME_LOG_SANITIZE_STATUS:
+               nvme_sanitize_status_page_swapbytes(
+                       (struct nvme_sanitize_status_page
*)aer->log_page_buffer);
+               break;
+       default:
+               break;
+       }
+
+       if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
+               struct nvme_health_information_page *health_info =
+                   (struct nvme_health_information_page *)aer->log_page_buffer;
+
+               /*
+                * Critical warnings reported through the SMART/health log page
+                * are persistent, so clear the associated bits in the async
+                * event config so that we do not receive repeated notifications
+                * for the same event.
+                */
+               nvme_ctrlr_log_critical_warnings(aer->ctrlr,
+                   health_info->critical_warning);
+               aer->ctrlr->async_event_config &=
+                   ~health_info->critical_warning;
+               nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
+                   aer->ctrlr->async_event_config, NULL, NULL);
+       } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE) {
+               struct nvme_ns_list *nsl =
+                   (struct nvme_ns_list *)aer->log_page_buffer;
+               for (int i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
+                       if (nsl->ns[i] > NVME_MAX_NAMESPACES)
+                               break;
+                       nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
+               }
+       }
+
+       /*
+        * Pass the cpl data from the original async event completion, not the
+        * log page fetch.
+        */
+       nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
+           aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
+
+       /*
+        * Repost another asynchronous event request to replace the one
+        *  that just completed.
+        */
+out:
+       nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
+}
+
 /*
  * Poll all the queues enabled on the device for completion.
  */
@@ -1574,8 +1609,8 @@ nvme_ctrlr_construct(struct nvme_controller
*ctrlr, device_t dev)
        /*
         * Create 2 threads for the taskqueue. The reset thread will block when
         * it detects that the controller has failed until all I/O has been
-        * failed up the stack. The second thread used to be for failing
-        * requests.
+        * failed up the stack. The second thread is used for AER events, which
+        * can block, but only briefly for memory and log page fetching.
         */
        ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
            taskqueue_thread_enqueue, &ctrlr->taskqueue);
@@ -1585,6 +1620,12 @@ nvme_ctrlr_construct(struct nvme_controller
*ctrlr, device_t dev)
        ctrlr->is_initialized = false;
        ctrlr->notification_sent = 0;
        TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
+       for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) {
+               struct nvme_async_event_request *aer = &ctrlr->aer[i];
+
+               TASK_INIT(&aer->task, 0, nvme_ctrlr_aer_task, aer);
+               mtx_init(&aer->mtx, "AER mutex", NULL, MTX_DEF);
+       }
        ctrlr->is_failed = false;

        make_dev_args_init(&md_args);
@@ -1672,8 +1713,14 @@ nvme_ctrlr_destruct(struct nvme_controller
*ctrlr, device_t dev)
        }

 noadminq:
-       if (ctrlr->taskqueue)
+       if (ctrlr->taskqueue) {
                taskqueue_free(ctrlr->taskqueue);
+               for (int i = 0; i < NVME_MAX_ASYNC_EVENTS; i++) {
+                       struct nvme_async_event_request *aer = &ctrlr->aer[i];
+
+                       mtx_destroy(&aer->mtx);
+               }
+       }

        if (ctrlr->tag)
                bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h
index 0340b13b7de5..36f00fedc48e 100644
--- a/sys/dev/nvme/nvme_private.h
+++ b/sys/dev/nvme/nvme_private.h
@@ -123,6 +123,8 @@ struct nvme_request {
 struct nvme_async_event_request {
        struct nvme_controller          *ctrlr;
        struct nvme_request             *req;
+       struct task                     task;
+       struct mtx                      mtx;
        struct nvme_completion          cpl;
        uint32_t                        log_page_id;
        uint32_t                        log_page_size;

home | help

Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CANCZdfrvnG%2BcrC_ovPuqq=Wx=Rqrvvsy-UhFKyDfUe1t0rF1HA>

Header And Logo

Peripheral Links

Site Navigation

Header And Logo

Peripheral Links

Search

Site Navigation