Date: Mon, 29 Jun 2020 00:31:24 +0000 (UTC) From: Chuck Tuffli <chuck@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r362748 - head/usr.sbin/bhyve Message-ID: <202006290031.05T0VOlI047065@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: chuck Date: Mon Jun 29 00:31:24 2020 New Revision: 362748 URL: https://svnweb.freebsd.org/changeset/base/362748 Log: bhyve: add locks around NVMe queue accesses The NVMe code attempted to ensure thread safety through a combination of using atomics and a "busy" flag. But this approach leads to unavoidable race conditions. Fix is to use per-queue mutex locks to ensure thread safety within the queue processing code. While in the neighborhood, move all the queue initialization code to a common function. Tested by: Jason Tubnor MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D19841 Modified: head/usr.sbin/bhyve/pci_nvme.c Modified: head/usr.sbin/bhyve/pci_nvme.c ============================================================================== --- head/usr.sbin/bhyve/pci_nvme.c Mon Jun 29 00:31:20 2020 (r362747) +++ head/usr.sbin/bhyve/pci_nvme.c Mon Jun 29 00:31:24 2020 (r362748) @@ -153,21 +153,21 @@ enum nvme_copy_dir { struct nvme_completion_queue { struct nvme_completion *qbase; + pthread_mutex_t mtx; uint32_t size; uint16_t tail; /* nvme progress */ uint16_t head; /* guest progress */ uint16_t intr_vec; uint32_t intr_en; - pthread_mutex_t mtx; }; struct nvme_submission_queue { struct nvme_command *qbase; + pthread_mutex_t mtx; uint32_t size; uint16_t head; /* nvme progress */ uint16_t tail; /* guest progress */ uint16_t cqid; /* completion queue id */ - int busy; /* queue is being processed */ int qpriority; }; @@ -339,7 +339,63 @@ pci_nvme_toggle_phase(uint16_t *status, int prev) *status |= NVME_STATUS_P; } +/* + * Initialize the requested number or IO Submission and Completion Queues. + * Admin queues are allocated implicitly. + */ static void +pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq) +{ + uint32_t i; + + /* + * Allocate and initialize the Submission Queues + */ + if (nsq > NVME_QUEUES) { + WPRINTF("%s: clamping number of SQ from %u to %u", + __func__, nsq, NVME_QUEUES); + nsq = NVME_QUEUES; + } + + sc->num_squeues = nsq; + + sc->submit_queues = calloc(sc->num_squeues + 1, + sizeof(struct nvme_submission_queue)); + if (sc->submit_queues == NULL) { + WPRINTF("%s: SQ allocation failed", __func__); + sc->num_squeues = 0; + } else { + struct nvme_submission_queue *sq = sc->submit_queues; + + for (i = 0; i < sc->num_squeues; i++) + pthread_mutex_init(&sq[i].mtx, NULL); + } + + /* + * Allocate and initialize the Completion Queues + */ + if (ncq > NVME_QUEUES) { + WPRINTF("%s: clamping number of CQ from %u to %u", + __func__, ncq, NVME_QUEUES); + ncq = NVME_QUEUES; + } + + sc->num_cqueues = ncq; + + sc->compl_queues = calloc(sc->num_cqueues + 1, + sizeof(struct nvme_completion_queue)); + if (sc->compl_queues == NULL) { + WPRINTF("%s: CQ allocation failed", __func__); + sc->num_cqueues = 0; + } else { + struct nvme_completion_queue *cq = sc->compl_queues; + + for (i = 0; i < sc->num_cqueues; i++) + pthread_mutex_init(&cq[i].mtx, NULL); + } +} + +static void pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) { struct nvme_controller_data *cd = &sc->ctrldata; @@ -498,6 +554,8 @@ pci_nvme_init_logpages(struct pci_nvme_softc *sc) static void pci_nvme_reset_locked(struct pci_nvme_softc *sc) { + uint32_t i; + DPRINTF("%s", __func__); sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | @@ -511,44 +569,23 @@ pci_nvme_reset_locked(struct pci_nvme_softc *sc) sc->regs.cc = 0; sc->regs.csts = 0; - sc->num_cqueues = sc->num_squeues = sc->max_queues; - if (sc->submit_queues != NULL) { - for (int i = 0; i < sc->num_squeues + 1; i++) { - /* - * The Admin Submission Queue is at index 0. - * It must not be changed at reset otherwise the - * emulation will be out of sync with the guest. - */ - if (i != 0) { - sc->submit_queues[i].qbase = NULL; - sc->submit_queues[i].size = 0; - sc->submit_queues[i].cqid = 0; - } - sc->submit_queues[i].tail = 0; - sc->submit_queues[i].head = 0; - sc->submit_queues[i].busy = 0; - } - } else - sc->submit_queues = calloc(sc->num_squeues + 1, - sizeof(struct nvme_submission_queue)); + assert(sc->submit_queues != NULL); - if (sc->compl_queues != NULL) { - for (int i = 0; i < sc->num_cqueues + 1; i++) { - /* See Admin Submission Queue note above */ - if (i != 0) { - sc->compl_queues[i].qbase = NULL; - sc->compl_queues[i].size = 0; - } + for (i = 0; i < sc->num_squeues + 1; i++) { + sc->submit_queues[i].qbase = NULL; + sc->submit_queues[i].size = 0; + sc->submit_queues[i].cqid = 0; + sc->submit_queues[i].tail = 0; + sc->submit_queues[i].head = 0; + } - sc->compl_queues[i].tail = 0; - sc->compl_queues[i].head = 0; - } - } else { - sc->compl_queues = calloc(sc->num_cqueues + 1, - sizeof(struct nvme_completion_queue)); + assert(sc->compl_queues != NULL); - for (int i = 0; i < sc->num_cqueues + 1; i++) - pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); + for (i = 0; i < sc->num_cqueues + 1; i++) { + sc->compl_queues[i].qbase = NULL; + sc->compl_queues[i].size = 0; + sc->compl_queues[i].tail = 0; + sc->compl_queues[i].head = 0; } } @@ -1092,14 +1129,9 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, u sq = &sc->submit_queues[0]; cq = &sc->compl_queues[0]; - sqhead = atomic_load_acq_short(&sq->head); + pthread_mutex_lock(&sq->mtx); - if (atomic_testandset_int(&sq->busy, 1)) { - DPRINTF("%s SQ busy, head %u, tail %u", - __func__, sqhead, sq->tail); - return; - } - + sqhead = sq->head; DPRINTF("sqhead %u, tail %u", sqhead, sq->tail); while (sqhead != atomic_load_acq_short(&sq->tail)) { @@ -1162,6 +1194,8 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, u struct nvme_completion *cp; int phase; + pthread_mutex_lock(&cq->mtx); + cp = &(cq->qbase)[cq->tail]; cp->cdw0 = compl.cdw0; cp->sqid = 0; @@ -1173,16 +1207,18 @@ pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, u pci_nvme_toggle_phase(&cp->status, phase); cq->tail = (cq->tail + 1) % cq->size; + + pthread_mutex_unlock(&cq->mtx); } } DPRINTF("setting sqhead %u", sqhead); - atomic_store_short(&sq->head, sqhead); - atomic_store_int(&sq->busy, 0); + sq->head = sqhead; if (cq->head != cq->tail) pci_generate_msix(sc->nsc_pi, 0); + pthread_mutex_unlock(&sq->mtx); } static int @@ -1272,7 +1308,7 @@ pci_nvme_append_iov_req(struct pci_nvme_softc *sc, str static void pci_nvme_set_completion(struct pci_nvme_softc *sc, struct nvme_submission_queue *sq, int sqid, uint16_t cid, - uint32_t cdw0, uint16_t status, int ignore_busy) + uint32_t cdw0, uint16_t status) { struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; struct nvme_completion *compl; @@ -1290,7 +1326,7 @@ pci_nvme_set_completion(struct pci_nvme_softc *sc, compl->cdw0 = cdw0; compl->sqid = sqid; - compl->sqhd = atomic_load_acq_short(&sq->head); + compl->sqhd = sq->head; compl->cid = cid; // toggle phase @@ -1375,7 +1411,7 @@ pci_nvme_io_done(struct blockif_req *br, int err) code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; pci_nvme_status_genc(&status, code); - pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); + pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status); pci_nvme_release_ioreq(req->sc, req); } @@ -1575,7 +1611,7 @@ pci_nvme_dealloc_sm(struct blockif_req *br, int err) if (done) { pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, - req->cid, 0, status, 0); + req->cid, 0, status); pci_nvme_release_ioreq(sc, req); } } @@ -1677,13 +1713,9 @@ pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint /* handle all submissions up to sq->tail index */ sq = &sc->submit_queues[idx]; - if (atomic_testandset_int(&sq->busy, 1)) { - DPRINTF("%s sqid %u busy", __func__, idx); - return; - } + pthread_mutex_lock(&sq->mtx); - sqhead = atomic_load_acq_short(&sq->head); - + sqhead = sq->head; DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p", idx, sqhead, sq->tail, sq->qbase); @@ -1750,14 +1782,15 @@ pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint complete: if (!pending) { pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, - status, 1); + status); if (req != NULL) pci_nvme_release_ioreq(sc, req); } } - atomic_store_short(&sq->head, sqhead); - atomic_store_int(&sq->busy, 0); + sq->head = sqhead; + + pthread_mutex_unlock(&sq->mtx); } static void @@ -1768,6 +1801,13 @@ pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci idx, is_sq ? "SQ" : "CQ", value & 0xFFFF); if (is_sq) { + if (idx > sc->num_squeues) { + WPRINTF("%s queue index %lu overflow from " + "guest (max %u)", + __func__, idx, sc->num_squeues); + return; + } + atomic_store_short(&sc->submit_queues[idx].tail, (uint16_t)value); @@ -1791,7 +1831,8 @@ pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci return; } - sc->compl_queues[idx].head = (uint16_t)value; + atomic_store_short(&sc->compl_queues[idx].head, + (uint16_t)value); } } @@ -2236,7 +2277,7 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *p pthread_mutex_init(&sc->mtx, NULL); sem_init(&sc->iosemlock, 0, sc->ioslots); - pci_nvme_reset(sc); + pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues); /* * Controller data depends on Namespace data so initialize Namespace * data first. @@ -2244,6 +2285,8 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *p pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore); pci_nvme_init_ctrldata(sc); pci_nvme_init_logpages(sc); + + pci_nvme_reset(sc); pci_lintr_request(pi);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202006290031.05T0VOlI047065>