Date: Fri, 4 Jan 2019 15:03:31 +0000 (UTC) From: Chuck Tuffli <chuck@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r342761 - head/usr.sbin/bhyve Message-ID: <201901041503.x04F3VGe063766@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: chuck Date: Fri Jan 4 15:03:30 2019 New Revision: 342761 URL: https://svnweb.freebsd.org/changeset/base/342761 Log: Fix bhyve's NVMe queue bookkeeping Many size / length parameters in NVMe are "0's based", meaning, a value of 0x0 represents 1, 0x1 represents 2, etc.. While this leads to an efficient encoding, it can lead to subtle bugs. With respect to queues, these parameters include: - Maximum number of queue entries - Maximum number of queues - Number of Completion Queues - Number of Submission Queues To be consistent, convert all 0's based values from the host to 1's based value internally. Likewise, covert internal 1's based values to 0's based values when returned to the host. This fixes an off-by-one bug when creating IO queues and simplifies some of the code. Note that this bug is masked by another bug. While in the neighborhood, - fix an erroneous queue ID check (checking CQ count when deleting SQ) - check for queue ID of 0x0 in a few places where this is illegal - clean up the Set Features, Number of Queues command and check for illegal values Reviewed by: imp, araujo Approved by: imp (mentor) MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D18702 Modified: head/usr.sbin/bhyve/pci_nvme.c Modified: head/usr.sbin/bhyve/pci_nvme.c ============================================================================== --- head/usr.sbin/bhyve/pci_nvme.c Fri Jan 4 14:42:36 2019 (r342760) +++ head/usr.sbin/bhyve/pci_nvme.c Fri Jan 4 15:03:30 2019 (r342761) @@ -93,6 +93,16 @@ static int nvme_debug = 0; /* helpers */ +/* Convert a zero-based value into a one-based value */ +#define ONE_BASED(zero) ((zero) + 1) +/* Convert a one-based value into a zero-based value */ +#define ZERO_BASED(one) ((one) - 1) + +/* Encode number of SQ's and CQ's for Set/Get Features */ +#define NVME_FEATURE_NUM_QUEUES(sc) \ + (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ + (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; + #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) enum nvme_controller_register_offsets { @@ -192,8 +202,8 @@ struct pci_nvme_softc { struct pci_nvme_blockstore nvstore; - uint16_t max_qentries; /* max entries per queue */ - uint32_t max_queues; + uint16_t max_qentries; /* max entries per queue */ + uint32_t max_queues; /* max number of IO SQ's or CQ's */ uint32_t num_cqueues; uint32_t num_squeues; @@ -203,7 +213,10 @@ struct pci_nvme_softc { uint32_t ioslots; sem_t iosemlock; - /* status and guest memory mapped queues */ + /* + * Memory mapped Submission and Completion queues + * Each array includes both Admin and IO queues + */ struct nvme_completion_queue *compl_queues; struct nvme_submission_queue *submit_queues; @@ -357,7 +370,7 @@ pci_nvme_reset_locked(struct pci_nvme_softc *sc) { DPRINTF(("%s\r\n", __func__)); - sc->regs.cap_lo = (sc->max_qentries & NVME_CAP_LO_REG_MQES_MASK) | + sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | (1 << NVME_CAP_LO_REG_CQR_SHIFT) | (60 << NVME_CAP_LO_REG_TO_SHIFT); @@ -370,7 +383,7 @@ pci_nvme_reset_locked(struct pci_nvme_softc *sc) sc->num_cqueues = sc->num_squeues = sc->max_queues; if (sc->submit_queues != NULL) { - for (int i = 0; i <= sc->max_queues; i++) { + for (int i = 0; i < sc->num_squeues + 1; i++) { /* * The Admin Submission Queue is at index 0. * It must not be changed at reset otherwise the @@ -380,26 +393,31 @@ pci_nvme_reset_locked(struct pci_nvme_softc *sc) sc->submit_queues[i].qbase = NULL; sc->submit_queues[i].size = 0; sc->submit_queues[i].cqid = 0; - - sc->compl_queues[i].qbase = NULL; - sc->compl_queues[i].size = 0; } sc->submit_queues[i].tail = 0; sc->submit_queues[i].head = 0; sc->submit_queues[i].busy = 0; - - sc->compl_queues[i].tail = 0; - sc->compl_queues[i].head = 0; } } else - sc->submit_queues = calloc(sc->max_queues + 1, + sc->submit_queues = calloc(sc->num_squeues + 1, sizeof(struct nvme_submission_queue)); - if (sc->compl_queues == NULL) { - sc->compl_queues = calloc(sc->max_queues + 1, + if (sc->compl_queues != NULL) { + for (int i = 0; i < sc->num_cqueues + 1; i++) { + /* See Admin Submission Queue note above */ + if (i != 0) { + sc->compl_queues[i].qbase = NULL; + sc->compl_queues[i].size = 0; + } + + sc->compl_queues[i].tail = 0; + sc->compl_queues[i].head = 0; + } + } else { + sc->compl_queues = calloc(sc->num_cqueues + 1, sizeof(struct nvme_completion_queue)); - for (int i = 0; i <= sc->num_cqueues; i++) + for (int i = 0; i < sc->num_cqueues + 1; i++) pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); } } @@ -443,7 +461,7 @@ nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struc uint16_t qid = command->cdw10 & 0xffff; DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid)); - if (qid == 0 || qid > sc->num_cqueues) { + if (qid == 0 || qid > sc->num_squeues) { WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n", __func__, qid, sc->num_squeues)); pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, @@ -464,7 +482,7 @@ nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struc uint16_t qid = command->cdw10 & 0xffff; struct nvme_submission_queue *nsq; - if (qid > sc->num_squeues) { + if ((qid == 0) || (qid > sc->num_squeues)) { WPRINTF(("%s queue index %u > num_squeues %u\r\n", __func__, qid, sc->num_squeues)); pci_nvme_status_tc(&compl->status, @@ -474,7 +492,7 @@ nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struc } nsq = &sc->submit_queues[qid]; - nsq->size = ((command->cdw10 >> 16) & 0xffff) + 1; + nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, sizeof(struct nvme_command) * (size_t)nsq->size); @@ -529,7 +547,7 @@ nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struc uint16_t qid = command->cdw10 & 0xffff; struct nvme_completion_queue *ncq; - if (qid > sc->num_cqueues) { + if ((qid == 0) || (qid > sc->num_cqueues)) { WPRINTF(("%s queue index %u > num_cqueues %u\r\n", __func__, qid, sc->num_cqueues)); pci_nvme_status_tc(&compl->status, @@ -541,7 +559,7 @@ nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struc ncq = &sc->compl_queues[qid]; ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; - ncq->size = ((command->cdw10 >> 16) & 0xffff) + 1; + ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, @@ -649,6 +667,45 @@ nvme_opc_identify(struct pci_nvme_softc* sc, struct nv } static int +nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t nqr; /* Number of Queues Requested */ + + nqr = command->cdw11 & 0xFFFF; + if (nqr == 0xffff) { + WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (-1); + } + + sc->num_squeues = ONE_BASED(nqr); + if (sc->num_squeues > sc->max_queues) { + DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues, + sc->max_queues)); + sc->num_squeues = sc->max_queues; + } + + nqr = (command->cdw11 >> 16) & 0xFFFF; + if (nqr == 0xffff) { + WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (-1); + } + + sc->num_cqueues = ONE_BASED(nqr); + if (sc->num_cqueues > sc->max_queues) { + DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues, + sc->max_queues)); + sc->num_cqueues = sc->max_queues; + } + + compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); + + return (0); +} + +static int nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, struct nvme_completion* compl) { @@ -678,19 +735,7 @@ nvme_opc_set_features(struct pci_nvme_softc* sc, struc DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11)); break; case NVME_FEAT_NUMBER_OF_QUEUES: - sc->num_squeues = command->cdw11 & 0xFFFF; - sc->num_cqueues = (command->cdw11 >> 16) & 0xFFFF; - DPRINTF((" number of queues (submit %u, completion %u)\r\n", - sc->num_squeues, sc->num_cqueues)); - - if (sc->num_squeues == 0 || sc->num_squeues > sc->max_queues) - sc->num_squeues = sc->max_queues; - if (sc->num_cqueues == 0 || sc->num_cqueues > sc->max_queues) - sc->num_cqueues = sc->max_queues; - - compl->cdw0 = (sc->num_squeues & 0xFFFF) | - ((sc->num_cqueues & 0xFFFF) << 16); - + nvme_set_feature_queues(sc, command, compl); break; case NVME_FEAT_INTERRUPT_COALESCING: DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11)); @@ -706,7 +751,7 @@ nvme_opc_set_features(struct pci_nvme_softc* sc, struc DPRINTF((" interrupt vector configuration 0x%x\r\n", command->cdw11)); - for (uint32_t i = 0; i <= sc->num_cqueues; i++) { + for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { if (sc->compl_queues[i].intr_vec == iv) { if (command->cdw11 & (1 << 16)) sc->compl_queues[i].intr_en |= @@ -788,17 +833,8 @@ nvme_opc_get_features(struct pci_nvme_softc* sc, struc DPRINTF((" volatile write cache\r\n")); break; case NVME_FEAT_NUMBER_OF_QUEUES: - compl->cdw0 = 0; - if (sc->num_squeues == 0) - compl->cdw0 |= sc->max_queues & 0xFFFF; - else - compl->cdw0 |= sc->num_squeues & 0xFFFF; + compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); - if (sc->num_cqueues == 0) - compl->cdw0 |= (sc->max_queues & 0xFFFF) << 16; - else - compl->cdw0 |= (sc->num_cqueues & 0xFFFF) << 16; - DPRINTF((" number of queues (submit %u, completion %u)\r\n", compl->cdw0 & 0xFFFF, (compl->cdw0 >> 16) & 0xFFFF)); @@ -1812,7 +1848,7 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *p /* allocate size of nvme registers + doorbell space for all queues */ pci_membar_sz = sizeof(struct nvme_registers) + - 2*sizeof(uint32_t)*(sc->max_queues); + 2*sizeof(uint32_t)*(sc->max_queues + 1); DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz)); @@ -1822,7 +1858,7 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *p goto done; } - error = pci_emul_add_msixcap(pi, sc->max_queues, NVME_MSIX_BAR); + error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); if (error) { WPRINTF(("%s pci add msixcap failed\r\n", __func__)); goto done;
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201901041503.x04F3VGe063766>