From owner-svn-src-head@freebsd.org Mon Jun 29 00:31:15 2020 Return-Path: Delivered-To: svn-src-head@mailman.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.nyi.freebsd.org (Postfix) with ESMTP id 919E2359436; Mon, 29 Jun 2020 00:31:15 +0000 (UTC) (envelope-from chuck@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 49w7hL6d3Zz42mf; Mon, 29 Jun 2020 00:31:14 +0000 (UTC) (envelope-from chuck@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id B8F561E765; Mon, 29 Jun 2020 00:31:14 +0000 (UTC) (envelope-from chuck@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id 05T0VEW3045219; Mon, 29 Jun 2020 00:31:14 GMT (envelope-from chuck@FreeBSD.org) Received: (from chuck@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id 05T0VEni045218; Mon, 29 Jun 2020 00:31:14 GMT (envelope-from chuck@FreeBSD.org) Message-Id: <202006290031.05T0VEni045218@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: chuck set sender to chuck@FreeBSD.org using -f From: Chuck Tuffli Date: Mon, 29 Jun 2020 00:31:14 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r362745 - head/usr.sbin/bhyve X-SVN-Group: head X-SVN-Commit-Author: chuck X-SVN-Commit-Paths: head/usr.sbin/bhyve X-SVN-Commit-Revision: 362745 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.33 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 29 Jun 2020 00:31:15 -0000 Author: chuck Date: Mon Jun 29 00:31:14 2020 New Revision: 362745 URL: https://svnweb.freebsd.org/changeset/base/362745 Log: bhyve: refactor NVMe IO command handling This refactors the NVMe I/O command processing function to make adding new commands easier. The main change is to move command specific processing (i.e. Read/Write) to separate functions for each NVMe I/O command and leave the common per-command processing in the existing pci_nvme_handle_io_cmd() function. While here, add checks for some common errors (invalid Namespace ID, invalid opcode, LBA out of range). Add myself to the Copyright holders Reviewed by: imp Tested by: Jason Tubnor MFC after: 2 weeks Differential Revision: https://reviews.freebsd.org/D24879 Modified: head/usr.sbin/bhyve/pci_nvme.c Modified: head/usr.sbin/bhyve/pci_nvme.c ============================================================================== --- head/usr.sbin/bhyve/pci_nvme.c Mon Jun 29 00:31:11 2020 (r362744) +++ head/usr.sbin/bhyve/pci_nvme.c Mon Jun 29 00:31:14 2020 (r362745) @@ -3,6 +3,7 @@ * * Copyright (c) 2017 Shunsuke Mie * Copyright (c) 2018 Leon Dang + * Copyright (c) 2020 Chuck Tuffli * * Function crc16 Copyright (c) 2017, Fedor Uporov * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c @@ -1386,6 +1387,122 @@ pci_nvme_io_partial(struct blockif_req *br, int err) pthread_cond_signal(&req->cv); } +static bool +nvme_opc_write_read(struct pci_nvme_softc *sc, + struct nvme_command *cmd, + struct pci_nvme_blockstore *nvstore, + struct pci_nvme_ioreq *req, + uint16_t *status) +{ + uint64_t lba, nblocks, bytes; + size_t offset; + bool is_write = cmd->opc == NVME_OPC_WRITE; + bool pending = false; + + lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; + nblocks = (cmd->cdw12 & 0xFFFF) + 1; + + offset = lba * nvstore->sectsz; + bytes = nblocks * nvstore->sectsz; + + if ((offset + bytes) > nvstore->size) { + WPRINTF("%s command would exceed LBA range", __func__); + pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); + goto out; + } + + req->io_req.br_offset = lba; + + /* PRP bits 1:0 must be zero */ + cmd->prp1 &= ~0x3UL; + cmd->prp2 &= ~0x3UL; + + if (nvstore->type == NVME_STOR_RAM) { + uint8_t *buf = nvstore->ctx; + enum nvme_copy_dir dir; + + if (is_write) + dir = NVME_COPY_TO_PRP; + else + dir = NVME_COPY_FROM_PRP; + + if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, + buf + offset, bytes, dir)) + pci_nvme_status_genc(status, + NVME_SC_DATA_TRANSFER_ERROR); + else + pci_nvme_status_genc(status, NVME_SC_SUCCESS); + } else { + uint64_t size; + int err; + + size = MIN(PAGE_SIZE - (cmd->prp1 % PAGE_SIZE), bytes); + if (pci_nvme_append_iov_req(sc, req, cmd->prp1, + size, is_write, offset)) { + pci_nvme_status_genc(status, + NVME_SC_DATA_TRANSFER_ERROR); + goto out; + } + + offset += size; + bytes -= size; + + if (bytes == 0) { + ; + } else if (bytes <= PAGE_SIZE) { + size = bytes; + if (pci_nvme_append_iov_req(sc, req, cmd->prp2, + size, is_write, offset)) { + pci_nvme_status_genc(status, + NVME_SC_DATA_TRANSFER_ERROR); + goto out; + } + } else { + void *vmctx = sc->nsc_pi->pi_vmctx; + uint64_t *prp_list = &cmd->prp2; + uint64_t *last = prp_list; + + /* PRP2 is pointer to a physical region page list */ + while (bytes) { + /* Last entry in list points to the next list */ + if (prp_list == last) { + uint64_t prp = *prp_list; + + prp_list = paddr_guest2host(vmctx, prp, + PAGE_SIZE - (prp % PAGE_SIZE)); + last = prp_list + (NVME_PRP2_ITEMS - 1); + } + + size = MIN(bytes, PAGE_SIZE); + + if (pci_nvme_append_iov_req(sc, req, *prp_list, + size, is_write, offset)) { + pci_nvme_status_genc(status, + NVME_SC_DATA_TRANSFER_ERROR); + goto out; + } + + offset += size; + bytes -= size; + + prp_list++; + } + } + req->io_req.br_callback = pci_nvme_io_done; + if (is_write) + err = blockif_write(nvstore->ctx, &req->io_req); + else + err = blockif_read(nvstore->ctx, &req->io_req); + + if (err) + pci_nvme_status_genc(status, NVME_SC_DATA_TRANSFER_ERROR); + else + pending = true; + } +out: + return (pending); +} + static void pci_nvme_dealloc_sm(struct blockif_req *br, int err) { @@ -1421,14 +1538,15 @@ pci_nvme_dealloc_sm(struct blockif_req *br, int err) } } -static int +static bool nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, struct nvme_command *cmd, struct pci_nvme_blockstore *nvstore, struct pci_nvme_ioreq *req, uint16_t *status) { - int err = -1; + int err; + bool pending = false; if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) { pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE); @@ -1463,9 +1581,6 @@ nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2, (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP); - req->opc = cmd->opc; - req->cid = cmd->cid; - req->nsid = cmd->nsid; /* * If the request is for more than a single range, store * the ranges in the br_iov. Optimize for the common case @@ -1501,11 +1616,13 @@ nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc, err = blockif_delete(nvstore->ctx, &req->io_req); if (err) pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR); + else + pending = true; free(range); } out: - return (err); + return (pending); } static void @@ -1514,7 +1631,6 @@ pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint struct nvme_submission_queue *sq; uint16_t status; uint16_t sqhead; - int err; /* handle all submissions up to sq->tail index */ sq = &sc->submit_queues[idx]; @@ -1531,189 +1647,69 @@ pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint while (sqhead != atomic_load_acq_short(&sq->tail)) { struct nvme_command *cmd; - struct pci_nvme_ioreq *req = NULL; - uint64_t lba; - uint64_t nblocks, bytes, size, cpsz; + struct pci_nvme_ioreq *req; + uint32_t nsid; + bool pending; - /* TODO: support scatter gather list handling */ + pending = false; + req = NULL; + status = 0; cmd = &sq->qbase[sqhead]; sqhead = (sqhead + 1) % sq->size; - lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; + nsid = le32toh(cmd->nsid); + if ((nsid == 0) || (nsid > sc->ctrldata.nn)) { + pci_nvme_status_genc(&status, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + status |= + NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT; + goto complete; + } - if (cmd->opc == NVME_OPC_FLUSH) { - pci_nvme_status_genc(&status, NVME_SC_SUCCESS); - pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, - status, 1); - - continue; - } else if (cmd->opc == 0x08) { - /* TODO: write zeroes */ - WPRINTF("%s write zeroes lba 0x%lx blocks %u", - __func__, lba, cmd->cdw12 & 0xFFFF); - pci_nvme_status_genc(&status, NVME_SC_SUCCESS); - pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, - status, 1); - - continue; + req = pci_nvme_get_ioreq(sc); + if (req == NULL) { + pci_nvme_status_genc(&status, + NVME_SC_INTERNAL_DEVICE_ERROR); + WPRINTF("%s: unable to allocate IO req", __func__); + goto complete; } + req->nvme_sq = sq; + req->sqid = idx; + req->opc = cmd->opc; + req->cid = cmd->cid; + req->nsid = cmd->nsid; - if (sc->nvstore.type == NVME_STOR_BLOCKIF) { - req = pci_nvme_get_ioreq(sc); - req->nvme_sq = sq; - req->sqid = idx; - } - - if (cmd->opc == NVME_OPC_DATASET_MANAGEMENT) { - if (nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, req, - &status)) { - pci_nvme_set_completion(sc, sq, idx, cmd->cid, - 0, status, 1); - if (req) - pci_nvme_release_ioreq(sc, req); - } - continue; - } - - nblocks = (cmd->cdw12 & 0xFFFF) + 1; - - bytes = nblocks * sc->nvstore.sectsz; - - /* - * If data starts mid-page and flows into the next page, then - * increase page count - */ - - DPRINTF("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " - "(%lu-bytes)", - sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, - cmd->opc == NVME_OPC_WRITE ? - "WRITE" : "READ", - lba, nblocks, bytes); - - cmd->prp1 &= ~(0x03UL); - cmd->prp2 &= ~(0x03UL); - - DPRINTF(" prp1 0x%lx prp2 0x%lx", cmd->prp1, cmd->prp2); - - size = bytes; - lba *= sc->nvstore.sectsz; - - cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); - - if (cpsz > bytes) - cpsz = bytes; - - if (req != NULL) { - req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | - cmd->cdw10; - req->opc = cmd->opc; - req->cid = cmd->cid; - req->nsid = cmd->nsid; - } - - err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, - cmd->opc == NVME_OPC_WRITE, lba); - lba += cpsz; - size -= cpsz; - - if (size == 0) - goto iodone; - - if (size <= PAGE_SIZE) { - /* prp2 is second (and final) page in transfer */ - - err = pci_nvme_append_iov_req(sc, req, cmd->prp2, - size, - cmd->opc == NVME_OPC_WRITE, - lba); - } else { - uint64_t *prp_list; - int i; - - /* prp2 is pointer to a physical region page list */ - prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, - cmd->prp2, PAGE_SIZE); - - i = 0; - while (size != 0) { - cpsz = MIN(size, PAGE_SIZE); - - /* - * Move to linked physical region page list - * in last item. - */ - if (i == (NVME_PRP2_ITEMS-1) && - size > PAGE_SIZE) { - assert((prp_list[i] & (PAGE_SIZE-1)) == 0); - prp_list = paddr_guest2host( - sc->nsc_pi->pi_vmctx, - prp_list[i], PAGE_SIZE); - i = 0; - } - if (prp_list[i] == 0) { - WPRINTF("PRP2[%d] = 0 !!!", i); - err = 1; - break; - } - - err = pci_nvme_append_iov_req(sc, req, - prp_list[i], cpsz, - cmd->opc == NVME_OPC_WRITE, lba); - if (err) - break; - - lba += cpsz; - size -= cpsz; - i++; - } - } - -iodone: - if (sc->nvstore.type == NVME_STOR_RAM) { - uint16_t code, status; - - code = err ? NVME_SC_LBA_OUT_OF_RANGE : - NVME_SC_SUCCESS; - pci_nvme_status_genc(&status, code); - - pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, - status, 1); - - continue; - } - - - if (err) - goto do_error; - - req->io_req.br_callback = pci_nvme_io_done; - - err = 0; switch (cmd->opc) { + case NVME_OPC_FLUSH: + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); + break; + case NVME_OPC_WRITE: case NVME_OPC_READ: - err = blockif_read(sc->nvstore.ctx, &req->io_req); + pending = nvme_opc_write_read(sc, cmd, &sc->nvstore, + req, &status); break; - case NVME_OPC_WRITE: - err = blockif_write(sc->nvstore.ctx, &req->io_req); + case NVME_OPC_WRITE_ZEROES: + /* TODO: write zeroes + WPRINTF("%s write zeroes lba 0x%lx blocks %u", + __func__, lba, cmd->cdw12 & 0xFFFF); */ + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); break; - default: - WPRINTF("%s unhandled io command 0x%x", - __func__, cmd->opc); - err = 1; + case NVME_OPC_DATASET_MANAGEMENT: + pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore, + req, &status); + break; + default: + WPRINTF("%s unhandled io command 0x%x", + __func__, cmd->opc); + pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE); } - -do_error: - if (err) { - uint16_t status; - - pci_nvme_status_genc(&status, - NVME_SC_DATA_TRANSFER_ERROR); - +complete: + if (!pending) { pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, - status, 1); - pci_nvme_release_ioreq(sc, req); + status, 1); + if (req != NULL) + pci_nvme_release_ioreq(sc, req); } }