From owner-svn-src-all@freebsd.org Sun Jul 19 23:40:35 2020 Return-Path: Delivered-To: svn-src-all@mailman.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.nyi.freebsd.org (Postfix) with ESMTP id 291BE36DB0D; Sun, 19 Jul 2020 23:40:35 +0000 (UTC) (envelope-from chuck@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 4B91ZC0F8Qz4CSX; Sun, 19 Jul 2020 23:40:35 +0000 (UTC) (envelope-from chuck@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id E0FD823BE5; Sun, 19 Jul 2020 23:40:34 +0000 (UTC) (envelope-from chuck@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id 06JNeYAZ065315; Sun, 19 Jul 2020 23:40:34 GMT (envelope-from chuck@FreeBSD.org) Received: (from chuck@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id 06JNeY8u065314; Sun, 19 Jul 2020 23:40:34 GMT (envelope-from chuck@FreeBSD.org) Message-Id: <202007192340.06JNeY8u065314@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: chuck set sender to chuck@FreeBSD.org using -f From: Chuck Tuffli Date: Sun, 19 Jul 2020 23:40:34 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-12@freebsd.org Subject: svn commit: r363347 - stable/12/usr.sbin/bhyve X-SVN-Group: stable-12 X-SVN-Commit-Author: chuck X-SVN-Commit-Paths: stable/12/usr.sbin/bhyve X-SVN-Commit-Revision: 363347 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.33 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 19 Jul 2020 23:40:35 -0000 Author: chuck Date: Sun Jul 19 23:40:34 2020 New Revision: 363347 URL: https://svnweb.freebsd.org/changeset/base/363347 Log: MFC r362757 bhyve: base pci_nvme_ioreq size on advertised MDTS Modified: stable/12/usr.sbin/bhyve/pci_nvme.c Directory Properties: stable/12/ (props changed) Modified: stable/12/usr.sbin/bhyve/pci_nvme.c ============================================================================== --- stable/12/usr.sbin/bhyve/pci_nvme.c Sun Jul 19 23:37:19 2020 (r363346) +++ stable/12/usr.sbin/bhyve/pci_nvme.c Sun Jul 19 23:40:34 2020 (r363347) @@ -99,9 +99,16 @@ static int nvme_debug = 0; #define NVME_QUEUES 16 #define NVME_MAX_QENTRIES 2048 +/* Memory Page size Minimum reported in CAP register */ +#define NVME_MPSMIN 0 +/* MPSMIN converted to bytes */ +#define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN)) #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) -#define NVME_MAX_BLOCKIOVS 512 +#define NVME_MDTS 9 +/* Note the + 1 allows for the initial descriptor to not be page aligned */ +#define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1) +#define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES) /* This is a synthetic status code to indicate there is no status */ #define NVME_NO_STATUS 0xffff @@ -186,6 +193,18 @@ struct pci_nvme_blockstore { uint32_t deallocate:1; }; +/* + * Calculate the number of additional page descriptors for guest IO requests + * based on the advertised Max Data Transfer (MDTS) and given the number of + * default iovec's in a struct blockif_req. + * + * Note the + 1 allows for the initial descriptor to not be page aligned. + */ +#define MDTS_PAD_SIZE \ + NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \ + NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \ + 0 + struct pci_nvme_ioreq { struct pci_nvme_softc *sc; STAILQ_ENTRY(pci_nvme_ioreq) link; @@ -200,17 +219,9 @@ struct pci_nvme_ioreq { uint64_t prev_gpaddr; size_t prev_size; - /* - * lock if all iovs consumed (big IO); - * complete transaction before continuing - */ - pthread_mutex_t mtx; - pthread_cond_t cv; - struct blockif_req io_req; - /* pad to fit up to 512 page descriptors from guest IO request */ - struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; + struct iovec iovpadding[MDTS_PAD_SIZE]; }; enum nvme_dsm_type { @@ -279,7 +290,6 @@ struct pci_nvme_softc { }; -static void pci_nvme_io_partial(struct blockif_req *br, int err); static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *); static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *); static void pci_nvme_io_done(struct blockif_req *, int); @@ -433,7 +443,7 @@ pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) cd->mic = 0; - cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ + cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */ cd->ver = 0x00010300; @@ -1460,81 +1470,46 @@ pci_nvme_append_iov_req(struct pci_nvme_softc *sc, str { int iovidx; - if (req != NULL) { - /* concatenate contig block-iovs to minimize number of iovs */ - if ((req->prev_gpaddr + req->prev_size) == gpaddr) { - iovidx = req->io_req.br_iovcnt - 1; + if (req == NULL) + return (-1); - req->io_req.br_iov[iovidx].iov_base = - paddr_guest2host(req->sc->nsc_pi->pi_vmctx, - req->prev_gpaddr, size); + if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) { + return (-1); + } - req->prev_size += size; - req->io_req.br_resid += size; + /* concatenate contig block-iovs to minimize number of iovs */ + if ((req->prev_gpaddr + req->prev_size) == gpaddr) { + iovidx = req->io_req.br_iovcnt - 1; - req->io_req.br_iov[iovidx].iov_len = req->prev_size; - } else { - pthread_mutex_lock(&req->mtx); + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + req->prev_gpaddr, size); - iovidx = req->io_req.br_iovcnt; - if (iovidx == NVME_MAX_BLOCKIOVS) { - int err = 0; + req->prev_size += size; + req->io_req.br_resid += size; - DPRINTF("large I/O, doing partial req"); + req->io_req.br_iov[iovidx].iov_len = req->prev_size; + } else { + iovidx = req->io_req.br_iovcnt; + if (iovidx == 0) { + req->io_req.br_offset = lba; + req->io_req.br_resid = 0; + req->io_req.br_param = req; + } - iovidx = 0; - req->io_req.br_iovcnt = 0; + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + gpaddr, size); - req->io_req.br_callback = pci_nvme_io_partial; + req->io_req.br_iov[iovidx].iov_len = size; - if (!do_write) - err = blockif_read(sc->nvstore.ctx, - &req->io_req); - else - err = blockif_write(sc->nvstore.ctx, - &req->io_req); + req->prev_gpaddr = gpaddr; + req->prev_size = size; + req->io_req.br_resid += size; - /* wait until req completes before cont */ - if (err == 0) - pthread_cond_wait(&req->cv, &req->mtx); - } - if (iovidx == 0) { - req->io_req.br_offset = lba; - req->io_req.br_resid = 0; - req->io_req.br_param = req; - } - - req->io_req.br_iov[iovidx].iov_base = - paddr_guest2host(req->sc->nsc_pi->pi_vmctx, - gpaddr, size); - - req->io_req.br_iov[iovidx].iov_len = size; - - req->prev_gpaddr = gpaddr; - req->prev_size = size; - req->io_req.br_resid += size; - - req->io_req.br_iovcnt++; - - pthread_mutex_unlock(&req->mtx); - } - } else { - /* RAM buffer: read/write directly */ - void *p = sc->nvstore.ctx; - void *gptr; - - if ((lba + size) > sc->nvstore.size) { - WPRINTF("%s write would overflow RAM", __func__); - return (-1); - } - - p = (void *)((uintptr_t)p + (uintptr_t)lba); - gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); - if (do_write) - memcpy(p, gptr, size); - else - memcpy(gptr, p, size); + req->io_req.br_iovcnt++; } + return (0); } @@ -1632,16 +1607,6 @@ pci_nvme_io_done(struct blockif_req *br, int err) pci_nvme_release_ioreq(req->sc, req); } -static void -pci_nvme_io_partial(struct blockif_req *br, int err) -{ - struct pci_nvme_ioreq *req = br->br_param; - - DPRINTF("%s error %d %s", __func__, err, strerror(err)); - - pthread_cond_signal(&req->cv); -} - /* * Implements the Flush command. The specification states: * If a volatile write cache is not present, Flush commands complete @@ -1799,9 +1764,14 @@ nvme_opc_write_read(struct pci_nvme_softc *sc, lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; nblocks = (cmd->cdw12 & 0xFFFF) + 1; - offset = lba * nvstore->sectsz; bytes = nblocks * nvstore->sectsz; + if (bytes > NVME_MAX_DATA_SIZE) { + WPRINTF("%s command would exceed MDTS", __func__); + pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD); + goto out; + } + offset = lba * nvstore->sectsz; if ((offset + bytes) > nvstore->size) { WPRINTF("%s command would exceed LBA range", __func__); pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE); @@ -2478,8 +2448,6 @@ pci_nvme_init(struct vmctx *ctx, struct pci_devinst *p sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); for (int i = 0; i < sc->ioslots; i++) { STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link); - pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); - pthread_cond_init(&sc->ioreqs[i].cv, NULL); } pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);