From owner-svn-src-all@freebsd.org Wed Jul 8 19:56:37 2020 Return-Path: Delivered-To: svn-src-all@mailman.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.nyi.freebsd.org (Postfix) with ESMTP id 8D07E36D798; Wed, 8 Jul 2020 19:56:37 +0000 (UTC) (envelope-from gordon@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 4B296r75fHz3ZNx; Wed, 8 Jul 2020 19:56:36 +0000 (UTC) (envelope-from gordon@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id DCDB1258A9; Wed, 8 Jul 2020 19:56:35 +0000 (UTC) (envelope-from gordon@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id 068JuZq7018075; Wed, 8 Jul 2020 19:56:35 GMT (envelope-from gordon@FreeBSD.org) Received: (from gordon@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id 068JuYAQ018071; Wed, 8 Jul 2020 19:56:34 GMT (envelope-from gordon@FreeBSD.org) Message-Id: <202007081956.068JuYAQ018071@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: gordon set sender to gordon@FreeBSD.org using -f From: Gordon Tetlow Date: Wed, 8 Jul 2020 19:56:34 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-releng@freebsd.org Subject: svn commit: r363022 - in releng/12.1: sys/amd64/vmm/intel usr.sbin/bhyve X-SVN-Group: releng X-SVN-Commit-Author: gordon X-SVN-Commit-Paths: in releng/12.1: sys/amd64/vmm/intel usr.sbin/bhyve X-SVN-Commit-Revision: 363022 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.33 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 08 Jul 2020 19:56:38 -0000 Author: gordon Date: Wed Jul 8 19:56:34 2020 New Revision: 363022 URL: https://svnweb.freebsd.org/changeset/base/363022 Log: Fix host crash in bhyve with PCI device passthrough. Approved by: so Security: FreeBSD-EN-20:13.bhyve Modified: releng/12.1/sys/amd64/vmm/intel/vtd.c releng/12.1/usr.sbin/bhyve/pci_emul.c releng/12.1/usr.sbin/bhyve/pci_emul.h releng/12.1/usr.sbin/bhyve/pci_passthru.c Modified: releng/12.1/sys/amd64/vmm/intel/vtd.c ============================================================================== --- releng/12.1/sys/amd64/vmm/intel/vtd.c Wed Jul 8 19:14:44 2020 (r363021) +++ releng/12.1/sys/amd64/vmm/intel/vtd.c Wed Jul 8 19:56:34 2020 (r363022) @@ -51,6 +51,8 @@ __FBSDID("$FreeBSD$"); * Architecture Spec, September 2008. */ +#define VTD_DRHD_INCLUDE_PCI_ALL(Flags) (((Flags) >> 0) & 0x1) + /* Section 10.4 "Register Descriptions" */ struct vtdmap { volatile uint32_t version; @@ -116,10 +118,11 @@ struct domain { static SLIST_HEAD(, domain) domhead; #define DRHD_MAX_UNITS 8 -static int drhd_num; -static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; -static int max_domains; -typedef int (*drhd_ident_func_t)(void); +static ACPI_DMAR_HARDWARE_UNIT *drhds[DRHD_MAX_UNITS]; +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); @@ -175,6 +178,69 @@ domain_id(void) return (id); } +static struct vtdmap * +vtd_device_scope(uint16_t rid) +{ + int i, remaining, pathremaining; + char *end, *pathend; + struct vtdmap *vtdmap; + ACPI_DMAR_HARDWARE_UNIT *drhd; + ACPI_DMAR_DEVICE_SCOPE *device_scope; + ACPI_DMAR_PCI_PATH *path; + + for (i = 0; i < drhd_num; i++) { + drhd = drhds[i]; + + if (VTD_DRHD_INCLUDE_PCI_ALL(drhd->Flags)) { + /* + * From Intel VT-d arch spec, version 3.0: + * If a DRHD structure with INCLUDE_PCI_ALL flag Set is reported + * for a Segment, it must be enumerated by BIOS after all other + * DRHD structures for the same Segment. + */ + vtdmap = vtdmaps[i]; + return(vtdmap); + } + + end = (char *)drhd + drhd->Header.Length; + remaining = drhd->Header.Length - sizeof(ACPI_DMAR_HARDWARE_UNIT); + while (remaining > sizeof(ACPI_DMAR_DEVICE_SCOPE)) { + device_scope = (ACPI_DMAR_DEVICE_SCOPE *)(end - remaining); + remaining -= device_scope->Length; + + switch (device_scope->EntryType){ + /* 0x01 and 0x02 are PCI device entries */ + case 0x01: + case 0x02: + break; + default: + continue; + } + + if (PCI_RID2BUS(rid) != device_scope->Bus) + continue; + + pathend = (char *)device_scope + device_scope->Length; + pathremaining = device_scope->Length - sizeof(ACPI_DMAR_DEVICE_SCOPE); + while (pathremaining >= sizeof(ACPI_DMAR_PCI_PATH)) { + path = (ACPI_DMAR_PCI_PATH *)(pathend - pathremaining); + pathremaining -= sizeof(ACPI_DMAR_PCI_PATH); + + if (PCI_RID2SLOT(rid) != path->Device) + continue; + if (PCI_RID2FUNC(rid) != path->Function) + continue; + + vtdmap = vtdmaps[i]; + return (vtdmap); + } + } + } + + /* No matching scope */ + return (NULL); +} + static void vtd_wbflush(struct vtdmap *vtdmap) { @@ -240,7 +306,7 @@ vtd_translation_disable(struct vtdmap *vtdmap) static int vtd_init(void) { - int i, units, remaining; + int i, units, remaining, tmp; struct vtdmap *vtdmap; vm_paddr_t ctx_paddr; char *end, envname[32]; @@ -291,8 +357,9 @@ vtd_init(void) break; drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr; - vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); - if (units >= DRHD_MAX_UNITS) + drhds[units] = drhd; + vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); + if (++units >= DRHD_MAX_UNITS) break; remaining -= hdr->Length; } @@ -302,13 +369,19 @@ vtd_init(void) skip_dmar: drhd_num = units; - vtdmap = vtdmaps[0]; - if (VTD_CAP_CM(vtdmap->cap) != 0) - panic("vtd_init: invalid caching mode"); + max_domains = 64 * 1024; /* maximum valid value */ + for (i = 0; i < drhd_num; i++){ + vtdmap = vtdmaps[i]; - max_domains = vtd_max_domains(vtdmap); + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + /* take most compatible (minimum) value */ + if ((tmp = vtd_max_domains(vtdmap)) < max_domains) + max_domains = tmp; + } + /* * Set up the root-table to point to the context-entry tables */ @@ -373,7 +446,6 @@ vtd_add_device(void *arg, uint16_t rid) struct vtdmap *vtdmap; uint8_t bus; - vtdmap = vtdmaps[0]; bus = PCI_RID2BUS(rid); ctxp = ctx_tables[bus]; pt_paddr = vtophys(dom->ptp); @@ -385,6 +457,10 @@ vtd_add_device(void *arg, uint16_t rid) (uint16_t)(ctxp[idx + 1] >> 8)); } + if ((vtdmap = vtd_device_scope(rid)) == NULL) + panic("vtd_add_device: device %x is not in scope for " + "any DMA remapping unit", rid); + /* * Order is important. The 'present' bit is set only after all fields * of the context pointer are initialized. @@ -568,8 +644,6 @@ vtd_create_domain(vm_paddr_t maxaddr) if (drhd_num <= 0) panic("vtd_create_domain: no dma remapping hardware available"); - vtdmap = vtdmaps[0]; - /* * Calculate AGAW. * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. @@ -594,7 +668,14 @@ vtd_create_domain(vm_paddr_t maxaddr) pt_levels = 2; sagaw = 30; addrwidth = 0; - tmp = VTD_CAP_SAGAW(vtdmap->cap); + + tmp = ~0; + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + /* take most compatible value */ + tmp &= VTD_CAP_SAGAW(vtdmap->cap); + } + for (i = 0; i < 5; i++) { if ((tmp & (1 << i)) != 0 && sagaw >= agaw) break; @@ -606,8 +687,8 @@ vtd_create_domain(vm_paddr_t maxaddr) } if (i >= 5) { - panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d", - VTD_CAP_SAGAW(vtdmap->cap), agaw); + panic("vtd_create_domain: SAGAW 0x%x does not support AGAW %d", + tmp, agaw); } dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); @@ -634,7 +715,12 @@ vtd_create_domain(vm_paddr_t maxaddr) * There is not any code to deal with the demotion at the moment * so we disable superpage mappings altogether. */ - dom->spsmask = VTD_CAP_SPS(vtdmap->cap); + dom->spsmask = ~0; + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + /* take most compatible value */ + dom->spsmask &= VTD_CAP_SPS(vtdmap->cap); + } #endif SLIST_INSERT_HEAD(&domhead, dom, next); Modified: releng/12.1/usr.sbin/bhyve/pci_emul.c ============================================================================== --- releng/12.1/usr.sbin/bhyve/pci_emul.c Wed Jul 8 19:14:44 2020 (r363021) +++ releng/12.1/usr.sbin/bhyve/pci_emul.c Wed Jul 8 19:56:34 2020 (r363022) @@ -868,7 +868,7 @@ pci_emul_add_msixcap(struct pci_devinst *pi, int msgnu sizeof(msixcap))); } -void +static void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { @@ -892,7 +892,7 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, i CFGWRITE(pi, offset, val, bytes); } -void +static void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { @@ -971,30 +971,34 @@ pci_emul_add_pciecap(struct pci_devinst *pi, int type) /* * This function assumes that 'coff' is in the capabilities region of the - * config space. + * config space. A capoff parameter of zero will force a search for the + * offset and type. */ -static void -pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val) +void +pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val, + uint8_t capoff, int capid) { - int capid; - uint8_t capoff, nextoff; + uint8_t nextoff; /* Do not allow un-aligned writes */ if ((offset & (bytes - 1)) != 0) return; - /* Find the capability that we want to update */ - capoff = CAP_START_OFFSET; - while (1) { - nextoff = pci_get_cfgdata8(pi, capoff + 1); - if (nextoff == 0) - break; - if (offset >= capoff && offset < nextoff) - break; + if (capoff == 0) { + /* Find the capability that we want to update */ + capoff = CAP_START_OFFSET; + while (1) { + nextoff = pci_get_cfgdata8(pi, capoff + 1); + if (nextoff == 0) + break; + if (offset >= capoff && offset < nextoff) + break; - capoff = nextoff; + capoff = nextoff; + } + assert(offset >= capoff); + capid = pci_get_cfgdata8(pi, capoff); } - assert(offset >= capoff); /* * Capability ID and Next Capability Pointer are readonly. @@ -1011,7 +1015,6 @@ pci_emul_capwrite(struct pci_devinst *pi, int offset, return; } - capid = pci_get_cfgdata8(pi, capoff); switch (capid) { case PCIY_MSI: msicap_cfgwrite(pi, capoff, offset, bytes, val); @@ -1878,7 +1881,7 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus pci_set_cfgdata32(pi, coff, bar); } else if (pci_emul_iscap(pi, coff)) { - pci_emul_capwrite(pi, coff, bytes, *eax); + pci_emul_capwrite(pi, coff, bytes, *eax, 0, 0); } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { pci_emul_cmdsts_write(pi, coff, *eax, bytes); } else { Modified: releng/12.1/usr.sbin/bhyve/pci_emul.h ============================================================================== --- releng/12.1/usr.sbin/bhyve/pci_emul.h Wed Jul 8 19:14:44 2020 (r363021) +++ releng/12.1/usr.sbin/bhyve/pci_emul.h Wed Jul 8 19:56:34 2020 (r363022) @@ -212,10 +212,6 @@ typedef void (*pci_lintr_cb)(int b, int s, int pin, in int ioapic_irq, void *arg); int init_pci(struct vmctx *ctx); -void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, - int bytes, uint32_t val); -void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, - int bytes, uint32_t val); void pci_callback(void); int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type, uint64_t size); @@ -223,6 +219,8 @@ int pci_emul_alloc_pbar(struct pci_devinst *pdi, int i uint64_t hostbase, enum pcibar_type type, uint64_t size); int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum); int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type); +void pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, + uint32_t val, uint8_t capoff, int capid); void pci_generate_msi(struct pci_devinst *pi, int msgnum); void pci_generate_msix(struct pci_devinst *pi, int msgnum); void pci_lintr_assert(struct pci_devinst *pi); Modified: releng/12.1/usr.sbin/bhyve/pci_passthru.c ============================================================================== --- releng/12.1/usr.sbin/bhyve/pci_passthru.c Wed Jul 8 19:14:44 2020 (r363021) +++ releng/12.1/usr.sbin/bhyve/pci_passthru.c Wed Jul 8 19:56:34 2020 (r363022) @@ -828,8 +828,8 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct * MSI capability is emulated */ if (msicap_access(sc, coff)) { - msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val); - + pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msi.capoff, + PCIY_MSI); error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, sc->psc_sel.pc_func, pi->pi_msi.addr, pi->pi_msi.msg_data, @@ -840,7 +840,8 @@ passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct } if (msixcap_access(sc, coff)) { - msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val); + pci_emul_capwrite(pi, coff, bytes, val, sc->psc_msix.capoff, + PCIY_MSIX); if (pi->pi_msix.enabled) { msix_table_entries = pi->pi_msix.table_count; for (i = 0; i < msix_table_entries; i++) {