From owner-svn-src-head@freebsd.org Mon Sep 2 21:57:58 2019 Return-Path: Delivered-To: svn-src-head@mailman.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.nyi.freebsd.org (Postfix) with ESMTP id 58684E8D32; Mon, 2 Sep 2019 21:57:58 +0000 (UTC) (envelope-from markj@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) server-signature RSA-PSS (4096 bits) client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 46MkTy1WbQz4M58; Mon, 2 Sep 2019 21:57:58 +0000 (UTC) (envelope-from markj@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 134F720985; Mon, 2 Sep 2019 21:57:58 +0000 (UTC) (envelope-from markj@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id x82Lvv1m070734; Mon, 2 Sep 2019 21:57:57 GMT (envelope-from markj@FreeBSD.org) Received: (from markj@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id x82LvvUx070733; Mon, 2 Sep 2019 21:57:57 GMT (envelope-from markj@FreeBSD.org) Message-Id: <201909022157.x82LvvUx070733@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: markj set sender to markj@FreeBSD.org using -f From: Mark Johnston Date: Mon, 2 Sep 2019 21:57:57 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r351728 - head/sys/amd64/amd64 X-SVN-Group: head X-SVN-Commit-Author: markj X-SVN-Commit-Paths: head/sys/amd64/amd64 X-SVN-Commit-Revision: 351728 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 02 Sep 2019 21:57:58 -0000 Author: markj Date: Mon Sep 2 21:57:57 2019 New Revision: 351728 URL: https://svnweb.freebsd.org/changeset/base/351728 Log: Add a sysctl to dump kernel mappings and their properties on amd64. The sysctl is called vm.pmap.kernel_maps. It dumps address ranges and their corresponding protection and mapping mode, as well as counts of 2MB and 1GB pages in the range. Reviewed by: kib MFC after: 2 weeks Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D21380 Modified: head/sys/amd64/amd64/pmap.c Modified: head/sys/amd64/amd64/pmap.c ============================================================================== --- head/sys/amd64/amd64/pmap.c Mon Sep 2 21:54:08 2019 (r351727) +++ head/sys/amd64/amd64/pmap.c Mon Sep 2 21:57:57 2019 (r351728) @@ -124,6 +124,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -2112,6 +2113,41 @@ pmap_cache_mask(pmap_t pmap, boolean_t is_pde) return (mask); } +static int +pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde) +{ + int pat_flag, pat_idx; + + pat_idx = 0; + switch (pmap->pm_type) { + case PT_X86: + case PT_RVI: + /* The PAT bit is different for PTE's and PDE's. */ + pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT; + + if ((pte & pat_flag) != 0) + pat_idx |= 0x4; + if ((pte & PG_NC_PCD) != 0) + pat_idx |= 0x2; + if ((pte & PG_NC_PWT) != 0) + pat_idx |= 0x1; + break; + case PT_EPT: + if ((pte & EPT_PG_IGNORE_PAT) != 0) + panic("EPT PTE %#lx has no PAT memory type", pte); + pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3; + break; + } + + /* See pmap_init_pat(). */ + if (pat_idx == 4) + pat_idx = 0; + if (pat_idx == 7) + pat_idx = 3; + + return (pat_idx); +} + bool pmap_ps_enabled(pmap_t pmap) { @@ -9980,6 +10016,268 @@ pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offse } return (error); } + +/* + * Track a range of the kernel's virtual address space that is contiguous + * in various mapping attributes. + */ +struct pmap_kernel_map_range { + vm_offset_t sva; + pt_entry_t attrs; + int ptes; + int pdes; + int pdpes; +}; + +static void +sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range, + vm_offset_t eva) +{ + const char *mode; + int i, pat_idx; + + if (eva <= range->sva) + return; + + pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true); + for (i = 0; i < PAT_INDEX_SIZE; i++) + if (pat_index[i] == pat_idx) + break; + + switch (i) { + case PAT_WRITE_BACK: + mode = "WB"; + break; + case PAT_WRITE_THROUGH: + mode = "WT"; + break; + case PAT_UNCACHEABLE: + mode = "UC"; + break; + case PAT_WRITE_PROTECTED: + mode = "WP"; + break; + case PAT_WRITE_COMBINING: + mode = "WC"; + break; + default: + printf("%s: unknown PAT mode %#x for range %#016lx-%#016lx\n", + __func__, i, range->sva, eva); + mode = "??"; + break; + } + + sbuf_printf(sb, "%#016lx-%#016lx r%c%c%c%c %s %d %d %d\n", + range->sva, eva, + (range->attrs & X86_PG_RW) != 0 ? 'w' : '-', + (range->attrs & pg_nx) != 0 ? '-' : 'x', + (range->attrs & X86_PG_U) != 0 ? 'u' : 's', + (range->attrs & X86_PG_G) != 0 ? 'g' : '-', + mode, range->pdpes, range->pdes, range->ptes); + + /* Reset to sentinel value. */ + range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); +} + +/* + * Determine whether the attributes specified by a page table entry match those + * being tracked by the current range. This is not quite as simple as a direct + * flag comparison since some PAT modes have multiple representations. + */ +static bool +sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs) +{ + pt_entry_t diff, mask; + + mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx; + diff = (range->attrs ^ attrs) & mask; + if (diff == 0) + return (true); + if ((diff & ~X86_PG_PDE_PAT) == 0 && + pmap_pat_index(kernel_pmap, range->attrs, true) == + pmap_pat_index(kernel_pmap, attrs, true)) + return (true); + return (false); +} + +static void +sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va, + pt_entry_t attrs) +{ + + memset(range, 0, sizeof(*range)); + range->sva = va; + range->attrs = attrs; +} + +/* + * Given a leaf PTE, derive the mapping's attributes. If they do not match + * those of the current run, dump the address range and its attributes, and + * begin a new run. + */ +static void +sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range, + vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde, + pt_entry_t pte) +{ + pt_entry_t attrs; + + attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx); + + attrs |= pdpe & pg_nx; + attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U)); + if ((pdpe & PG_PS) != 0) { + attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE); + } else if (pde != 0) { + attrs |= pde & pg_nx; + attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U)); + } + if ((pde & PG_PS) != 0) { + attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE); + } else if (pte != 0) { + attrs |= pte & pg_nx; + attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U)); + attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE); + + /* Canonicalize by always using the PDE PAT bit. */ + if ((attrs & X86_PG_PTE_PAT) != 0) + attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT; + } + + if (range->sva > va || !sysctl_kmaps_match(range, attrs)) { + sysctl_kmaps_dump(sb, range, va); + sysctl_kmaps_reinit(range, va, attrs); + } +} + +static int +sysctl_kmaps(SYSCTL_HANDLER_ARGS) +{ + struct pmap_kernel_map_range range; + struct sbuf sbuf, *sb; + pml4_entry_t pml4e; + pdp_entry_t *pdp, pdpe; + pd_entry_t *pd, pde; + pt_entry_t *pt, pte; + vm_offset_t sva; + vm_paddr_t pa; + int error, i, j, k, l; + + error = sysctl_wire_old_buffer(req, 0); + if (error != 0) + return (error); + sb = &sbuf; + sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req); + + /* Sentinel value. */ + range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1); + + /* + * Iterate over the kernel page tables without holding the kernel pmap + * lock. Outside of the large map, kernel page table pages are never + * freed, so at worst we will observe inconsistencies in the output. + * Within the large map, ensure that PDP and PD page addresses are + * valid before descending. + */ + for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) { + switch (i) { + case PML4PML4I: + sbuf_printf(sb, "\nRecursive map:\n"); + break; + case DMPML4I: + sbuf_printf(sb, "\nDirect map:\n"); + break; + case KPML4BASE: + sbuf_printf(sb, "\nKernel map:\n"); + break; + case LMSPML4I: + sbuf_printf(sb, "\nLarge map:\n"); + break; + } + + /* Convert to canonical form. */ + if (sva == 1ul << 47) + sva |= -1ul << 48; + +restart: + pml4e = kernel_pmap->pm_pml4[i]; + if ((pml4e & X86_PG_V) == 0) { + sva = rounddown2(sva, NBPML4); + sysctl_kmaps_dump(sb, &range, sva); + sva += NBPML4; + continue; + } + pa = pml4e & PG_FRAME; + pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa); + + for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) { + pdpe = pdp[j]; + if ((pdpe & X86_PG_V) == 0) { + sva = rounddown2(sva, NBPDP); + sysctl_kmaps_dump(sb, &range, sva); + sva += NBPDP; + continue; + } + pa = pdpe & PG_FRAME; + if (PMAP_ADDRESS_IN_LARGEMAP(sva) && + vm_phys_paddr_to_vm_page(pa) == NULL) + goto restart; + if ((pdpe & PG_PS) != 0) { + sva = rounddown2(sva, NBPDP); + sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe, + 0, 0); + range.pdpes++; + sva += NBPDP; + continue; + } + pd = (pd_entry_t *)PHYS_TO_DMAP(pa); + + for (k = pmap_pde_index(sva); k < NPDEPG; k++) { + pde = pd[k]; + if ((pde & X86_PG_V) == 0) { + sva = rounddown2(sva, NBPDR); + sysctl_kmaps_dump(sb, &range, sva); + sva += NBPDR; + continue; + } + pa = pde & PG_FRAME; + if (PMAP_ADDRESS_IN_LARGEMAP(sva) && + vm_phys_paddr_to_vm_page(pa) == NULL) + goto restart; + if ((pde & PG_PS) != 0) { + sva = rounddown2(sva, NBPDR); + sysctl_kmaps_check(sb, &range, sva, + pml4e, pdpe, pde, 0); + range.pdes++; + sva += NBPDR; + continue; + } + pt = (pt_entry_t *)PHYS_TO_DMAP(pa); + + for (l = pmap_pte_index(sva); l < NPTEPG; l++, + sva += PAGE_SIZE) { + pte = pt[l]; + if ((pte & X86_PG_V) == 0) { + sysctl_kmaps_dump(sb, &range, + sva); + continue; + } + sysctl_kmaps_check(sb, &range, sva, + pml4e, pdpe, pde, pte); + range.ptes++; + } + } + } + } + + error = sbuf_finish(sb); + sbuf_delete(sb); + return (error); +} +SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, sysctl_kmaps, "A", + "Dump kernel address layout"); #ifdef DDB DB_SHOW_COMMAND(pte, pmap_print_pte)