Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 2 Sep 2019 21:57:57 +0000 (UTC)
From:      Mark Johnston <markj@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r351728 - head/sys/amd64/amd64
Message-ID:  <201909022157.x82LvvUx070733@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: markj
Date: Mon Sep  2 21:57:57 2019
New Revision: 351728
URL: https://svnweb.freebsd.org/changeset/base/351728

Log:
  Add a sysctl to dump kernel mappings and their properties on amd64.
  
  The sysctl is called vm.pmap.kernel_maps.  It dumps address ranges
  and their corresponding protection and mapping mode, as well as
  counts of 2MB and 1GB pages in the range.
  
  Reviewed by:	kib
  MFC after:	2 weeks
  Sponsored by:	Netflix
  Differential Revision:	https://reviews.freebsd.org/D21380

Modified:
  head/sys/amd64/amd64/pmap.c

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Mon Sep  2 21:54:08 2019	(r351727)
+++ head/sys/amd64/amd64/pmap.c	Mon Sep  2 21:57:57 2019	(r351728)
@@ -124,6 +124,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/rangeset.h>
 #include <sys/rwlock.h>
+#include <sys/sbuf.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
 #include <sys/vmem.h>
@@ -2112,6 +2113,41 @@ pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
 	return (mask);
 }
 
+static int
+pmap_pat_index(pmap_t pmap, pt_entry_t pte, bool is_pde)
+{
+	int pat_flag, pat_idx;
+
+	pat_idx = 0;
+	switch (pmap->pm_type) {
+	case PT_X86:
+	case PT_RVI:
+		/* The PAT bit is different for PTE's and PDE's. */
+		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
+
+		if ((pte & pat_flag) != 0)
+			pat_idx |= 0x4;
+		if ((pte & PG_NC_PCD) != 0)
+			pat_idx |= 0x2;
+		if ((pte & PG_NC_PWT) != 0)
+			pat_idx |= 0x1;
+		break;
+	case PT_EPT:
+		if ((pte & EPT_PG_IGNORE_PAT) != 0)
+			panic("EPT PTE %#lx has no PAT memory type", pte);
+		pat_idx = (pte & EPT_PG_MEMORY_TYPE(0x7)) >> 3;
+		break;
+	}
+
+	/* See pmap_init_pat(). */
+	if (pat_idx == 4)
+		pat_idx = 0;
+	if (pat_idx == 7)
+		pat_idx = 3;
+
+	return (pat_idx);
+}
+
 bool
 pmap_ps_enabled(pmap_t pmap)
 {
@@ -9980,6 +10016,268 @@ pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offse
 	}
 	return (error);
 }
+
+/*
+ * Track a range of the kernel's virtual address space that is contiguous
+ * in various mapping attributes.
+ */
+struct pmap_kernel_map_range {
+	vm_offset_t sva;
+	pt_entry_t attrs;
+	int ptes;
+	int pdes;
+	int pdpes;
+};
+
+static void
+sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
+    vm_offset_t eva)
+{
+	const char *mode;
+	int i, pat_idx;
+
+	if (eva <= range->sva)
+		return;
+
+	pat_idx = pmap_pat_index(kernel_pmap, range->attrs, true);
+	for (i = 0; i < PAT_INDEX_SIZE; i++)
+		if (pat_index[i] == pat_idx)
+			break;
+
+	switch (i) {
+	case PAT_WRITE_BACK:
+		mode = "WB";
+		break;
+	case PAT_WRITE_THROUGH:
+		mode = "WT";
+		break;
+	case PAT_UNCACHEABLE:
+		mode = "UC";
+		break;
+	case PAT_WRITE_PROTECTED:
+		mode = "WP";
+		break;
+	case PAT_WRITE_COMBINING:
+		mode = "WC";
+		break;
+	default:
+		printf("%s: unknown PAT mode %#x for range %#016lx-%#016lx\n",
+		    __func__, i, range->sva, eva);
+		mode = "??";
+		break;
+	}
+
+	sbuf_printf(sb, "%#016lx-%#016lx r%c%c%c%c %s %d %d %d\n",
+	    range->sva, eva,
+	    (range->attrs & X86_PG_RW) != 0 ? 'w' : '-',
+	    (range->attrs & pg_nx) != 0 ? '-' : 'x',
+	    (range->attrs & X86_PG_U) != 0 ? 'u' : 's',
+	    (range->attrs & X86_PG_G) != 0 ? 'g' : '-',
+	    mode, range->pdpes, range->pdes, range->ptes);
+
+	/* Reset to sentinel value. */
+	range->sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
+}
+
+/*
+ * Determine whether the attributes specified by a page table entry match those
+ * being tracked by the current range.  This is not quite as simple as a direct
+ * flag comparison since some PAT modes have multiple representations.
+ */
+static bool
+sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
+{
+	pt_entry_t diff, mask;
+
+	mask = X86_PG_G | X86_PG_RW | X86_PG_U | X86_PG_PDE_CACHE | pg_nx;
+	diff = (range->attrs ^ attrs) & mask;
+	if (diff == 0)
+		return (true);
+	if ((diff & ~X86_PG_PDE_PAT) == 0 &&
+	    pmap_pat_index(kernel_pmap, range->attrs, true) ==
+	    pmap_pat_index(kernel_pmap, attrs, true))
+		return (true);
+	return (false);
+}
+
+static void
+sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
+    pt_entry_t attrs)
+{
+
+	memset(range, 0, sizeof(*range));
+	range->sva = va;
+	range->attrs = attrs;
+}
+
+/*
+ * Given a leaf PTE, derive the mapping's attributes.  If they do not match
+ * those of the current run, dump the address range and its attributes, and
+ * begin a new run.
+ */
+static void
+sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
+    vm_offset_t va, pml4_entry_t pml4e, pdp_entry_t pdpe, pd_entry_t pde,
+    pt_entry_t pte)
+{
+	pt_entry_t attrs;
+
+	attrs = pml4e & (X86_PG_RW | X86_PG_U | pg_nx);
+
+	attrs |= pdpe & pg_nx;
+	attrs &= pg_nx | (pdpe & (X86_PG_RW | X86_PG_U));
+	if ((pdpe & PG_PS) != 0) {
+		attrs |= pdpe & (X86_PG_G | X86_PG_PDE_CACHE);
+	} else if (pde != 0) {
+		attrs |= pde & pg_nx;
+		attrs &= pg_nx | (pde & (X86_PG_RW | X86_PG_U));
+	}
+	if ((pde & PG_PS) != 0) {
+		attrs |= pde & (X86_PG_G | X86_PG_PDE_CACHE);
+	} else if (pte != 0) {
+		attrs |= pte & pg_nx;
+		attrs &= pg_nx | (pte & (X86_PG_RW | X86_PG_U));
+		attrs |= pte & (X86_PG_G | X86_PG_PTE_CACHE);
+
+		/* Canonicalize by always using the PDE PAT bit. */
+		if ((attrs & X86_PG_PTE_PAT) != 0)
+			attrs ^= X86_PG_PDE_PAT | X86_PG_PTE_PAT;
+	}
+
+	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
+		sysctl_kmaps_dump(sb, range, va);
+		sysctl_kmaps_reinit(range, va, attrs);
+	}
+}
+
+static int
+sysctl_kmaps(SYSCTL_HANDLER_ARGS)
+{
+	struct pmap_kernel_map_range range;
+	struct sbuf sbuf, *sb;
+	pml4_entry_t pml4e;
+	pdp_entry_t *pdp, pdpe;
+	pd_entry_t *pd, pde;
+	pt_entry_t *pt, pte;
+	vm_offset_t sva;
+	vm_paddr_t pa;
+	int error, i, j, k, l;
+
+	error = sysctl_wire_old_buffer(req, 0);
+	if (error != 0)
+		return (error);
+	sb = &sbuf;
+	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
+
+	/* Sentinel value. */
+	range.sva = KVADDR(NPML4EPG - 1, NPDPEPG - 1, NPDEPG - 1, NPTEPG - 1);
+
+	/*
+	 * Iterate over the kernel page tables without holding the kernel pmap
+	 * lock.  Outside of the large map, kernel page table pages are never
+	 * freed, so at worst we will observe inconsistencies in the output.
+	 * Within the large map, ensure that PDP and PD page addresses are
+	 * valid before descending.
+	 */
+	for (sva = 0, i = pmap_pml4e_index(sva); i < NPML4EPG; i++) {
+		switch (i) {
+		case PML4PML4I:
+			sbuf_printf(sb, "\nRecursive map:\n");
+			break;
+		case DMPML4I:
+			sbuf_printf(sb, "\nDirect map:\n");
+			break;
+		case KPML4BASE:
+			sbuf_printf(sb, "\nKernel map:\n");
+			break;
+		case LMSPML4I:
+			sbuf_printf(sb, "\nLarge map:\n");
+			break;
+		}
+
+		/* Convert to canonical form. */
+		if (sva == 1ul << 47)
+			sva |= -1ul << 48;
+
+restart:
+		pml4e = kernel_pmap->pm_pml4[i];
+		if ((pml4e & X86_PG_V) == 0) {
+			sva = rounddown2(sva, NBPML4);
+			sysctl_kmaps_dump(sb, &range, sva);
+			sva += NBPML4;
+			continue;
+		}
+		pa = pml4e & PG_FRAME;
+		pdp = (pdp_entry_t *)PHYS_TO_DMAP(pa);
+
+		for (j = pmap_pdpe_index(sva); j < NPDPEPG; j++) {
+			pdpe = pdp[j];
+			if ((pdpe & X86_PG_V) == 0) {
+				sva = rounddown2(sva, NBPDP);
+				sysctl_kmaps_dump(sb, &range, sva);
+				sva += NBPDP;
+				continue;
+			}
+			pa = pdpe & PG_FRAME;
+			if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
+			    vm_phys_paddr_to_vm_page(pa) == NULL)
+				goto restart;
+			if ((pdpe & PG_PS) != 0) {
+				sva = rounddown2(sva, NBPDP);
+				sysctl_kmaps_check(sb, &range, sva, pml4e, pdpe,
+				    0, 0);
+				range.pdpes++;
+				sva += NBPDP;
+				continue;
+			}
+			pd = (pd_entry_t *)PHYS_TO_DMAP(pa);
+
+			for (k = pmap_pde_index(sva); k < NPDEPG; k++) {
+				pde = pd[k];
+				if ((pde & X86_PG_V) == 0) {
+					sva = rounddown2(sva, NBPDR);
+					sysctl_kmaps_dump(sb, &range, sva);
+					sva += NBPDR;
+					continue;
+				}
+				pa = pde & PG_FRAME;
+				if (PMAP_ADDRESS_IN_LARGEMAP(sva) &&
+				    vm_phys_paddr_to_vm_page(pa) == NULL)
+					goto restart;
+				if ((pde & PG_PS) != 0) {
+					sva = rounddown2(sva, NBPDR);
+					sysctl_kmaps_check(sb, &range, sva,
+					    pml4e, pdpe, pde, 0);
+					range.pdes++;
+					sva += NBPDR;
+					continue;
+				}
+				pt = (pt_entry_t *)PHYS_TO_DMAP(pa);
+
+				for (l = pmap_pte_index(sva); l < NPTEPG; l++,
+				    sva += PAGE_SIZE) {
+					pte = pt[l];
+					if ((pte & X86_PG_V) == 0) {
+						sysctl_kmaps_dump(sb, &range,
+						    sva);
+						continue;
+					}
+					sysctl_kmaps_check(sb, &range, sva,
+					    pml4e, pdpe, pde, pte);
+					range.ptes++;
+				}
+			}
+		}
+	}
+
+	error = sbuf_finish(sb);
+	sbuf_delete(sb);
+	return (error);
+}
+SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+    NULL, 0, sysctl_kmaps, "A",
+    "Dump kernel address layout");
 
 #ifdef DDB
 DB_SHOW_COMMAND(pte, pmap_print_pte)



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201909022157.x82LvvUx070733>