Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 3 Dec 2021 14:53:41 GMT
From:      Mitchell Horne <mhorne@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-branches@FreeBSD.org
Subject:   git: f61e6927a41b - stable/13 - minidump: reduce the amount direct accesses to page tables
Message-ID:  <202112031453.1B3ErfI4081049@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch stable/13 has been updated by mhorne:

URL: https://cgit.FreeBSD.org/src/commit/?id=f61e6927a41b5227f42db8fef9ae63b63fe4a85d

commit f61e6927a41b5227f42db8fef9ae63b63fe4a85d
Author:     Mitchell Horne <mhorne@FreeBSD.org>
AuthorDate: 2021-11-17 15:30:43 +0000
Commit:     Mitchell Horne <mhorne@FreeBSD.org>
CommitDate: 2021-12-03 14:02:03 +0000

    minidump: reduce the amount direct accesses to page tables
    
    During a live dump, we may race with updates to the kernel page tables.
    This is generally okay; we accept that the state of the system while
    dumping may be somewhat inconsistent with its state when the dump was
    invoked. However, when walking the kernel page tables, it is important
    that we load each PDE/PTE only once while operating on it. Otherwise, it
    is possible to have the relevant PTE change underneath us. For example,
    after checking the valid bit, but before reading the physical address.
    
    Convert the loads to atomics, and add some validation around the
    physical addresses, to ensure that we do not try to dump a non-existent
    or non-canonical physical address.
    
    Similarly, don't read kernel_vm_end more than once, on the off chance
    that pmap_growkernel() is called between the two page table walks.
    
    Reviewed by:    kib, markj
    MFC after:      2 weeks
    Sponsored by:   Juniper Networks, Inc.
    Sponsored by:   Klara, Inc.
    Differential Revision:  https://reviews.freebsd.org/D31990
    
    (cherry picked from commit 681bd71047f184282d10d5ec9c1770882d525eb8)
---
 sys/amd64/amd64/minidump_machdep.c    | 72 +++++++++++++++++++++--------------
 sys/arm/arm/minidump_machdep.c        | 14 +++++--
 sys/arm64/arm64/minidump_machdep.c    | 69 +++++++++++++++++++++++----------
 sys/i386/i386/minidump_machdep_base.c | 45 +++++++++++++---------
 sys/riscv/riscv/minidump_machdep.c    | 58 +++++++++++++++++++---------
 5 files changed, 171 insertions(+), 87 deletions(-)

diff --git a/sys/amd64/amd64/minidump_machdep.c b/sys/amd64/amd64/minidump_machdep.c
index 68333088ed5a..975ae038cfdf 100644
--- a/sys/amd64/amd64/minidump_machdep.c
+++ b/sys/amd64/amd64/minidump_machdep.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
 #include <machine/elf.h>
 #include <machine/md_var.h>
 #include <machine/minidump.h>
+#include <machine/vmparam.h>
 
 CTASSERT(sizeof(struct kerneldumpheader) == 512);
 
@@ -163,10 +164,11 @@ int
 cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 {
 	uint32_t pmapsize;
-	vm_offset_t va;
+	vm_offset_t va, kva_end;
 	int error;
 	uint64_t *pml4, *pdp, *pd, *pt, pa;
-	int i, ii, j, k, n;
+	uint64_t pdpe, pde, pte;
+	int ii, j, k, n;
 	int retry_count;
 	struct minidumphdr mdhdr;
 
@@ -174,10 +176,18 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
  retry:
 	retry_count++;
 
-	/* Walk page table pages, set bits in vm_page_dump */
+	/* Snapshot the KVA upper bound in case it grows. */
+	kva_end = MAX(KERNBASE + nkpt * NBPDR, kernel_vm_end);
+
+	/*
+	 * Walk the kernel page table pages, setting the active entries in the
+	 * dump bitmap.
+	 *
+	 * NB: for a live dump, we may be racing with updates to the page
+	 * tables, so care must be taken to read each entry only once.
+	 */
 	pmapsize = 0;
-	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
-	    kernel_vm_end); ) {
+	for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; ) {
 		/*
 		 * We always write a page, even if it is zero. Each
 		 * page written corresponds to 1GB of space
@@ -186,8 +196,8 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 		ii = pmap_pml4e_index(va);
 		pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
 		pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
-		i = pmap_pdpe_index(va);
-		if ((pdp[i] & PG_V) == 0) {
+		pdpe = atomic_load_64(&pdp[pmap_pdpe_index(va)]);
+		if ((pdpe & PG_V) == 0) {
 			va += NBPDP;
 			continue;
 		}
@@ -195,9 +205,9 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 		/*
 		 * 1GB page is represented as 512 2MB pages in a dump.
 		 */
-		if ((pdp[i] & PG_PS) != 0) {
+		if ((pdpe & PG_PS) != 0) {
 			va += NBPDP;
-			pa = pdp[i] & PG_PS_FRAME;
+			pa = pdpe & PG_PS_FRAME;
 			for (n = 0; n < NPDEPG * NPTEPG; n++) {
 				if (vm_phys_is_dumpable(pa))
 					dump_add_page(pa);
@@ -206,16 +216,16 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 			continue;
 		}
 
-		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
+		pd = (uint64_t *)PHYS_TO_DMAP(pdpe & PG_FRAME);
 		for (n = 0; n < NPDEPG; n++, va += NBPDR) {
-			j = pmap_pde_index(va);
+			pde = atomic_load_64(&pd[pmap_pde_index(va)]);
 
-			if ((pd[j] & PG_V) == 0)
+			if ((pde & PG_V) == 0)
 				continue;
 
-			if ((pd[j] & PG_PS) != 0) {
+			if ((pde & PG_PS) != 0) {
 				/* This is an entire 2M page. */
-				pa = pd[j] & PG_PS_FRAME;
+				pa = pde & PG_PS_FRAME;
 				for (k = 0; k < NPTEPG; k++) {
 					if (vm_phys_is_dumpable(pa))
 						dump_add_page(pa);
@@ -224,17 +234,18 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 				continue;
 			}
 
-			pa = pd[j] & PG_FRAME;
+			pa = pde & PG_FRAME;
 			/* set bit for this PTE page */
 			if (vm_phys_is_dumpable(pa))
 				dump_add_page(pa);
 			/* and for each valid page in this 2MB block */
-			pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
+			pt = (uint64_t *)PHYS_TO_DMAP(pde & PG_FRAME);
 			for (k = 0; k < NPTEPG; k++) {
-				if ((pt[k] & PG_V) == 0)
+				pte = atomic_load_64(&pt[k]);
+				if ((pte & PG_V) == 0)
 					continue;
-				pa = pt[k] & PG_FRAME;
-				if (vm_phys_is_dumpable(pa))
+				pa = pte & PG_FRAME;
+				if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa))
 					dump_add_page(pa);
 			}
 		}
@@ -247,7 +258,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 	dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages));
 	VM_PAGE_DUMP_FOREACH(pa) {
 		/* Clear out undumpable pages now if needed */
-		if (vm_phys_is_dumpable(pa)) {
+		if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) {
 			dumpsize += PAGE_SIZE;
 		} else {
 			dump_drop_page(pa);
@@ -309,15 +320,14 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 
 	/* Dump kernel page directory pages */
 	bzero(fakepd, sizeof(fakepd));
-	for (va = VM_MIN_KERNEL_ADDRESS; va < MAX(KERNBASE + nkpt * NBPDR,
-	    kernel_vm_end); va += NBPDP) {
+	for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += NBPDP) {
 		ii = pmap_pml4e_index(va);
 		pml4 = (uint64_t *)PHYS_TO_DMAP(KPML4phys) + ii;
 		pdp = (uint64_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
-		i = pmap_pdpe_index(va);
+		pdpe = atomic_load_64(&pdp[pmap_pdpe_index(va)]);
 
 		/* We always write a page, even if it is zero */
-		if ((pdp[i] & PG_V) == 0) {
+		if ((pdpe & PG_V) == 0) {
 			error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
 			if (error)
 				goto fail;
@@ -329,9 +339,9 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 		}
 
 		/* 1GB page is represented as 512 2MB pages in a dump */
-		if ((pdp[i] & PG_PS) != 0) {
+		if ((pdpe & PG_PS) != 0) {
 			/* PDPE and PDP have identical layout in this case */
-			fakepd[0] = pdp[i];
+			fakepd[0] = pdpe;
 			for (j = 1; j < NPDEPG; j++)
 				fakepd[j] = fakepd[j - 1] + NBPDR;
 			error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
@@ -345,8 +355,14 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 			continue;
 		}
 
-		pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
-		error = blk_write(di, (char *)pd, 0, PAGE_SIZE);
+		pa = pdpe & PG_FRAME;
+		if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa)) {
+			pd = (uint64_t *)PHYS_TO_DMAP(pa);
+			error = blk_write(di, (char *)pd, 0, PAGE_SIZE);
+		} else {
+			/* Malformed pa, write the zeroed fakepd. */
+			error = blk_write(di, (char *)&fakepd, 0, PAGE_SIZE);
+		}
 		if (error)
 			goto fail;
 		error = blk_flush(di);
diff --git a/sys/arm/arm/minidump_machdep.c b/sys/arm/arm/minidump_machdep.c
index 6d199d1894c3..e3bf37599d4a 100644
--- a/sys/arm/arm/minidump_machdep.c
+++ b/sys/arm/arm/minidump_machdep.c
@@ -159,7 +159,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 	uint64_t dumpsize, *dump_avail_buf;
 	uint32_t ptesize;
 	uint32_t pa, prev_pa = 0, count = 0;
-	vm_offset_t va;
+	vm_offset_t va, kva_end;
 	int error, i;
 	char *addr;
 
@@ -174,9 +174,15 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 	 */
 	dcache_wbinv_poc_all();
 
-	/* Walk page table pages, set bits in vm_page_dump */
+	/* Snapshot the KVA upper bound in case it grows. */
+	kva_end = kernel_vm_end;
+
+	/*
+	 * Walk the kernel page table pages, setting the active entries in the
+	 * dump bitmap.
+	 */
 	ptesize = 0;
-	for (va = KERNBASE; va < kernel_vm_end; va += PAGE_SIZE) {
+	for (va = KERNBASE; va < kva_end; va += PAGE_SIZE) {
 		pa = pmap_dump_kextract(va, NULL);
 		if (pa != 0 && vm_phys_is_dumpable(pa))
 			dump_add_page(pa);
@@ -255,7 +261,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 
 	/* Dump kernel page table pages */
 	addr = dumpbuf;
-	for (va = KERNBASE; va < kernel_vm_end; va += PAGE_SIZE) {
+	for (va = KERNBASE; va < kva_end; va += PAGE_SIZE) {
 		pmap_dump_kextract(va, (pt2_entry_t *)addr);
 		addr += sizeof(pt2_entry_t);
 		if (addr == dumpbuf + sizeof(dumpbuf)) {
diff --git a/sys/arm64/arm64/minidump_machdep.c b/sys/arm64/arm64/minidump_machdep.c
index 7558e612153c..5814cce632a9 100644
--- a/sys/arm64/arm64/minidump_machdep.c
+++ b/sys/arm64/arm64/minidump_machdep.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_dumpset.h>
 #include <vm/pmap.h>
 
+#include <machine/atomic.h>
 #include <machine/md_var.h>
 #include <machine/pte.h>
 #include <machine/minidump.h>
@@ -150,9 +151,9 @@ int
 cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 {
 	struct minidumphdr mdhdr;
-	pd_entry_t *l0, *l1, *l2;
-	pt_entry_t *l3;
-	vm_offset_t va;
+	pd_entry_t *l0, *l1, l1e, *l2, l2e;
+	pt_entry_t *l3, l3e;
+	vm_offset_t va, kva_end;
 	vm_paddr_t pa;
 	uint32_t pmapsize;
 	int error, i, j, retry_count;
@@ -162,31 +163,46 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 	retry_count++;
 	error = 0;
 	pmapsize = 0;
-	for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += L2_SIZE) {
+
+	/* Snapshot the KVA upper bound in case it grows. */
+	kva_end = kernel_vm_end;
+
+	/*
+	 * Walk the kernel page table pages, setting the active entries in the
+	 * dump bitmap.
+	 *
+	 * NB: for a live dump, we may be racing with updates to the page
+	 * tables, so care must be taken to read each entry only once.
+	 */
+	for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += L2_SIZE) {
 		pmapsize += PAGE_SIZE;
 		if (!pmap_get_tables(pmap_kernel(), va, &l0, &l1, &l2, &l3))
 			continue;
 
-		if ((*l1 & ATTR_DESCR_MASK) == L1_BLOCK) {
-			pa = *l1 & ~ATTR_MASK;
+		l1e = atomic_load_64(l1);
+		l2e = atomic_load_64(l2);
+		if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
+			pa = l1e & ~ATTR_MASK;
 			for (i = 0; i < Ln_ENTRIES * Ln_ENTRIES;
 			    i++, pa += PAGE_SIZE)
 				if (vm_phys_is_dumpable(pa))
 					dump_add_page(pa);
 			pmapsize += (Ln_ENTRIES - 1) * PAGE_SIZE;
 			va += L1_SIZE - L2_SIZE;
-		} else if ((*l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
-			pa = *l2 & ~ATTR_MASK;
+		} else if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
+			pa = l2e & ~ATTR_MASK;
 			for (i = 0; i < Ln_ENTRIES; i++, pa += PAGE_SIZE) {
 				if (vm_phys_is_dumpable(pa))
 					dump_add_page(pa);
 			}
-		} else if ((*l2 & ATTR_DESCR_MASK) == L2_TABLE) {
+		} else if ((l2e & ATTR_DESCR_MASK) == L2_TABLE) {
 			for (i = 0; i < Ln_ENTRIES; i++) {
-				if ((l3[i] & ATTR_DESCR_MASK) != L3_PAGE)
+				l3e = atomic_load_64(&l3[i]);
+				if ((l3e & ATTR_DESCR_MASK) != L3_PAGE)
 					continue;
-				pa = l3[i] & ~ATTR_MASK;
-				if (vm_phys_is_dumpable(pa))
+				pa = l3e & ~ATTR_MASK;
+				pa = l3e & ~ATTR_MASK;
+				if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa))
 					dump_add_page(pa);
 			}
 		}
@@ -198,7 +214,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 	dumpsize += round_page(sizeof(dump_avail));
 	dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages));
 	VM_PAGE_DUMP_FOREACH(pa) {
-		if (vm_phys_is_dumpable(pa))
+		if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa))
 			dumpsize += PAGE_SIZE;
 		else
 			dump_drop_page(pa);
@@ -260,7 +276,7 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 
 	/* Dump kernel page directory pages */
 	bzero(&tmpbuffer, sizeof(tmpbuffer));
-	for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += L2_SIZE) {
+	for (va = VM_MIN_KERNEL_ADDRESS; va < kva_end; va += L2_SIZE) {
 		if (!pmap_get_tables(pmap_kernel(), va, &l0, &l1, &l2, &l3)) {
 			/* We always write a page, even if it is zero */
 			error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE);
@@ -270,12 +286,17 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 			error = blk_flush(di);
 			if (error)
 				goto fail;
-		} else if ((*l1 & ATTR_DESCR_MASK) == L1_BLOCK) {
+			continue;
+		}
+
+		l1e = atomic_load_64(l1);
+		l2e = atomic_load_64(l2);
+		if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
 			/*
 			 * Handle a 1GB block mapping: write out 512 fake L2
 			 * pages.
 			 */
-			pa = (*l1 & ~ATTR_MASK) | (va & L1_OFFSET);
+			pa = (l1e & ~ATTR_MASK) | (va & L1_OFFSET);
 
 			for (i = 0; i < Ln_ENTRIES; i++) {
 				for (j = 0; j < Ln_ENTRIES; j++) {
@@ -294,8 +315,8 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 				goto fail;
 			bzero(&tmpbuffer, sizeof(tmpbuffer));
 			va += L1_SIZE - L2_SIZE;
-		} else if ((*l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
-			pa = (*l2 & ~ATTR_MASK) | (va & L2_OFFSET);
+		} else if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
+			pa = (l2e & ~ATTR_MASK) | (va & L2_OFFSET);
 
 			/* Generate fake l3 entries based upon the l1 entry */
 			for (i = 0; i < Ln_ENTRIES; i++) {
@@ -312,9 +333,17 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 			bzero(&tmpbuffer, sizeof(tmpbuffer));
 			continue;
 		} else {
-			pa = *l2 & ~ATTR_MASK;
+			pa = l2e & ~ATTR_MASK;
 
-			error = blk_write(di, NULL, pa, PAGE_SIZE);
+			/*
+			 * We always write a page, even if it is zero. If pa
+			 * is malformed, write the zeroed tmpbuffer.
+			 */
+			if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa))
+				error = blk_write(di, NULL, pa, PAGE_SIZE);
+			else
+				error = blk_write(di, (char *)&tmpbuffer, 0,
+				    PAGE_SIZE);
 			if (error)
 				goto fail;
 		}
diff --git a/sys/i386/i386/minidump_machdep_base.c b/sys/i386/i386/minidump_machdep_base.c
index 8984ab4dd083..9b036f0fd700 100644
--- a/sys/i386/i386/minidump_machdep_base.c
+++ b/sys/i386/i386/minidump_machdep_base.c
@@ -157,17 +157,26 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 {
 	uint64_t dumpsize;
 	uint32_t ptesize;
-	vm_offset_t va;
+	vm_offset_t va, kva_end;
 	int error;
 	uint64_t pa;
-	pd_entry_t *pd;
-	pt_entry_t *pt;
+	pd_entry_t *pd, pde;
+	pt_entry_t *pt, pte;
 	int j, k;
 	struct minidumphdr mdhdr;
 
-	/* Walk page table pages, set bits in vm_page_dump */
+	/* Snapshot the KVA upper bound in case it grows. */
+	kva_end = kernel_vm_end;
+
+	/*
+	 * Walk the kernel page table pages, setting the active entries in the
+	 * dump bitmap.
+	 *
+	 * NB: for a live dump, we may be racing with updates to the page
+	 * tables, so care must be taken to read each entry only once.
+	 */
 	ptesize = 0;
-	for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) {
+	for (va = KERNBASE; va < kva_end; va += NBPDR) {
 		/*
 		 * We always write a page, even if it is zero. Each
 		 * page written corresponds to 2MB of space
@@ -175,9 +184,10 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 		ptesize += PAGE_SIZE;
 		pd = IdlePTD;	/* always mapped! */
 		j = va >> PDRSHIFT;
-		if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
+		pde = pte_load(&pd[va >> PDRSHIFT]);
+		if ((pde & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
 			/* This is an entire 2M page. */
-			pa = pd[j] & PG_PS_FRAME;
+			pa = pde & PG_PS_FRAME;
 			for (k = 0; k < NPTEPG; k++) {
 				if (vm_phys_is_dumpable(pa))
 					dump_add_page(pa);
@@ -185,12 +195,13 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 			}
 			continue;
 		}
-		if ((pd[j] & PG_V) == PG_V) {
+		if ((pde & PG_V) == PG_V) {
 			/* set bit for each valid page in this 2MB block */
-			pt = pmap_kenter_temporary(pd[j] & PG_FRAME, 0);
+			pt = pmap_kenter_temporary(pde & PG_FRAME, 0);
 			for (k = 0; k < NPTEPG; k++) {
-				if ((pt[k] & PG_V) == PG_V) {
-					pa = pt[k] & PG_FRAME;
+				pte = pte_load(&pt[k]);
+				if ((pte & PG_V) == PG_V) {
+					pa = pte & PG_FRAME;
 					if (vm_phys_is_dumpable(pa))
 						dump_add_page(pa);
 				}
@@ -266,13 +277,13 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 		goto fail;
 
 	/* Dump kernel page table pages */
-	for (va = KERNBASE; va < kernel_vm_end; va += NBPDR) {
+	for (va = KERNBASE; va < kva_end; va += NBPDR) {
 		/* We always write a page, even if it is zero */
 		pd = IdlePTD;	/* always mapped! */
-		j = va >> PDRSHIFT;
-		if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
+		pde = pte_load(&pd[va >> PDRSHIFT]);
+		if ((pde & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
 			/* This is a single 2M block. Generate a fake PTP */
-			pa = pd[j] & PG_PS_FRAME;
+			pa = pde & PG_PS_FRAME;
 			for (k = 0; k < NPTEPG; k++) {
 				fakept[k] = (pa + (k * PAGE_SIZE)) | PG_V | PG_RW | PG_A | PG_M;
 			}
@@ -285,8 +296,8 @@ cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 				goto fail;
 			continue;
 		}
-		if ((pd[j] & PG_V) == PG_V) {
-			pa = pd[j] & PG_FRAME;
+		if ((pde & PG_V) == PG_V) {
+			pa = pde & PG_FRAME;
 			error = blk_write(di, 0, pa, PAGE_SIZE);
 			if (error)
 				goto fail;
diff --git a/sys/riscv/riscv/minidump_machdep.c b/sys/riscv/riscv/minidump_machdep.c
index d71dde8f6da5..8f5a4a4d1289 100644
--- a/sys/riscv/riscv/minidump_machdep.c
+++ b/sys/riscv/riscv/minidump_machdep.c
@@ -155,11 +155,11 @@ blk_write(struct dumperinfo *di, char *ptr, vm_paddr_t pa, size_t sz)
 int
 cpu_minidumpsys(struct dumperinfo *di, const struct minidumpstate *state)
 {
-	pd_entry_t *l1, *l2;
-	pt_entry_t *l3;
+	pd_entry_t *l1, *l2, l2e;
+	pt_entry_t *l3, l3e;
 	struct minidumphdr mdhdr;
 	uint32_t pmapsize;
-	vm_offset_t va;
+	vm_offset_t va, kva_max;
 	vm_paddr_t pa;
 	int error;
 	int i;
@@ -171,8 +171,17 @@ retry:
 	error = 0;
 	pmapsize = 0;
 
-	/* Build set of dumpable pages from kernel pmap */
-	for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += L2_SIZE) {
+	/* Snapshot the KVA upper bound in case it grows. */
+	kva_max = kernel_vm_end;
+
+	/*
+	 * Walk the kernel page table pages, setting the active entries in the
+	 * dump bitmap.
+	 *
+	 * NB: for a live dump, we may be racing with updates to the page
+	 * tables, so care must be taken to read each entry only once.
+	 */
+	for (va = VM_MIN_KERNEL_ADDRESS; va < kva_max; va += L2_SIZE) {
 		pmapsize += PAGE_SIZE;
 		if (!pmap_get_tables(pmap_kernel(), va, &l1, &l2, &l3))
 			continue;
@@ -182,18 +191,20 @@ retry:
 			continue;
 
 		/* l2 may be a superpage */
-		if ((*l2 & PTE_RWX) != 0) {
-			pa = (*l2 >> PTE_PPN1_S) << L2_SHIFT;
+		l2e = atomic_load_64(l2);
+		if ((l2e & PTE_RWX) != 0) {
+			pa = (l2e >> PTE_PPN1_S) << L2_SHIFT;
 			for (i = 0; i < Ln_ENTRIES; i++, pa += PAGE_SIZE) {
 				if (vm_phys_is_dumpable(pa))
 					dump_add_page(pa);
 			}
 		} else {
 			for (i = 0; i < Ln_ENTRIES; i++) {
-				if ((l3[i] & PTE_V) == 0)
+				l3e = atomic_load_64(&l3[i]);
+				if ((l3e & PTE_V) == 0)
 					continue;
-				pa = (l3[i] >> PTE_PPN0_S) * PAGE_SIZE;
-				if (vm_phys_is_dumpable(pa))
+				pa = (l3e >> PTE_PPN0_S) * PAGE_SIZE;
+				if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa))
 					dump_add_page(pa);
 			}
 		}
@@ -206,7 +217,7 @@ retry:
 	dumpsize += round_page(BITSET_SIZE(vm_page_dump_pages));
 	VM_PAGE_DUMP_FOREACH(pa) {
 		/* Clear out undumpable pages now if needed */
-		if (vm_phys_is_dumpable(pa))
+		if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa))
 			dumpsize += PAGE_SIZE;
 		else
 			dump_drop_page(pa);
@@ -268,7 +279,7 @@ retry:
 
 	/* Dump kernel page directory pages */
 	bzero(&tmpbuffer, sizeof(tmpbuffer));
-	for (va = VM_MIN_KERNEL_ADDRESS; va < kernel_vm_end; va += L2_SIZE) {
+	for (va = VM_MIN_KERNEL_ADDRESS; va < kva_max; va += L2_SIZE) {
 		if (!pmap_get_tables(pmap_kernel(), va, &l1, &l2, &l3)) {
 			/* We always write a page, even if it is zero */
 			error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE);
@@ -278,10 +289,14 @@ retry:
 			error = blk_flush(di);
 			if (error)
 				goto fail;
-		} else if ((*l2 & PTE_RWX) != 0) {
+			continue;
+		}
+
+		l2e = atomic_load_64(l2);
+		if ((l2e & PTE_RWX) != 0) {
 			/* Generate fake l3 entries based on the l2 superpage */
 			for (i = 0; i < Ln_ENTRIES; i++) {
-				tmpbuffer[i] = (*l2 | (i << PTE_PPN0_S));
+				tmpbuffer[i] = (l2e | (i << PTE_PPN0_S));
 			}
 			/* We always write a page, even if it is zero */
 			error = blk_write(di, (char *)&tmpbuffer, 0, PAGE_SIZE);
@@ -293,10 +308,17 @@ retry:
 				goto fail;
 			bzero(&tmpbuffer, sizeof(tmpbuffer));
 		} else {
-			pa = (*l2 >> PTE_PPN0_S) * PAGE_SIZE;
-
-			/* We always write a page, even if it is zero */
-			error = blk_write(di, NULL, pa, PAGE_SIZE);
+			pa = (l2e >> PTE_PPN0_S) * PAGE_SIZE;
+
+			/*
+			 * We always write a page, even if it is zero. If pa
+			 * is malformed, write the zeroed tmpbuffer.
+			 */
+			if (PHYS_IN_DMAP(pa) && vm_phys_is_dumpable(pa))
+				error = blk_write(di, NULL, pa, PAGE_SIZE);
+			else
+				error = blk_write(di, (char *)&tmpbuffer, 0,
+				    PAGE_SIZE);
 			if (error)
 				goto fail;
 		}



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202112031453.1B3ErfI4081049>