Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 31 Dec 2022 22:10:39 GMT
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: cde70e312c3f - main - amd64: for small cores, use (big hammer) INVPCID_CTXGLOB instead of INVLPG
Message-ID:  <202212312210.2BVMAdgL010642@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=cde70e312c3fde5b37a29be1dacb7fde9a45b94a

commit cde70e312c3fde5b37a29be1dacb7fde9a45b94a
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2022-10-10 23:08:55 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2022-12-31 22:09:45 +0000

    amd64: for small cores, use (big hammer) INVPCID_CTXGLOB instead of INVLPG
    
    A hypothetical CPU bug makes invalidation of global PTEs using INVLPG
    in pcid mode unreliable, it seems.  The workaround is applied for all
    CPUs with small cores, since we do not know the scope of the issue, and
    the right fix.
    
    Reviewed by:    alc (previous version)
    Discussed with: emaste, markj
    Tested by:      karels
    PR:     261169, 266145
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D37770
---
 sys/amd64/amd64/initcpu.c    |  5 +++++
 sys/amd64/amd64/mp_machdep.c | 16 +++++++++++-----
 sys/amd64/amd64/pmap.c       | 36 +++++++++++++++++++++++++++++-------
 sys/amd64/include/pcpu.h     |  3 ++-
 sys/amd64/include/pmap.h     | 20 ++++++++++++++++++++
 5 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/sys/amd64/amd64/initcpu.c b/sys/amd64/amd64/initcpu.c
index 1b731821889e..08385d3095d0 100644
--- a/sys/amd64/amd64/initcpu.c
+++ b/sys/amd64/amd64/initcpu.c
@@ -324,6 +324,11 @@ initializecpu(void)
 		if ((r[0] & CPUID_HYBRID_CORE_MASK) ==
 		    CPUID_HYBRID_SMALL_CORE) {
 			PCPU_SET(small_core, 1);
+			if (pmap_pcid_enabled &&
+			    pmap_pcid_invlpg_workaround_uena) {
+				PCPU_SET(pcid_invlpg_workaround, 1);
+				pmap_pcid_invlpg_workaround = 1;
+			}
 		}
 	}
 }
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index f41e8dafcc86..5c60d301c1e7 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -861,7 +861,7 @@ invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1)
 	(*ipi_invlpg_counts[PCPU_GET(cpuid)])++;
 #endif /* COUNT_IPIS */
 
-	invlpg(smp_tlb_addr1);
+	pmap_invlpg(smp_tlb_pmap, smp_tlb_addr1);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 &&
 	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
@@ -931,10 +931,16 @@ invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_t smp_tlb_addr1,
 #endif /* COUNT_IPIS */
 
 	addr = smp_tlb_addr1;
-	do {
-		invlpg(addr);
-		addr += PAGE_SIZE;
-	} while (addr < smp_tlb_addr2);
+	if (smp_tlb_pmap == kernel_pmap && PCPU_GET(pcid_invlpg_workaround)) {
+		struct invpcid_descr d = { 0 };
+
+		invpcid(&d, INVPCID_CTXGLOB);
+	} else {
+		do {
+			invlpg(addr);
+			addr += PAGE_SIZE;
+		} while (addr < smp_tlb_addr2);
+	}
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
 	    smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 &&
 	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index a44993efb409..07a00963004b 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -529,6 +529,12 @@ SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
 int invpcid_works = 0;
 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
     "Is the invpcid instruction available ?");
+int pmap_pcid_invlpg_workaround = 0;
+SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_invlpg_workaround,
+    CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
+    &pmap_pcid_invlpg_workaround, 0,
+    "Enable small core PCID/INVLPG workaround");
+int pmap_pcid_invlpg_workaround_uena = 1;
 
 int __read_frequently pti = 0;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
@@ -2560,6 +2566,9 @@ pmap_init(void)
 			    VM_PAGE_TO_PHYS(m);
 		}
 	}
+
+	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
+	    &pmap_pcid_invlpg_workaround_uena);
 }
 
 SYSCTL_UINT(_vm_pmap, OID_AUTO, large_map_pml4_entries,
@@ -2791,7 +2800,7 @@ pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
-		invlpg(va);
+		pmap_invlpg(pmap, va);
 	else if ((newpde & PG_G) == 0)
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB
@@ -3130,7 +3139,7 @@ pmap_invalidate_page_curcpu_cb(pmap_t pmap, vm_offset_t va,
     vm_offset_t addr2 __unused)
 {
 	if (pmap == kernel_pmap) {
-		invlpg(va);
+		pmap_invlpg(kernel_pmap, va);
 	} else if (pmap == PCPU_GET(curpmap)) {
 		invlpg(va);
 		pmap_invalidate_page_cb(pmap, va);
@@ -3221,8 +3230,14 @@ pmap_invalidate_range_curcpu_cb(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap) {
-		for (addr = sva; addr < eva; addr += PAGE_SIZE)
-			invlpg(addr);
+		if (PCPU_GET(pcid_invlpg_workaround)) {
+			struct invpcid_descr d = { 0 };
+
+			invpcid(&d, INVPCID_CTXGLOB);
+		} else {
+			for (addr = sva; addr < eva; addr += PAGE_SIZE)
+				invlpg(addr);
+		}
 	} else if (pmap == PCPU_GET(curpmap)) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
@@ -3760,7 +3775,7 @@ pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
 	for (; spa < epa; spa += PAGE_SIZE) {
 		sched_pin();
 		pte_store(pte, spa | pte_bits);
-		invlpg(vaddr);
+		pmap_invlpg(kernel_pmap, vaddr);
 		/* XXXKIB atomic inside flush_cache_range are excessive */
 		pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
 		sched_unpin();
@@ -7668,7 +7683,7 @@ pmap_kenter_temporary(vm_paddr_t pa, int i)
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
-	invlpg(va);
+	pmap_invlpg(kernel_pmap, va);
 	return ((void *)crashdumpmap);
 }
 
@@ -10371,7 +10386,7 @@ pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
 				    page[i]->md.pat_mode, 0);
 				pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
 				    cache_bits);
-				invlpg(vaddr[i]);
+				pmap_invlpg(kernel_pmap, vaddr[i]);
 			}
 		}
 	}
@@ -10420,7 +10435,14 @@ pmap_quick_remove_page(vm_offset_t addr)
 	if (addr != qframe)
 		return;
 	pte_store(vtopte(qframe), 0);
+
+	/*
+	 * Since qframe is exclusively mapped by
+	 * pmap_quick_enter_page() and that function doesn't set PG_G,
+	 * we can use INVLPG here.
+	 */
 	invlpg(qframe);
+
 	mtx_unlock_spin(&qframe_mtx);
 }
 
diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h
index 70f008fe835a..c0c35f4419e8 100644
--- a/sys/amd64/include/pcpu.h
+++ b/sys/amd64/include/pcpu.h
@@ -100,7 +100,8 @@ _Static_assert(sizeof(struct monitorbuf) == 128, "2x cache line");
 	u_int	pc_smp_tlb_op;						\
 	uint64_t pc_ucr3_load_mask;					\
 	u_int	pc_small_core;						\
-	char	__pad[2912]		/* pad to UMA_PCPU_ALLOC_SIZE */
+	u_int	pc_pcid_invlpg_workaround;				\
+	char	__pad[2908]		/* pad to UMA_PCPU_ALLOC_SIZE */
 
 #define	PC_DBREG_CMD_NONE	0
 #define	PC_DBREG_CMD_LOAD	1
diff --git a/sys/amd64/include/pmap.h b/sys/amd64/include/pmap.h
index a55a14f94ed7..e7497c2f8b4b 100644
--- a/sys/amd64/include/pmap.h
+++ b/sys/amd64/include/pmap.h
@@ -431,6 +431,8 @@ extern vm_offset_t virtual_end;
 extern vm_paddr_t dmaplimit;
 extern int pmap_pcid_enabled;
 extern int invpcid_works;
+extern int pmap_pcid_invlpg_workaround;
+extern int pmap_pcid_invlpg_workaround_uena;
 
 #define	pmap_page_get_memattr(m)	((vm_memattr_t)(m)->md.pat_mode)
 #define	pmap_page_is_write_mapped(m)	(((m)->a.flags & PGA_WRITEABLE) != 0)
@@ -514,6 +516,24 @@ pmap_invalidate_cpu_mask(pmap_t pmap)
 	return (&pmap->pm_active);
 }
 
+/*
+ * It seems that AlderLake+ small cores have some microarchitectural
+ * bug, which results in the INVLPG instruction failing to flush all
+ * global TLB entries when PCID is enabled.  Work around it for now,
+ * by doing global invalidation on small cores instead of INVLPG.
+ */
+static __inline void
+pmap_invlpg(pmap_t pmap, vm_offset_t va)
+{
+	if (pmap == kernel_pmap && PCPU_GET(pcid_invlpg_workaround)) {
+		struct invpcid_descr d = { 0 };
+
+		invpcid(&d, INVPCID_CTXGLOB);
+	} else {
+		invlpg(va);
+	}
+}
+
 #endif /* _KERNEL */
 
 /* Return various clipped indexes for a given VA */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202212312210.2BVMAdgL010642>