Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 18 Jul 2020 18:19:57 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r363311 - in head/sys/amd64: amd64 include
Message-ID:  <202007181819.06IIJvHi076860@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Sat Jul 18 18:19:57 2020
New Revision: 363311
URL: https://svnweb.freebsd.org/changeset/base/363311

Log:
  amd64 pmap: microoptimize local shootdowns for PCID PTI configurations
  
  When pmap operates in PTI mode, we must reload %cr3 on return to
  userspace.  In non-PCID mode the reload always flushes all non-global
  TLB entries and we take advantage of it by only invalidating the KPT
  TLB entries (there is no cached UPT entries at all).
  
  In PCID mode, we flush both KPT and UPT TLB explicitly, but we can
  take advantage of the fact that PCID mode command to reload %cr3
  includes a flag to flush/not flush target TLB.  In particular, we can
  avoid the flush for UPT, instead record that load of pc_ucr3 into %cr3
  on return to usermode should be flushing.  This is done by providing
  either all-1s or ~CR3_PCID_MASK in pc_ucr3_load_mask.  The mask is
  automatically reset to all-1s on return to usermode.
  
  Similarly, we can avoid flushing UPT TLB on context switch, replacing
  it by setting pc_ucr3_load_mask.  This unifies INVPCID and non-INVPCID
  PTI ifunc, leaving only 4 cases instead of 6.  This trick is also
  applicable both to the TLB shootdown IPI handlers, since handlers
  interrupt the target thread.
  
  But then we need to check pc_curpmap in handlers, and this would
  reopen the same race for INVPCID machines as was fixed in r306350 for
  non-INVPCID.  To not introduce the same bug, unconditionally do
  spinlock_enter() in pmap_activate().
  
  Reviewed by:	alc, markj
  Tested by:	pho
  Sponsored by:	The FreeBSD Foundation
  MFC after:	3 weeks
  Differential revision:	https://reviews.freebsd.org/D25483

Modified:
  head/sys/amd64/amd64/exception.S
  head/sys/amd64/amd64/genassym.c
  head/sys/amd64/amd64/machdep.c
  head/sys/amd64/amd64/mp_machdep.c
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/include/pcpu.h
  head/sys/amd64/include/pmap.h

Modified: head/sys/amd64/amd64/exception.S
==============================================================================
--- head/sys/amd64/amd64/exception.S	Sat Jul 18 13:10:31 2020	(r363310)
+++ head/sys/amd64/amd64/exception.S	Sat Jul 18 18:19:57 2020	(r363311)
@@ -47,6 +47,7 @@
 #include <machine/asmacros.h>
 #include <machine/trap.h>
 #include <machine/specialreg.h>
+#include <machine/pmap.h>
 
 #ifdef KDTRACE_HOOKS
 	.bss
@@ -607,8 +608,10 @@ fast_syscall_common:
 	cmpq	$~0,PCPU(UCR3)
 	je	2f
 	movq	PCPU(UCR3),%r9
+	andq	PCPU(UCR3_LOAD_MASK),%r9
 	movq	%r9,%cr3
 2:	xorl	%r9d,%r9d
+	movq	$PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK)
 	swapgs
 	sysretq
 
@@ -1262,6 +1265,8 @@ ld_regs:
 	movq	TF_SS(%rsp),%rax
 	movq	%rax,PTI_SS(%rdx)
 	movq	PCPU(UCR3),%rax
+	andq	PCPU(UCR3_LOAD_MASK),%rax
+	movq	$PMAP_UCR3_NOMASK,PCPU(UCR3_LOAD_MASK)
 	swapgs
 	movq	%rdx,%rsp
 	movq	%rax,%cr3

Modified: head/sys/amd64/amd64/genassym.c
==============================================================================
--- head/sys/amd64/amd64/genassym.c	Sat Jul 18 13:10:31 2020	(r363310)
+++ head/sys/amd64/amd64/genassym.c	Sat Jul 18 18:19:57 2020	(r363311)
@@ -230,6 +230,7 @@ ASSYM(PC_TSS, offsetof(struct pcpu, pc_tss));
 ASSYM(PC_PM_SAVE_CNT, offsetof(struct pcpu, pc_pm_save_cnt));
 ASSYM(PC_KCR3, offsetof(struct pcpu, pc_kcr3));
 ASSYM(PC_UCR3, offsetof(struct pcpu, pc_ucr3));
+ASSYM(PC_UCR3_LOAD_MASK, offsetof(struct pcpu, pc_ucr3_load_mask));
 ASSYM(PC_SAVED_UCR3, offsetof(struct pcpu, pc_saved_ucr3));
 ASSYM(PC_PTI_STACK, offsetof(struct pcpu, pc_pti_stack));
 ASSYM(PC_PTI_STACK_SZ, PC_PTI_STACK_SZ);

Modified: head/sys/amd64/amd64/machdep.c
==============================================================================
--- head/sys/amd64/amd64/machdep.c	Sat Jul 18 13:10:31 2020	(r363310)
+++ head/sys/amd64/amd64/machdep.c	Sat Jul 18 18:19:57 2020	(r363311)
@@ -1562,6 +1562,7 @@ amd64_bsp_pcpu_init1(struct pcpu *pc)
 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
+	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
 	PCPU_SET(smp_tlb_gen, 1);
 }
 

Modified: head/sys/amd64/amd64/mp_machdep.c
==============================================================================
--- head/sys/amd64/amd64/mp_machdep.c	Sat Jul 18 13:10:31 2020	(r363310)
+++ head/sys/amd64/amd64/mp_machdep.c	Sat Jul 18 18:19:57 2020	(r363311)
@@ -283,6 +283,7 @@ init_secondary(void)
 	pc->pc_fs32p = &gdt[GUFS32_SEL];
 	pc->pc_gs32p = &gdt[GUGS32_SEL];
 	pc->pc_ldt = (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL];
+	pc->pc_ucr3_load_mask = PMAP_UCR3_NOMASK;
 	/* See comment in pmap_bootstrap(). */
 	pc->pc_pcid_next = PMAP_PCID_KERN + 2;
 	pc->pc_pcid_gen = 1;
@@ -821,15 +822,14 @@ invltlb_invpcid_pti_handler(pmap_t smp_tlb_pmap)
 		invpcid(&d, INVPCID_CTXGLOB);
 	} else {
 		invpcid(&d, INVPCID_CTX);
-		d.pcid |= PMAP_PCID_USER_PT;
-		invpcid(&d, INVPCID_CTX);
+		if (smp_tlb_pmap == PCPU_GET(curpmap))
+			PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
 	}
 }
 
 static void
 invltlb_pcid_handler(pmap_t smp_tlb_pmap)
 {
-	uint64_t kcr3, ucr3;
 	uint32_t pcid;
   
 #ifdef COUNT_XINVLTLB_HITS
@@ -849,15 +849,11 @@ invltlb_pcid_handler(pmap_t smp_tlb_pmap)
 		 * invalidation when switching to the pmap on this
 		 * CPU.
 		 */
-		if (PCPU_GET(curpmap) == smp_tlb_pmap) {
+		if (smp_tlb_pmap == PCPU_GET(curpmap)) {
 			pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
-			kcr3 = smp_tlb_pmap->pm_cr3 | pcid;
-			ucr3 = smp_tlb_pmap->pm_ucr3;
-			if (ucr3 != PMAP_NO_CR3) {
-				ucr3 |= PMAP_PCID_USER_PT | pcid;
-				pmap_pti_pcid_invalidate(ucr3, kcr3);
-			} else
-				load_cr3(kcr3);
+			load_cr3(smp_tlb_pmap->pm_cr3 | pcid);
+			if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3)
+				PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
 		}
 	}
 }
@@ -888,7 +884,9 @@ invlpg_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset_
 #endif /* COUNT_IPIS */
 
 	invlpg(smp_tlb_addr1);
-	if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
+	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
+	    smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 &&
+	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
 		    PMAP_PCID_USER_PT;
 		d.pad = 0;
@@ -912,7 +910,8 @@ invlpg_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t s
 
 	invlpg(smp_tlb_addr1);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
-	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
+	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 &&
+	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
@@ -960,7 +959,9 @@ invlrng_invpcid_handler(pmap_t smp_tlb_pmap, vm_offset
 		invlpg(addr);
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
-	if (smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3) {
+	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
+	    smp_tlb_pmap->pm_ucr3 != PMAP_NO_CR3 &&
+	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		d.pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid |
 		    PMAP_PCID_USER_PT;
 		d.pad = 0;
@@ -994,7 +995,8 @@ invlrng_pcid_handler(pmap_t smp_tlb_pmap, vm_offset_t 
 		addr += PAGE_SIZE;
 	} while (addr < addr2);
 	if (smp_tlb_pmap == PCPU_GET(curpmap) &&
-	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3) {
+	    (ucr3 = smp_tlb_pmap->pm_ucr3) != PMAP_NO_CR3 &&
+	    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 		pcid = smp_tlb_pmap->pm_pcids[PCPU_GET(cpuid)].pm_pcid;
 		kcr3 = smp_tlb_pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 		ucr3 |= pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Sat Jul 18 13:10:31 2020	(r363310)
+++ head/sys/amd64/amd64/pmap.c	Sat Jul 18 18:19:57 2020	(r363311)
@@ -2520,7 +2520,16 @@ pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
 
 	cpuid = PCPU_GET(cpuid);
 	if (pmap == PCPU_GET(curpmap)) {
-		if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+		if (pmap->pm_ucr3 != PMAP_NO_CR3 &&
+		    /*
+		     * If we context-switched right after
+		     * PCPU_GET(ucr3_load_mask), we could read the
+		     * ~CR3_PCID_SAVE mask, which causes us to skip
+		     * the code below to invalidate user pages.  This
+		     * is handled in pmap_activate_sw_pcid_pti() by
+		     * clearing pm_gen if ucr3_load_mask is ~CR3_PCID_SAVE.
+		     */
+		    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 			/*
 			 * Because pm_pcid is recalculated on a
 			 * context switch, we must disable switching.
@@ -2635,7 +2644,8 @@ pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sv
 
 	cpuid = PCPU_GET(cpuid);
 	if (pmap == PCPU_GET(curpmap)) {
-		if (pmap->pm_ucr3 != PMAP_NO_CR3) {
+		if (pmap->pm_ucr3 != PMAP_NO_CR3 &&
+		    PCPU_GET(ucr3_load_mask) == PMAP_UCR3_NOMASK) {
 			critical_enter();
 			pcid = pmap->pm_pcids[cpuid].pm_pcid;
 			if (invpcid_works1) {
@@ -2736,7 +2746,7 @@ static inline void
 pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
 {
 	struct invpcid_descr d;
-	uint64_t kcr3, ucr3;
+	uint64_t kcr3;
 	uint32_t pcid;
 	u_int cpuid, i;
 
@@ -2757,20 +2767,12 @@ pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_wor
 				d.pad = 0;
 				d.addr = 0;
 				invpcid(&d, INVPCID_CTX);
-				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
-					d.pcid |= PMAP_PCID_USER_PT;
-					invpcid(&d, INVPCID_CTX);
-				}
 			} else {
 				kcr3 = pmap->pm_cr3 | pcid;
-				ucr3 = pmap->pm_ucr3;
-				if (ucr3 != PMAP_NO_CR3) {
-					ucr3 |= pcid | PMAP_PCID_USER_PT;
-					pmap_pti_pcid_invalidate(ucr3, kcr3);
-				} else {
-					load_cr3(kcr3);
-				}
+				load_cr3(kcr3);
 			}
+			if (pmap->pm_ucr3 != PMAP_NO_CR3)
+				PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
 			critical_exit();
 		} else
 			pmap->pm_pcids[cpuid].pm_gen = 0;
@@ -8816,12 +8818,23 @@ pmap_activate_sw_pti_post(struct thread *td, pmap_t pm
 	    PCPU_GET(pti_rsp0) : (uintptr_t)td->td_md.md_stack_base;
 }
 
-static void inline
-pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1)
+static void
+pmap_activate_sw_pcid_pti(struct thread *td, pmap_t pmap, u_int cpuid)
 {
-	struct invpcid_descr d;
+	pmap_t old_pmap;
 	uint64_t cached, cr3, kcr3, ucr3;
 
+	KASSERT((read_rflags() & PSL_I) == 0,
+	    ("PCID needs interrupts disabled in pmap_activate_sw()"));
+
+	/* See the comment in pmap_invalidate_page_pcid(). */
+	if (PCPU_GET(ucr3_load_mask) != PMAP_UCR3_NOMASK) {
+		PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
+		old_pmap = PCPU_GET(curpmap);
+		MPASS(old_pmap->pm_ucr3 != PMAP_NO_CR3);
+		old_pmap->pm_pcids[cpuid].pm_gen = 0;
+	}
+
 	cached = pmap_pcid_alloc_checked(pmap, cpuid);
 	cr3 = rcr3();
 	if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
@@ -8831,77 +8844,26 @@ pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, co
 	ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
 	    PMAP_PCID_USER_PT;
 
-	if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) {
-		/*
-		 * Explicitly invalidate translations cached from the
-		 * user page table.  They are not automatically
-		 * flushed by reload of cr3 with the kernel page table
-		 * pointer above.
-		 *
-		 * Note that the if() condition is resolved statically
-		 * by using the function argument instead of
-		 * runtime-evaluated invpcid_works value.
-		 */
-		if (invpcid_works1) {
-			d.pcid = PMAP_PCID_USER_PT |
-			    pmap->pm_pcids[cpuid].pm_pcid;
-			d.pad = 0;
-			d.addr = 0;
-			invpcid(&d, INVPCID_CTX);
-		} else {
-			pmap_pti_pcid_invalidate(ucr3, kcr3);
-		}
-	}
+	if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3)
+		PCPU_SET(ucr3_load_mask, ~CR3_PCID_SAVE);
 
 	PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
 	PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
 	if (cached)
 		PCPU_INC(pm_save_cnt);
-}
 
-static void
-pmap_activate_sw_pcid_invpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid)
-{
-
-	pmap_activate_sw_pcid_pti(pmap, cpuid, true);
 	pmap_activate_sw_pti_post(td, pmap);
 }
 
 static void
-pmap_activate_sw_pcid_noinvpcid_pti(struct thread *td, pmap_t pmap,
-    u_int cpuid)
-{
-	register_t rflags;
-
-	/*
-	 * If the INVPCID instruction is not available,
-	 * invltlb_pcid_handler() is used to handle an invalidate_all
-	 * IPI, which checks for curpmap == smp_tlb_pmap.  The below
-	 * sequence of operations has a window where %CR3 is loaded
-	 * with the new pmap's PML4 address, but the curpmap value has
-	 * not yet been updated.  This causes the invltlb IPI handler,
-	 * which is called between the updates, to execute as a NOP,
-	 * which leaves stale TLB entries.
-	 *
-	 * Note that the most typical use of pmap_activate_sw(), from
-	 * the context switch, is immune to this race, because
-	 * interrupts are disabled (while the thread lock is owned),
-	 * and the IPI happens after curpmap is updated.  Protect
-	 * other callers in a similar way, by disabling interrupts
-	 * around the %cr3 register reload and curpmap assignment.
-	 */
-	rflags = intr_disable();
-	pmap_activate_sw_pcid_pti(pmap, cpuid, false);
-	intr_restore(rflags);
-	pmap_activate_sw_pti_post(td, pmap);
-}
-
-static void
 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap,
     u_int cpuid)
 {
 	uint64_t cached, cr3;
 
+	KASSERT((read_rflags() & PSL_I) == 0,
+	    ("PCID needs interrupts disabled in pmap_activate_sw()"));
+
 	cached = pmap_pcid_alloc_checked(pmap, cpuid);
 	cr3 = rcr3();
 	if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
@@ -8913,17 +8875,6 @@ pmap_activate_sw_pcid_nopti(struct thread *td __unused
 }
 
 static void
-pmap_activate_sw_pcid_noinvpcid_nopti(struct thread *td __unused, pmap_t pmap,
-    u_int cpuid)
-{
-	register_t rflags;
-
-	rflags = intr_disable();
-	pmap_activate_sw_pcid_nopti(td, pmap, cpuid);
-	intr_restore(rflags);
-}
-
-static void
 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap,
     u_int cpuid __unused)
 {
@@ -8947,14 +8898,10 @@ DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (str
     u_int))
 {
 
-	if (pmap_pcid_enabled && pti && invpcid_works)
-		return (pmap_activate_sw_pcid_invpcid_pti);
-	else if (pmap_pcid_enabled && pti && !invpcid_works)
-		return (pmap_activate_sw_pcid_noinvpcid_pti);
-	else if (pmap_pcid_enabled && !pti && invpcid_works)
+	if (pmap_pcid_enabled && pti)
+		return (pmap_activate_sw_pcid_pti);
+	else if (pmap_pcid_enabled && !pti)
 		return (pmap_activate_sw_pcid_nopti);
-	else if (pmap_pcid_enabled && !pti && !invpcid_works)
-		return (pmap_activate_sw_pcid_noinvpcid_nopti);
 	else if (!pmap_pcid_enabled && pti)
 		return (pmap_activate_sw_nopcid_pti);
 	else /* if (!pmap_pcid_enabled && !pti) */
@@ -8991,10 +8938,26 @@ pmap_activate_sw(struct thread *td)
 void
 pmap_activate(struct thread *td)
 {
-
-	critical_enter();
+	/*
+	 * invltlb_{invpcid,}_pcid_handler() is used to handle an
+	 * invalidate_all IPI, which checks for curpmap ==
+	 * smp_tlb_pmap.  The below sequence of operations has a
+	 * window where %CR3 is loaded with the new pmap's PML4
+	 * address, but the curpmap value has not yet been updated.
+	 * This causes the invltlb IPI handler, which is called
+	 * between the updates, to execute as a NOP, which leaves
+	 * stale TLB entries.
+	 *
+	 * Note that the most common use of pmap_activate_sw(), from
+	 * a context switch, is immune to this race, because
+	 * interrupts are disabled (while the thread lock is owned),
+	 * so the IPI is delayed until after curpmap is updated.  Protect
+	 * other callers in a similar way, by disabling interrupts
+	 * around the %cr3 register reload and curpmap assignment.
+	 */
+	spinlock_enter();
 	pmap_activate_sw(td);
-	critical_exit();
+	spinlock_exit();
 }
 
 void

Modified: head/sys/amd64/include/pcpu.h
==============================================================================
--- head/sys/amd64/include/pcpu.h	Sat Jul 18 13:10:31 2020	(r363310)
+++ head/sys/amd64/include/pcpu.h	Sat Jul 18 18:19:57 2020	(r363311)
@@ -99,7 +99,8 @@ _Static_assert(sizeof(struct monitorbuf) == 128, "2x c
 	uint64_t pc_smp_tlb_addr2;					\
 	uint32_t pc_smp_tlb_gen;					\
 	u_int	pc_smp_tlb_op;						\
-	char	__pad[2924]		/* pad to UMA_PCPU_ALLOC_SIZE */
+	uint64_t pc_ucr3_load_mask;					\
+	char	__pad[2916]		/* pad to UMA_PCPU_ALLOC_SIZE */
 
 #define	PC_DBREG_CMD_NONE	0
 #define	PC_DBREG_CMD_LOAD	1

Modified: head/sys/amd64/include/pmap.h
==============================================================================
--- head/sys/amd64/include/pmap.h	Sat Jul 18 13:10:31 2020	(r363310)
+++ head/sys/amd64/include/pmap.h	Sat Jul 18 18:19:57 2020	(r363311)
@@ -241,6 +241,7 @@
 #define	PMAP_PCID_USER_PT	0x800
 
 #define	PMAP_NO_CR3		(~0UL)
+#define	PMAP_UCR3_NOMASK	(~0UL)
 
 #ifndef LOCORE
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202007181819.06IIJvHi076860>