Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 19 Aug 2018 18:47:17 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r338068 - in head/sys/amd64: amd64 include vmm/intel
Message-ID:  <201808191847.w7JIlHof088422@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Sun Aug 19 18:47:16 2018
New Revision: 338068
URL: https://svnweb.freebsd.org/changeset/base/338068

Log:
  Update L1TF workaround to sustain L1D pollution from NMI.
  
  Current mitigation for L1TF in bhyve flushes L1D either by an explicit
  WRMSR command, or by software reading enough uninteresting data to
  fully populate all lines of L1D.  If NMI occurs after either of
  methods is completed, but before VM entry, L1D becomes polluted with
  the cache lines touched by NMI handlers.  There is no interesting data
  which NMI accesses, but something sensitive might be co-located on the
  same cache line, and then L1TF exposes that to a rogue guest.
  
  Use VM entry MSR load list to ensure atomicity of L1D cache and VM
  entry if updated microcode was loaded.  If only software flush method
  is available, try to help the bhyve sw flusher by also flushing L1D on
  NMI exit to kernel mode.
  
  Suggested by and discussed with: Andrew Cooper <andrew.cooper3@citrix.com>
  Reviewed by:	jhb
  Sponsored by:	The FreeBSD Foundation
  MFC after:	2 weeks
  Differential revision:	https://reviews.freebsd.org/D16790

Modified:
  head/sys/amd64/amd64/exception.S
  head/sys/amd64/amd64/support.S
  head/sys/amd64/amd64/trap.c
  head/sys/amd64/include/md_var.h
  head/sys/amd64/vmm/intel/vmx.c
  head/sys/amd64/vmm/intel/vmx_support.S

Modified: head/sys/amd64/amd64/exception.S
==============================================================================
--- head/sys/amd64/amd64/exception.S	Sun Aug 19 18:43:10 2018	(r338067)
+++ head/sys/amd64/amd64/exception.S	Sun Aug 19 18:47:16 2018	(r338068)
@@ -864,7 +864,10 @@ nocallchain:
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
-	movq	%r13,%cr3
+	cmpb	$0, nmi_flush_l1d_sw(%rip)
+	je	2f
+	call	flush_l1d_sw		/* bhyve L1TF assist */
+2:	movq	%r13,%cr3
 	RESTORE_REGS
 	addq	$TF_RIP,%rsp
 	jmp	doreti_iret

Modified: head/sys/amd64/amd64/support.S
==============================================================================
--- head/sys/amd64/amd64/support.S	Sun Aug 19 18:43:10 2018	(r338067)
+++ head/sys/amd64/amd64/support.S	Sun Aug 19 18:47:16 2018	(r338068)
@@ -1225,3 +1225,36 @@ ENTRY(handle_ibrs_exit_rs)
 END(handle_ibrs_exit_rs)
 
 	.noaltmacro
+
+/*
+ * Flush L1D cache.  Load enough of the data from the kernel text
+ * to flush existing L1D content.
+ *
+ * N.B. The function follows ABI calling conventions, but the vmm.ko
+ * caller expects that only %rax, %rcx, %r9, and %rflags registers
+ * are clobbered.
+ */
+ENTRY(flush_l1d_sw)
+#define	L1D_FLUSH_SIZE	(64 * 1024)
+	movq	$KERNBASE, %r9
+	movq	$-L1D_FLUSH_SIZE, %rcx
+	/*
+	 * pass 1: Preload TLB.
+	 * Kernel text is mapped using superpages.  TLB preload is
+	 * done for the benefit of older CPUs which split 2M page
+	 * into 4k TLB entries.
+	 */
+1:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
+	addq	$PAGE_SIZE, %rcx
+	jne	1b
+	xorl	%eax, %eax
+	cpuid
+	movq	$-L1D_FLUSH_SIZE, %rcx
+	/* pass 2: Read each cache line. */
+2:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
+	addq	$64, %rcx
+	jne	2b
+	lfence
+	ret
+#undef	L1D_FLUSH_SIZE
+END(flush_l1d_sw)

Modified: head/sys/amd64/amd64/trap.c
==============================================================================
--- head/sys/amd64/amd64/trap.c	Sun Aug 19 18:43:10 2018	(r338067)
+++ head/sys/amd64/amd64/trap.c	Sun Aug 19 18:47:16 2018	(r338068)
@@ -161,6 +161,20 @@ SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG
     "Print debugging information on trap signal to ctty");
 
 /*
+ * Control L1D flush on return from NMI.
+ *
+ * Tunable  can be set to the following values:
+ * 0 - only enable flush on return from NMI if required by vmm.ko (default)
+ * >1 - always flush on return from NMI.
+ *
+ * Post-boot, the sysctl indicates if flushing is currently enabled.
+ */
+int nmi_flush_l1d_sw;
+SYSCTL_INT(_machdep, OID_AUTO, nmi_flush_l1d_sw, CTLFLAG_RWTUN,
+    &nmi_flush_l1d_sw, 0,
+    "Flush L1 Data Cache on NMI exit, software bhyve L1TF mitigation assist");
+
+/*
  * Exception, fault, and trap interface to the FreeBSD kernel.
  * This common code is called from assembly language IDT gate entry
  * routines that prepare a suitable stack frame, and restore this

Modified: head/sys/amd64/include/md_var.h
==============================================================================
--- head/sys/amd64/include/md_var.h	Sun Aug 19 18:43:10 2018	(r338067)
+++ head/sys/amd64/include/md_var.h	Sun Aug 19 18:47:16 2018	(r338068)
@@ -40,6 +40,7 @@ extern uint64_t	*vm_page_dump;
 extern int	hw_lower_amd64_sharedpage;
 extern int	hw_ibrs_disable;
 extern int	hw_ssb_disable;
+extern int	nmi_flush_l1d_sw;
 
 /*
  * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its

Modified: head/sys/amd64/vmm/intel/vmx.c
==============================================================================
--- head/sys/amd64/vmm/intel/vmx.c	Sun Aug 19 18:43:10 2018	(r338067)
+++ head/sys/amd64/vmm/intel/vmx.c	Sun Aug 19 18:47:16 2018	(r338068)
@@ -191,8 +191,11 @@ SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, 
 static int guest_l1d_flush;
 SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD,
     &guest_l1d_flush, 0, NULL);
+static int guest_l1d_flush_sw;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD,
+    &guest_l1d_flush_sw, 0, NULL);
 
-uint64_t vmx_msr_flush_cmd;
+static struct msr_entry msr_load_list[1] __aligned(16);
 
 /*
  * The definitions of SDT probes for VMX.
@@ -579,6 +582,9 @@ vmx_cleanup(void)
 		vpid_unr = NULL;
 	}
 
+	if (nmi_flush_l1d_sw == 1)
+		nmi_flush_l1d_sw = 0;
+
 	smp_rendezvous(NULL, vmx_disable, NULL, NULL);
 
 	return (0);
@@ -807,11 +813,30 @@ vmx_init(int ipinum)
 
 	guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) == 0;
 	TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush);
-	if (guest_l1d_flush &&
-	    (cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) != 0)
-		vmx_msr_flush_cmd = IA32_FLUSH_CMD_L1D;
 
 	/*
+	 * L1D cache flush is enabled.  Use IA32_FLUSH_CMD MSR when
+	 * available.  Otherwise fall back to the software flush
+	 * method which loads enough data from the kernel text to
+	 * flush existing L1D content, both on VMX entry and on NMI
+	 * return.
+	 */
+	if (guest_l1d_flush) {
+		if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) {
+			guest_l1d_flush_sw = 1;
+			TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw",
+			    &guest_l1d_flush_sw);
+		}
+		if (guest_l1d_flush_sw) {
+			if (nmi_flush_l1d_sw <= 1)
+				nmi_flush_l1d_sw = 1;
+		} else {
+			msr_load_list[0].index = MSR_IA32_FLUSH_CMD;
+			msr_load_list[0].val = IA32_FLUSH_CMD_L1D;
+		}
+	}
+
+	/*
 	 * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
 	 */
 	fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
@@ -999,6 +1024,15 @@ vmx_vminit(struct vm *vm, pmap_t pmap)
 		error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls);
 		error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap));
 		error += vmwrite(VMCS_VPID, vpid[i]);
+
+		if (guest_l1d_flush && !guest_l1d_flush_sw) {
+			vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract(
+			    (vm_offset_t)&msr_load_list[0]));
+			vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT,
+			    nitems(msr_load_list));
+			vmcs_write(VMCS_EXIT_MSR_STORE, 0);
+			vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0);
+		}
 
 		/* exception bitmap */
 		if (vcpu_trace_exceptions(vm, i))

Modified: head/sys/amd64/vmm/intel/vmx_support.S
==============================================================================
--- head/sys/amd64/vmm/intel/vmx_support.S	Sun Aug 19 18:43:10 2018	(r338067)
+++ head/sys/amd64/vmm/intel/vmx_support.S	Sun Aug 19 18:47:16 2018	(r338068)
@@ -176,44 +176,10 @@ ENTRY(vmx_enter_guest)
 	jbe	invept_error		/* Check invept instruction error */
 
 guest_restore:
-
-	/*
-	 * Flush L1D cache if requested.  Use IA32_FLUSH_CMD MSR if available,
-	 * otherwise load enough of the data from the zero_region to flush
-	 * existing L1D content.
-	 */
-#define	L1D_FLUSH_SIZE	(64 * 1024)
 	movl	%edx, %r8d
-	cmpb	$0, guest_l1d_flush(%rip)
+	cmpb	$0, guest_l1d_flush_sw(%rip)
 	je	after_l1d
-	movq	vmx_msr_flush_cmd(%rip), %rax
-	testq	%rax, %rax
-	jz	1f
-	movq	%rax, %rdx
-	shrq	$32, %rdx
-	movl	$MSR_IA32_FLUSH_CMD, %ecx
-	wrmsr
-	jmp	after_l1d
-1:	movq	$KERNBASE, %r9
-	movq	$-L1D_FLUSH_SIZE, %rcx
-	/*
-	 * pass 1: Preload TLB.
-	 * Kernel text is mapped using superpages.  TLB preload is
-	 * done for the benefit of older CPUs which split 2M page
-	 * into 4k TLB entries.
-	 */
-2:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
-	addq	$PAGE_SIZE, %rcx
-	jne	2b
-	xorl	%eax, %eax
-	cpuid
-	movq	$-L1D_FLUSH_SIZE, %rcx
-	/* pass 2: Read each cache line */
-3:	movb	L1D_FLUSH_SIZE(%r9, %rcx), %al
-	addq	$64, %rcx
-	jne	3b
-	lfence
-#undef	L1D_FLUSH_SIZE
+	call	flush_l1d_sw
 after_l1d:
 	cmpl	$0, %r8d
 	je	do_launch



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808191847.w7JIlHof088422>