Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 18 Jan 2018 23:50:21 +0000 (UTC)
From:      John Baldwin <jhb@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r328157 - in head/sys/amd64: amd64 include
Message-ID:  <201801182350.w0INoLTx065886@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jhb
Date: Thu Jan 18 23:50:21 2018
New Revision: 328157
URL: https://svnweb.freebsd.org/changeset/base/328157

Log:
  Use a dedicated per-CPU stack for machine check exceptions.
  
  Similar to NMIs, machine check exceptions can fire at any time and are
  not masked by IF.  This means that machine checks can fire when the
  kstack is too deep to hold a trap frame, or at critical sections in
  trap handlers when a user %gs is used with a kernel %cs.  Use the same
  strategy used for NMIs of using a dedicated per-CPU stack configured
  in IST 3.  Store the CPU's pcpu pointer at the stop of the stack so
  that the machine check handler can reliably find the proper value for
  %gs (also borrowed from NMIs).
  
  This should also fix a similar issue with PTI with a MC# occurring
  while the CPU is executing on the trampoline stack.
  
  While here, bypass trap() entirely and just call mca_intr().  This
  avoids a bogus call to kdb_reenter() (there's no reason to try to
  reenter kdb if a MC# is raised).
  
  Reviewed by:	kib
  Tested by:	avg (on AMD without PTI)
  Differential Revision:	https://reviews.freebsd.org/D13962

Modified:
  head/sys/amd64/amd64/db_trace.c
  head/sys/amd64/amd64/exception.S
  head/sys/amd64/amd64/machdep.c
  head/sys/amd64/amd64/mp_machdep.c
  head/sys/amd64/amd64/pmap.c
  head/sys/amd64/amd64/trap.c
  head/sys/amd64/include/intr_machdep.h

Modified: head/sys/amd64/amd64/db_trace.c
==============================================================================
--- head/sys/amd64/amd64/db_trace.c	Thu Jan 18 22:46:47 2018	(r328156)
+++ head/sys/amd64/amd64/db_trace.c	Thu Jan 18 23:50:21 2018	(r328157)
@@ -200,6 +200,7 @@ db_nextframe(struct amd64_frame **fp, db_addr_t *ip, s
 	if (name != NULL) {
 		if (strcmp(name, "calltrap") == 0 ||
 		    strcmp(name, "fork_trampoline") == 0 ||
+		    strcmp(name, "mchk_calltrap") == 0 ||
 		    strcmp(name, "nmi_calltrap") == 0 ||
 		    strcmp(name, "Xdblfault") == 0)
 			frame_type = TRAP;

Modified: head/sys/amd64/amd64/exception.S
==============================================================================
--- head/sys/amd64/amd64/exception.S	Thu Jan 18 22:46:47 2018	(r328156)
+++ head/sys/amd64/amd64/exception.S	Thu Jan 18 23:50:21 2018	(r328157)
@@ -141,7 +141,6 @@ X\l:
 	TRAP	ill, T_PRIVINFLT
 	TRAP	dna, T_DNA
 	TRAP	fpusegm, T_FPOPFLT
-	TRAP	mchk, T_MCHK
 	TRAP	rsvd, T_RESERVED
 	TRAP	fpu, T_ARITHTRAP
 	TRAP	xmm, T_XMMFLT
@@ -663,6 +662,103 @@ outofnmi:
 	cli
 nocallchain:
 #endif
+	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
+	jnz	doreti_exit
+	/*
+	 * Put back the preserved MSR_GSBASE value.
+	 */
+	movl	$MSR_GSBASE,%ecx
+	movq	%r12,%rdx
+	movl	%edx,%eax
+	shrq	$32,%rdx
+	wrmsr
+	movq	%r13,%cr3
+	RESTORE_REGS
+	addq	$TF_RIP,%rsp
+	jmp	doreti_iret
+
+/*
+ * MC# handling is similar to NMI.
+ *
+ * As with NMIs, machine check exceptions do not respect RFLAGS.IF and
+ * can occur at any time with a GS.base value that does not correspond
+ * to the privilege level in CS.
+ *
+ * Machine checks are not unblocked by iretq, but it is best to run
+ * the handler with interrupts disabled since the exception may have
+ * interrupted a critical section.
+ *
+ * The MC# handler runs on its own stack (tss_ist3).  The canonical
+ * GS.base value for the processor is stored just above the bottom of
+ * its MC# stack.  For exceptions taken from kernel mode, the current
+ * value in the processor's GS.base is saved at entry to C-preserved
+ * register %r12, the canonical value for GS.base is then loaded into
+ * the processor, and the saved value is restored at exit time.  For
+ * exceptions taken from user mode, the cheaper 'SWAPGS' instructions
+ * are used for swapping GS.base.
+ */
+
+IDTVEC(mchk)
+	subq	$TF_RIP,%rsp
+	movl	$(T_MCHK),TF_TRAPNO(%rsp)
+	movq	$0,TF_ADDR(%rsp)
+	movq	$0,TF_ERR(%rsp)
+	movq	%rdi,TF_RDI(%rsp)
+	movq	%rsi,TF_RSI(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	movq	%r8,TF_R8(%rsp)
+	movq	%r9,TF_R9(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rbx,TF_RBX(%rsp)
+	movq	%rbp,TF_RBP(%rsp)
+	movq	%r10,TF_R10(%rsp)
+	movq	%r11,TF_R11(%rsp)
+	movq	%r12,TF_R12(%rsp)
+	movq	%r13,TF_R13(%rsp)
+	movq	%r14,TF_R14(%rsp)
+	movq	%r15,TF_R15(%rsp)
+	SAVE_SEGS
+	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
+	cld
+	xorl	%ebx,%ebx
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
+	jnz	mchk_fromuserspace
+	/*
+	 * We've interrupted the kernel.  Preserve GS.base in %r12
+	 * and %cr3 in %r13.
+	 */
+	movl	$MSR_GSBASE,%ecx
+	rdmsr
+	movq	%rax,%r12
+	shlq	$32,%rdx
+	orq	%rdx,%r12
+	/* Retrieve and load the canonical value for GS.base. */
+	movq	TF_SIZE(%rsp),%rdx
+	movl	%edx,%eax
+	shrq	$32,%rdx
+	wrmsr
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	mchk_calltrap
+	movq	%rax,%cr3
+	jmp	mchk_calltrap
+mchk_fromuserspace:
+	incl	%ebx
+	swapgs
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	1f
+	movq	%rax,%cr3
+1:
+/* Note: this label is also used by ddb and gdb: */
+mchk_calltrap:
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	movq	%rsp,%rdi
+	call	mca_intr
+	MEXITCOUNT
 	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
 	jnz	doreti_exit
 	/*

Modified: head/sys/amd64/amd64/machdep.c
==============================================================================
--- head/sys/amd64/amd64/machdep.c	Thu Jan 18 22:46:47 2018	(r328156)
+++ head/sys/amd64/amd64/machdep.c	Thu Jan 18 23:50:21 2018	(r328157)
@@ -662,7 +662,7 @@ static struct gate_descriptor idt0[NIDT];
 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
 
 static char dblfault_stack[PAGE_SIZE] __aligned(16);
-
+static char mce0_stack[PAGE_SIZE] __aligned(16);
 static char nmi0_stack[PAGE_SIZE] __aligned(16);
 CTASSERT(sizeof(struct nmi_pcpu) == 16);
 
@@ -819,7 +819,7 @@ extern inthand_t
 	IDTVEC(div_pti), IDTVEC(dbg_pti), IDTVEC(bpt_pti),
 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
-	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti), IDTVEC(mchk_pti),
+	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
 	IDTVEC(xmm_pti),
 #ifdef KDTRACE_HOOKS
@@ -1658,8 +1658,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	    SEL_KPL, 0);
 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
 	    SEL_KPL, 0);
-	setidt(IDT_MC, pti ? &IDTVEC(mchk_pti) : &IDTVEC(mchk), SDT_SYSIGT,
-	    SEL_KPL, 0);
+	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
 	    SEL_KPL, 0);
 #ifdef KDTRACE_HOOKS
@@ -1704,6 +1703,14 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	np->np_pcpu = (register_t) pc;
 	common_tss[0].tss_ist2 = (long) np;
 
+	/*
+	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
+	 * above the start of the ist3 stack.
+	 */
+	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
+	np->np_pcpu = (register_t) pc;
+	common_tss[0].tss_ist3 = (long) np;
+	
 	/* Set the IO permission bitmap (empty due to tss seg limit) */
 	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
 

Modified: head/sys/amd64/amd64/mp_machdep.c
==============================================================================
--- head/sys/amd64/amd64/mp_machdep.c	Thu Jan 18 22:46:47 2018	(r328156)
+++ head/sys/amd64/amd64/mp_machdep.c	Thu Jan 18 23:50:21 2018	(r328157)
@@ -87,6 +87,7 @@ extern	struct pcpu __pcpu[];
 
 /* Temporary variables for init_secondary()  */
 char *doublefault_stack;
+char *mce_stack;
 char *nmi_stack;
 
 /*
@@ -212,6 +213,10 @@ init_secondary(void)
 	np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
 	common_tss[cpu].tss_ist2 = (long) np;
 
+	/* The MC# stack runs on IST3. */
+	np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
+	common_tss[cpu].tss_ist3 = (long) np;
+
 	/* Prepare private GDT */
 	gdt_segs[GPROC0_SEL].ssd_base = (long) &common_tss[cpu];
 	for (x = 0; x < NGDT; x++) {
@@ -250,8 +255,13 @@ init_secondary(void)
 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful : 0;
 
 	/* Save the per-cpu pointer for use by the NMI handler. */
+	np = ((struct nmi_pcpu *) &nmi_stack[PAGE_SIZE]) - 1;
 	np->np_pcpu = (register_t) pc;
 
+	/* Save the per-cpu pointer for use by the MC# handler. */
+	np = ((struct nmi_pcpu *) &mce_stack[PAGE_SIZE]) - 1;
+	np->np_pcpu = (register_t) pc;
+
 	wrmsr(MSR_FSBASE, 0);		/* User value */
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, (u_int64_t)pc);	/* XXX User value while we're in the kernel */
@@ -346,6 +356,8 @@ native_start_all_aps(void)
 		    kstack_pages * PAGE_SIZE, M_WAITOK | M_ZERO);
 		doublefault_stack = (char *)kmem_malloc(kernel_arena,
 		    PAGE_SIZE, M_WAITOK | M_ZERO);
+		mce_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
+		    M_WAITOK | M_ZERO);
 		nmi_stack = (char *)kmem_malloc(kernel_arena, PAGE_SIZE,
 		    M_WAITOK | M_ZERO);
 		dpcpu = (void *)kmem_malloc(kernel_arena, DPCPU_SIZE,

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c	Thu Jan 18 22:46:47 2018	(r328156)
+++ head/sys/amd64/amd64/pmap.c	Thu Jan 18 23:50:21 2018	(r328157)
@@ -7596,6 +7596,9 @@ pmap_pti_init(void)
 		/* NMI stack IST 2 */
 		va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
+		/* MC# stack IST 3 */
+		va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
+		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 	}
 	pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
 	    (vm_offset_t)etext, true);

Modified: head/sys/amd64/amd64/trap.c
==============================================================================
--- head/sys/amd64/amd64/trap.c	Thu Jan 18 22:46:47 2018	(r328156)
+++ head/sys/amd64/amd64/trap.c	Thu Jan 18 23:50:21 2018	(r328157)
@@ -220,11 +220,6 @@ trap(struct trapframe *frame)
 #endif
 	}
 
-	if (type == T_MCHK) {
-		mca_intr();
-		return;
-	}
-
 	if ((frame->tf_rflags & PSL_I) == 0) {
 		/*
 		 * Buggy application or kernel code has disabled

Modified: head/sys/amd64/include/intr_machdep.h
==============================================================================
--- head/sys/amd64/include/intr_machdep.h	Thu Jan 18 22:46:47 2018	(r328156)
+++ head/sys/amd64/include/intr_machdep.h	Thu Jan 18 23:50:21 2018	(r328157)
@@ -139,7 +139,7 @@ struct trapframe;
 
 /*
  * The following data structure holds per-cpu data, and is placed just
- * above the top of the space used for the NMI stack.
+ * above the top of the space used for the NMI and MC# stacks.
  */
 struct nmi_pcpu {
 	register_t	np_pcpu;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201801182350.w0INoLTx065886>