Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 17 Feb 2018 18:00:01 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r329462 - in stable/11/sys: amd64/amd64 amd64/ia32 amd64/include amd64/vmm amd64/vmm/intel dev/cpuctl dev/hyperv/vmbus dev/hyperv/vmbus/amd64 dev/hyperv/vmbus/i386 i386/i386 x86/include...
Message-ID:  <201802171800.w1HI01KX064024@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Sat Feb 17 18:00:01 2018
New Revision: 329462
URL: https://svnweb.freebsd.org/changeset/base/329462

Log:
  MFC r328083,328096,328116,328119,328120,328128,328135,328153,328157,
  328166,328177,328199,328202,328205,328468,328470,328624,328625,328627,
  328628,329214,329297,329365:
  
  Meltdown mitigation by PTI, PCID optimization of PTI, and kernel use of IBRS
  for some mitigations of Spectre.
  
  Tested by:      emaste, Arshan Khanifar <arshankhanifar@gmail.com>
  Discussed with:	jkim
  Sponsored by:	The FreeBSD Foundation

Modified:
  stable/11/sys/amd64/amd64/apic_vector.S
  stable/11/sys/amd64/amd64/atpic_vector.S
  stable/11/sys/amd64/amd64/cpu_switch.S
  stable/11/sys/amd64/amd64/db_trace.c
  stable/11/sys/amd64/amd64/exception.S
  stable/11/sys/amd64/amd64/genassym.c
  stable/11/sys/amd64/amd64/initcpu.c
  stable/11/sys/amd64/amd64/machdep.c
  stable/11/sys/amd64/amd64/mp_machdep.c
  stable/11/sys/amd64/amd64/pmap.c
  stable/11/sys/amd64/amd64/support.S
  stable/11/sys/amd64/amd64/sys_machdep.c
  stable/11/sys/amd64/amd64/trap.c
  stable/11/sys/amd64/amd64/vm_machdep.c
  stable/11/sys/amd64/ia32/ia32_exception.S
  stable/11/sys/amd64/ia32/ia32_syscall.c
  stable/11/sys/amd64/include/asmacros.h
  stable/11/sys/amd64/include/frame.h
  stable/11/sys/amd64/include/intr_machdep.h
  stable/11/sys/amd64/include/md_var.h
  stable/11/sys/amd64/include/pcb.h
  stable/11/sys/amd64/include/pcpu.h
  stable/11/sys/amd64/include/pmap.h
  stable/11/sys/amd64/include/smp.h
  stable/11/sys/amd64/vmm/intel/vmx.c
  stable/11/sys/amd64/vmm/vmm.c
  stable/11/sys/dev/cpuctl/cpuctl.c
  stable/11/sys/dev/hyperv/vmbus/amd64/vmbus_vector.S
  stable/11/sys/dev/hyperv/vmbus/i386/vmbus_vector.S
  stable/11/sys/dev/hyperv/vmbus/vmbus.c
  stable/11/sys/i386/i386/apic_vector.s
  stable/11/sys/i386/i386/atpic_vector.s
  stable/11/sys/i386/i386/exception.s
  stable/11/sys/i386/i386/machdep.c
  stable/11/sys/i386/i386/pmap.c
  stable/11/sys/i386/i386/support.s
  stable/11/sys/i386/i386/vm_machdep.c
  stable/11/sys/x86/include/apicvar.h
  stable/11/sys/x86/include/specialreg.h
  stable/11/sys/x86/include/x86_smp.h
  stable/11/sys/x86/include/x86_var.h
  stable/11/sys/x86/isa/atpic.c
  stable/11/sys/x86/x86/cpu_machdep.c
  stable/11/sys/x86/x86/identcpu.c
  stable/11/sys/x86/x86/local_apic.c
  stable/11/sys/x86/x86/mp_x86.c
  stable/11/sys/x86/xen/pv.c
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/amd64/amd64/apic_vector.S
==============================================================================
--- stable/11/sys/amd64/amd64/apic_vector.S	Sat Feb 17 17:23:43 2018	(r329461)
+++ stable/11/sys/amd64/amd64/apic_vector.S	Sat Feb 17 18:00:01 2018	(r329462)
@@ -2,7 +2,13 @@
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
  * All rights reserved.
+ * Copyright (c) 2014-2018 The FreeBSD Foundation
+ * All rights reserved.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -38,12 +44,12 @@
 
 #include "opt_smp.h"
 
+#include "assym.s"
+
 #include <machine/asmacros.h>
 #include <machine/specialreg.h>
 #include <x86/apicreg.h>
 
-#include "assym.s"
-
 #ifdef SMP
 #define LK	lock ;
 #else
@@ -73,30 +79,28 @@ as_lapic_eoi:
  * translates that into a vector, and passes the vector to the
  * lapic_handle_intr() function.
  */
-#define	ISR_VEC(index, vec_name)					\
-	.text ;								\
-	SUPERALIGN_TEXT ;						\
-IDTVEC(vec_name) ;							\
-	PUSH_FRAME ;							\
-	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
-	cmpl	$0,x2apic_mode ;					\
-	je	1f ;							\
-	movl	$(MSR_APIC_ISR0 + index),%ecx ;				\
-	rdmsr ;								\
-	jmp	2f ;							\
-1: ;									\
-	movq	lapic_map, %rdx ;	/* pointer to local APIC */	\
-	movl	LA_ISR + 16 * (index)(%rdx), %eax ;	/* load ISR */	\
-2: ;									\
-	bsrl	%eax, %eax ;	/* index of highest set bit in ISR */	\
-	jz	3f ;							\
-	addl	$(32 * index),%eax ;					\
-	movq	%rsp, %rsi	;                                       \
-	movl	%eax, %edi ;	/* pass the IRQ */			\
-	call	lapic_handle_intr ;					\
-3: ;									\
-	MEXITCOUNT ;							\
+	.macro	ISR_VEC	index, vec_name
+	INTR_HANDLER	\vec_name
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	cmpl	$0,x2apic_mode
+	je	1f
+	movl	$(MSR_APIC_ISR0 + \index),%ecx
+	rdmsr
+	jmp	2f
+1:
+	movq	lapic_map, %rdx		/* pointer to local APIC */
+	movl	LA_ISR + 16 * (\index)(%rdx), %eax	/* load ISR */
+2:
+	bsrl	%eax, %eax	/* index of highest set bit in ISR */
+	jz	3f
+	addl	$(32 * \index),%eax
+	movq	%rsp, %rsi
+	movl	%eax, %edi	/* pass the IRQ */
+	call	lapic_handle_intr
+3:
+	MEXITCOUNT
 	jmp	doreti
+	.endm
 
 /*
  * Handle "spurious INTerrupts".
@@ -108,26 +112,21 @@ IDTVEC(vec_name) ;							\
 	.text
 	SUPERALIGN_TEXT
 IDTVEC(spuriousint)
-
 	/* No EOI cycle used here */
-
 	jmp	doreti_iret
 
-	ISR_VEC(1, apic_isr1)
-	ISR_VEC(2, apic_isr2)
-	ISR_VEC(3, apic_isr3)
-	ISR_VEC(4, apic_isr4)
-	ISR_VEC(5, apic_isr5)
-	ISR_VEC(6, apic_isr6)
-	ISR_VEC(7, apic_isr7)
+	ISR_VEC	1, apic_isr1
+	ISR_VEC	2, apic_isr2
+	ISR_VEC	3, apic_isr3
+	ISR_VEC	4, apic_isr4
+	ISR_VEC	5, apic_isr5
+	ISR_VEC	6, apic_isr6
+	ISR_VEC	7, apic_isr7
 
 /*
  * Local APIC periodic timer handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(timerint)
-	PUSH_FRAME
+	INTR_HANDLER	timerint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	lapic_handle_timer
@@ -137,10 +136,7 @@ IDTVEC(timerint)
 /*
  * Local APIC CMCI handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cmcint)
-	PUSH_FRAME
+	INTR_HANDLER cmcint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_cmc
 	MEXITCOUNT
@@ -149,10 +145,7 @@ IDTVEC(cmcint)
 /*
  * Local APIC error interrupt handler.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(errorint)
-	PUSH_FRAME
+	INTR_HANDLER errorint
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	call	lapic_handle_error
 	MEXITCOUNT
@@ -163,10 +156,7 @@ IDTVEC(errorint)
  * Xen event channel upcall interrupt handler.
  * Only used when the hypervisor supports direct vector callbacks.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(xen_intr_upcall)
-	PUSH_FRAME
+	INTR_HANDLER xen_intr_upcall
 	FAKE_MCOUNT(TF_RIP(%rsp))
 	movq	%rsp, %rdi
 	call	xen_intr_handle_upcall
@@ -183,74 +173,68 @@ IDTVEC(xen_intr_upcall)
 	SUPERALIGN_TEXT
 invltlb_ret:
 	call	as_lapic_eoi
-	POP_FRAME
-	jmp	doreti_iret
+	jmp	ld_regs
 
 	SUPERALIGN_TEXT
-IDTVEC(invltlb)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb
 	call	invltlb_handler
 	jmp	invltlb_ret
 
-IDTVEC(invltlb_pcid)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb_pcid
 	call	invltlb_pcid_handler
 	jmp	invltlb_ret
 
-IDTVEC(invltlb_invpcid)
-	PUSH_FRAME
-
+	INTR_HANDLER invltlb_invpcid_nopti
 	call	invltlb_invpcid_handler
 	jmp	invltlb_ret
 
+	INTR_HANDLER invltlb_invpcid_pti
+	call	invltlb_invpcid_pti_handler
+	jmp	invltlb_ret
+
 /*
  * Single page TLB shootdown
  */
-	.text
+	INTR_HANDLER invlpg
+	call	invlpg_handler
+	jmp	invltlb_ret
 
-	SUPERALIGN_TEXT
-IDTVEC(invlpg)
-	PUSH_FRAME
+	INTR_HANDLER invlpg_invpcid
+	call	invlpg_invpcid_handler
+	jmp	invltlb_ret
 
-	call	invlpg_handler
+	INTR_HANDLER invlpg_pcid
+	call	invlpg_pcid_handler
 	jmp	invltlb_ret
 
 /*
  * Page range TLB shootdown.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(invlrng)
-	PUSH_FRAME
-
+	INTR_HANDLER invlrng
 	call	invlrng_handler
 	jmp	invltlb_ret
 
+	INTR_HANDLER invlrng_invpcid
+	call	invlrng_invpcid_handler
+	jmp	invltlb_ret
+
+	INTR_HANDLER invlrng_pcid
+	call	invlrng_pcid_handler
+	jmp	invltlb_ret
+
 /*
  * Invalidate cache.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(invlcache)
-	PUSH_FRAME
-
+	INTR_HANDLER invlcache
 	call	invlcache_handler
 	jmp	invltlb_ret
 
 /*
  * Handler for IPIs sent via the per-cpu IPI bitmap.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(ipi_intr_bitmap_handler)		
-	PUSH_FRAME
-
+	INTR_HANDLER ipi_intr_bitmap_handler
 	call	as_lapic_eoi
-	
 	FAKE_MCOUNT(TF_RIP(%rsp))
-
 	call	ipi_bitmap_handler
 	MEXITCOUNT
 	jmp	doreti
@@ -258,24 +242,15 @@ IDTVEC(ipi_intr_bitmap_handler)		
 /*
  * Executed by a CPU when it receives an IPI_STOP from another CPU.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cpustop)
-	PUSH_FRAME
-
+	INTR_HANDLER cpustop
 	call	as_lapic_eoi
-
 	call	cpustop_handler
 	jmp	doreti
 
 /*
  * Executed by a CPU when it receives an IPI_SUSPEND from another CPU.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(cpususpend)
-	PUSH_FRAME
-
+	INTR_HANDLER cpususpend
 	call	cpususpend_handler
 	call	as_lapic_eoi
 	jmp	doreti
@@ -285,10 +260,7 @@ IDTVEC(cpususpend)
  *
  * - Calls the generic rendezvous action function.
  */
-	.text
-	SUPERALIGN_TEXT
-IDTVEC(rendezvous)
-	PUSH_FRAME
+	INTR_HANDLER rendezvous
 #ifdef COUNT_IPIS
 	movl	PCPU(CPUID), %eax
 	movq	ipi_rendezvous_counts(,%rax,8), %rax
@@ -327,5 +299,9 @@ IDTVEC(justreturn)
 	popq	%rcx
 	popq	%rax
 	jmp	doreti_iret
+
+	INTR_HANDLER	justreturn1
+	call	as_lapic_eoi
+	jmp	doreti
 
 #endif /* SMP */

Modified: stable/11/sys/amd64/amd64/atpic_vector.S
==============================================================================
--- stable/11/sys/amd64/amd64/atpic_vector.S	Sat Feb 17 17:23:43 2018	(r329461)
+++ stable/11/sys/amd64/amd64/atpic_vector.S	Sat Feb 17 18:00:01 2018	(r329462)
@@ -36,38 +36,35 @@
  * master and slave interrupt controllers.
  */
 
+#include "assym.s"
 #include <machine/asmacros.h>
 
-#include "assym.s"
-
 /*
  * Macros for interrupt entry, call to handler, and exit.
  */
-#define	INTR(irq_num, vec_name) \
-	.text ;								\
-	SUPERALIGN_TEXT ;						\
-IDTVEC(vec_name) ;							\
-	PUSH_FRAME ;							\
-	FAKE_MCOUNT(TF_RIP(%rsp)) ;					\
-	movq	%rsp, %rsi	;                                       \
-	movl	$irq_num, %edi; 	/* pass the IRQ */		\
-	call	atpic_handle_intr ;					\
-	MEXITCOUNT ;							\
+	.macro	INTR	irq_num, vec_name
+	INTR_HANDLER	\vec_name
+	FAKE_MCOUNT(TF_RIP(%rsp))
+	movq	%rsp, %rsi
+	movl	$\irq_num, %edi	 	/* pass the IRQ */
+	call	atpic_handle_intr
+	MEXITCOUNT
 	jmp	doreti
+	.endm
 
-	INTR(0, atpic_intr0)
-	INTR(1, atpic_intr1)
-	INTR(2, atpic_intr2)
-	INTR(3, atpic_intr3)
-	INTR(4, atpic_intr4)
-	INTR(5, atpic_intr5)
-	INTR(6, atpic_intr6)
-	INTR(7, atpic_intr7)
-	INTR(8, atpic_intr8)
-	INTR(9, atpic_intr9)
-	INTR(10, atpic_intr10)
-	INTR(11, atpic_intr11)
-	INTR(12, atpic_intr12)
-	INTR(13, atpic_intr13)
-	INTR(14, atpic_intr14)
-	INTR(15, atpic_intr15)
+	INTR	0, atpic_intr0
+	INTR	1, atpic_intr1
+	INTR	2, atpic_intr2
+	INTR	3, atpic_intr3
+	INTR	4, atpic_intr4
+	INTR	5, atpic_intr5
+	INTR	6, atpic_intr6
+	INTR	7, atpic_intr7
+	INTR	8, atpic_intr8
+	INTR	9, atpic_intr9
+	INTR	10, atpic_intr10
+	INTR	11, atpic_intr11
+	INTR	12, atpic_intr12
+	INTR	13, atpic_intr13
+	INTR	14, atpic_intr14
+	INTR	15, atpic_intr15

Modified: stable/11/sys/amd64/amd64/cpu_switch.S
==============================================================================
--- stable/11/sys/amd64/amd64/cpu_switch.S	Sat Feb 17 17:23:43 2018	(r329461)
+++ stable/11/sys/amd64/amd64/cpu_switch.S	Sat Feb 17 18:00:01 2018	(r329462)
@@ -215,8 +215,10 @@ done_tss:
 	movq	%r8,PCPU(RSP0)
 	movq	%r8,PCPU(CURPCB)
 	/* Update the TSS_RSP0 pointer for the next interrupt */
+	cmpb	$0,pti(%rip)
+	jne	1f
 	movq	%r8,TSS_RSP0(%rdx)
-	movq	%r12,PCPU(CURTHREAD)		/* into next thread */
+1:	movq	%r12,PCPU(CURTHREAD)		/* into next thread */
 
 	/* Test if debug registers should be restored. */
 	testl	$PCB_DBREGS,PCB_FLAGS(%r8)
@@ -293,7 +295,12 @@ do_tss:	movq	%rdx,PCPU(TSSP)
 	shrq	$8,%rcx
 	movl	%ecx,8(%rax)
 	movb	$0x89,5(%rax)	/* unset busy */
-	movl	$TSSSEL,%eax
+	cmpb	$0,pti(%rip)
+	je	1f
+	movq	PCPU(PRVSPACE),%rax
+	addq	$PC_PTI_STACK+PC_PTI_STACK_SZ*8,%rax
+	movq	%rax,TSS_RSP0(%rdx)
+1:	movl	$TSSSEL,%eax
 	ltr	%ax
 	jmp	done_tss
 

Modified: stable/11/sys/amd64/amd64/db_trace.c
==============================================================================
--- stable/11/sys/amd64/amd64/db_trace.c	Sat Feb 17 17:23:43 2018	(r329461)
+++ stable/11/sys/amd64/amd64/db_trace.c	Sat Feb 17 18:00:01 2018	(r329462)
@@ -200,6 +200,7 @@ db_nextframe(struct amd64_frame **fp, db_addr_t *ip, s
 	if (name != NULL) {
 		if (strcmp(name, "calltrap") == 0 ||
 		    strcmp(name, "fork_trampoline") == 0 ||
+		    strcmp(name, "mchk_calltrap") == 0 ||
 		    strcmp(name, "nmi_calltrap") == 0 ||
 		    strcmp(name, "Xdblfault") == 0)
 			frame_type = TRAP;

Modified: stable/11/sys/amd64/amd64/exception.S
==============================================================================
--- stable/11/sys/amd64/amd64/exception.S	Sat Feb 17 17:23:43 2018	(r329461)
+++ stable/11/sys/amd64/amd64/exception.S	Sat Feb 17 18:00:01 2018	(r329462)
@@ -1,12 +1,16 @@
 /*-
  * Copyright (c) 1989, 1990 William F. Jolitz.
  * Copyright (c) 1990 The Regents of the University of California.
- * Copyright (c) 2007 The FreeBSD Foundation
+ * Copyright (c) 2007-2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by A. Joseph Koshy under
  * sponsorship from the FreeBSD Foundation and Google, Inc.
  *
+ * Portions of this software were developed by
+ * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
+ * the FreeBSD Foundation.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -38,13 +42,13 @@
 #include "opt_compat.h"
 #include "opt_hwpmc_hooks.h"
 
+#include "assym.s"
+
 #include <machine/asmacros.h>
 #include <machine/psl.h>
 #include <machine/trap.h>
 #include <machine/specialreg.h>
 
-#include "assym.s"
-
 #ifdef KDTRACE_HOOKS
 	.bss
 	.globl	dtrace_invop_jump_addr
@@ -100,69 +104,62 @@ dtrace_invop_calltrap_addr:
 MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
-/* Traps that we leave interrupts disabled for.. */
-#define	TRAP_NOEN(a)	\
-	subq $TF_RIP,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
-	movq $0,TF_ERR(%rsp) ; \
+/* Traps that we leave interrupts disabled for. */
+	.macro	TRAP_NOEN	l, trapno
+	PTI_ENTRY	\l,X\l
+	.globl	X\l
+	.type	X\l,@function
+X\l:	subq $TF_RIP,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
+	movq $0,TF_ERR(%rsp)
 	jmp alltraps_noen
-IDTVEC(dbg)
-	TRAP_NOEN(T_TRCTRAP)
-IDTVEC(bpt)
-	TRAP_NOEN(T_BPTFLT)
+	.endm
+
+	TRAP_NOEN	dbg, T_TRCTRAP
+	TRAP_NOEN	bpt, T_BPTFLT
 #ifdef KDTRACE_HOOKS
-IDTVEC(dtrace_ret)
-	TRAP_NOEN(T_DTRACE_RET)
+	TRAP_NOEN	dtrace_ret, T_DTRACE_RET
 #endif
 
 /* Regular traps; The cpu does not supply tf_err for these. */
-#define	TRAP(a)	 \
-	subq $TF_RIP,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
-	movq $0,TF_ERR(%rsp) ; \
+	.macro	TRAP	l, trapno
+	PTI_ENTRY	\l,X\l
+	.globl	X\l
+	.type	X\l,@function
+X\l:
+	subq $TF_RIP,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
+	movq $0,TF_ERR(%rsp)
 	jmp alltraps
-IDTVEC(div)
-	TRAP(T_DIVIDE)
-IDTVEC(ofl)
-	TRAP(T_OFLOW)
-IDTVEC(bnd)
-	TRAP(T_BOUND)
-IDTVEC(ill)
-	TRAP(T_PRIVINFLT)
-IDTVEC(dna)
-	TRAP(T_DNA)
-IDTVEC(fpusegm)
-	TRAP(T_FPOPFLT)
-IDTVEC(mchk)
-	TRAP(T_MCHK)
-IDTVEC(rsvd)
-	TRAP(T_RESERVED)
-IDTVEC(fpu)
-	TRAP(T_ARITHTRAP)
-IDTVEC(xmm)
-	TRAP(T_XMMFLT)
+	.endm
 
-/* This group of traps have tf_err already pushed by the cpu */
-#define	TRAP_ERR(a)	\
-	subq $TF_ERR,%rsp; \
-	movl $(a),TF_TRAPNO(%rsp) ; \
-	movq $0,TF_ADDR(%rsp) ; \
+	TRAP	div, T_DIVIDE
+	TRAP	ofl, T_OFLOW
+	TRAP	bnd, T_BOUND
+	TRAP	ill, T_PRIVINFLT
+	TRAP	dna, T_DNA
+	TRAP	fpusegm, T_FPOPFLT
+	TRAP	rsvd, T_RESERVED
+	TRAP	fpu, T_ARITHTRAP
+	TRAP	xmm, T_XMMFLT
+
+/* This group of traps have tf_err already pushed by the cpu. */
+	.macro	TRAP_ERR	l, trapno
+	PTI_ENTRY	\l,X\l,has_err=1
+	.globl	X\l
+	.type	X\l,@function
+X\l:
+	subq $TF_ERR,%rsp
+	movl $\trapno,TF_TRAPNO(%rsp)
+	movq $0,TF_ADDR(%rsp)
 	jmp alltraps
-IDTVEC(tss)
-	TRAP_ERR(T_TSSFLT)
-IDTVEC(missing)
-	subq	$TF_ERR,%rsp
-	movl	$T_SEGNPFLT,TF_TRAPNO(%rsp)
-	jmp	prot_addrf
-IDTVEC(stk)
-	subq	$TF_ERR,%rsp
-	movl	$T_STKFLT,TF_TRAPNO(%rsp)
-	jmp	prot_addrf
-IDTVEC(align)
-	TRAP_ERR(T_ALIGNFLT)
+	.endm
 
+	TRAP_ERR	tss, T_TSSFLT
+	TRAP_ERR	align, T_ALIGNFLT
+
 	/*
 	 * alltraps entry point.  Use swapgs if this is the first time in the
 	 * kernel from userland.  Reenable interrupts if they were enabled
@@ -174,24 +171,22 @@ IDTVEC(align)
 alltraps:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	alltraps_testi		/* already running with kernel GS.base */
+	jz	1f		/* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
-alltraps_testi:
-	testl	$PSL_I,TF_RFLAGS(%rsp)
-	jz	alltraps_pushregs_no_rdi
-	sti
-alltraps_pushregs_no_rdi:
+1:	SAVE_SEGS
 	movq	%rdx,TF_RDX(%rsp)
 	movq	%rax,TF_RAX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
+	jz	2f
+	call	handle_ibrs_entry
+2:	testl	$PSL_I,TF_RFLAGS(%rsp)
+	jz	alltraps_pushregs_no_rax
+	sti
 alltraps_pushregs_no_rax:
 	movq	%rsi,TF_RSI(%rsp)
-	movq	%rcx,TF_RCX(%rsp)
 	movq	%r8,TF_R8(%rsp)
 	movq	%r9,TF_R9(%rsp)
 	movq	%rbx,TF_RBX(%rsp)
@@ -249,15 +244,18 @@ calltrap:
 alltraps_noen:
 	movq	%rdi,TF_RDI(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f	/* already running with kernel GS.base */
+	jz	1f /* already running with kernel GS.base */
 	swapgs
 	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-1:	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
-	jmp	alltraps_pushregs_no_rdi
+1:	SAVE_SEGS
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	testb	$SEL_RPL_MASK,TF_CS(%rsp)
+	jz	alltraps_pushregs_no_rax
+	call	handle_ibrs_entry
+	jmp	alltraps_pushregs_no_rax
 
 IDTVEC(dblfault)
 	subq	$TF_ERR,%rsp
@@ -279,56 +277,110 @@ IDTVEC(dblfault)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
 	jz	1f			/* already running with kernel GS.base */
 	swapgs
 1:
-	movq	%rsp,%rdi
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	2f
+	movq	%rax,%cr3
+2:	movq	%rsp,%rdi
 	call	dblfault_handler
-2:
-	hlt
-	jmp	2b
+3:	hlt
+	jmp	3b
 
+	ALIGN_TEXT
+IDTVEC(page_pti)
+	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp)
+	jz	Xpage
+	swapgs
+	pushq	%rax
+	pushq	%rdx
+	movq	%cr3,%rax
+	movq	%rax,PCPU(SAVED_UCR3)
+	PTI_UUENTRY has_err=1
+	subq	$TF_ERR,%rsp
+	movq	%rdi,TF_RDI(%rsp)
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
+	jmp	page_u
 IDTVEC(page)
 	subq	$TF_ERR,%rsp
-	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
-	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
+	movq	%rdi,TF_RDI(%rsp)	/* free up GP registers */
+	movq	%rax,TF_RAX(%rsp)
+	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
 	testb	$SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */
-	jz	1f			/* already running with kernel GS.base */
+	jz	page_cr2		/* already running with kernel GS.base */
 	swapgs
-	movq	PCPU(CURPCB),%rdi
+page_u:	movq	PCPU(CURPCB),%rdi
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%rdi)
-1:	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
+	movq	PCPU(SAVED_UCR3),%rax
+	movq	%rax,PCB_SAVED_UCR3(%rdi)
+	call	handle_ibrs_entry
+page_cr2:
+	movq	%cr2,%rdi		/* preserve %cr2 before ..  */
 	movq	%rdi,TF_ADDR(%rsp)	/* enabling interrupts. */
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
+	movl	$T_PAGEFLT,TF_TRAPNO(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
-	jz	alltraps_pushregs_no_rdi
+	jz	alltraps_pushregs_no_rax
 	sti
-	jmp	alltraps_pushregs_no_rdi
+	jmp	alltraps_pushregs_no_rax
 
 	/*
 	 * We have to special-case this one.  If we get a trap in doreti() at
 	 * the iretq stage, we'll reenter with the wrong gs state.  We'll have
 	 * to do a special the swapgs in this case even coming from the kernel.
 	 * XXX linux has a trap handler for their equivalent of load_gs().
+	 *
+	 * On the stack, we have the hardware interrupt frame to return
+	 * to usermode (faulted) and another frame with error code, for
+	 * fault.  For PTI, copy both frames to the main thread stack.
 	 */
-IDTVEC(prot)
+	.macro PROTF_ENTRY name,trapno
+\name\()_pti_doreti:
+	pushq	%rax
+	pushq	%rdx
+	swapgs
+	movq	PCPU(KCR3),%rax
+	movq	%rax,%cr3
+	movq	PCPU(RSP0),%rax
+	subq	$2*PTI_SIZE-3*8,%rax /* no err, %rax, %rdx in faulted frame */
+	MOVE_STACKS	(PTI_SIZE / 4 - 3)
+	movq	%rax,%rsp
+	popq	%rdx
+	popq	%rax
+	swapgs
+	jmp	X\name
+IDTVEC(\name\()_pti)
+	cmpq	$doreti_iret,PTI_RIP-2*8(%rsp)
+	je	\name\()_pti_doreti
+	testb	$SEL_RPL_MASK,PTI_CS-2*8(%rsp) /* %rax, %rdx not yet pushed */
+	jz	X\name
+	PTI_UENTRY has_err=1
+	swapgs
+IDTVEC(\name)
 	subq	$TF_ERR,%rsp
-	movl	$T_PROTFLT,TF_TRAPNO(%rsp)
+	movl	$\trapno,TF_TRAPNO(%rsp)
+	jmp	prot_addrf
+	.endm
+
+	PROTF_ENTRY	missing, T_SEGNPFLT
+	PROTF_ENTRY	stk, T_STKFLT
+	PROTF_ENTRY	prot, T_PROTFLT
+
 prot_addrf:
 	movq	$0,TF_ADDR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* free up a GP register */
 	movq	%rax,TF_RAX(%rsp)
 	movq	%rdx,TF_RDX(%rsp)
+	movq	%rcx,TF_RCX(%rsp)
 	movw	%fs,TF_FS(%rsp)
 	movw	%gs,TF_GS(%rsp)
 	leaq	doreti_iret(%rip),%rdi
@@ -354,7 +406,8 @@ prot_addrf:
 3:	cmpw	$KUG32SEL,TF_GS(%rsp)
 	jne	4f
 	movq	%rdx,PCB_GSBASE(%rdi)
-4:	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)	/* always full iret from GPF */
+4:	call	handle_ibrs_entry
+	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)	/* always full iret from GPF */
 	movw	%es,TF_ES(%rsp)
 	movw	%ds,TF_DS(%rsp)
 	testl	$PSL_I,TF_RFLAGS(%rsp)
@@ -375,8 +428,18 @@ prot_addrf:
  * We do not support invoking this from a custom segment registers,
  * esp. %cs, %ss, %fs, %gs, e.g. using entries from an LDT.
  */
+	SUPERALIGN_TEXT
+IDTVEC(fast_syscall_pti)
+	swapgs
+	movq	%rax,PCPU(SCRATCH_RAX)
+	movq	PCPU(KCR3),%rax
+	movq	%rax,%cr3
+	jmp	fast_syscall_common
+	SUPERALIGN_TEXT
 IDTVEC(fast_syscall)
 	swapgs
+	movq	%rax,PCPU(SCRATCH_RAX)
+fast_syscall_common:
 	movq	%rsp,PCPU(SCRATCH_RSP)
 	movq	PCPU(RSP0),%rsp
 	/* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
@@ -386,10 +449,11 @@ IDTVEC(fast_syscall)
 	movq	%rcx,TF_RIP(%rsp)	/* %rcx original value is in %r10 */
 	movq	PCPU(SCRATCH_RSP),%r11	/* %r11 already saved */
 	movq	%r11,TF_RSP(%rsp)	/* user stack pointer */
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	movq	PCPU(SCRATCH_RAX),%rax
+	movq	%rax,TF_RAX(%rsp)	/* syscall number */
+	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
+	SAVE_SEGS
+	call	handle_ibrs_entry
 	movq	PCPU(CURPCB),%r11
 	andl	$~PCB_FULL_IRET,PCB_FLAGS(%r11)
 	sti
@@ -398,11 +462,9 @@ IDTVEC(fast_syscall)
 	movq	$2,TF_ERR(%rsp)
 	movq	%rdi,TF_RDI(%rsp)	/* arg 1 */
 	movq	%rsi,TF_RSI(%rsp)	/* arg 2 */
-	movq	%rdx,TF_RDX(%rsp)	/* arg 3 */
 	movq	%r10,TF_RCX(%rsp)	/* arg 4 */
 	movq	%r8,TF_R8(%rsp)		/* arg 5 */
 	movq	%r9,TF_R9(%rsp)		/* arg 6 */
-	movq	%rax,TF_RAX(%rsp)	/* syscall number */
 	movq	%rbx,TF_RBX(%rsp)	/* C preserved */
 	movq	%rbp,TF_RBP(%rsp)	/* C preserved */
 	movq	%r12,TF_R12(%rsp)	/* C preserved */
@@ -420,11 +482,12 @@ IDTVEC(fast_syscall)
 	/* Disable interrupts before testing PCB_FULL_IRET. */
 	cli
 	testl	$PCB_FULL_IRET,PCB_FLAGS(%rax)
-	jnz	3f
+	jnz	4f
 	/* Check for and handle AST's on return to userland. */
 	movq	PCPU(CURTHREAD),%rax
 	testl	$TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax)
-	jne	2f
+	jne	3f
+	call	handle_ibrs_exit
 	/* Restore preserved registers. */
 	MEXITCOUNT
 	movq	TF_RDI(%rsp),%rdi	/* bonus; preserve arg 1 */
@@ -434,16 +497,21 @@ IDTVEC(fast_syscall)
 	movq	TF_RFLAGS(%rsp),%r11	/* original %rflags */
 	movq	TF_RIP(%rsp),%rcx	/* original %rip */
 	movq	TF_RSP(%rsp),%rsp	/* user stack pointer */
-	swapgs
+	cmpb	$0,pti
+	je	2f
+	movq	PCPU(UCR3),%r9
+	movq	%r9,%cr3
+	xorl	%r9d,%r9d
+2:	swapgs
 	sysretq
 
-2:	/* AST scheduled. */
+3:	/* AST scheduled. */
 	sti
 	movq	%rsp,%rdi
 	call	ast
 	jmp	1b
 
-3:	/* Requested full context restore, use doreti for that. */
+4:	/* Requested full context restore, use doreti for that. */
 	MEXITCOUNT
 	jmp	doreti
 
@@ -499,17 +567,15 @@ IDTVEC(nmi)
 	movq	%r13,TF_R13(%rsp)
 	movq	%r14,TF_R14(%rsp)
 	movq	%r15,TF_R15(%rsp)
-	movw	%fs,TF_FS(%rsp)
-	movw	%gs,TF_GS(%rsp)
-	movw	%es,TF_ES(%rsp)
-	movw	%ds,TF_DS(%rsp)
+	SAVE_SEGS
 	movl	$TF_HASSEGS,TF_FLAGS(%rsp)
 	cld
 	xorl	%ebx,%ebx
 	testb	$SEL_RPL_MASK,TF_CS(%rsp)
 	jnz	nmi_fromuserspace
 	/*
-	 * We've interrupted the kernel.  Preserve GS.base in %r12.
+	 * We've interrupted the kernel.  Preserve GS.base in %r12,
+	 * %cr3 in %r13, and possibly lower half of MSR_IA32_SPEC_CTL in %r14d.
 	 */
 	movl	$MSR_GSBASE,%ecx
 	rdmsr
@@ -521,27 +587,45 @@ IDTVEC(nmi)
 	movl	%edx,%eax
 	shrq	$32,%rdx
 	wrmsr
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	1f
+	movq	%rax,%cr3
+1:	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
+	je	nmi_calltrap
+	movl	$MSR_IA32_SPEC_CTRL,%ecx
+	rdmsr
+	movl	%eax,%r14d
+	call	handle_ibrs_entry
 	jmp	nmi_calltrap
 nmi_fromuserspace:
 	incl	%ebx
 	swapgs
-	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
-	jz	2f
+	movq	%cr3,%r13
+	movq	PCPU(KCR3),%rax
+	cmpq	$~0,%rax
+	je	1f
+	movq	%rax,%cr3
+1:	call	handle_ibrs_entry
 	movq	PCPU(CURPCB),%rdi
 	testq	%rdi,%rdi
-	jz	2f
+	jz	3f
+	orl	$PCB_FULL_IRET,PCB_FLAGS(%rdi)
+	testb	$CPUID_STDEXT_FSGSBASE,cpu_stdext_feature(%rip)
+	jz	3f
 	cmpw	$KUF32SEL,TF_FS(%rsp)
-	jne	1f
+	jne	2f
 	rdfsbase %rax
 	movq	%rax,PCB_FSBASE(%rdi)
-1:	cmpw	$KUG32SEL,TF_GS(%rsp)
-	jne	2f
+2:	cmpw	$KUG32SEL,TF_GS(%rsp)
+	jne	3f
 	movl	$MSR_KGSBASE,%ecx
 	rdmsr
 	shlq	$32,%rdx
 	orq	%rdx,%rax
 	movq	%rax,PCB_GSBASE(%rdi)
-2:
+3:
 /* Note: this label is also used by ddb and gdb: */
 nmi_calltrap:
 	FAKE_MCOUNT(TF_RIP(%rsp))
@@ -564,26 +648,29 @@ nmi_calltrap:
 	movq	PCPU(CURTHREAD),%rax
 	orq	%rax,%rax	/* curthread present? */
 	jz	nocallchain
-	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
-	jz	nocallchain
 	/*
-	 * A user callchain is to be captured, so:
-	 * - Move execution to the regular kernel stack, to allow for
-	 *   nested NMI interrupts.
-	 * - Take the processor out of "NMI" mode by faking an "iret".
-	 * - Enable interrupts, so that copyin() can work.
+	 * Move execution to the regular kernel stack, because we
+	 * committed to return through doreti.
 	 */
 	movq	%rsp,%rsi	/* source stack pointer */
 	movq	$TF_SIZE,%rcx
 	movq	PCPU(RSP0),%rdx
 	subq	%rcx,%rdx
 	movq	%rdx,%rdi	/* destination stack pointer */
-
 	shrq	$3,%rcx		/* trap frame size in long words */
 	cld
 	rep
 	movsq			/* copy trapframe */
+	movq	%rdx,%rsp	/* we are on the regular kstack */
 
+	testl	$TDP_CALLCHAIN,TD_PFLAGS(%rax) /* flagged for capture? */
+	jz	nocallchain
+	/*
+	 * A user callchain is to be captured, so:
+	 * - Take the processor out of "NMI" mode by faking an "iret",
+	 *   to allow for nested NMI interrupts.
+	 * - Enable interrupts, so that copyin() can work.
+	 */
 	movl	%ss,%eax
 	pushq	%rax		/* tf_ss */
 	pushq	%rdx		/* tf_rsp (on kernel stack) */
@@ -613,33 +700,139 @@ outofnmi:
 	cli
 nocallchain:
 #endif
-	testl	%ebx,%ebx
+	testl	%ebx,%ebx	/* %ebx == 0 => return to userland */
 	jnz	doreti_exit
-nmi_kernelexit:
 	/*
+	 * Restore speculation control MSR, if preserved.
+	 */
+	testl	$CPUID_STDEXT3_IBPB,cpu_stdext_feature3(%rip)
+	je	1f
+	movl	%r14d,%eax
+	xorl	%edx,%edx
+	movl	$MSR_IA32_SPEC_CTRL,%ecx
+	wrmsr
+	/*
 	 * Put back the preserved MSR_GSBASE value.
 	 */
+1:	movl	$MSR_GSBASE,%ecx
+	movq	%r12,%rdx
+	movl	%edx,%eax
+	shrq	$32,%rdx
+	wrmsr
+	movq	%r13,%cr3
+	RESTORE_REGS
+	addq	$TF_RIP,%rsp
+	jmp	doreti_iret
+
+/*
+ * MC# handling is similar to NMI.
+ *
+ * As with NMIs, machine check exceptions do not respect RFLAGS.IF and
+ * can occur at any time with a GS.base value that does not correspond
+ * to the privilege level in CS.
+ *
+ * Machine checks are not unblocked by iretq, but it is best to run

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201802171800.w1HI01KX064024>