Date: Sun, 11 May 2003 18:18:29 -0700 (PDT) From: Peter Wemm <peter@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 30984 for review Message-ID: <200305120118.h4C1IT5N035838@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=30984 Change 30984 by peter@peter_hammer on 2003/05/11 18:17:36 Use swapgs. Ouch; this is hairy. We have to avoid doing it a second time when trapping from kernel context, so check the frame's TF_CS to see if we're coming from kernel context. This means converting *all* the trap gates to interrupt gates so that we can do the swapgs without the risk of an intermediate interrupt firing after entering supervisor mode but before swapgs. This means that we have to undo the effects of the interrupt gate when we really want the trap gate. Ugh. The other option is to have the regular entry points use the rdmsr/wrmsr stuff to save/restore the %GS.base etc *in the trap handlers*! and load the kernel %gs values and leave swapgs for the fast syscall stuff. I'll do a time comparison later to see if this is infact faster. Update comments. Affected files ... .. //depot/projects/hammer/sys/amd64/amd64/exception.S#6 edit .. //depot/projects/hammer/sys/amd64/amd64/machdep.c#22 edit .. //depot/projects/hammer/sys/amd64/isa/icu_vector.S#2 edit Differences ... ==== //depot/projects/hammer/sys/amd64/amd64/exception.S#6 (text+ko) ==== @@ -51,16 +51,16 @@ /* * Trap and fault vector routines. * - * Most traps are 'trap gates', SDT_SYS386TGT. A trap gate pushes state on - * the stack that mostly looks like an interrupt, but does not disable - * interrupts. A few of the traps we are use are interrupt gates, - * SDT_SYS386IGT, which are nearly the same thing except interrupts are - * disabled on entry. + * All traps are 'interrupt gates', SDT_SYSIGT. An interrupt gate pushes + * state on the stack but also disables interrupts. This is important for + * us for the use of the swapgs instruction. We cannot be interrupted + * until the GS.base value is correct. For most traps, we automatically + * then enable interrupts if the interrupted context had them enabled. + * This is equivalent to the i386 port's use of SDT_SYS386TGT. * * The cpu will push a certain amount of state onto the kernel stack for - * the current process. The amount of state depends on the type of trap - * and whether the trap crossed rings or not. See i386/include/frame.h. - * At the very least the current EFLAGS (status register, which includes + * the current process. See amd64/include/frame.h. + * This includes the current RFLAGS (status register, which includes * the interrupt disable state prior to the trap), the code segment register, * and the return instruction pointer are pushed by the cpu. The cpu * will also push an 'error' code for certain traps. We push a dummy @@ -75,6 +75,7 @@ #define IDTVEC(name) ALIGN_TEXT; .globl __CONCAT(X,name); \ .type __CONCAT(X,name),@function; __CONCAT(X,name): #define TRAP(a) pushq $(a) ; jmp alltraps +#define TRAP_NOEN(a) pushq $(a) ; jmp alltraps_noen MCOUNT_LABEL(user) MCOUNT_LABEL(btrap) @@ -82,11 +83,11 @@ IDTVEC(div) pushq $0; TRAP(T_DIVIDE) IDTVEC(dbg) - pushq $0; TRAP(T_TRCTRAP) + pushq $0; TRAP_NOEN(T_TRCTRAP) IDTVEC(nmi) pushq $0; TRAP(T_NMI) IDTVEC(bpt) - pushq $0; TRAP(T_BPTFLT) + pushq $0; TRAP_NOEN(T_BPTFLT) IDTVEC(ofl) pushq $0; TRAP(T_OFLOW) IDTVEC(bnd) @@ -106,7 +107,7 @@ IDTVEC(prot) TRAP(T_PROTFLT) IDTVEC(page) - TRAP(T_PAGEFLT) + TRAP_NOEN(T_PAGEFLT) IDTVEC(mchk) pushq $0; TRAP(T_MCHK) IDTVEC(rsvd) @@ -119,10 +120,9 @@ pushq $0; TRAP(T_XMMFLT) /* - * alltraps entry point. Interrupts are enabled if this was a trap - * gate (TGT), else disabled if this was an interrupt gate (IGT). - * Note that int0x80_syscall is a trap gate. Only page faults - * use an interrupt gate. + * alltraps entry point. Use swapgs if this is the first time in the + * kernel from userland. Reenable interrupts if they were enabled + * before the trap. This approximates SDT_SYS386TGT on the i386 port. */ SUPERALIGN_TEXT @@ -130,6 +130,14 @@ .type alltraps,@function alltraps: subq $TF_TRAPNO,%rsp /* tf_err and tf_trapno already pushed */ + testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ + jz alltraps_testi /* already running with kernel GS.base */ + swapgs +alltraps_testi: + testl $PSL_I,TF_RFLAGS(%rsp) + jz alltraps_pushregs + sti +alltraps_pushregs: movq %rdi,TF_RDI(%rsp) movq %rsi,TF_RSI(%rsp) movq %rdx,TF_RDX(%rsp) @@ -153,22 +161,43 @@ MEXITCOUNT jmp doreti /* Handle any pending ASTs */ + /* + * alltraps_noen entry point. Unlike alltraps above, we want to + * leave the interrupts disabled. This corresponds to + * SDT_SYS386IGT on the i386 port. + */ + SUPERALIGN_TEXT + .globl alltraps_noen + .type alltraps_noen,@function +alltraps_noen: + subq $TF_TRAPNO,%rsp /* tf_err and tf_trapno already pushed */ + testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ + jz alltraps_pushregs /* already running with kernel GS.base */ + swapgs + jmp alltraps_pushregs + +IDTVEC(dblfault) + pushq $T_DOUBLEFLT + subq $TF_TRAPNO,%rsp /* tf_err and tf_trapno already pushed */ + testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ + jz 1f /* already running with kernel GS.base */ + swapgs +1: call dblfault_handler +2: hlt + jmp 2b + /* * Call gate entry for FreeBSD ELF and Linux/NetBSD syscall (int 0x80) * - * Even though the name says 'int0x80', this is actually a TGT (trap gate) - * rather then an IGT (interrupt gate). Thus interrupts are enabled on - * entry just as they are for a normal syscall. - * - * This leaves a place to put eflags so that the call frame can be - * converted to a trap frame. Note that the eflags is (semi-)bogusly - * pushed into (what will be) tf_err and then copied later into the - * final spot. It has to be done this way because esp can't be just - * temporarily altered for the pushfl - an interrupt might come in - * and clobber the saved cs/eip. + * This is a SDT_SYSIDT entry point (unlike the i386 port) so that we + * can do a swapgs before enabling interrupts. This is critical because + * if we took an interrupt before swapgs, the interrupt code would see + * that it originated in supervisor mode and skip the swapgs. */ SUPERALIGN_TEXT IDTVEC(int0x80_syscall) + swapgs + sti pushq $2 /* sizeof "int 0x80" */ subq $TF_ERR,%rsp /* skip over tf_trapno */ movq %rdi,TF_RDI(%rsp) @@ -196,19 +225,21 @@ * and the new privilige level. We are still running on the old user stack * pointer. We have to juggle a few things around to find our stack etc. * swapgs gives us access to our PCPU space only. - * XXX The PCPU stuff is stubbed out right now... */ IDTVEC(fast_syscall) - /* XXX swapgs */ + swapgs movq %rsp,PCPU(SCRATCH_RSP) movq common_tss+COMMON_TSS_RSP0,%rsp /* Now emulate a trapframe. Ugh. */ subq $TF_SIZE,%rsp - movq $KUDSEL,TF_SS(%rsp) /* defer TF_RSP till we have a spare register */ movq %r11,TF_RFLAGS(%rsp) + movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */ + movq PCPU(SCRATCH_RSP),%r11 /* %r11 already saved */ + movq %r11,TF_RSP(%rsp) /* user stack pointer */ + sti + movq $KUDSEL,TF_SS(%rsp) movq $KUCSEL,TF_CS(%rsp) - movq %rcx,TF_RIP(%rsp) /* %rcx original value is in %r10 */ movq $2,TF_ERR(%rsp) movq %rdi,TF_RDI(%rsp) /* arg 1 */ movq %rsi,TF_RSI(%rsp) /* arg 2 */ @@ -223,14 +254,10 @@ movq %r13,TF_R13(%rsp) /* C preserved */ movq %r14,TF_R14(%rsp) /* C preserved */ movq %r15,TF_R15(%rsp) /* C preserved */ - movq PCPU(SCRATCH_RSP),%r12 /* %r12 already saved */ - movq %r12,TF_RSP(%rsp) /* user stack pointer */ - sti call syscall movq PCPU(CURPCB),%rax testq $PCB_FULLCTX,PCB_FLAGS(%rax) jne 3f - /* simplified from doreti */ 1: /* Check for and handle AST's on return to userland */ cli movq PCPU(CURTHREAD),%rax @@ -255,7 +282,7 @@ movq TF_RIP(%rsp),%rcx /* original %rip */ movq TF_RSP(%rsp),%r9 /* user stack pointer */ movq %r9,%rsp /* original %rsp */ - /* XXX swapgs */ + swapgs sysretq 3: /* Requested full context restore, use doreti for that */ andq $~PCB_FULLCTX,PCB_FLAGS(%rax) @@ -344,12 +371,16 @@ movq TF_R13(%rsp),%r13 movq TF_R14(%rsp),%r14 movq TF_R15(%rsp),%r15 - addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */ + testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ + jz 1f /* keep running with kernel GS.base */ + cli + swapgs +1: addq $TF_RIP,%rsp /* skip over tf_err, tf_trapno */ .globl doreti_iret doreti_iret: iretq - /* + /* * doreti_iret_fault and friends. Alternative return code for * the case where we get a fault in the doreti_exit code * above. trap() (i386/i386/trap.c) catches this specific @@ -360,7 +391,13 @@ .globl doreti_iret_fault doreti_iret_fault: subq $TF_RIP,%rsp /* space including tf_err, tf_trapno */ - movq %rdi,TF_RDI(%rsp) + testb $SEL_RPL_MASK,TF_CS(%rsp) /* Did we come from kernel? */ + jz 1f /* already running with kernel GS.base */ + swapgs +1: testl $PSL_I,TF_RFLAGS(%rsp) + jz 2f + sti +2: movq %rdi,TF_RDI(%rsp) movq %rsi,TF_RSI(%rsp) movq %rdx,TF_RDX(%rsp) movq %rcx,TF_RCX(%rsp) ==== //depot/projects/hammer/sys/amd64/amd64/machdep.c#22 (text+ko) ==== @@ -654,7 +654,7 @@ IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm), IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot), IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align), - IDTVEC(xmm), IDTVEC(int0x80_syscall), + IDTVEC(xmm), IDTVEC(dblfault), IDTVEC(int0x80_syscall), IDTVEC(fast_syscall), IDTVEC(fast_syscall32); void @@ -1182,9 +1182,9 @@ lgdt(&r_gdt); pc = &__pcpu; - wrmsr(MSR_FSBASE, (u_int64_t)pc); + wrmsr(MSR_FSBASE, 0); /* User value */ wrmsr(MSR_GSBASE, (u_int64_t)pc); - wrmsr(MSR_KGSBASE, (u_int64_t)pc); + wrmsr(MSR_KGSBASE, 0); /* User value while we're in the kernel */ pcpu_init(pc, 0, sizeof(struct pcpu)); PCPU_SET(prvspace, pc); @@ -1204,28 +1204,28 @@ /* exceptions */ for (x = 0; x < NIDT; x++) - setidt(x, &IDTVEC(rsvd), SDT_SYSTGT, SEL_KPL, 0); - setidt(0, &IDTVEC(div), SDT_SYSTGT, SEL_KPL, 0); + setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); + setidt(0, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0); setidt(1, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0); - setidt(2, &IDTVEC(nmi), SDT_SYSTGT, SEL_KPL, 0); + setidt(2, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 0); setidt(3, &IDTVEC(bpt), SDT_SYSIGT, SEL_KPL, 0); - setidt(4, &IDTVEC(ofl), SDT_SYSTGT, SEL_KPL, 0); - setidt(5, &IDTVEC(bnd), SDT_SYSTGT, SEL_KPL, 0); - setidt(6, &IDTVEC(ill), SDT_SYSTGT, SEL_KPL, 0); - setidt(7, &IDTVEC(dna), SDT_SYSTGT, SEL_KPL, 0); - setidt(8, (inthand_t *)dblfault_handler, SDT_SYSIGT, SEL_KPL, 1); - setidt(9, &IDTVEC(fpusegm), SDT_SYSTGT, SEL_KPL, 0); - setidt(10, &IDTVEC(tss), SDT_SYSTGT, SEL_KPL, 0); - setidt(11, &IDTVEC(missing), SDT_SYSTGT, SEL_KPL, 0); - setidt(12, &IDTVEC(stk), SDT_SYSTGT, SEL_KPL, 0); - setidt(13, &IDTVEC(prot), SDT_SYSTGT, SEL_KPL, 0); + setidt(4, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0); + setidt(5, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0); + setidt(6, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0); + setidt(7, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0); + setidt(8, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1); + setidt(9, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0); + setidt(10, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0); + setidt(11, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0); + setidt(12, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0); + setidt(13, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0); setidt(14, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0); - setidt(15, &IDTVEC(rsvd), SDT_SYSTGT, SEL_KPL, 0); - setidt(16, &IDTVEC(fpu), SDT_SYSTGT, SEL_KPL, 0); - setidt(17, &IDTVEC(align), SDT_SYSTGT, SEL_KPL, 0); - setidt(18, &IDTVEC(mchk), SDT_SYSTGT, SEL_KPL, 0); - setidt(19, &IDTVEC(xmm), SDT_SYSTGT, SEL_KPL, 0); - setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYSTGT, SEL_UPL, 0); + setidt(15, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0); + setidt(16, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0); + setidt(17, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0); + setidt(18, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0); + setidt(19, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0); + setidt(0x80, &IDTVEC(int0x80_syscall), SDT_SYSIGT, SEL_UPL, 0); r_idt.rd_limit = sizeof(idt0) - 1; r_idt.rd_base = (long) idt; @@ -1251,8 +1251,6 @@ /* make an initial tss so cpu can get interrupt stack on syscall! */ common_tss.tss_rsp0 = thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb); - /* XXX we need to update tss_rsp0 in cpu_switch */ - /* XXX maybe not yet, everything is still running in supervisor mode */ /* doublefault stack space, runs on ist1 */ common_tss.tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)]; ==== //depot/projects/hammer/sys/amd64/isa/icu_vector.S#2 (text+ko) ==== @@ -25,7 +25,10 @@ SUPERALIGN_TEXT ; \ IDTVEC(vec_name) ; \ subq $TF_RIP,%rsp ; /* skip dummy tf_err and tf_trapno */ \ - movq %rdi,TF_RDI(%rsp) ; \ + testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \ + jz 1f ; /* Yes, dont swapgs again */ \ + swapgs ; \ +1: movq %rdi,TF_RDI(%rsp) ; \ movq %rsi,TF_RSI(%rsp) ; \ movq %rdx,TF_RDX(%rsp) ; \ movq %rcx,TF_RCX(%rsp) ; \ @@ -69,7 +72,10 @@ SUPERALIGN_TEXT ; \ IDTVEC(vec_name) ; \ subq $TF_RIP,%rsp ; /* skip dummy tf_err and tf_trapno */ \ - movq %rdi,TF_RDI(%rsp) ; \ + testb $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */ \ + jz 1f ; /* Yes, dont swapgs again */ \ + swapgs ; \ +1: movq %rdi,TF_RDI(%rsp) ; \ movq %rsi,TF_RSI(%rsp) ; \ movq %rdx,TF_RDX(%rsp) ; \ movq %rcx,TF_RCX(%rsp) ; \
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200305120118.h4C1IT5N035838>