From owner-freebsd-current Wed May 15 03:12:12 1996 Return-Path: owner-current Received: (from root@localhost) by freefall.freebsd.org (8.7.3/8.7.3) id DAA26724 for current-outgoing; Wed, 15 May 1996 03:12:12 -0700 (PDT) Received: from silvia.HIP.Berkeley.EDU (silvia.HIP.Berkeley.EDU [136.152.64.181]) by freefall.freebsd.org (8.7.3/8.7.3) with ESMTP id DAA26579 for ; Wed, 15 May 1996 03:12:00 -0700 (PDT) Received: (from asami@localhost) by silvia.HIP.Berkeley.EDU (8.7.5/8.6.9) id DAA15029; Wed, 15 May 1996 03:11:17 -0700 (PDT) Date: Wed, 15 May 1996 03:11:17 -0700 (PDT) Message-Id: <199605151011.DAA15029@silvia.HIP.Berkeley.EDU> To: bde@zeta.org.au CC: current@freebsd.org, ccd@stampede.cs.berkeley.edu In-reply-to: <199605110914.TAA21036@godzilla.zeta.org.au> (message from Bruce Evans on Sat, 11 May 1996 19:14:07 +1000) Subject: Re: some more on fast bcopy From: asami@cs.berkeley.edu (Satoshi Asami) Sender: owner-current@freebsd.org X-Loop: FreeBSD.org Precedence: bulk * It isn't even necessary to save the FP registers, at least in * non-interrupt handlers, if they aren't already in use. In particular, * copyin() and copyout() are never called from interrupt handlers, so * it isn't necessary to preserve the kernel FP registers (they are * guaranteed to not be in use). Only the user FP context needs to be * preserved. This optimization is closely related to fixing the bug. * It should start out something like: * * if (intr_nesting_level > 0) { * /* Save reentrantly the same as now. */ * } else { * if (npxproc != NULL) { * assert(npxproc == curproc); * fnsave(&curpcb->pcb_savefpu); * npxproc = NULL; * } * /* Now we own the FPU. */ * * /* * * The process' FP state is saved in the pcb, but if we get * * switched, the cpu_switch() will store our FP state in the * * pcb. It should be possible to avoid all the copying for * * this, e.g., by setting a flag to tell cpu_switch() to * * save the state somewhere else. * */ * tmp = curpcb->pcb_savefpu; * * stop_emulating(); * npxproc = curproc; * } * ... * if (intr_nesting_level > 0) * /* Restore reentrantly the same as now. */ * } else { * curpcb->pcb_savefpu = tmp; * start_emulating(); * npxproc = NULL; * } Okay, I implemented this (I think) with the help of gcc -S, but I still get "FPU not available" panics intermittently (often during "make install"s). Here's the diff, can you please check and see if something's wrong? (I know 128 is too small and 256 is too large, I have them this way to test it "aggressively". :) === Index: support.s =================================================================== RCS file: /usr/cvs/src/sys/i386/i386/support.s,v retrieving revision 1.35 diff -u -r1.35 support.s --- support.s 1996/05/03 21:01:00 1.35 +++ support.s 1996/05/15 09:31:42 @@ -453,6 +453,16 @@ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx +#ifdef I586_FAST_BCOPY + cmpl $128,%ecx + jbe slow_copyout + + call fastmove + jmp done_copyout + + ALIGN_TEXT +slow_copyout: +#endif /* I586_FAST_BCOPY */ shrl $2,%ecx cld rep @@ -500,6 +510,16 @@ cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault +#ifdef I586_FAST_BCOPY + cmpl $128,%ecx + jbe slow_copyin + + call fastmove + jmp done_copyin + + ALIGN_TEXT +slow_copyin: +#endif /* I586_FAST_BCOPY */ movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld @@ -510,6 +530,10 @@ rep movsb +#ifdef I586_FAST_BCOPY + ALIGN_TEXT +done_copyin: +#endif /* I586_FAST_BCOPY */ popl %edi popl %esi xorl %eax,%eax @@ -525,6 +549,252 @@ movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret + +#ifdef I586_FAST_BCOPY +/* fastmove(src, dst, len) + src in %esi + dst in %edi + len in %ecx + uses %eax and %edx for tmp. storage + */ +/* +LC0: + .ascii "npxproc == curproc\0" +LC1: + .ascii "support.s" + */ + ALIGN_TEXT +fastmove: + cmpl $255,%ecx + jbe fastmove_tail + + testl $7,%esi /* check if src addr is multiple of 8 */ + jnz fastmove_tail + + testl $7,%edi /* check if dst addr is multiple of 8 */ + jnz fastmove_tail + + pushl %ebp + movl %esp,%ebp + subl $176,%esp + +/* if (intr_nesting_level > 0) */ + cmpb $0,_intr_nesting_level + je L6 +/* save reentrantly */ + movl %cr0,%edx + clts + fnsave -176(%ebp) + jmp L7 + +/* else { */ + ALIGN_TEXT +L6: +/* if (npxproc != NULL) { */ + cmpl $0,_npxproc + je L8 +/* assert(npxproc == curproc); */ +/* movl _npxproc,%eax + cmpl %eax,_curproc + je L6b + pushl LC0 + pushl $599 + pushl LC1 + call ___assert + addl $12,%esp +L6b: */ +/* fnsave(&curpcb->pcb_savefpu); */ + movl _curpcb,%eax + fnsave 122(%eax) +/* npxproc = NULL; */ + movl $0,_npxproc +/* } */ +L8: +/* now we own the FPU. */ + +/* + * The process' FP state is saved in the pcb, but if we get + * switched, the cpu_switch() will store our FP state in the + * pcb. It should be possible to avoid all the copying for + * this, e.g., by setting a flag to tell cpu_switch() to + * save the state somewhere else. + */ +/* tmp = curpcb->pcb_savefpu; */ + pushl %edi + pushl %esi + pushl %ecx + leal -176(%ebp),%edi + movl _curpcb,%esi + addl $112,%esi + cld + movl $44,%ecx + rep + movsl + popl %ecx + popl %esi + popl %edi +/* stop_emulating(); */ + clts +/* npxproc = curproc; */ + movl _curproc,%eax + movl %eax,_npxproc +/* } */ +L7: + ALIGN_TEXT +fastmove_loop: + movl 32(%esi),%eax + movl 64(%esi),%eax + movl 96(%esi),%eax + movl 128(%esi),%eax + movl 160(%esi),%eax + movl 192(%esi),%eax + movl 224(%esi),%eax + + cmpl $259,%ecx + jbe fastmove_tmp + movl 256(%esi),%eax + + ALIGN_TEXT +fastmove_tmp: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fxch %st(7) + fistpq 0(%edi) + fxch %st(5) + fistpq 8(%edi) + fxch %st(3) + fistpq 16(%edi) + fxch %st(1) + fistpq 24(%edi) + fistpq 32(%edi) + fistpq 40(%edi) + fistpq 48(%edi) + fistpq 56(%edi) + fildq 64(%esi) + fildq 72(%esi) + fildq 80(%esi) + fildq 88(%esi) + fildq 96(%esi) + fildq 104(%esi) + fildq 112(%esi) + fildq 120(%esi) + fxch %st(7) + fistpq 64(%edi) + fxch %st(5) + fistpq 72(%edi) + fxch %st(3) + fistpq 80(%edi) + fxch %st(1) + fistpq 88(%edi) + fistpq 96(%edi) + fistpq 104(%edi) + fistpq 112(%edi) + fistpq 120(%edi) + fildq 128(%esi) + fildq 136(%esi) + fildq 144(%esi) + fildq 152(%esi) + fildq 160(%esi) + fildq 168(%esi) + fildq 176(%esi) + fildq 184(%esi) + fxch %st(7) + fistpq 128(%edi) + fxch %st(5) + fistpq 136(%edi) + fxch %st(3) + fistpq 144(%edi) + fxch %st(1) + fistpq 152(%edi) + fistpq 160(%edi) + fistpq 168(%edi) + fistpq 176(%edi) + fistpq 184(%edi) + fildq 192(%esi) + fildq 200(%esi) + fildq 208(%esi) + fildq 216(%esi) + fildq 224(%esi) + fildq 232(%esi) + fildq 240(%esi) + fildq 248(%esi) + fxch %st(7) + fistpq 192(%edi) + fxch %st(5) + fistpq 200(%edi) + fxch %st(3) + fistpq 208(%edi) + fxch %st(1) + fistpq 216(%edi) + fistpq 224(%edi) + fistpq 232(%edi) + fistpq 240(%edi) + fistpq 248(%edi) + addl $-256,%ecx + addl $256,%esi + addl $256,%edi + cmpl $255,%ecx + ja fastmove_loop + +/* if (intr_nesting_level > 0) */ + + cmpb $0,_intr_nesting_level + je L9 + +/* Restore reentrantly. */ + frstor -176(%ebp) + movl %edx,%cr0 + jmp L10 + +/* else { */ + ALIGN_TEXT +L9: +/* curpcb->pcb_savefpu = tmp; */ + pushl %edi + pushl %esi + pushl %ecx + movl _curpcb,%edi + addl $112,%edi + leal -176(%ebp),%esi + cld + movl $44,%ecx + rep + movsl + popl %ecx + popl %esi + popl %edi + +/* start_emulating(); */ + smsw %ax + orb $8,%al + lmsw %ax +/* npxproc = NULL; */ + movl $0,_npxproc +/* } */ +L10: + movl %ebp,%esp + popl %ebp + + ALIGN_TEXT +fastmove_tail: + movb %cl,%al + shrl $2,%ecx /* copy longword-wise */ + cld + rep + movsl + movb %al,%cl + andb $3,%cl /* copy remaining bytes */ + rep + movsb + + ret +#endif /* I586_FAST_BCOPY */ /* * fu{byte,sword,word} : fetch a byte (sword, word) from user memory === I couldn't find __assert() in the kernel so I commented out that part. Satoshi