Date: Sat, 14 Sep 1996 16:49:21 +1000 From: Bruce Evans <bde@zeta.org.au> To: current@freebsd.org Subject: pentium-optimized bzero and bcopy Message-ID: <199609140649.QAA06910@godzilla.zeta.org.au>
next in thread | raw e-mail | index | archive | help
I haven't been doing anything with this for too long. I'm not happy with a few of the details: 1) The function pointer interface to bzero and bcopy is too i*86- specific, and doesn't gain much speed even on an i*86. It is convenient for testing and for dynamic configuration, but this can be implemented at a lower level at a cost of one jmp or call+ret. 2) The FPU is assumed to work on 586's. I don't want ifdefs for it. The ifdefs for I586_FAST_BCOPY (which are for the i586- optimized copyin() and copyout(), not for bcopy()) should go away soon. Bruce diff -c2 src/sys/i386/i386/identcpu.c~ src/sys/i386/i386/identcpu.c *** src/sys/i386/i386/identcpu.c~ Sat Sep 7 23:08:26 1996 --- src/sys/i386/i386/identcpu.c Mon Sep 9 09:16:36 1996 *************** *** 41,47 **** /* XXX - should be in header file */ ! extern void i486_bzero __P((void *, size_t)); ! extern void i586_bzero __P((void *, size_t)); ! extern void i686_bzero __P((void *, size_t)); void identifycpu(void); /* XXX should be in different header file */ --- 41,57 ---- /* XXX - should be in header file */ ! extern void i586_bcopy __P((const void *from, void *to, size_t len)); ! extern void i486_bzero __P((void *buf, size_t len)); ! extern void i586_bzero __P((void *buf, size_t len)); void identifycpu(void); /* XXX should be in different header file */ *************** *** 174,177 **** --- 169,175 ---- ((i586_ctr_freq + 4999) / 10000) % 100); printf("586"); + bcopy = i586_bcopy; + bzero = i586_bzero; + ovbcopy = i586_bcopy; break; #endif diff -c2 src/sys/i386/i386/support.s~ src/sys/i386/i386/support.s *** src/sys/i386/i386/support.s~ Wed Sep 11 03:32:22 1996 --- src/sys/i386/i386/support.s Wed Sep 11 03:32:32 1996 *************** *** 45,52 **** #define IDXSHIFT 10 - .data .globl _bzero ! _bzero: .long _generic_bzero .text --- 43,59 ---- #define IDXSHIFT 10 .data + .globl _bcopy + _bcopy: + .long _generic_bcopy .globl _bzero ! _bzero: ! .long _generic_bzero ! .globl _ovbcopy ! _ovbcopy: ! .long _generic_bcopy ! kernel_fpu_lock: ! .byte 0xfe ! .space 3 .text *************** *** 173,236 **** #endif - #if 0 /* Actually lowers performance in real-world cases */ #if defined(I586_CPU) || defined(I686_CPU) ! ALTENTRY(i586_bzero) ! ENTRY(i686_bzero) ! pushl %edi ! movl 8(%esp),%edi /* destination pointer */ ! movl 12(%esp),%edx /* size (in 8-bit words) */ ! xorl %eax,%eax /* store data */ ! cld ! /* If less than 100 bytes to write, skip tricky code. */ ! cmpl $100,%edx ! movl %edx,%ecx /* needed when branch is taken! */ ! jl 2f ! ! /* First write 0-3 bytes to make the pointer 32-bit aligned. */ ! movl %edi,%ecx /* Copy ptr to ecx... */ ! negl %ecx /* ...and negate that and... */ ! andl $3,%ecx /* ...mask to get byte count. */ ! subl %ecx,%edx /* adjust global byte count */ ! rep ! stosb ! subl $32,%edx /* offset count for unrolled loop */ ! movl (%edi),%ecx /* Fetch destination cache line */ ! .align 2,0x90 /* supply 0x90 for broken assemblers */ ! 1: ! movl 28(%edi),%ecx /* allocate cache line for destination */ ! subl $32,%edx /* decr loop count */ ! movl %eax,0(%edi) /* store words pairwise */ ! movl %eax,4(%edi) ! movl %eax,8(%edi) ! movl %eax,12(%edi) ! movl %eax,16(%edi) ! movl %eax,20(%edi) ! movl %eax,24(%edi) ! movl %eax,28(%edi) ! ! leal 32(%edi),%edi /* update destination pointer */ ! jge 1b ! leal 32(%edx),%ecx ! /* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */ ! 2: shrl $2,%ecx rep stosl ! ! /* Finally write the last 0-3 bytes. */ ! movl %edx,%ecx andl $3,%ecx rep stosb - popl %edi ret ! #endif ! #endif /* fillw(pat, base, cnt) */ --- 182,326 ---- #endif #if defined(I586_CPU) || defined(I686_CPU) ! ENTRY(i586_bzero) ! movl 4(%esp),%edx ! movl 8(%esp),%ecx ! /* ! * The FPU register method is twice as fast as the integer register ! * method unless the target is in the L1 cache and we pre-allocate a ! * cache line for it (then the integer register method is 4-5 times ! * faster). However, we never pre-allocate cache lines, since that ! * would make the integer method 25% or more slower for the common ! * case when the target isn't in either the L1 cache or the L2 cache. ! * Thus we normally use the FPU register method unless the overhead ! * would be too large. ! */ ! cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */ ! jb intreg_i586_bzero ! /* ! * The FPU registers may belong to an application or to fastmove() ! * or to another invocation of bcopy() or ourself in a higher level ! * interrupt or trap handler. Preserving the registers is ! * complicated since we avoid it if possible at all levels. We ! * want to localize the complications even when that increases them. ! * Here the extra work involves preserving CR0_TS in TS. ! * `npxproc != NULL' is supposed to be the condition that all the ! * FPU resources belong to an application, but npxproc and CR0_TS ! * aren't set atomically enough for this condition to work in ! * interrupt handlers. ! * ! * Case 1: FPU registers belong to the application: we must preserve ! * the registers if we use them, so we only use the FPU register ! * method if the target size is large enough to amortize the extra ! * overhead for preserving them. CR0_TS must be preserved although ! * it is very likely to end up as set. ! * ! * Case 2: FPU registers belong to fastmove(): fastmove() currently ! * makes the registers look like they belong to an application so ! * that cpu_switch() and savectx() don't have to know about it, so ! * this case reduces to case 1. ! * ! * Case 3: FPU registers belong to the kernel: don't use the FPU ! * register method. This case is unlikely, and supporting it would ! * be more complicated and might take too much stack. ! * ! * Case 4: FPU registers don't belong to anyone: the FPU registers ! * don't need to be preserved, so we always use the FPU register ! * method. CR0_TS must be preserved although it is very likely to ! * always end up as clear. ! */ ! cmpl $0,_npxproc ! je i586_bz1 ! cmpl $256+184,%ecx /* empirical; not quite 2*108 more */ ! jb intreg_i586_bzero ! sarb $1,kernel_fpu_lock ! jc intreg_i586_bzero ! smsw %ax ! clts ! subl $108,%esp ! fnsave 0(%esp) ! jmp i586_bz2 ! ! i586_bz1: ! sarb $1,kernel_fpu_lock ! jc intreg_i586_bzero ! smsw %ax ! clts ! fninit /* XXX should avoid needing this */ ! i586_bz2: ! fldz ! /* ! * Align to an 8 byte boundary (misalignment in the main loop would ! * cost a factor of >= 2). Avoid jumps (at little cost if it is ! * already aligned) by always zeroing 8 bytes and using the part up ! * to the _next_ alignment position. ! */ ! fstl 0(%edx) ! addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */ ! addl $8,%edx ! andl $~7,%edx ! subl %edx,%ecx ! /* ! * Similarly align `len' to a multiple of 8. ! */ ! fstl -8(%edx,%ecx) ! decl %ecx ! andl $~7,%ecx ! /* ! * This wouldn't be any faster if it were unrolled, since the loop ! * control instructions are much faster than the fstl and/or done ! * in parallel with it so their overhead is insignificant. ! */ ! fpureg_i586_bzero_loop: ! fstl 0(%edx) ! addl $8,%edx ! subl $8,%ecx ! cmpl $8,%ecx ! jae fpureg_i586_bzero_loop ! ! cmpl $0,_npxproc ! je i586_bz3 ! frstor 0(%esp) ! addl $108,%esp ! lmsw %ax ! movb $0xfe,kernel_fpu_lock ! ret ! ! i586_bz3: ! fstpl %st(0) ! lmsw %ax ! movb $0xfe,kernel_fpu_lock ! ret ! ! intreg_i586_bzero: ! /* ! * `rep stos' seems to be the best method in practice for small ! * counts. Fancy methods usually take too long to start up due ! * to cache and BTB misses. ! */ ! pushl %edi ! movl %edx,%edi ! xorl %eax,%eax shrl $2,%ecx + cld rep stosl ! movl 12(%esp),%ecx andl $3,%ecx + jne 1f + popl %edi + ret + + 1: rep stosb popl %edi ret ! #endif /* I586_CPU || I686_CPU */ /* fillw(pat, base, cnt) */ *************** *** 279,288 **** /* ! * (ov)bcopy(src, dst, cnt) * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ ! ALTENTRY(ovbcopy) ! ENTRY(bcopy) ! bcopy: pushl %esi pushl %edi --- 369,376 ---- /* ! * generic_bcopy(src, dst, cnt) * ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ ! ENTRY(generic_bcopy) pushl %esi pushl %edi *************** *** 295,298 **** --- 383,387 ---- cmpl %ecx,%eax /* overlapping? */ jb 1f + shrl $2,%ecx /* copy by 32-bit words */ cld /* nope, copy forwards */ *************** *** 328,331 **** --- 417,555 ---- ret + ENTRY(i586_bcopy) + pushl %esi + pushl %edi + movl 12(%esp),%esi + movl 16(%esp),%edi + movl 20(%esp),%ecx + + movl %edi,%eax + subl %esi,%eax + cmpl %ecx,%eax /* overlapping? */ + jb 1f + + cmpl $1024,%ecx + jb small_i586_bcopy + + sarb $1,kernel_fpu_lock + jc small_i586_bcopy + cmpl $0,_npxproc + je i586_bc1 + smsw %dx + clts + subl $108,%esp + fnsave 0(%esp) + jmp 4f + + i586_bc1: + smsw %dx + clts + fninit /* XXX should avoid needing this */ + + ALIGN_TEXT + 4: + pushl %ecx + #define DCACHE_SIZE 8192 + cmpl $(DCACHE_SIZE-512)/2,%ecx + jbe 2f + movl $(DCACHE_SIZE-512)/2,%ecx + 2: + subl %ecx,0(%esp) + cmpl $256,%ecx + jb 5f /* XXX should prefetch if %ecx >= 32 */ + pushl %esi + pushl %ecx + ALIGN_TEXT + 3: + movl 0(%esi),%eax + movl 32(%esi),%eax + movl 64(%esi),%eax + movl 96(%esi),%eax + movl 128(%esi),%eax + movl 160(%esi),%eax + movl 192(%esi),%eax + movl 224(%esi),%eax + addl $256,%esi + subl $256,%ecx + cmpl $256,%ecx + jae 3b + popl %ecx + popl %esi + 5: + ALIGN_TEXT + large_i586_bcopy_loop: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fistpq 56(%edi) + fistpq 48(%edi) + fistpq 40(%edi) + fistpq 32(%edi) + fistpq 24(%edi) + fistpq 16(%edi) + fistpq 8(%edi) + fistpq 0(%edi) + addl $64,%esi + addl $64,%edi + subl $64,%ecx + cmpl $64,%ecx + jae large_i586_bcopy_loop + popl %eax + addl %eax,%ecx + cmpl $64,%ecx + jae 4b + + cmpl $0,_npxproc + je i586_bc2 + frstor 0(%esp) + addl $108,%esp + i586_bc2: + lmsw %dx + movb $0xfe,kernel_fpu_lock + + /* + * This is a duplicate of the main part of generic_bcopy. See the comments + * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and + * would mess up high resolution profiling. + */ + ALIGN_TEXT + small_i586_bcopy: + shrl $2,%ecx + cld + rep + movsl + movl 20(%esp),%ecx + andl $3,%ecx + rep + movsb + popl %edi + popl %esi + ret + + ALIGN_TEXT + 1: + addl %ecx,%edi + addl %ecx,%esi + decl %edi + decl %esi + andl $3,%ecx + std + rep + movsb + movl 20(%esp),%ecx + shrl $2,%ecx + subl $3,%esi + subl $3,%edi + rep + movsl + popl %edi + popl %esi + cld + ret /* diff -c2 src/sys/i386/i386/swtch.s~ src/sys/i386/i386/swtch.s *** src/sys/i386/i386/swtch.s~ Thu Aug 1 04:43:17 1996 --- src/sys/i386/i386/swtch.s Tue Sep 10 07:06:50 1996 *************** *** 493,497 **** pushl %ecx pushl %eax ! call _bcopy addl $12,%esp #endif /* NNPX > 0 */ --- 516,520 ---- pushl %ecx pushl %eax ! call _generic_bcopy addl $12,%esp #endif /* NNPX > 0 */ diff -c2 src/sys/sys/systm.h~ src/sys/sys/systm.h *** src/sys/sys/systm.h~ Sat Sep 14 07:28:06 1996 --- src/sys/sys/systm.h Tue Sep 3 03:24:04 1996 *************** *** 129,135 **** void ttyprintf __P((struct tty *, const char *, ...)); ! void bcopy __P((const void *from, void *to, size_t len)); ! void ovbcopy __P((const void *from, void *to, size_t len)); extern void (*bzero) __P((void *buf, size_t len)); void *memcpy __P((void *to, const void *from, size_t len)); --- 126,132 ---- void ttyprintf __P((struct tty *, const char *, ...)); ! extern void (*bcopy) __P((const void *from, void *to, size_t len)); extern void (*bzero) __P((void *buf, size_t len)); + extern void (*ovbcopy) __P((const void *from, void *to, size_t len)); void *memcpy __P((void *to, const void *from, size_t len));
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199609140649.QAA06910>