Date: Sat, 14 Sep 1996 16:49:21 +1000 From: Bruce Evans <bde@zeta.org.au> To: current@freebsd.org Subject: pentium-optimized bzero and bcopy Message-ID: <199609140649.QAA06910@godzilla.zeta.org.au>
next in thread | raw e-mail | index | archive | help
I haven't been doing anything with this for too long. I'm not happy
with a few of the details:
1) The function pointer interface to bzero and bcopy is too i*86-
specific, and doesn't gain much speed even on an i*86. It is
convenient for testing and for dynamic configuration, but this
can be implemented at a lower level at a cost of one jmp or
call+ret.
2) The FPU is assumed to work on 586's. I don't want ifdefs for
it. The ifdefs for I586_FAST_BCOPY (which are for the i586-
optimized copyin() and copyout(), not for bcopy()) should go
away soon.
Bruce
diff -c2 src/sys/i386/i386/identcpu.c~ src/sys/i386/i386/identcpu.c
*** src/sys/i386/i386/identcpu.c~ Sat Sep 7 23:08:26 1996
--- src/sys/i386/i386/identcpu.c Mon Sep 9 09:16:36 1996
***************
*** 41,47 ****
/* XXX - should be in header file */
! extern void i486_bzero __P((void *, size_t));
! extern void i586_bzero __P((void *, size_t));
! extern void i686_bzero __P((void *, size_t));
void identifycpu(void); /* XXX should be in different header file */
--- 41,57 ----
/* XXX - should be in header file */
! extern void i586_bcopy __P((const void *from, void *to, size_t len));
! extern void i486_bzero __P((void *buf, size_t len));
! extern void i586_bzero __P((void *buf, size_t len));
void identifycpu(void); /* XXX should be in different header file */
***************
*** 174,177 ****
--- 169,175 ----
((i586_ctr_freq + 4999) / 10000) % 100);
printf("586");
+ bcopy = i586_bcopy;
+ bzero = i586_bzero;
+ ovbcopy = i586_bcopy;
break;
#endif
diff -c2 src/sys/i386/i386/support.s~ src/sys/i386/i386/support.s
*** src/sys/i386/i386/support.s~ Wed Sep 11 03:32:22 1996
--- src/sys/i386/i386/support.s Wed Sep 11 03:32:32 1996
***************
*** 45,52 ****
#define IDXSHIFT 10
-
.data
.globl _bzero
! _bzero: .long _generic_bzero
.text
--- 43,59 ----
#define IDXSHIFT 10
.data
+ .globl _bcopy
+ _bcopy:
+ .long _generic_bcopy
.globl _bzero
! _bzero:
! .long _generic_bzero
! .globl _ovbcopy
! _ovbcopy:
! .long _generic_bcopy
! kernel_fpu_lock:
! .byte 0xfe
! .space 3
.text
***************
*** 173,236 ****
#endif
- #if 0 /* Actually lowers performance in real-world cases */
#if defined(I586_CPU) || defined(I686_CPU)
! ALTENTRY(i586_bzero)
! ENTRY(i686_bzero)
! pushl %edi
! movl 8(%esp),%edi /* destination pointer */
! movl 12(%esp),%edx /* size (in 8-bit words) */
! xorl %eax,%eax /* store data */
! cld
! /* If less than 100 bytes to write, skip tricky code. */
! cmpl $100,%edx
! movl %edx,%ecx /* needed when branch is taken! */
! jl 2f
!
! /* First write 0-3 bytes to make the pointer 32-bit aligned. */
! movl %edi,%ecx /* Copy ptr to ecx... */
! negl %ecx /* ...and negate that and... */
! andl $3,%ecx /* ...mask to get byte count. */
! subl %ecx,%edx /* adjust global byte count */
! rep
! stosb
! subl $32,%edx /* offset count for unrolled loop */
! movl (%edi),%ecx /* Fetch destination cache line */
! .align 2,0x90 /* supply 0x90 for broken assemblers */
! 1:
! movl 28(%edi),%ecx /* allocate cache line for destination */
! subl $32,%edx /* decr loop count */
! movl %eax,0(%edi) /* store words pairwise */
! movl %eax,4(%edi)
! movl %eax,8(%edi)
! movl %eax,12(%edi)
! movl %eax,16(%edi)
! movl %eax,20(%edi)
! movl %eax,24(%edi)
! movl %eax,28(%edi)
!
! leal 32(%edi),%edi /* update destination pointer */
! jge 1b
! leal 32(%edx),%ecx
! /* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */
! 2:
shrl $2,%ecx
rep
stosl
!
! /* Finally write the last 0-3 bytes. */
! movl %edx,%ecx
andl $3,%ecx
rep
stosb
-
popl %edi
ret
! #endif
! #endif
/* fillw(pat, base, cnt) */
--- 182,326 ----
#endif
#if defined(I586_CPU) || defined(I686_CPU)
! ENTRY(i586_bzero)
! movl 4(%esp),%edx
! movl 8(%esp),%ecx
! /*
! * The FPU register method is twice as fast as the integer register
! * method unless the target is in the L1 cache and we pre-allocate a
! * cache line for it (then the integer register method is 4-5 times
! * faster). However, we never pre-allocate cache lines, since that
! * would make the integer method 25% or more slower for the common
! * case when the target isn't in either the L1 cache or the L2 cache.
! * Thus we normally use the FPU register method unless the overhead
! * would be too large.
! */
! cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */
! jb intreg_i586_bzero
! /*
! * The FPU registers may belong to an application or to fastmove()
! * or to another invocation of bcopy() or ourself in a higher level
! * interrupt or trap handler. Preserving the registers is
! * complicated since we avoid it if possible at all levels. We
! * want to localize the complications even when that increases them.
! * Here the extra work involves preserving CR0_TS in TS.
! * `npxproc != NULL' is supposed to be the condition that all the
! * FPU resources belong to an application, but npxproc and CR0_TS
! * aren't set atomically enough for this condition to work in
! * interrupt handlers.
! *
! * Case 1: FPU registers belong to the application: we must preserve
! * the registers if we use them, so we only use the FPU register
! * method if the target size is large enough to amortize the extra
! * overhead for preserving them. CR0_TS must be preserved although
! * it is very likely to end up as set.
! *
! * Case 2: FPU registers belong to fastmove(): fastmove() currently
! * makes the registers look like they belong to an application so
! * that cpu_switch() and savectx() don't have to know about it, so
! * this case reduces to case 1.
! *
! * Case 3: FPU registers belong to the kernel: don't use the FPU
! * register method. This case is unlikely, and supporting it would
! * be more complicated and might take too much stack.
! *
! * Case 4: FPU registers don't belong to anyone: the FPU registers
! * don't need to be preserved, so we always use the FPU register
! * method. CR0_TS must be preserved although it is very likely to
! * always end up as clear.
! */
! cmpl $0,_npxproc
! je i586_bz1
! cmpl $256+184,%ecx /* empirical; not quite 2*108 more */
! jb intreg_i586_bzero
! sarb $1,kernel_fpu_lock
! jc intreg_i586_bzero
! smsw %ax
! clts
! subl $108,%esp
! fnsave 0(%esp)
! jmp i586_bz2
!
! i586_bz1:
! sarb $1,kernel_fpu_lock
! jc intreg_i586_bzero
! smsw %ax
! clts
! fninit /* XXX should avoid needing this */
! i586_bz2:
! fldz
! /*
! * Align to an 8 byte boundary (misalignment in the main loop would
! * cost a factor of >= 2). Avoid jumps (at little cost if it is
! * already aligned) by always zeroing 8 bytes and using the part up
! * to the _next_ alignment position.
! */
! fstl 0(%edx)
! addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */
! addl $8,%edx
! andl $~7,%edx
! subl %edx,%ecx
! /*
! * Similarly align `len' to a multiple of 8.
! */
! fstl -8(%edx,%ecx)
! decl %ecx
! andl $~7,%ecx
! /*
! * This wouldn't be any faster if it were unrolled, since the loop
! * control instructions are much faster than the fstl and/or done
! * in parallel with it so their overhead is insignificant.
! */
! fpureg_i586_bzero_loop:
! fstl 0(%edx)
! addl $8,%edx
! subl $8,%ecx
! cmpl $8,%ecx
! jae fpureg_i586_bzero_loop
!
! cmpl $0,_npxproc
! je i586_bz3
! frstor 0(%esp)
! addl $108,%esp
! lmsw %ax
! movb $0xfe,kernel_fpu_lock
! ret
!
! i586_bz3:
! fstpl %st(0)
! lmsw %ax
! movb $0xfe,kernel_fpu_lock
! ret
!
! intreg_i586_bzero:
! /*
! * `rep stos' seems to be the best method in practice for small
! * counts. Fancy methods usually take too long to start up due
! * to cache and BTB misses.
! */
! pushl %edi
! movl %edx,%edi
! xorl %eax,%eax
shrl $2,%ecx
+ cld
rep
stosl
! movl 12(%esp),%ecx
andl $3,%ecx
+ jne 1f
+ popl %edi
+ ret
+
+ 1:
rep
stosb
popl %edi
ret
! #endif /* I586_CPU || I686_CPU */
/* fillw(pat, base, cnt) */
***************
*** 279,288 ****
/*
! * (ov)bcopy(src, dst, cnt)
* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
! ALTENTRY(ovbcopy)
! ENTRY(bcopy)
! bcopy:
pushl %esi
pushl %edi
--- 369,376 ----
/*
! * generic_bcopy(src, dst, cnt)
* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
! ENTRY(generic_bcopy)
pushl %esi
pushl %edi
***************
*** 295,298 ****
--- 383,387 ----
cmpl %ecx,%eax /* overlapping? */
jb 1f
+
shrl $2,%ecx /* copy by 32-bit words */
cld /* nope, copy forwards */
***************
*** 328,331 ****
--- 417,555 ----
ret
+ ENTRY(i586_bcopy)
+ pushl %esi
+ pushl %edi
+ movl 12(%esp),%esi
+ movl 16(%esp),%edi
+ movl 20(%esp),%ecx
+
+ movl %edi,%eax
+ subl %esi,%eax
+ cmpl %ecx,%eax /* overlapping? */
+ jb 1f
+
+ cmpl $1024,%ecx
+ jb small_i586_bcopy
+
+ sarb $1,kernel_fpu_lock
+ jc small_i586_bcopy
+ cmpl $0,_npxproc
+ je i586_bc1
+ smsw %dx
+ clts
+ subl $108,%esp
+ fnsave 0(%esp)
+ jmp 4f
+
+ i586_bc1:
+ smsw %dx
+ clts
+ fninit /* XXX should avoid needing this */
+
+ ALIGN_TEXT
+ 4:
+ pushl %ecx
+ #define DCACHE_SIZE 8192
+ cmpl $(DCACHE_SIZE-512)/2,%ecx
+ jbe 2f
+ movl $(DCACHE_SIZE-512)/2,%ecx
+ 2:
+ subl %ecx,0(%esp)
+ cmpl $256,%ecx
+ jb 5f /* XXX should prefetch if %ecx >= 32 */
+ pushl %esi
+ pushl %ecx
+ ALIGN_TEXT
+ 3:
+ movl 0(%esi),%eax
+ movl 32(%esi),%eax
+ movl 64(%esi),%eax
+ movl 96(%esi),%eax
+ movl 128(%esi),%eax
+ movl 160(%esi),%eax
+ movl 192(%esi),%eax
+ movl 224(%esi),%eax
+ addl $256,%esi
+ subl $256,%ecx
+ cmpl $256,%ecx
+ jae 3b
+ popl %ecx
+ popl %esi
+ 5:
+ ALIGN_TEXT
+ large_i586_bcopy_loop:
+ fildq 0(%esi)
+ fildq 8(%esi)
+ fildq 16(%esi)
+ fildq 24(%esi)
+ fildq 32(%esi)
+ fildq 40(%esi)
+ fildq 48(%esi)
+ fildq 56(%esi)
+ fistpq 56(%edi)
+ fistpq 48(%edi)
+ fistpq 40(%edi)
+ fistpq 32(%edi)
+ fistpq 24(%edi)
+ fistpq 16(%edi)
+ fistpq 8(%edi)
+ fistpq 0(%edi)
+ addl $64,%esi
+ addl $64,%edi
+ subl $64,%ecx
+ cmpl $64,%ecx
+ jae large_i586_bcopy_loop
+ popl %eax
+ addl %eax,%ecx
+ cmpl $64,%ecx
+ jae 4b
+
+ cmpl $0,_npxproc
+ je i586_bc2
+ frstor 0(%esp)
+ addl $108,%esp
+ i586_bc2:
+ lmsw %dx
+ movb $0xfe,kernel_fpu_lock
+
+ /*
+ * This is a duplicate of the main part of generic_bcopy. See the comments
+ * there. Jumping into generic_bcopy would cost a whole 0-1 cycles and
+ * would mess up high resolution profiling.
+ */
+ ALIGN_TEXT
+ small_i586_bcopy:
+ shrl $2,%ecx
+ cld
+ rep
+ movsl
+ movl 20(%esp),%ecx
+ andl $3,%ecx
+ rep
+ movsb
+ popl %edi
+ popl %esi
+ ret
+
+ ALIGN_TEXT
+ 1:
+ addl %ecx,%edi
+ addl %ecx,%esi
+ decl %edi
+ decl %esi
+ andl $3,%ecx
+ std
+ rep
+ movsb
+ movl 20(%esp),%ecx
+ shrl $2,%ecx
+ subl $3,%esi
+ subl $3,%edi
+ rep
+ movsl
+ popl %edi
+ popl %esi
+ cld
+ ret
/*
diff -c2 src/sys/i386/i386/swtch.s~ src/sys/i386/i386/swtch.s
*** src/sys/i386/i386/swtch.s~ Thu Aug 1 04:43:17 1996
--- src/sys/i386/i386/swtch.s Tue Sep 10 07:06:50 1996
***************
*** 493,497 ****
pushl %ecx
pushl %eax
! call _bcopy
addl $12,%esp
#endif /* NNPX > 0 */
--- 516,520 ----
pushl %ecx
pushl %eax
! call _generic_bcopy
addl $12,%esp
#endif /* NNPX > 0 */
diff -c2 src/sys/sys/systm.h~ src/sys/sys/systm.h
*** src/sys/sys/systm.h~ Sat Sep 14 07:28:06 1996
--- src/sys/sys/systm.h Tue Sep 3 03:24:04 1996
***************
*** 129,135 ****
void ttyprintf __P((struct tty *, const char *, ...));
! void bcopy __P((const void *from, void *to, size_t len));
! void ovbcopy __P((const void *from, void *to, size_t len));
extern void (*bzero) __P((void *buf, size_t len));
void *memcpy __P((void *to, const void *from, size_t len));
--- 126,132 ----
void ttyprintf __P((struct tty *, const char *, ...));
! extern void (*bcopy) __P((const void *from, void *to, size_t len));
extern void (*bzero) __P((void *buf, size_t len));
+ extern void (*ovbcopy) __P((const void *from, void *to, size_t len));
void *memcpy __P((void *to, const void *from, size_t len));
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199609140649.QAA06910>
