Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 14 Sep 1996 16:49:21 +1000
From:      Bruce Evans <bde@zeta.org.au>
To:        current@freebsd.org
Subject:   pentium-optimized bzero and bcopy
Message-ID:  <199609140649.QAA06910@godzilla.zeta.org.au>

next in thread | raw e-mail | index | archive | help
I haven't been doing anything with this for too long.  I'm not happy
with a few of the details:

1) The function pointer interface to bzero and bcopy is too i*86-
   specific, and doesn't gain much speed even on an i*86.  It is
   convenient for testing and for dynamic configuration, but this
   can be implemented at a lower level at a cost of one jmp or
   call+ret.

2) The FPU is assumed to work on 586's.  I don't want ifdefs for
   it.   The ifdefs for I586_FAST_BCOPY (which are for the i586-
   optimized copyin() and copyout(), not for bcopy()) should go
   away soon.

Bruce

diff -c2 src/sys/i386/i386/identcpu.c~ src/sys/i386/i386/identcpu.c
*** src/sys/i386/i386/identcpu.c~	Sat Sep  7 23:08:26 1996
--- src/sys/i386/i386/identcpu.c	Mon Sep  9 09:16:36 1996
***************
*** 41,47 ****
  
  /* XXX - should be in header file */
! extern void i486_bzero	__P((void *, size_t));
! extern void i586_bzero	__P((void *, size_t));
! extern void i686_bzero	__P((void *, size_t));
  
  void identifycpu(void);		/* XXX should be in different header file */
--- 41,57 ----
  
  /* XXX - should be in header file */
! extern void i586_bcopy __P((const void *from, void *to, size_t len));
! extern void i486_bzero __P((void *buf, size_t len));
! extern void i586_bzero __P((void *buf, size_t len));
  
  void identifycpu(void);		/* XXX should be in different header file */
***************
*** 174,177 ****
--- 169,175 ----
  		       ((i586_ctr_freq + 4999) / 10000) % 100);
  		printf("586");
+ 		bcopy = i586_bcopy;
+ 		bzero = i586_bzero;
+ 		ovbcopy = i586_bcopy;
  		break;
  #endif
diff -c2 src/sys/i386/i386/support.s~ src/sys/i386/i386/support.s
*** src/sys/i386/i386/support.s~	Wed Sep 11 03:32:22 1996
--- src/sys/i386/i386/support.s	Wed Sep 11 03:32:32 1996
***************
*** 45,52 ****
  #define IDXSHIFT	10
  
- 
  	.data
  	.globl	_bzero
! _bzero:	.long	_generic_bzero
  
  	.text
--- 43,59 ----
  #define IDXSHIFT	10
  
  	.data
+ 	.globl	_bcopy
+ _bcopy:
+ 	.long	_generic_bcopy
  	.globl	_bzero
! _bzero:
! 	.long	_generic_bzero
! 	.globl	_ovbcopy
! _ovbcopy:
! 	.long	_generic_bcopy
! kernel_fpu_lock:
! 	.byte	0xfe
! 	.space	3
  
  	.text
***************
*** 173,236 ****
  #endif
  
- #if 0	/* Actually lowers performance in real-world cases */
  #if defined(I586_CPU) || defined(I686_CPU)
! ALTENTRY(i586_bzero)
! ENTRY(i686_bzero)
! 	pushl	%edi
! 	movl	8(%esp),%edi	/* destination pointer */
! 	movl	12(%esp),%edx	/* size (in 8-bit words) */
  
! 	xorl	%eax,%eax	/* store data */
! 	cld
  
! /* If less than 100 bytes to write, skip tricky code.  */
! 	cmpl	$100,%edx
! 	movl	%edx,%ecx	/* needed when branch is taken! */
! 	jl	2f
! 
! /* First write 0-3 bytes to make the pointer 32-bit aligned.  */
! 	movl	%edi,%ecx	/* Copy ptr to ecx... */
! 	negl	%ecx		/* ...and negate that and... */
! 	andl	$3,%ecx		/* ...mask to get byte count.  */
! 	subl	%ecx,%edx	/* adjust global byte count */
! 	rep
! 	stosb
  
! 	subl	$32,%edx	/* offset count for unrolled loop */
! 	movl	(%edi),%ecx	/* Fetch destination cache line */
  
! 	.align	2,0x90		/* supply 0x90 for broken assemblers */
! 1:
! 	movl	28(%edi),%ecx	/* allocate cache line for destination */
! 	subl	$32,%edx	/* decr loop count */
! 	movl	%eax,0(%edi)	/* store words pairwise */
! 	movl	%eax,4(%edi)
! 	movl	%eax,8(%edi)
! 	movl	%eax,12(%edi)
! 	movl	%eax,16(%edi)
! 	movl	%eax,20(%edi)
! 	movl	%eax,24(%edi)
! 	movl	%eax,28(%edi)
! 
! 	leal	32(%edi),%edi	/* update destination pointer */
! 	jge	1b
! 	leal	32(%edx),%ecx
  
! /* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped).  */
! 2:
  	shrl	$2,%ecx
  	rep
  	stosl
! 
! /* Finally write the last 0-3 bytes.  */
! 	movl	%edx,%ecx
  	andl	$3,%ecx
  	rep
  	stosb
- 
  	popl	%edi
  	ret
! #endif
! #endif
  
  /* fillw(pat, base, cnt) */
--- 182,326 ----
  #endif
  
  #if defined(I586_CPU) || defined(I686_CPU)
! ENTRY(i586_bzero)
! 	movl	4(%esp),%edx
! 	movl	8(%esp),%ecx
  
! 	/*
! 	 * The FPU register method is twice as fast as the integer register
! 	 * method unless the target is in the L1 cache and we pre-allocate a
! 	 * cache line for it (then the integer register method is 4-5 times
! 	 * faster).  However, we never pre-allocate cache lines, since that
! 	 * would make the integer method 25% or more slower for the common
! 	 * case when the target isn't in either the L1 cache or the L2 cache.
! 	 * Thus we normally use the FPU register method unless the overhead
! 	 * would be too large.
! 	 */
! 	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
! 	jb	intreg_i586_bzero
  
! 	/*
! 	 * The FPU registers may belong to an application or to fastmove()
! 	 * or to another invocation of bcopy() or ourself in a higher level
! 	 * interrupt or trap handler.  Preserving the registers is
! 	 * complicated since we avoid it if possible at all levels.  We
! 	 * want to localize the complications even when that increases them.
! 	 * Here the extra work involves preserving CR0_TS in TS.
! 	 * `npxproc != NULL' is supposed to be the condition that all the
! 	 * FPU resources belong to an application, but npxproc and CR0_TS
! 	 * aren't set atomically enough for this condition to work in
! 	 * interrupt handlers.
! 	 *
! 	 * Case 1: FPU registers belong to the application: we must preserve
! 	 * the registers if we use them, so we only use the FPU register
! 	 * method if the target size is large enough to amortize the extra
! 	 * overhead for preserving them.  CR0_TS must be preserved although
! 	 * it is very likely to end up as set.
! 	 *
! 	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
! 	 * makes the registers look like they belong to an application so
! 	 * that cpu_switch() and savectx() don't have to know about it, so
! 	 * this case reduces to case 1.
! 	 *
! 	 * Case 3: FPU registers belong to the kernel: don't use the FPU
! 	 * register method.  This case is unlikely, and supporting it would
! 	 * be more complicated and might take too much stack.
! 	 *
! 	 * Case 4: FPU registers don't belong to anyone: the FPU registers
! 	 * don't need to be preserved, so we always use the FPU register
! 	 * method.  CR0_TS must be preserved although it is very likely to
! 	 * always end up as clear.
! 	 */
! 	cmpl	$0,_npxproc
! 	je	i586_bz1
! 	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
! 	jb	intreg_i586_bzero
! 	sarb	$1,kernel_fpu_lock
! 	jc	intreg_i586_bzero
! 	smsw	%ax
! 	clts
! 	subl	$108,%esp
! 	fnsave	0(%esp)
! 	jmp	i586_bz2
! 
! i586_bz1:
! 	sarb	$1,kernel_fpu_lock
! 	jc	intreg_i586_bzero
! 	smsw	%ax
! 	clts
! 	fninit				/* XXX should avoid needing this */
! i586_bz2:
! 	fldz
  
! 	/*
! 	 * Align to an 8 byte boundary (misalignment in the main loop would
! 	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
! 	 * already aligned) by always zeroing 8 bytes and using the part up
! 	 * to the _next_ alignment position.
! 	 */
! 	fstl	0(%edx)
! 	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
! 	addl	$8,%edx
! 	andl	$~7,%edx
! 	subl	%edx,%ecx
  
! 	/*
! 	 * Similarly align `len' to a multiple of 8.
! 	 */
! 	fstl	-8(%edx,%ecx)
! 	decl	%ecx
! 	andl	$~7,%ecx
  
! 	/*
! 	 * This wouldn't be any faster if it were unrolled, since the loop
! 	 * control instructions are much faster than the fstl and/or done
! 	 * in parallel with it so their overhead is insignificant.
! 	 */
! fpureg_i586_bzero_loop:
! 	fstl	0(%edx)
! 	addl	$8,%edx
! 	subl	$8,%ecx
! 	cmpl	$8,%ecx
! 	jae	fpureg_i586_bzero_loop
! 
! 	cmpl	$0,_npxproc
! 	je	i586_bz3
! 	frstor	0(%esp)
! 	addl	$108,%esp
! 	lmsw	%ax
! 	movb	$0xfe,kernel_fpu_lock
! 	ret
! 
! i586_bz3:
! 	fstpl	%st(0)
! 	lmsw	%ax
! 	movb	$0xfe,kernel_fpu_lock
! 	ret
! 
! intreg_i586_bzero:
! 	/*
! 	 * `rep stos' seems to be the best method in practice for small
! 	 * counts.  Fancy methods usually take too long to start up due
! 	 * to cache and BTB misses.
! 	 */
! 	pushl	%edi
! 	movl	%edx,%edi
! 	xorl	%eax,%eax
  	shrl	$2,%ecx
+ 	cld
  	rep
  	stosl
! 	movl	12(%esp),%ecx
  	andl	$3,%ecx
+ 	jne	1f
+ 	popl	%edi
+ 	ret
+ 
+ 1:
  	rep
  	stosb
  	popl	%edi
  	ret
! #endif /* I586_CPU || I686_CPU */
  
  /* fillw(pat, base, cnt) */
***************
*** 279,288 ****
  
  /*
!  * (ov)bcopy(src, dst, cnt)
   *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
   */
! ALTENTRY(ovbcopy)
! ENTRY(bcopy)
! bcopy:
  	pushl	%esi
  	pushl	%edi
--- 369,376 ----
  
  /*
!  * generic_bcopy(src, dst, cnt)
   *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
   */
! ENTRY(generic_bcopy)
  	pushl	%esi
  	pushl	%edi
***************
*** 295,298 ****
--- 383,387 ----
  	cmpl	%ecx,%eax			/* overlapping? */
  	jb	1f
+ 
  	shrl	$2,%ecx				/* copy by 32-bit words */
  	cld					/* nope, copy forwards */
***************
*** 328,331 ****
--- 417,555 ----
  	ret
  
+ ENTRY(i586_bcopy)
+ 	pushl	%esi
+ 	pushl	%edi
+ 	movl	12(%esp),%esi
+ 	movl	16(%esp),%edi
+ 	movl	20(%esp),%ecx
+ 
+ 	movl	%edi,%eax
+ 	subl	%esi,%eax
+ 	cmpl	%ecx,%eax			/* overlapping? */
+ 	jb	1f
+ 
+ 	cmpl	$1024,%ecx
+ 	jb	small_i586_bcopy
+ 
+ 	sarb	$1,kernel_fpu_lock
+ 	jc	small_i586_bcopy
+ 	cmpl	$0,_npxproc
+ 	je	i586_bc1
+ 	smsw	%dx
+ 	clts
+ 	subl	$108,%esp
+ 	fnsave	0(%esp)
+ 	jmp	4f
+ 
+ i586_bc1:
+ 	smsw	%dx
+ 	clts
+ 	fninit				/* XXX should avoid needing this */
+ 
+ 	ALIGN_TEXT
+ 4:
+ 	pushl	%ecx
+ #define	DCACHE_SIZE	8192
+ 	cmpl	$(DCACHE_SIZE-512)/2,%ecx
+ 	jbe	2f
+ 	movl	$(DCACHE_SIZE-512)/2,%ecx
+ 2:
+ 	subl	%ecx,0(%esp)
+ 	cmpl	$256,%ecx
+ 	jb	5f			/* XXX should prefetch if %ecx >= 32 */
+ 	pushl	%esi
+ 	pushl	%ecx
+ 	ALIGN_TEXT
+ 3:
+ 	movl	0(%esi),%eax
+ 	movl	32(%esi),%eax
+ 	movl	64(%esi),%eax
+ 	movl	96(%esi),%eax
+ 	movl	128(%esi),%eax
+ 	movl	160(%esi),%eax
+ 	movl	192(%esi),%eax
+ 	movl	224(%esi),%eax
+ 	addl	$256,%esi
+ 	subl	$256,%ecx
+ 	cmpl	$256,%ecx
+ 	jae	3b
+ 	popl	%ecx
+ 	popl	%esi
+ 5:
+ 	ALIGN_TEXT
+ large_i586_bcopy_loop:
+ 	fildq	0(%esi)
+ 	fildq	8(%esi)
+ 	fildq	16(%esi)
+ 	fildq	24(%esi)
+ 	fildq	32(%esi)
+ 	fildq	40(%esi)
+ 	fildq	48(%esi)
+ 	fildq	56(%esi)
+ 	fistpq	56(%edi)
+ 	fistpq	48(%edi)
+ 	fistpq	40(%edi)
+ 	fistpq	32(%edi)
+ 	fistpq	24(%edi)
+ 	fistpq	16(%edi)
+ 	fistpq	8(%edi)
+ 	fistpq	0(%edi)
+ 	addl	$64,%esi
+ 	addl	$64,%edi
+ 	subl	$64,%ecx
+ 	cmpl	$64,%ecx
+ 	jae	large_i586_bcopy_loop
+ 	popl	%eax
+ 	addl	%eax,%ecx
+ 	cmpl	$64,%ecx
+ 	jae	4b
+ 
+ 	cmpl	$0,_npxproc
+ 	je	i586_bc2
+ 	frstor	0(%esp)
+ 	addl	$108,%esp
+ i586_bc2:
+ 	lmsw	%dx
+ 	movb	$0xfe,kernel_fpu_lock
+ 
+ /*
+  * This is a duplicate of the main part of generic_bcopy.  See the comments
+  * there.  Jumping into generic_bcopy would cost a whole 0-1 cycles and
+  * would mess up high resolution profiling.
+  */
+ 	ALIGN_TEXT
+ small_i586_bcopy:
+ 	shrl	$2,%ecx
+ 	cld
+ 	rep
+ 	movsl
+ 	movl	20(%esp),%ecx
+ 	andl	$3,%ecx
+ 	rep
+ 	movsb
+ 	popl	%edi
+ 	popl	%esi
+ 	ret
+ 
+ 	ALIGN_TEXT
+ 1:
+ 	addl	%ecx,%edi
+ 	addl	%ecx,%esi
+ 	decl	%edi
+ 	decl	%esi
+ 	andl	$3,%ecx
+ 	std
+ 	rep
+ 	movsb
+ 	movl	20(%esp),%ecx
+ 	shrl	$2,%ecx
+ 	subl	$3,%esi
+ 	subl	$3,%edi
+ 	rep
+ 	movsl
+ 	popl	%edi
+ 	popl	%esi
+ 	cld
+ 	ret
  
  /*
diff -c2 src/sys/i386/i386/swtch.s~ src/sys/i386/i386/swtch.s
*** src/sys/i386/i386/swtch.s~	Thu Aug  1 04:43:17 1996
--- src/sys/i386/i386/swtch.s	Tue Sep 10 07:06:50 1996
***************
*** 493,497 ****
  	pushl	%ecx
  	pushl	%eax
! 	call	_bcopy
  	addl	$12,%esp
  #endif	/* NNPX > 0 */
--- 516,520 ----
  	pushl	%ecx
  	pushl	%eax
! 	call	_generic_bcopy
  	addl	$12,%esp
  #endif	/* NNPX > 0 */
diff -c2 src/sys/sys/systm.h~ src/sys/sys/systm.h
*** src/sys/sys/systm.h~	Sat Sep 14 07:28:06 1996
--- src/sys/sys/systm.h	Tue Sep  3 03:24:04 1996
***************
*** 129,135 ****
  void	ttyprintf __P((struct tty *, const char *, ...));
  
! void	bcopy __P((const void *from, void *to, size_t len));
! void	ovbcopy __P((const void *from, void *to, size_t len));
  extern void	(*bzero) __P((void *buf, size_t len));
  
  void	*memcpy __P((void *to, const void *from, size_t len));
--- 126,132 ----
  void	ttyprintf __P((struct tty *, const char *, ...));
  
! extern void	(*bcopy) __P((const void *from, void *to, size_t len));
  extern void	(*bzero) __P((void *buf, size_t len));
+ extern void	(*ovbcopy) __P((const void *from, void *to, size_t len));
  
  void	*memcpy __P((void *to, const void *from, size_t len));



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199609140649.QAA06910>