Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 24 Dec 1995 12:48:06 +0100
From:      Torbjorn Granlund <tege@matematik.su.se>
To:        freebsd-hackers@freebsd.org
Subject:   Better bzero
Message-ID:  <199512241148.MAA28761@insanus.matematik.su.se>

next in thread | raw e-mail | index | archive | help
Here is a bzero/memset that uses the same trick as the previously posted
memcpy.

noisy> gcc mzero.c zero.S src/lib/cputime.c -O -DSIZE=1000
noisy> a.out
zero 1005
memset 8436
bzero 8424

noisy> gcc mzero.c zero.S src/lib/cputime.c -O -DSIZE=10000
noisy> a.out
zero 5605
memset 8341
bzero 8349

Note that the code wins significantly even when the L1 cache is too small
for the data set.  The mere 5x (actually 4.89x) improvement measured on my
system has been improved to a more decent 8x.  :-)


	.text
	.align 4
	.globl	_zero
_zero:	pushl	%edi

	movl	8(%esp),%edi	/* destination pointer */
	movl	12(%esp),%ecx	/* size (in 32-bit words) */

	xorl	%eax,%eax	/* for memset, we'd copy byte to all of eax */
	shrl	$3,%ecx		/* count for unrolled loop */
	jz	Lend		/* if zero, skip unrolled loop */

	movl	(%edi),%edx	/* Fetch destination cache line */

	.align	2,0x90		/* supply 0x90 for broken assemblers */
Loop:	movl	28(%edi),%edx	/* allocate cache line for destination */
	nop			/* we want these two insn to pair! */
	movl	%eax,(%edi)	/* store words pairwise */
	movl	%eax,4(%edi)
	movl	%eax,8(%edi)
	movl	%eax,12(%edi)
	movl	%eax,16(%edi)
	movl	%eax,20(%edi)
	movl	%eax,24(%edi)
	movl	%eax,28(%edi)

	addl	$32,%edi	/* update destination pointer */
	decl	%ecx		/* decr loop count */
	jnz	Loop

/* Copy last 0-7 words */
Lend:	movl	12(%esp),%ecx
	andl	$7,%ecx
	cld
	rep
	stosl

	popl	%edi
	ret

mzero.c:

#ifndef SIZE
#define SIZE 1000
#endif

#ifndef TIMES
#define TIMES 100000000/SIZE
#endif

long cputime ();

main ()
{
  int d[SIZE];
  int i;
  long t0;

  t0 = cputime ();
  for (i = 0; i < TIMES; i++)
    zero (d, SIZE);
  printf ("zero %ld\n", cputime () - t0);

  t0 = cputime ();
  for (i = 0; i < TIMES; i++)
    memset (d, 0, SIZE * sizeof (int));
  printf ("memset %ld\n", cputime () - t0);

  t0 = cputime ();
  for (i = 0; i < TIMES; i++)
    bzero (d, SIZE * sizeof (int));
  printf ("bzero %ld\n", cputime () - t0);

  exit (0);
}



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199512241148.MAA28761>