From owner-freebsd-hackers Sun Dec 24 03:48:11 1995 Return-Path: owner-hackers Received: (from root@localhost) by freefall.freebsd.org (8.7.3/8.7.3) id DAA24278 for hackers-outgoing; Sun, 24 Dec 1995 03:48:11 -0800 (PST) Received: from insanus.matematik.su.se (insanus.matematik.su.se [130.237.198.12]) by freefall.freebsd.org (8.7.3/8.7.3) with ESMTP id DAA24273 for ; Sun, 24 Dec 1995 03:48:08 -0800 (PST) Received: from localhost (prudens.matematik.su.se [130.237.198.5]) by insanus.matematik.su.se (8.7.1/8.6.9) with ESMTP id MAA28761 for ; Sun, 24 Dec 1995 12:48:07 +0100 (MET) Message-Id: <199512241148.MAA28761@insanus.matematik.su.se> X-Address: Department of Mathematics, Stockholm University S-106 91 Stockholm SWEDEN X-Phone: int+46 8 162000 X-Fax: int+46 8 6126717 X-Url: http://www.matematik.su.se To: freebsd-hackers@freebsd.org Subject: Better bzero Date: Sun, 24 Dec 1995 12:48:06 +0100 From: Torbjorn Granlund Sender: owner-hackers@freebsd.org Precedence: bulk Here is a bzero/memset that uses the same trick as the previously posted memcpy. noisy> gcc mzero.c zero.S src/lib/cputime.c -O -DSIZE=1000 noisy> a.out zero 1005 memset 8436 bzero 8424 noisy> gcc mzero.c zero.S src/lib/cputime.c -O -DSIZE=10000 noisy> a.out zero 5605 memset 8341 bzero 8349 Note that the code wins significantly even when the L1 cache is too small for the data set. The mere 5x (actually 4.89x) improvement measured on my system has been improved to a more decent 8x. :-) .text .align 4 .globl _zero _zero: pushl %edi movl 8(%esp),%edi /* destination pointer */ movl 12(%esp),%ecx /* size (in 32-bit words) */ xorl %eax,%eax /* for memset, we'd copy byte to all of eax */ shrl $3,%ecx /* count for unrolled loop */ jz Lend /* if zero, skip unrolled loop */ movl (%edi),%edx /* Fetch destination cache line */ .align 2,0x90 /* supply 0x90 for broken assemblers */ Loop: movl 28(%edi),%edx /* allocate cache line for destination */ nop /* we want these two insn to pair! */ movl %eax,(%edi) /* store words pairwise */ movl %eax,4(%edi) movl %eax,8(%edi) movl %eax,12(%edi) movl %eax,16(%edi) movl %eax,20(%edi) movl %eax,24(%edi) movl %eax,28(%edi) addl $32,%edi /* update destination pointer */ decl %ecx /* decr loop count */ jnz Loop /* Copy last 0-7 words */ Lend: movl 12(%esp),%ecx andl $7,%ecx cld rep stosl popl %edi ret mzero.c: #ifndef SIZE #define SIZE 1000 #endif #ifndef TIMES #define TIMES 100000000/SIZE #endif long cputime (); main () { int d[SIZE]; int i; long t0; t0 = cputime (); for (i = 0; i < TIMES; i++) zero (d, SIZE); printf ("zero %ld\n", cputime () - t0); t0 = cputime (); for (i = 0; i < TIMES; i++) memset (d, 0, SIZE * sizeof (int)); printf ("memset %ld\n", cputime () - t0); t0 = cputime (); for (i = 0; i < TIMES; i++) bzero (d, SIZE * sizeof (int)); printf ("bzero %ld\n", cputime () - t0); exit (0); }