From owner-freebsd-current  Fri Apr  5 03:19:25 1996
Return-Path: owner-current
Received: (from root@localhost)
          by freefall.freebsd.org (8.7.3/8.7.3) id DAA11838
          for current-outgoing; Fri, 5 Apr 1996 03:19:25 -0800 (PST)
Received: from silvia.HIP.Berkeley.EDU (silvia.HIP.Berkeley.EDU [136.152.64.181])
          by freefall.freebsd.org (8.7.3/8.7.3) with ESMTP id DAA11833
          Fri, 5 Apr 1996 03:19:20 -0800 (PST)
Received: (from asami@localhost) by silvia.HIP.Berkeley.EDU (8.7.5/8.6.9) id DAA24816; Fri, 5 Apr 1996 03:16:38 -0800 (PST)
Date: Fri, 5 Apr 1996 03:16:38 -0800 (PST)
Message-Id: <199604051116.DAA24816@silvia.HIP.Berkeley.EDU>
To: davidg@root.com
CC: current@freebsd.org, nisha@cs.berkeley.edu, tege@matematik.su.se,
        hasty@rah.star-gate.com, dyson@freebsd.org
In-reply-to: <199604051021.CAA00222@Root.COM> (message from David Greenman on Fri, 05 Apr 1996 02:21:48 -0800)
Subject: Re: fast memory copy for large data sizes
From: asami@cs.berkeley.edu (Satoshi Asami)
Sender: owner-current@freebsd.org
X-Loop: FreeBSD.org
Precedence: bulk

 > I have that mail, tried what was in there, but it wasn't as fast as FP 
 > copies.  Maybe I screwed up something, I'll try again tomorrow.

It wasn't much trouble so I tried it again.  Here's what I got on the
133MHz Pentium:

    size     libc             ours
      32      N/A         30.517578 MB/s
      64  61.035156 MB/s  30.517578 MB/s
     128  40.690104 MB/s  40.690104 MB/s
     256  40.690104 MB/s  34.877232 MB/s
     512  40.690104 MB/s  34.877232 MB/s
    1024  40.690104 MB/s  33.674569 MB/s
    2048  39.859694 MB/s  34.265351 MB/s
    4096  39.859694 MB/s  34.265351 MB/s
    8192  39.657360 MB/s  34.115721 MB/s
   16384  39.556962 MB/s  34.115721 MB/s
   32768  39.506953 MB/s  34.153005 MB/s
   65536  39.531942 MB/s  34.227820 MB/s
  131072  39.345294 MB/s  34.125034 MB/s
  262144  39.227993 MB/s  34.227820 MB/s
  524288  38.735668 MB/s  34.218451 MB/s
 1048576  38.224839 MB/s  34.263003 MB/s
 2097152  37.799323 MB/s  34.270635 MB/s
 4194304  37.700283 MB/s  34.283265 MB/s

Hmm.  I can't even get it to be faster than libc now.  I think I've
seen 40MB/s for large copies before, I don't remember exactly what I
did.

Satoshi

P.S. Here's the "unrolled", pretty much stolen from Torbjorn's mail to
-hackers:

	.align 2
.globl _unrolled
	.type	 _unrolled,@function
_unrolled:
	pushl %ebp
	movl %esp,%ebp
	pushl %edi
	pushl %esi
	movl 8(%ebp),%esi
	movl 12(%ebp),%edi
	movl 16(%ebp),%ecx	/* count is in bytes */

	shrl $5,%ecx
	jz L54
	
	movl (%edi),%eax	/* fetch destination cache line */

	.align 2,0x90
L55:	movl	28(%edi),%eax	/* fetch destination cache line */
	orl	%eax,%eax	/* to make things go in pairs */

	movl	(%esi),%eax	/* load pairwise */
	movl	4(%esi),%edx
	movl	%eax,(%edi)	/* and store pairwise */
	movl	%edx,4(%edi)

	movl	8(%esi),%eax
	movl	12(%esi),%edx
	movl	%eax,8(%edi)
	movl	%edx,12(%edi)

	movl	16(%esi),%eax
	movl	20(%esi),%edx
	movl	%eax,16(%edi)
	movl	%edx,20(%edi)

	movl	24(%esi),%eax
	movl	28(%esi),%edx
	movl	%eax,24(%edi)
	movl	%edx,28(%edi)

	addl	$32,%esi	/* update source pointer */
	addl	$32,%edi	/* update destnation pointer */
	decl	%ecx		/* decr loop count */
	jnz	L55

L54:
	movl 16(%ebp),%ecx
	andl $31,%ecx
	movl %ecx,%edx
	shrl $2,%ecx		/* first copy as much as we can in words */
	cld
	rep
	movsl
	movl %edx,%ecx
	andl $3,%ecx		/* and then up to 3 bytes */
	rep
	movsb

	leal -8(%ebp),%esp
	popl %esi
	popl %edi
	leave
	ret
Lfe6:
	.size	 _unrolled,Lfe6-_unrolled