From owner-freebsd-current  Fri May 10 03:18:08 1996
Return-Path: owner-current
Received: (from root@localhost)
          by freefall.freebsd.org (8.7.3/8.7.3) id DAA08002
          for current-outgoing; Fri, 10 May 1996 03:18:08 -0700 (PDT)
Received: from silvia.HIP.Berkeley.EDU (silvia.HIP.Berkeley.EDU [136.152.64.181])
          by freefall.freebsd.org (8.7.3/8.7.3) with ESMTP id DAA07994
          for <current@freebsd.org>; Fri, 10 May 1996 03:18:00 -0700 (PDT)
Received: (from asami@localhost) by silvia.HIP.Berkeley.EDU (8.7.5/8.6.9) id DAA00972; Fri, 10 May 1996 03:17:39 -0700 (PDT)
Date: Fri, 10 May 1996 03:17:39 -0700 (PDT)
Message-Id: <199605101017.DAA00972@silvia.HIP.Berkeley.EDU>
To: current@freebsd.org
CC: nisha@cs.berkeley.edu
Subject: some more on fast bcopy
From: asami@cs.berkeley.edu (Satoshi Asami)
Sender: owner-current@freebsd.org
X-Loop: FreeBSD.org
Precedence: bulk

Just to let you know that I haven't forgetten the bcopy project amid
the excitement of all the new disks, here's the current patch we are
using:

===
Index: support.s
===================================================================
RCS file: /usr/cvs/src/sys/i386/i386/support.s,v
retrieving revision 1.35
diff -u -r1.35 support.s
--- support.s	1996/05/03 21:01:00	1.35
+++ support.s	1996/05/10 09:59:57
@@ -453,6 +453,16 @@
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
+#ifdef I586_FAST_BCOPY
+	cmpl	$128,%ecx
+	jbe	slow_copyout
+
+	call	fastmove
+	jmp	done_copyout
+
+	ALIGN_TEXT
+slow_copyout:
+#endif /* I586_FAST_BCOPY */
 	shrl	$2,%ecx
 	cld
 	rep
@@ -500,6 +510,16 @@
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 
+#ifdef I586_FAST_BCOPY
+	cmpl	$128,%ecx
+	jbe	slow_copyin
+
+	call	fastmove
+	jmp	done_copyin
+
+	ALIGN_TEXT
+slow_copyin:
+#endif /* I586_FAST_BCOPY */
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
@@ -510,6 +530,10 @@
 	rep
 	movsb
 
+#ifdef I586_FAST_BCOPY
+	ALIGN_TEXT
+done_copyin:
+#endif /* I586_FAST_BCOPY */
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
@@ -525,6 +549,152 @@
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
+
+#ifdef I586_FAST_BCOPY
+/* fastmove(src, dst, len)
+	src in %esi
+	dst in %edi
+	len in %ecx
+	uses %eax and %edx for tmp. storage
+ */
+	ALIGN_TEXT
+fastmove:
+	cmpl $255,%ecx
+	jbe fastmove_tail
+
+	movl %esi,%eax
+	andl $7,%eax	/* check if src addr is multiple of 8 */
+	jnz fastmove_tail
+
+	movl %edi,%eax
+	andl $7,%eax	/* check if dst addr is multiple of 8 */
+	jnz fastmove_tail
+
+	subl $108,%esp
+	movl %cr0,%edx
+	clts
+	fnsave (%esp)
+
+	ALIGN_TEXT
+fastmove_loop:
+	movl 32(%esi),%eax
+	movl 64(%esi),%eax
+	movl 96(%esi),%eax
+	movl 128(%esi),%eax
+	movl 160(%esi),%eax
+	movl 192(%esi),%eax
+	movl 224(%esi),%eax
+
+	cmpl $259,%ecx
+	jbe fastmove_tmp
+	movl 256(%esi),%eax
+
+	ALIGN_TEXT
+fastmove_tmp:
+	fildq 0(%esi)
+	fildq 8(%esi)
+	fildq 16(%esi)
+	fildq 24(%esi)
+	fildq 32(%esi)
+	fildq 40(%esi)
+	fildq 48(%esi)
+	fildq 56(%esi)
+	fxch %st(7)
+	fistpq 0(%edi)
+	fxch %st(5)
+	fistpq 8(%edi)
+	fxch %st(3)
+	fistpq 16(%edi)
+	fxch %st(1)
+	fistpq 24(%edi)
+	fistpq 32(%edi)
+	fistpq 40(%edi)
+	fistpq 48(%edi)
+	fistpq 56(%edi)
+	fildq 64(%esi)
+	fildq 72(%esi)
+	fildq 80(%esi)
+	fildq 88(%esi)
+	fildq 96(%esi)
+	fildq 104(%esi)
+	fildq 112(%esi)
+	fildq 120(%esi)
+	fxch %st(7)
+	fistpq 64(%edi)
+	fxch %st(5)
+	fistpq 72(%edi)
+	fxch %st(3)
+	fistpq 80(%edi)
+	fxch %st(1)
+	fistpq 88(%edi)
+	fistpq 96(%edi)
+	fistpq 104(%edi)
+	fistpq 112(%edi)
+	fistpq 120(%edi)
+	fildq 128(%esi)
+	fildq 136(%esi)
+	fildq 144(%esi)
+	fildq 152(%esi)
+	fildq 160(%esi)
+	fildq 168(%esi)
+	fildq 176(%esi)
+	fildq 184(%esi)
+	fxch %st(7)
+	fistpq 128(%edi)
+	fxch %st(5)
+	fistpq 136(%edi)
+	fxch %st(3)
+	fistpq 144(%edi)
+	fxch %st(1)
+	fistpq 152(%edi)
+	fistpq 160(%edi)
+	fistpq 168(%edi)
+	fistpq 176(%edi)
+	fistpq 184(%edi)
+	fildq 192(%esi)
+	fildq 200(%esi)
+	fildq 208(%esi)
+	fildq 216(%esi)
+	fildq 224(%esi)
+	fildq 232(%esi)
+	fildq 240(%esi)
+	fildq 248(%esi)
+	fxch %st(7)
+	fistpq 192(%edi)
+	fxch %st(5)
+	fistpq 200(%edi)
+	fxch %st(3)
+	fistpq 208(%edi)
+	fxch %st(1)
+	fistpq 216(%edi)
+	fistpq 224(%edi)
+	fistpq 232(%edi)
+	fistpq 240(%edi)
+	fistpq 248(%edi)
+	addl $-256,%ecx
+	addl $256,%esi
+	addl $256,%edi
+	cmpl $255,%ecx
+	ja fastmove_loop
+
+	frstor (%esp)
+	movl %edx,%cr0
+	addl $108,%esp
+
+	ALIGN_TEXT
+fastmove_tail:
+	movb	%cl,%al
+	shrl	$2,%ecx				/* copy longword-wise */
+	cld
+	rep
+	movsl
+	movb	%al,%cl
+	andb	$3,%cl				/* copy remaining bytes */
+	rep
+	movsb
+	
+	ret
+#endif /* I586_FAST_BCOPY */
 
 /*
  * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
===

As you can see, everything is conditionalized on

  options "I586_FAST_BCOPY"

(quotes essential) in your kernel config file.

Bruce said we shouldn't try to unroll it too much but it's less than
500 bytes and there was quite a large drop between 256 and 128 on our
system so I tried a little agressively.  (The latest summary is on
"http://stampede.cs.berkeley.edu/Td/bcopy.html", in case you have been 
hybernating the past few days.)

The net speedup is pretty impressive for large sequential I/O, as
repeated reads from the disk cache went up from 50MB/s to 80MB/s, and
the disk array can now deliver 27MB/s instead of 21MB/s.  (This is on
the system which gave us 40MB/s (libc) and 80MB/s (ours) on the
user-level test program.)

We have been running systems with this for a few days, on both 
-current and -stable (although the patch doesn't apply as is to
-stable -- minor tweaks necessary), and have done a few make worlds.
The system sometimes crashes under heavy load for no good reason (page
fault in kernel and stuff) but I don't know if it's due to the general
instability or the bcopy enhancements.   I'll back it out from one of
the systems and try stressing it a bit.  At least we aren't seeing any
file corruptions (yet).

If someone out there has an EXPENDABLE (insert Satoshi's serious face
here) Pentium system and want to contribute this grand project of
trying to outrun P6's with el-cheapo Pentium systems, please give it a
try.

Satoshi