From owner-freebsd-current Fri May 10 03:18:08 1996 Return-Path: owner-current Received: (from root@localhost) by freefall.freebsd.org (8.7.3/8.7.3) id DAA08002 for current-outgoing; Fri, 10 May 1996 03:18:08 -0700 (PDT) Received: from silvia.HIP.Berkeley.EDU (silvia.HIP.Berkeley.EDU [136.152.64.181]) by freefall.freebsd.org (8.7.3/8.7.3) with ESMTP id DAA07994 for ; Fri, 10 May 1996 03:18:00 -0700 (PDT) Received: (from asami@localhost) by silvia.HIP.Berkeley.EDU (8.7.5/8.6.9) id DAA00972; Fri, 10 May 1996 03:17:39 -0700 (PDT) Date: Fri, 10 May 1996 03:17:39 -0700 (PDT) Message-Id: <199605101017.DAA00972@silvia.HIP.Berkeley.EDU> To: current@freebsd.org CC: nisha@cs.berkeley.edu Subject: some more on fast bcopy From: asami@cs.berkeley.edu (Satoshi Asami) Sender: owner-current@freebsd.org X-Loop: FreeBSD.org Precedence: bulk Just to let you know that I haven't forgetten the bcopy project amid the excitement of all the new disks, here's the current patch we are using: === Index: support.s =================================================================== RCS file: /usr/cvs/src/sys/i386/i386/support.s,v retrieving revision 1.35 diff -u -r1.35 support.s --- support.s 1996/05/03 21:01:00 1.35 +++ support.s 1996/05/10 09:59:57 @@ -453,6 +453,16 @@ /* bcopy(%esi, %edi, %ebx) */ 3: movl %ebx,%ecx +#ifdef I586_FAST_BCOPY + cmpl $128,%ecx + jbe slow_copyout + + call fastmove + jmp done_copyout + + ALIGN_TEXT +slow_copyout: +#endif /* I586_FAST_BCOPY */ shrl $2,%ecx cld rep @@ -500,6 +510,16 @@ cmpl $VM_MAXUSER_ADDRESS,%edx ja copyin_fault +#ifdef I586_FAST_BCOPY + cmpl $128,%ecx + jbe slow_copyin + + call fastmove + jmp done_copyin + + ALIGN_TEXT +slow_copyin: +#endif /* I586_FAST_BCOPY */ movb %cl,%al shrl $2,%ecx /* copy longword-wise */ cld @@ -510,6 +530,10 @@ rep movsb +#ifdef I586_FAST_BCOPY + ALIGN_TEXT +done_copyin: +#endif /* I586_FAST_BCOPY */ popl %edi popl %esi xorl %eax,%eax @@ -525,6 +549,152 @@ movl $0,PCB_ONFAULT(%edx) movl $EFAULT,%eax ret + +#ifdef I586_FAST_BCOPY +/* fastmove(src, dst, len) + src in %esi + dst in %edi + len in %ecx + uses %eax and %edx for tmp. storage + */ + ALIGN_TEXT +fastmove: + cmpl $255,%ecx + jbe fastmove_tail + + movl %esi,%eax + andl $7,%eax /* check if src addr is multiple of 8 */ + jnz fastmove_tail + + movl %edi,%eax + andl $7,%eax /* check if dst addr is multiple of 8 */ + jnz fastmove_tail + + subl $108,%esp + movl %cr0,%edx + clts + fnsave (%esp) + + ALIGN_TEXT +fastmove_loop: + movl 32(%esi),%eax + movl 64(%esi),%eax + movl 96(%esi),%eax + movl 128(%esi),%eax + movl 160(%esi),%eax + movl 192(%esi),%eax + movl 224(%esi),%eax + + cmpl $259,%ecx + jbe fastmove_tmp + movl 256(%esi),%eax + + ALIGN_TEXT +fastmove_tmp: + fildq 0(%esi) + fildq 8(%esi) + fildq 16(%esi) + fildq 24(%esi) + fildq 32(%esi) + fildq 40(%esi) + fildq 48(%esi) + fildq 56(%esi) + fxch %st(7) + fistpq 0(%edi) + fxch %st(5) + fistpq 8(%edi) + fxch %st(3) + fistpq 16(%edi) + fxch %st(1) + fistpq 24(%edi) + fistpq 32(%edi) + fistpq 40(%edi) + fistpq 48(%edi) + fistpq 56(%edi) + fildq 64(%esi) + fildq 72(%esi) + fildq 80(%esi) + fildq 88(%esi) + fildq 96(%esi) + fildq 104(%esi) + fildq 112(%esi) + fildq 120(%esi) + fxch %st(7) + fistpq 64(%edi) + fxch %st(5) + fistpq 72(%edi) + fxch %st(3) + fistpq 80(%edi) + fxch %st(1) + fistpq 88(%edi) + fistpq 96(%edi) + fistpq 104(%edi) + fistpq 112(%edi) + fistpq 120(%edi) + fildq 128(%esi) + fildq 136(%esi) + fildq 144(%esi) + fildq 152(%esi) + fildq 160(%esi) + fildq 168(%esi) + fildq 176(%esi) + fildq 184(%esi) + fxch %st(7) + fistpq 128(%edi) + fxch %st(5) + fistpq 136(%edi) + fxch %st(3) + fistpq 144(%edi) + fxch %st(1) + fistpq 152(%edi) + fistpq 160(%edi) + fistpq 168(%edi) + fistpq 176(%edi) + fistpq 184(%edi) + fildq 192(%esi) + fildq 200(%esi) + fildq 208(%esi) + fildq 216(%esi) + fildq 224(%esi) + fildq 232(%esi) + fildq 240(%esi) + fildq 248(%esi) + fxch %st(7) + fistpq 192(%edi) + fxch %st(5) + fistpq 200(%edi) + fxch %st(3) + fistpq 208(%edi) + fxch %st(1) + fistpq 216(%edi) + fistpq 224(%edi) + fistpq 232(%edi) + fistpq 240(%edi) + fistpq 248(%edi) + addl $-256,%ecx + addl $256,%esi + addl $256,%edi + cmpl $255,%ecx + ja fastmove_loop + + frstor (%esp) + movl %edx,%cr0 + addl $108,%esp + + ALIGN_TEXT +fastmove_tail: + movb %cl,%al + shrl $2,%ecx /* copy longword-wise */ + cld + rep + movsl + movb %al,%cl + andb $3,%cl /* copy remaining bytes */ + rep + movsb + + ret +#endif /* I586_FAST_BCOPY */ /* * fu{byte,sword,word} : fetch a byte (sword, word) from user memory === As you can see, everything is conditionalized on options "I586_FAST_BCOPY" (quotes essential) in your kernel config file. Bruce said we shouldn't try to unroll it too much but it's less than 500 bytes and there was quite a large drop between 256 and 128 on our system so I tried a little agressively. (The latest summary is on "http://stampede.cs.berkeley.edu/Td/bcopy.html", in case you have been hybernating the past few days.) The net speedup is pretty impressive for large sequential I/O, as repeated reads from the disk cache went up from 50MB/s to 80MB/s, and the disk array can now deliver 27MB/s instead of 21MB/s. (This is on the system which gave us 40MB/s (libc) and 80MB/s (ours) on the user-level test program.) We have been running systems with this for a few days, on both -current and -stable (although the patch doesn't apply as is to -stable -- minor tweaks necessary), and have done a few make worlds. The system sometimes crashes under heavy load for no good reason (page fault in kernel and stuff) but I don't know if it's due to the general instability or the bcopy enhancements. I'll back it out from one of the systems and try stressing it a bit. At least we aren't seeing any file corruptions (yet). If someone out there has an EXPENDABLE (insert Satoshi's serious face here) Pentium system and want to contribute this grand project of trying to outrun P6's with el-cheapo Pentium systems, please give it a try. Satoshi