Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 24 May 1996 16:57:56 -0700
From:      asami@cs.berkeley.edu (Satoshi Asami)
To:        bde@zeta.org.au
Cc:        current@freebsd.org, ccd@stampede.cs.berkeley.edu
Subject:   Re: P6 memory copy speed
Message-ID:  <199605242357.QAA24589@sunrise.cs.berkeley.edu>
In-Reply-To: <199605211949.FAA22504@godzilla.zeta.org.au> (message from Bruce Evans on Wed, 22 May 1996 05:49:23 %2B1000)

next in thread | previous in thread | raw e-mail | index | archive | help
 * >http://stampede.cs.berkeley.edu/~asami/Td/bcopy.html
 * 
 * Please update it with these changes.

Thanks, updated.  You can get to the new tarfile from the same link.

 * I figured out why unrolling helps so much.  You may remember that I said
 :
 * It does help for reading, but this doesn't have much to do with floating
 * point - prefetching a few K using integer registers works just as well.
 * I implemented this.

Thanks.

 * Other changes:
 * - removed fsave/frstor.  These aren't necessary in user mode and require
 *   additional complications in kernel mode.  Now the FP version is competitive
 *   for all sizes and much better for len >= size or 2*size, provided the data
 *   (src or dst) isn't cached (fully cached case: integer speed 400MB/s, FP
 *   speed 140MB/s :-().

 ;)

 * - avoided fxch's.  These didn't cost anything but they made the code
 *   larger.

Ok.

 * *** unroll.c~	Tue May  7 20:12:21 1996
 * --- unroll.c	Wed May 22 04:44:08 1996

This change has been applied.  I also cleaned up the C program for
testing (why was that program soooooooo long to do something so
simple? ;).

You are right, now the FP unroll-64 version is just as fast as much
larger unroll sizes.  Given that, this is the updated patch to
/sys/i386/i386/support.s and trap.c:

===
Index: support.s
===================================================================
RCS file: /usr/cvs/src/sys/i386/i386/support.s,v
retrieving revision 1.35
diff -u -r1.35 support.s
--- 1.35	1996/05/03 21:01:00
+++ support.s	1996/05/24 22:06:19
@@ -453,6 +453,16 @@
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
+#ifdef I586_FAST_BCOPY
+	cmpl	$128,%ecx
+	jbe	slow_copyout
+
+	call	fastmove
+	jmp	done_copyout
+
+	ALIGN_TEXT
+slow_copyout:
+#endif /* I586_FAST_BCOPY */
 	shrl	$2,%ecx
 	cld
 	rep
@@ -500,6 +510,16 @@
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 
+#ifdef I586_FAST_BCOPY
+	cmpl	$128,%ecx
+	jbe	slow_copyin
+
+	call	fastmove
+	jmp	done_copyin
+
+	ALIGN_TEXT
+slow_copyin:
+#endif /* I586_FAST_BCOPY */
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
@@ -510,6 +530,10 @@
 	rep
 	movsb
 
+#ifdef I586_FAST_BCOPY
+	ALIGN_TEXT
+done_copyin:
+#endif /* I586_FAST_BCOPY */
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
@@ -525,6 +549,206 @@
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
+
+#ifdef I586_FAST_BCOPY
+/* fastmove(src, dst, len)
+	src in %esi
+	dst in %edi
+	len in %ecx
+	uses %eax and %edx for tmp. storage
+ */
+/* 
+LC0:
+	.ascii	"npxproc == curproc\0"
+LC1:
+	.ascii	"support.s"
+ */
+	ALIGN_TEXT
+fastmove:
+	cmpl	$63,%ecx
+	jbe	fastmove_tail
+
+	testl	$7,%esi	/* check if src addr is multiple of 8 */
+	jnz	fastmove_tail
+
+	testl	$7,%edi	/* check if dst addr is multiple of 8 */
+	jnz	fastmove_tail
+
+	pushl	%ebp
+	movl	%esp,%ebp
+	subl	$176,%esp
+
+/* if (intr_nesting_level > 0) */
+	cmpb	$0,_intr_nesting_level
+	je	L6
+/* save reentrantly */
+	movl	%cr0,%edx
+	clts
+	fnsave	-176(%ebp)
+	jmp L7
+
+/* else { */
+	ALIGN_TEXT
+L6:
+/* if (npxproc != NULL) { */
+	cmpl	$0,_npxproc
+	je	L8
+/*    assert(npxproc == curproc); */
+/*	movl	_npxproc,%eax
+	cmpl	%eax,_curproc
+	je	L6b
+	pushl	LC0
+	pushl	$599
+	pushl	LC1
+	call	___assert
+	addl	$12,%esp
+L6b: */
+/*    fnsave(&curpcb->pcb_savefpu); */
+	movl	_curpcb,%eax
+	fnsave	112(%eax)
+/*   npxproc = NULL; */
+	movl	$0,_npxproc
+/* } */
+L8:
+/* now we own the FPU. */
+
+/*
+ * The process' FP state is saved in the pcb, but if we get
+ * switched, the cpu_switch() will store our FP state in the
+ * pcb.  It should be possible to avoid all the copying for
+ * this, e.g., by setting a flag to tell cpu_switch() to
+ * save the state somewhere else.
+ */
+/* tmp = curpcb->pcb_savefpu; */
+	pushl	%edi
+	pushl	%esi
+	pushl	%ecx
+	leal	-176(%ebp),%edi
+	movl	_curpcb,%esi
+	addl	$112,%esi
+	cld
+	movl	$44,%ecx
+	rep
+	movsl
+	popl	%ecx
+	popl	%esi
+	popl	%edi
+/* stop_emulating(); */
+	clts
+/* npxproc = curproc; */
+	movl	_curproc,%eax
+	movl	%eax,_npxproc
+/* } */
+L7:
+4:
+	pushl %ecx
+	cmpl $1792,%ecx
+	jbe 2f
+	movl $1792,%ecx
+2:
+	subl %ecx,0(%esp)
+	cmpl $256,%ecx
+	jb 5f
+	pushl %esi
+	pushl %ecx
+	.align 4,0x90
+3:
+	movl 0(%esi),%eax
+	movl 32(%esi),%eax
+	movl 64(%esi),%eax
+	movl 96(%esi),%eax
+	movl 128(%esi),%eax
+	movl 160(%esi),%eax
+	movl 192(%esi),%eax
+	movl 224(%esi),%eax
+	addl $256,%esi
+	subl $256,%ecx
+	cmpl $256,%ecx
+	jae 3b
+	popl %ecx
+	popl %esi
+5:
+	ALIGN_TEXT
+fastmove_loop:
+	fildq	0(%esi)
+	fildq	8(%esi)
+	fildq	16(%esi)
+	fildq	24(%esi)
+	fildq	32(%esi)
+	fildq	40(%esi)
+	fildq	48(%esi)
+	fildq	56(%esi)
+	fistpq 56(%edi)
+	fistpq 48(%edi)
+	fistpq 40(%edi)
+	fistpq 32(%edi)
+	fistpq 24(%edi)
+	fistpq 16(%edi)
+	fistpq 8(%edi)
+	fistpq 0(%edi)
+	addl $-64,%ecx
+	addl $64,%esi
+	addl $64,%edi
+	cmpl $63,%ecx
+	ja fastmove_loop
+	popl %eax
+	addl %eax,%ecx
+	cmpl $64,%ecx
+	jae 4b
+	
+/* if (intr_nesting_level > 0) */
+
+	cmpb	$0,_intr_nesting_level
+	je	L9
+	
+/* Restore reentrantly. */
+	frstor	-176(%ebp)
+	movl	%edx,%cr0
+	jmp	L10
+
+/* else { */
+	ALIGN_TEXT
+L9:
+/* curpcb->pcb_savefpu = tmp; */
+	pushl	%edi
+	pushl	%esi
+	pushl	%ecx
+	movl	_curpcb,%edi
+	addl	$112,%edi
+	leal	-176(%ebp),%esi
+	cld
+	movl	$44,%ecx
+	rep
+	movsl
+	popl	%ecx
+	popl	%esi
+	popl	%edi
+
+/* start_emulating(); */
+	smsw	%ax
+	orb	$8,%al
+	lmsw	%ax
+/* npxproc = NULL; */
+	movl	$0,_npxproc
+/* } */
+L10:
+	movl	%ebp,%esp
+	popl	%ebp
+	
+	ALIGN_TEXT
+fastmove_tail:
+	movb	%cl,%al
+	shrl	$2,%ecx				/* copy longword-wise */
+	cld
+	rep
+	movsl
+	movb	%al,%cl
+	andb	$3,%cl				/* copy remaining bytes */
+	rep
+	movsb
+
+	ret
+#endif /* I586_FAST_BCOPY */
 
 /*
  * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
Index: trap.c
===================================================================
RCS file: /usr/cvs/src/sys/i386/i386/trap.c,v
retrieving revision 1.75
diff -u -r1.75 trap.c
--- 1.75	1996/03/28 05:40:57
+++ trap.c	1996/05/24 20:32:12
@@ -319,6 +319,14 @@
 			(void) trap_pfault(&frame, FALSE);
 			return;
 
+		case T_DNA:
+#if NNPX > 0
+			/* if a transparent fault (due to context switch "late") */
+			if (npxdna())
+				return;
+#endif	/* NNPX > 0 */
+			break;
+
 		case T_PROTFLT:		/* general protection fault */
 		case T_SEGNPFLT:	/* segment not present fault */
 			/*
===

I stuck it in our kernel, and the "read-lseek" loop test showed it's
just as fast as the old unroll-256 version.

 * This is for some FP tests on an ASUS P133.  I've never seen >= 80MB/s
 * except when there were bugs in the tests.  size = 1024 is worse than

That's probably because we have EDO memory.  EDO doesn't seem to make
much difference in real life but for a benchmark that pushes the
memory (and memory only) to the limit, it really shines. ;>

Satoshi



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199605242357.QAA24589>