Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 15 May 1996 03:11:17 -0700 (PDT)
From:      asami@cs.berkeley.edu (Satoshi Asami)
To:        bde@zeta.org.au
Cc:        current@freebsd.org, ccd@stampede.cs.berkeley.edu
Subject:   Re: some more on fast bcopy
Message-ID:  <199605151011.DAA15029@silvia.HIP.Berkeley.EDU>
In-Reply-To: <199605110914.TAA21036@godzilla.zeta.org.au> (message from Bruce Evans on Sat, 11 May 1996 19:14:07 %2B1000)

next in thread | previous in thread | raw e-mail | index | archive | help
 * It isn't even necessary to save the FP registers, at least in
 * non-interrupt handlers, if they aren't already in use.  In particular,
 * copyin() and copyout() are never called from interrupt handlers, so
 * it isn't necessary to preserve the kernel FP registers (they are
 * guaranteed to not be in use).  Only the user FP context needs to be
 * preserved.  This optimization is closely related to fixing the bug.
 * It should start out something like:
 * 
 * 	if (intr_nesting_level > 0) {
 * 		/* Save reentrantly the same as now. */
 * 	} else {
 * 		if (npxproc != NULL) {
 * 			assert(npxproc == curproc);
 * 			fnsave(&curpcb->pcb_savefpu);
 * 			npxproc = NULL;
 * 		}
 * 		/* Now we own the FPU. */
 * 
 * 		/*
 * 		 * The process' FP state is saved in the pcb, but if we get
 * 		 * switched, the cpu_switch() will store our FP state in the
 * 		 * pcb.  It should be possible to avoid all the copying for
 * 		 * this, e.g., by setting a flag to tell cpu_switch() to
 * 		 * save the state somewhere else.
 * 		 */
 * 		tmp = curpcb->pcb_savefpu;
 * 
 * 		stop_emulating();
 * 		npxproc = curproc;
 * 	}
 * 	...
 * 	if (intr_nesting_level > 0)
 * 		/* Restore reentrantly the same as now. */
 * 	} else {
 * 		curpcb->pcb_savefpu = tmp;
 * 		start_emulating();
 * 		npxproc = NULL;
 * 	}

Okay, I implemented this (I think) with the help of gcc -S, but I
still get "FPU not available" panics intermittently (often during
"make install"s).  Here's the diff, can you please check and see if
something's wrong?

(I know 128 is too small and 256 is too large, I have them this way to 
test it "aggressively". :)

===
Index: support.s
===================================================================
RCS file: /usr/cvs/src/sys/i386/i386/support.s,v
retrieving revision 1.35
diff -u -r1.35 support.s
--- support.s	1996/05/03 21:01:00	1.35
+++ support.s	1996/05/15 09:31:42
@@ -453,6 +453,16 @@
 	/* bcopy(%esi, %edi, %ebx) */
 3:
 	movl	%ebx,%ecx
+#ifdef I586_FAST_BCOPY
+	cmpl	$128,%ecx
+	jbe	slow_copyout
+
+	call	fastmove
+	jmp	done_copyout
+
+	ALIGN_TEXT
+slow_copyout:
+#endif /* I586_FAST_BCOPY */
 	shrl	$2,%ecx
 	cld
 	rep
@@ -500,6 +510,16 @@
 	cmpl	$VM_MAXUSER_ADDRESS,%edx
 	ja	copyin_fault
 
+#ifdef I586_FAST_BCOPY
+	cmpl	$128,%ecx
+	jbe	slow_copyin
+
+	call	fastmove
+	jmp	done_copyin
+
+	ALIGN_TEXT
+slow_copyin:
+#endif /* I586_FAST_BCOPY */
 	movb	%cl,%al
 	shrl	$2,%ecx				/* copy longword-wise */
 	cld
@@ -510,6 +530,10 @@
 	rep
 	movsb
 
+#ifdef I586_FAST_BCOPY
+	ALIGN_TEXT
+done_copyin:
+#endif /* I586_FAST_BCOPY */
 	popl	%edi
 	popl	%esi
 	xorl	%eax,%eax
@@ -525,6 +549,252 @@
 	movl	$0,PCB_ONFAULT(%edx)
 	movl	$EFAULT,%eax
 	ret
+
+#ifdef I586_FAST_BCOPY
+/* fastmove(src, dst, len)
+	src in %esi
+	dst in %edi
+	len in %ecx
+	uses %eax and %edx for tmp. storage
+ */
+/* 
+LC0:
+	.ascii	"npxproc == curproc\0"
+LC1:
+	.ascii	"support.s"
+ */
+	ALIGN_TEXT
+fastmove:
+	cmpl	$255,%ecx
+	jbe	fastmove_tail
+
+	testl	$7,%esi	/* check if src addr is multiple of 8 */
+	jnz	fastmove_tail
+
+	testl	$7,%edi	/* check if dst addr is multiple of 8 */
+	jnz	fastmove_tail
+
+	pushl	%ebp
+	movl	%esp,%ebp
+	subl	$176,%esp
+
+/* if (intr_nesting_level > 0) */
+	cmpb	$0,_intr_nesting_level
+	je	L6
+/* save reentrantly */
+	movl	%cr0,%edx
+	clts
+	fnsave	-176(%ebp)
+	jmp L7
+
+/* else { */
+	ALIGN_TEXT
+L6:
+/* if (npxproc != NULL) { */
+	cmpl	$0,_npxproc
+	je	L8
+/*    assert(npxproc == curproc); */
+/*	movl	_npxproc,%eax
+	cmpl	%eax,_curproc
+	je	L6b
+	pushl	LC0
+	pushl	$599
+	pushl	LC1
+	call	___assert
+	addl	$12,%esp
+L6b: */
+/*    fnsave(&curpcb->pcb_savefpu); */
+	movl	_curpcb,%eax
+	fnsave	122(%eax)
+/*   npxproc = NULL; */
+	movl	$0,_npxproc
+/* } */
+L8:
+/* now we own the FPU. */
+
+/*
+ * The process' FP state is saved in the pcb, but if we get
+ * switched, the cpu_switch() will store our FP state in the
+ * pcb.  It should be possible to avoid all the copying for
+ * this, e.g., by setting a flag to tell cpu_switch() to
+ * save the state somewhere else.
+ */
+/* tmp = curpcb->pcb_savefpu; */
+	pushl	%edi
+	pushl	%esi
+	pushl	%ecx
+	leal	-176(%ebp),%edi
+	movl	_curpcb,%esi
+	addl	$112,%esi
+	cld
+	movl	$44,%ecx
+	rep
+	movsl
+	popl	%ecx
+	popl	%esi
+	popl	%edi
+/* stop_emulating(); */
+	clts
+/* npxproc = curproc; */
+	movl	_curproc,%eax
+	movl	%eax,_npxproc
+/* } */
+L7:
+	ALIGN_TEXT
+fastmove_loop:
+	movl	32(%esi),%eax
+	movl	64(%esi),%eax
+	movl	96(%esi),%eax
+	movl	128(%esi),%eax
+	movl	160(%esi),%eax
+	movl	192(%esi),%eax
+	movl	224(%esi),%eax
+
+	cmpl	$259,%ecx
+	jbe	fastmove_tmp
+	movl	256(%esi),%eax
+
+	ALIGN_TEXT
+fastmove_tmp:
+	fildq	0(%esi)
+	fildq	8(%esi)
+	fildq	16(%esi)
+	fildq	24(%esi)
+	fildq	32(%esi)
+	fildq	40(%esi)
+	fildq	48(%esi)
+	fildq	56(%esi)
+	fxch	%st(7)
+	fistpq	0(%edi)
+	fxch	%st(5)
+	fistpq	8(%edi)
+	fxch	%st(3)
+	fistpq	16(%edi)
+	fxch	%st(1)
+	fistpq	24(%edi)
+	fistpq	32(%edi)
+	fistpq	40(%edi)
+	fistpq	48(%edi)
+	fistpq	56(%edi)
+	fildq	64(%esi)
+	fildq	72(%esi)
+	fildq	80(%esi)
+	fildq	88(%esi)
+	fildq	96(%esi)
+	fildq	104(%esi)
+	fildq	112(%esi)
+	fildq	120(%esi)
+	fxch	%st(7)
+	fistpq	64(%edi)
+	fxch	%st(5)
+	fistpq	72(%edi)
+	fxch	%st(3)
+	fistpq	80(%edi)
+	fxch	%st(1)
+	fistpq	88(%edi)
+	fistpq	96(%edi)
+	fistpq	104(%edi)
+	fistpq	112(%edi)
+	fistpq	120(%edi)
+	fildq	128(%esi)
+	fildq	136(%esi)
+	fildq	144(%esi)
+	fildq	152(%esi)
+	fildq	160(%esi)
+	fildq	168(%esi)
+	fildq	176(%esi)
+	fildq	184(%esi)
+	fxch	%st(7)
+	fistpq	128(%edi)
+	fxch	%st(5)
+	fistpq	136(%edi)
+	fxch	%st(3)
+	fistpq	144(%edi)
+	fxch	%st(1)
+	fistpq	152(%edi)
+	fistpq	160(%edi)
+	fistpq	168(%edi)
+	fistpq	176(%edi)
+	fistpq	184(%edi)
+	fildq	192(%esi)
+	fildq	200(%esi)
+	fildq	208(%esi)
+	fildq	216(%esi)
+	fildq	224(%esi)
+	fildq	232(%esi)
+	fildq	240(%esi)
+	fildq	248(%esi)
+	fxch	%st(7)
+	fistpq	192(%edi)
+	fxch	%st(5)
+	fistpq	200(%edi)
+	fxch	%st(3)
+	fistpq	208(%edi)
+	fxch	%st(1)
+	fistpq	216(%edi)
+	fistpq	224(%edi)
+	fistpq	232(%edi)
+	fistpq	240(%edi)
+	fistpq	248(%edi)
+	addl	$-256,%ecx
+	addl	$256,%esi
+	addl	$256,%edi
+	cmpl	$255,%ecx
+	ja	fastmove_loop
+	
+/* if (intr_nesting_level > 0) */
+
+	cmpb	$0,_intr_nesting_level
+	je	L9
+	
+/* Restore reentrantly. */
+	frstor	-176(%ebp)
+	movl	%edx,%cr0
+	jmp	L10
+
+/* else { */
+	ALIGN_TEXT
+L9:
+/* curpcb->pcb_savefpu = tmp; */
+	pushl	%edi
+	pushl	%esi
+	pushl	%ecx
+	movl	_curpcb,%edi
+	addl	$112,%edi
+	leal	-176(%ebp),%esi
+	cld
+	movl	$44,%ecx
+	rep
+	movsl
+	popl	%ecx
+	popl	%esi
+	popl	%edi
+
+/* start_emulating(); */
+	smsw	%ax
+	orb	$8,%al
+	lmsw	%ax
+/* npxproc = NULL; */
+	movl	$0,_npxproc
+/* } */
+L10:
+	movl	%ebp,%esp
+	popl	%ebp
+	
+	ALIGN_TEXT
+fastmove_tail:
+	movb	%cl,%al
+	shrl	$2,%ecx				/* copy longword-wise */
+	cld
+	rep
+	movsl
+	movb	%al,%cl
+	andb	$3,%cl				/* copy remaining bytes */
+	rep
+	movsb
+
+	ret
+#endif /* I586_FAST_BCOPY */
 
 /*
  * fu{byte,sword,word} : fetch a byte (sword, word) from user memory
===

I couldn't find __assert() in the kernel so I commented out that part.

Satoshi



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199605151011.DAA15029>