Date: Thu, 16 May 1996 02:39:57 -0700 (PDT) From: asami@cs.berkeley.edu (Satoshi Asami) To: bde@zeta.org.au Cc: bde@zeta.org.au, ccd@stampede.cs.berkeley.edu, current@freebsd.org Subject: Re: some more on fast bcopy Message-ID: <199605160939.CAA00809@silvia.HIP.Berkeley.EDU> In-Reply-To: <199605151358.XAA22996@godzilla.zeta.org.au> (message from Bruce Evans on Wed, 15 May 1996 23:58:45 %2B1000)
next in thread | previous in thread | raw e-mail | index | archive | help
* It should work better with a kernel mode trap handler for "FPU not
* available" :-). Add a T_DNA case for kernel mode in trap() by copying
* the first 5 lines from the user mode T_DNA case (`break;' if npxdna()
* doesn't handle the panic (this "can't happen")).
OK, I implemented it. Will this work?
===
Index: trap.c
===================================================================
RCS file: /usr/cvs/src/sys/i386/i386/trap.c,v
retrieving revision 1.75
diff -u -r1.75 trap.c
--- trap.c 1996/03/28 05:40:57 1.75
+++ trap.c 1996/05/15 21:30:27
@@ -319,6 +319,14 @@
(void) trap_pfault(&frame, FALSE);
return;
+ case T_DNA:
+#if NNPX > 0
+ /* if a transparent fault (due to context switch "late") */
+ if (npxdna())
+ return;
+#endif /* NNPX > 0 */
+ break;
+
case T_PROTFLT: /* general protection fault */
case T_SEGNPFLT: /* segment not present fault */
/*
===
Also, I found a typo in my last patch. The "fnsave 122(%eax)" should
have been "fnsave 112(%eax)". (That's what you get by writing
assembly language when your hamster is scratching her cage.)
Here's the (correct) patch to support.s:
===
Index: support.s
===================================================================
RCS file: /usr/cvs/src/sys/i386/i386/support.s,v
retrieving revision 1.35
diff -u -r1.35 support.s
--- support.s 1996/05/03 21:01:00 1.35
+++ support.s 1996/05/16 08:18:59
@@ -453,6 +453,16 @@
/* bcopy(%esi, %edi, %ebx) */
3:
movl %ebx,%ecx
+#ifdef I586_FAST_BCOPY
+ cmpl $128,%ecx
+ jbe slow_copyout
+
+ call fastmove
+ jmp done_copyout
+
+ ALIGN_TEXT
+slow_copyout:
+#endif /* I586_FAST_BCOPY */
shrl $2,%ecx
cld
rep
@@ -500,6 +510,16 @@
cmpl $VM_MAXUSER_ADDRESS,%edx
ja copyin_fault
+#ifdef I586_FAST_BCOPY
+ cmpl $128,%ecx
+ jbe slow_copyin
+
+ call fastmove
+ jmp done_copyin
+
+ ALIGN_TEXT
+slow_copyin:
+#endif /* I586_FAST_BCOPY */
movb %cl,%al
shrl $2,%ecx /* copy longword-wise */
cld
@@ -510,6 +530,10 @@
rep
movsb
+#ifdef I586_FAST_BCOPY
+ ALIGN_TEXT
+done_copyin:
+#endif /* I586_FAST_BCOPY */
popl %edi
popl %esi
xorl %eax,%eax
@@ -525,6 +549,252 @@
movl $0,PCB_ONFAULT(%edx)
movl $EFAULT,%eax
ret
+
+#ifdef I586_FAST_BCOPY
+/* fastmove(src, dst, len)
+ src in %esi
+ dst in %edi
+ len in %ecx
+ uses %eax and %edx for tmp. storage
+ */
+/*
+LC0:
+ .ascii "npxproc == curproc\0"
+LC1:
+ .ascii "support.s"
+ */
+ ALIGN_TEXT
+fastmove:
+ cmpl $255,%ecx
+ jbe fastmove_tail
+
+ testl $7,%esi /* check if src addr is multiple of 8 */
+ jnz fastmove_tail
+
+ testl $7,%edi /* check if dst addr is multiple of 8 */
+ jnz fastmove_tail
+
+ pushl %ebp
+ movl %esp,%ebp
+ subl $176,%esp
+
+/* if (intr_nesting_level > 0) */
+ cmpb $0,_intr_nesting_level
+ je L6
+/* save reentrantly */
+ movl %cr0,%edx
+ clts
+ fnsave -176(%ebp)
+ jmp L7
+
+/* else { */
+ ALIGN_TEXT
+L6:
+/* if (npxproc != NULL) { */
+ cmpl $0,_npxproc
+ je L8
+/* assert(npxproc == curproc); */
+/* movl _npxproc,%eax
+ cmpl %eax,_curproc
+ je L6b
+ pushl LC0
+ pushl $599
+ pushl LC1
+ call ___assert
+ addl $12,%esp
+L6b: */
+/* fnsave(&curpcb->pcb_savefpu); */
+ movl _curpcb,%eax
+ fnsave 112(%eax)
+/* npxproc = NULL; */
+ movl $0,_npxproc
+/* } */
+L8:
+/* now we own the FPU. */
+
+/*
+ * The process' FP state is saved in the pcb, but if we get
+ * switched, the cpu_switch() will store our FP state in the
+ * pcb. It should be possible to avoid all the copying for
+ * this, e.g., by setting a flag to tell cpu_switch() to
+ * save the state somewhere else.
+ */
+/* tmp = curpcb->pcb_savefpu; */
+ pushl %edi
+ pushl %esi
+ pushl %ecx
+ leal -176(%ebp),%edi
+ movl _curpcb,%esi
+ addl $112,%esi
+ cld
+ movl $44,%ecx
+ rep
+ movsl
+ popl %ecx
+ popl %esi
+ popl %edi
+/* stop_emulating(); */
+ clts
+/* npxproc = curproc; */
+ movl _curproc,%eax
+ movl %eax,_npxproc
+/* } */
+L7:
+ ALIGN_TEXT
+fastmove_loop:
+ movl 32(%esi),%eax
+ movl 64(%esi),%eax
+ movl 96(%esi),%eax
+ movl 128(%esi),%eax
+ movl 160(%esi),%eax
+ movl 192(%esi),%eax
+ movl 224(%esi),%eax
+
+ cmpl $259,%ecx
+ jbe fastmove_tmp
+ movl 256(%esi),%eax
+
+ ALIGN_TEXT
+fastmove_tmp:
+ fildq 0(%esi)
+ fildq 8(%esi)
+ fildq 16(%esi)
+ fildq 24(%esi)
+ fildq 32(%esi)
+ fildq 40(%esi)
+ fildq 48(%esi)
+ fildq 56(%esi)
+ fxch %st(7)
+ fistpq 0(%edi)
+ fxch %st(5)
+ fistpq 8(%edi)
+ fxch %st(3)
+ fistpq 16(%edi)
+ fxch %st(1)
+ fistpq 24(%edi)
+ fistpq 32(%edi)
+ fistpq 40(%edi)
+ fistpq 48(%edi)
+ fistpq 56(%edi)
+ fildq 64(%esi)
+ fildq 72(%esi)
+ fildq 80(%esi)
+ fildq 88(%esi)
+ fildq 96(%esi)
+ fildq 104(%esi)
+ fildq 112(%esi)
+ fildq 120(%esi)
+ fxch %st(7)
+ fistpq 64(%edi)
+ fxch %st(5)
+ fistpq 72(%edi)
+ fxch %st(3)
+ fistpq 80(%edi)
+ fxch %st(1)
+ fistpq 88(%edi)
+ fistpq 96(%edi)
+ fistpq 104(%edi)
+ fistpq 112(%edi)
+ fistpq 120(%edi)
+ fildq 128(%esi)
+ fildq 136(%esi)
+ fildq 144(%esi)
+ fildq 152(%esi)
+ fildq 160(%esi)
+ fildq 168(%esi)
+ fildq 176(%esi)
+ fildq 184(%esi)
+ fxch %st(7)
+ fistpq 128(%edi)
+ fxch %st(5)
+ fistpq 136(%edi)
+ fxch %st(3)
+ fistpq 144(%edi)
+ fxch %st(1)
+ fistpq 152(%edi)
+ fistpq 160(%edi)
+ fistpq 168(%edi)
+ fistpq 176(%edi)
+ fistpq 184(%edi)
+ fildq 192(%esi)
+ fildq 200(%esi)
+ fildq 208(%esi)
+ fildq 216(%esi)
+ fildq 224(%esi)
+ fildq 232(%esi)
+ fildq 240(%esi)
+ fildq 248(%esi)
+ fxch %st(7)
+ fistpq 192(%edi)
+ fxch %st(5)
+ fistpq 200(%edi)
+ fxch %st(3)
+ fistpq 208(%edi)
+ fxch %st(1)
+ fistpq 216(%edi)
+ fistpq 224(%edi)
+ fistpq 232(%edi)
+ fistpq 240(%edi)
+ fistpq 248(%edi)
+ addl $-256,%ecx
+ addl $256,%esi
+ addl $256,%edi
+ cmpl $255,%ecx
+ ja fastmove_loop
+
+/* if (intr_nesting_level > 0) */
+
+ cmpb $0,_intr_nesting_level
+ je L9
+
+/* Restore reentrantly. */
+ frstor -176(%ebp)
+ movl %edx,%cr0
+ jmp L10
+
+/* else { */
+ ALIGN_TEXT
+L9:
+/* curpcb->pcb_savefpu = tmp; */
+ pushl %edi
+ pushl %esi
+ pushl %ecx
+ movl _curpcb,%edi
+ addl $112,%edi
+ leal -176(%ebp),%esi
+ cld
+ movl $44,%ecx
+ rep
+ movsl
+ popl %ecx
+ popl %esi
+ popl %edi
+
+/* start_emulating(); */
+ smsw %ax
+ orb $8,%al
+ lmsw %ax
+/* npxproc = NULL; */
+ movl $0,_npxproc
+/* } */
+L10:
+ movl %ebp,%esp
+ popl %ebp
+
+ ALIGN_TEXT
+fastmove_tail:
+ movb %cl,%al
+ shrl $2,%ecx /* copy longword-wise */
+ cld
+ rep
+ movsl
+ movb %al,%cl
+ andb $3,%cl /* copy remaining bytes */
+ rep
+ movsb
+
+ ret
+#endif /* I586_FAST_BCOPY */
/*
* fu{byte,sword,word} : fetch a byte (sword, word) from user memory
===
* When you get everything
* working, add some tests and flags to restore the panic if an T_DNA
* trap occurs unexpectedly (set a flag FP_KERNEL_USING_FP in pcb->pcbflags
* while the kernel is using FP in fastmove() or elsewhere, and `break;'
* for the kernel T_DNA case if this flag isn't set).
I'm not sure if I understand this, but let me try to digest it a
little more over the weekend. ;)
* Also, the `intr_nesting_level > 0' case needs to preserve the TS bit.
* This case probably hasn't been executed yet (it will be executed when
* bcopy() uses fastmove() and an interrupt handler calls bcopy()).
Oh, really? I thought it was saved via %edx.
Anyway, I think I've got it working here. I have run several make
worlds on a few machines and they don't seem to crack. I also have it
on my home machine and am running all sorts of things from ghostscript
to top and nothing suspicious has happened so far. (I saw a few
processes die with SIGFPE before I fixed the fnsave bug.)
If some brave soul out there can test this, it will be great. Just
apply the above patch to a -current /sys/i386/i386/ and rebuild your
kernel with 'options "I586_FAST_BCOPY"'.
No guarantees, though. Just because it works here doesn't mean it
won't eat your root partition for lunch. :]
Satoshi
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199605160939.CAA00809>
