From owner-freebsd-current@FreeBSD.ORG  Thu Apr 17 13:40:19 2003
Return-Path: <owner-freebsd-current@FreeBSD.ORG>
Delivered-To: freebsd-current@freebsd.org
Received: from mx1.FreeBSD.org (mx1.freebsd.org [216.136.204.125])
	by hub.freebsd.org (Postfix) with ESMTP
	id 2030F37B401; Thu, 17 Apr 2003 13:40:19 -0700 (PDT)
Received: from mailman.zeta.org.au (mailman.zeta.org.au [203.26.10.16])
	by mx1.FreeBSD.org (Postfix) with ESMTP
	id A667343FBD; Thu, 17 Apr 2003 13:40:17 -0700 (PDT)
	(envelope-from bde@zeta.org.au)
Received: from katana.zip.com.au (katana.zip.com.au [61.8.7.246])
	by mailman.zeta.org.au (8.9.3/8.8.7) with ESMTP id GAA04316;
	Fri, 18 Apr 2003 06:40:03 +1000
Date: Fri, 18 Apr 2003 06:40:02 +1000 (EST)
From: Bruce Evans <bde@zeta.org.au>
X-X-Sender: bde@gamplex.bde.org
To: Ian Dowse <iedowse@maths.tcd.ie>
In-Reply-To: <200304171722.aa13792@salmon.maths.tcd.ie>
Message-ID: <20030418061359.B9905@gamplex.bde.org>
References: <200304171722.aa13792@salmon.maths.tcd.ie>
MIME-Version: 1.0
Content-Type: TEXT/PLAIN; charset=US-ASCII
cc: Nate Lawson <nate@root.org>
cc: current@freebsd.org
cc: Andrew Gallatin <gallatin@cs.duke.edu>
Subject: Re: Your locking and rman changes to pci/if_* 
X-BeenThere: freebsd-current@freebsd.org
X-Mailman-Version: 2.1.1
Precedence: list
List-Id: Discussions about the use of FreeBSD-current
	<freebsd-current.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/freebsd-current>,
	<mailto:freebsd-current-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/freebsd-current>
List-Post: <mailto:freebsd-current@freebsd.org>
List-Help: <mailto:freebsd-current-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/freebsd-current>,
	<mailto:freebsd-current-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Thu, 17 Apr 2003 20:40:19 -0000

On Thu, 17 Apr 2003, Ian Dowse wrote:

> It looks BTW, as if we convert some kernel page faults into witness
> panics, which is not so good... I think it is limited to cases where
> we page fault without Giant, but with a spin lock held (in this
> case callout_lock).

The i386 pagefault handler attempts to handle pagefaults on user address
in kernel mode.  This is dubious in RELENG_4 and just broken in -current
since there is a PROC_LOCK() is always attempted and is sometimes fatal.
The broken cases that I saw involved accesses to the stack gap.  The page
fault could be handled in this case, but it was fatal if enough error
checking was enabled because the process didn't know what it was doing
and held PROC_LOCK() (which is non-recursive).  Your cleanup of some
stack gaps fixed the main offenders.  A page fault while a spin lock is
held is more broken.

My version of trap_pfault() doesn't permit these accesses.  See near
"goto maygo" in the patch.  It is also supposed to not permit page
faults in critical regions, but this is a little broken (it's missing
at least a td_critnest check).  See near the start of the patch.  The
error handling for both cases works much better than panics deep in
witness, at least if DDB is configured.  ddb gets control at the
faulting instruction just like it does for null pointer accesses.

%%%
Index: trap.c
===================================================================
RCS file: /home/ncvs/src/sys/i386/i386/trap.c,v
retrieving revision 1.237
diff -u -2 -r1.237 trap.c
--- trap.c	7 Nov 2002 01:34:23 -0000	1.237
+++ trap.c	22 Nov 2002 17:14:31 -0000
@@ -677,6 +724,6 @@
 {
 	vm_offset_t va;
-	struct vmspace *vm = NULL;
-	vm_map_t map = 0;
+	struct vmspace *vm;
+	vm_map_t map;
 	int rv = 0;
 	vm_prot_t ftype;
@@ -684,8 +731,18 @@
 	struct proc *p = td->td_proc;

+	if (/* XXX intr proc || */ td == PCPU_GET(idlethread) ||
+	    td->td_intr_nesting_level != 0 || (frame->tf_eflags & PSL_I) == 0) {
+		Debugger("impossible pfault");
+		trap_fatal(frame, eva);
+		return (-1);
+	}
+
 	va = trunc_page(eva);
 	if (va >= KERNBASE) {
 		/*
 		 * Don't allow user-mode faults in kernel address space.
+		 */
+#if defined(I586_CPU) && !defined(NO_F00F_HACK)
+		/*
 		 * An exception:  if the faulting address is the invalid
 		 * instruction entry in the IDT, then the Intel Pentium
@@ -694,7 +751,6 @@
 		 * fault.
 		 */
-#if defined(I586_CPU) && !defined(NO_F00F_HACK)
-		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
-			return -2;
+		if (eva == (vm_offset_t)&idt[6] && has_f00f_bug)
+			return (-2);
 #endif
 		if (usermode)
@@ -705,10 +761,26 @@
 		/*
 		 * This is a fault on non-kernel virtual memory.
-		 * vm is initialized above to NULL. If curproc is NULL
-		 * or curproc->p_vmspace is NULL the fault is fatal.
+		 * Do not allow it in kernel mode unless it is for a
+		 * a recognized copying function.
 		 */
-		if (p != NULL)
-			vm = p->p_vmspace;
+		if (!usermode &&
+		    frame->tf_eip != (int)fubyte_access &&
+		    frame->tf_eip != (int)fuword16_access &&
+		    frame->tf_eip != (int)fuword_access &&
+		    frame->tf_eip != (int)subyte_access &&
+		    frame->tf_eip != (int)suword16_access &&
+		    frame->tf_eip != (int)suword_access &&
+		    PCPU_GET(curpcb)->pcb_onfault == NULL) {
+			Debugger(
+			"pagefault for kernel access to unmapped user memory");
+			goto maygo;
+			goto nogo;
+		}
+maygo:

+		/*
+		 * If curproc->p_vmspace is NULL the fault is fatal.
+		 */
+		vm = p->p_vmspace;
 		if (vm == NULL)
 			goto nogo;
@@ -733,6 +805,5 @@
 		/* Fault in the user page: */
 		rv = vm_fault(map, va, ftype,
-			      (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY
-						      : VM_FAULT_NORMAL);
+		    (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : VM_FAULT_NORMAL);

 		PROC_LOCK(p);
@@ -750,7 +821,19 @@
 nogo:
 	if (!usermode) {
-		if (td->td_intr_nesting_level == 0 &&
-		    PCPU_GET(curpcb) != NULL &&
-		    PCPU_GET(curpcb)->pcb_onfault != NULL) {
+#undef MAYBE_FUSU_FAULT
+#define	MAYBE_FUSU_FAULT(where, whereto) do {	\
+	if (frame->tf_eip == (int)where) {	\
+		breakpoint();			\
+		frame->tf_eip = (int)whereto;	\
+		return (0);			\
+	}					\
+} while (0)
+		MAYBE_FUSU_FAULT(fubyte_access, fusufault);
+		MAYBE_FUSU_FAULT(fuword16_access, fusufault);
+		MAYBE_FUSU_FAULT(fuword_access, fusufault);
+		MAYBE_FUSU_FAULT(subyte_access, fusufault);
+		MAYBE_FUSU_FAULT(suword16_access, fusufault);
+		MAYBE_FUSU_FAULT(suword_access, fusufault);
+		if (PCPU_GET(curpcb)->pcb_onfault != NULL) {
 			frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
 			return (0);
@@ -760,8 +843,8 @@
 	}

-	/* kludge to pass faulting virtual address to sendsig */
+	/* Kludge to pass faulting virtual address to sendsig(). */
 	frame->tf_err = eva;

-	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+	return ((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
 }

%%%