From owner-p4-projects@FreeBSD.ORG  Sat Dec  1 22:32:50 2007
Return-Path: <owner-p4-projects@FreeBSD.ORG>
Delivered-To: p4-projects@freebsd.org
Received: by hub.freebsd.org (Postfix, from userid 32767)
	id 92F0116A4DE; Sat,  1 Dec 2007 22:32:50 +0000 (UTC)
Delivered-To: perforce@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34])
	by hub.freebsd.org (Postfix) with ESMTP id 548D516A51B
	for <perforce@freebsd.org>; Sat,  1 Dec 2007 22:32:50 +0000 (UTC)
	(envelope-from peter@freebsd.org)
Received: from repoman.freebsd.org (repoman.freebsd.org
	[IPv6:2001:4f8:fff6::29])
	by mx1.freebsd.org (Postfix) with ESMTP id 1D65A13C4CC
	for <perforce@freebsd.org>; Sat,  1 Dec 2007 22:32:50 +0000 (UTC)
	(envelope-from peter@freebsd.org)
Received: from repoman.freebsd.org (localhost [127.0.0.1])
	by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id lB1MWoZW084766
	for <perforce@freebsd.org>; Sat, 1 Dec 2007 22:32:50 GMT
	(envelope-from peter@freebsd.org)
Received: (from perforce@localhost)
	by repoman.freebsd.org (8.14.1/8.14.1/Submit) id lB1MWn7h084762
	for perforce@freebsd.org; Sat, 1 Dec 2007 22:32:49 GMT
	(envelope-from peter@freebsd.org)
Date: Sat, 1 Dec 2007 22:32:49 GMT
Message-Id: <200712012232.lB1MWn7h084762@repoman.freebsd.org>
X-Authentication-Warning: repoman.freebsd.org: perforce set sender to
	peter@freebsd.org using -f
From: Peter Wemm <peter@FreeBSD.org>
To: Perforce Change Reviews <perforce@freebsd.org>
Cc: 
Subject: PERFORCE change 129909 for review
X-BeenThere: p4-projects@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: p4 projects tree changes <p4-projects.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/p4-projects>,
	<mailto:p4-projects-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/p4-projects>
List-Post: <mailto:p4-projects@freebsd.org>
List-Help: <mailto:p4-projects-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/p4-projects>,
	<mailto:p4-projects-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Sat, 01 Dec 2007 22:32:50 -0000

http://perforce.freebsd.org/chv.cgi?CH=129909

Change 129909 by peter@peter_daintree on 2007/12/01 22:30:49

	Revert to vendor. too painful to merge, will redo.

Affected files ...

.. //depot/projects/bike_sched/sys/kern/sched_4bsd.c#8 integrate
.. //depot/projects/bike_sched/sys/kern/sched_ule.c#6 integrate

Differences ...

==== //depot/projects/bike_sched/sys/kern/sched_4bsd.c#8 (text+ko) ====

@@ -33,12 +33,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.86 2006/07/02 20:53:52 maxim Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.110 2007/11/14 06:21:22 julian Exp $");
 
 #include "opt_hwpmc_hooks.h"
 
-#define kse td_sched
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -53,6 +51,7 @@
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
+#include <sys/umtx.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
 
@@ -76,56 +75,40 @@
 
 /*
  * The schedulable entity that runs a context.
- * A process may have several of these. Probably one per processor
- * but possibly a few more.
+ * This is  an extension to the thread structure and is tailored to
+ * the requirements of this scheduler
  */
-struct kse {
-	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
-	struct thread	*ke_thread;	/* (*) Active associated thread. */
-	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
-	u_char		ke_rqindex;	/* (j) Run queue index. */
-	enum {
-		KES_THREAD = 0x0,	/* slaved to thread state */
-		KES_ONRUNQ
-	} ke_state;			/* (j) KSE status. */
-	int		ke_cpticks;	/* (j) Ticks of cpu time. */
-	struct runq	*ke_runq;	/* runq the kse is currently on */
+struct td_sched {
+	TAILQ_ENTRY(td_sched) ts_procq;	/* (j/z) Run queue. */
+	struct thread	*ts_thread;	/* (*) Active associated thread. */
+	fixpt_t		ts_pctcpu;	/* (j) %cpu during p_swtime. */
+	u_char		ts_rqindex;	/* (j) Run queue index. */
+	int		ts_cpticks;	/* (j) Ticks of cpu time. */
+	int		ts_slptime;	/* (j) Seconds !RUNNING. */
+	struct runq	*ts_runq;	/* runq the thread is currently on */
 };
 
-#define td_kse td_sched
-
 /* flags kept in td_flags */
-#define TDF_DIDRUN	TDF_SCHED0	/* KSE actually ran. */
-#define TDF_EXIT	TDF_SCHED1	/* KSE is being killed. */
+#define TDF_DIDRUN	TDF_SCHED0	/* thread actually ran. */
+#define TDF_EXIT	TDF_SCHED1	/* thread is being killed. */
 #define TDF_BOUND	TDF_SCHED2
 
-#define ke_flags	ke_thread->td_flags
-#define KEF_DIDRUN	TDF_DIDRUN /* KSE actually ran. */
-#define KEF_EXIT	TDF_EXIT /* KSE is being killed. */
-#define KEF_BOUND	TDF_BOUND /* stuck to one CPU */
+#define ts_flags	ts_thread->td_flags
+#define TSF_DIDRUN	TDF_DIDRUN /* thread actually ran. */
+#define TSF_EXIT	TDF_EXIT /* thread is being killed. */
+#define TSF_BOUND	TDF_BOUND /* stuck to one CPU */
 
-#define SKE_RUNQ_PCPU(ke)						\
-    ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq)
+#define SKE_RUNQ_PCPU(ts)						\
+    ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
 
-/*
- * KSE_CAN_MIGRATE macro returns true if the kse can migrate between
- * cpus.
- */
-#define KSE_CAN_MIGRATE(ke)						\
-    ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
-
-static struct kse kse0;
+static struct td_sched td_sched0;
+struct mtx sched_lock;
 
 static int	sched_tdcnt;	/* Total runnable threads in the system. */
 static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
 #define	SCHED_QUANTUM	(hz / 10)	/* Default sched quantum */
 
-static struct callout roundrobin_callout;
-
-static struct thread *sched_choose(void);
-
 static void	setup_runqs(void);
-static void	roundrobin(void *arg);
 static void	schedcpu(void);
 static void	schedcpu_thread(void);
 static void	sched_priority(struct thread *td, u_char prio);
@@ -236,6 +219,12 @@
 	   "account for htt");
 
 #endif
+#if 0
+static int sched_followon = 0;
+SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
+	   &sched_followon, 0,
+	   "allow threads to share a quantum");
+#endif
 
 static __inline void
 sched_load_add(void)
@@ -258,36 +247,15 @@
 maybe_resched(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority < curthread->td_priority)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
- * Force switch among equal priority processes every 100ms.
- * We don't actually need to force a context switch of the current process.
- * The act of firing the event triggers a context switch to softclock() and
- * then switching back out again which is equivalent to a preemption, thus
- * no further work is needed on the local CPU.
- */
-/* ARGSUSED */
-static void
-roundrobin(void *arg)
-{
-
-#ifdef SMP
-	mtx_lock_spin(&sched_lock);
-	forward_roundrobin();
-	mtx_unlock_spin(&sched_lock);
-#endif
-
-	callout_reset(&roundrobin_callout, sched_quantum, roundrobin, NULL);
-}
-
-/*
  * Constants for digital decay and forget:
  *	90% of (td_estcpu) usage in 5 * loadav time
- *	95% of (ke_pctcpu) usage in 60 seconds (load insensitive)
+ *	95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
@@ -352,7 +320,7 @@
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
-/* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+/* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
@@ -381,77 +349,70 @@
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	struct thread *td;
 	struct proc *p;
-	struct kse *ke;
+	struct td_sched *ts;
 	int awake, realstathz;
 
 	realstathz = stathz ? stathz : hz;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
-		/*
-		 * Prevent state changes and protect run queue.
-		 */
-		mtx_lock_spin(&sched_lock);
-		/*
-		 * Increment time in/out of memory.  We ignore overflow; with
-		 * 16-bit int's (remember them?) overflow takes 45 days.
-		 */
-		p->p_swtime++;
+		PROC_SLOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td) { 
 			awake = 0;
-			ke = td->td_kse;
+			thread_lock(td);
+			ts = td->td_sched;
 			/*
 			 * Increment sleep time (if sleeping).  We
 			 * ignore overflow, as above.
 			 */
 			/*
-			 * The kse slptimes are not touched in wakeup
-			 * because the thread may not HAVE a KSE.
+			 * The td_sched slptimes are not touched in wakeup
+			 * because the thread may not HAVE everything in
+			 * memory? XXX I think this is out of date.
 			 */
-			if (ke->ke_state == KES_ONRUNQ) {
+			if (TD_ON_RUNQ(td)) {
 				awake = 1;
-				ke->ke_flags &= ~KEF_DIDRUN;
-			} else if ((ke->ke_state == KES_THREAD) &&
-			    (TD_IS_RUNNING(td))) {
+				ts->ts_flags &= ~TSF_DIDRUN;
+			} else if (TD_IS_RUNNING(td)) {
 				awake = 1;
-				/* Do not clear KEF_DIDRUN */
-			} else if (ke->ke_flags & KEF_DIDRUN) {
+				/* Do not clear TSF_DIDRUN */
+			} else if (ts->ts_flags & TSF_DIDRUN) {
 				awake = 1;
-				ke->ke_flags &= ~KEF_DIDRUN;
+				ts->ts_flags &= ~TSF_DIDRUN;
 			}
 
 			/*
-			 * ke_pctcpu is only for ps and ttyinfo().
-			 * Do it per kse, and add them up at the end?
+			 * ts_pctcpu is only for ps and ttyinfo().
+			 * Do it per td_sched, and add them up at the end?
 			 * XXXKSE
 			 */
-			ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >>
-			    FSHIFT;
+			ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
 			/*
-			 * If the kse has been idle the entire second,
+			 * If the td_sched has been idle the entire second,
 			 * stop recalculating its priority until
 			 * it wakes up.
 			 */
-			if (ke->ke_cpticks == 0)
-				continue;
+			if (ts->ts_cpticks != 0) {
 #if	(FSHIFT >= CCPU_SHIFT)
-			ke->ke_pctcpu += (realstathz == 100)
-			    ? ((fixpt_t) ke->ke_cpticks) <<
-			    (FSHIFT - CCPU_SHIFT) :
-			    100 * (((fixpt_t) ke->ke_cpticks)
-			    << (FSHIFT - CCPU_SHIFT)) / realstathz;
+				ts->ts_pctcpu += (realstathz == 100)
+				    ? ((fixpt_t) ts->ts_cpticks) <<
+				    (FSHIFT - CCPU_SHIFT) :
+				    100 * (((fixpt_t) ts->ts_cpticks)
+				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
-			ke->ke_pctcpu += ((FSCALE - ccpu) *
-			    (ke->ke_cpticks *
-			    FSCALE / realstathz)) >> FSHIFT;
+				ts->ts_pctcpu += ((FSCALE - ccpu) *
+				    (ts->ts_cpticks *
+				    FSCALE / realstathz)) >> FSHIFT;
 #endif
-			ke->ke_cpticks = 0;
-
+				ts->ts_cpticks = 0;
+			}
 			/* 
 			 * If there are ANY running threads in this process,
 			 * then don't count it as sleeping.
+XXX  this is broken
+
 			 */
 			if (awake) {
-				if (td->td_slptime > 1) {
+				if (ts->ts_slptime > 1) {
 					/*
 					 * In an ideal world, this should not
 					 * happen, because whoever woke us
@@ -463,16 +424,19 @@
 					 */
 					updatepri(td);
 				}
-				td->td_slptime = 0;
+				ts->ts_slptime = 0;
 			} else
-				td->td_slptime++;
-			if (td->td_slptime > 1)
+				ts->ts_slptime++;
+			if (ts->ts_slptime > 1) {
+				thread_unlock(td);
 				continue;
+			}
 			td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
 		      	resetpriority(td);
 			resetpriority_thread(td);
+			thread_unlock(td);
 		} /* end of thread loop */
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	} /* end of process loop */
 	sx_sunlock(&allproc_lock);
 }
@@ -483,11 +447,10 @@
 static void
 schedcpu_thread(void)
 {
-	int nowake;
 
 	for (;;) {
 		schedcpu();
-		tsleep(&nowake, 0, "-", hz);
+		pause("-", hz);
 	}
 }
 
@@ -499,16 +462,18 @@
 static void
 updatepri(struct thread *td)
 {
-	register fixpt_t loadfac;
-	register unsigned int newcpu;
+	struct td_sched *ts;
+	fixpt_t loadfac;
+	unsigned int newcpu;
 
+	ts = td->td_sched;
 	loadfac = loadfactor(averunnable.ldavg[0]);
-	if (td->td_slptime > 5 * loadfac)
+	if (ts->ts_slptime > 5 * loadfac)
 		td->td_estcpu = 0;
 	else {
 		newcpu = td->td_estcpu;
-		td->td_slptime--;	/* was incremented in schedcpu() */
-		while (newcpu && --td->td_slptime)
+		ts->ts_slptime--;	/* was incremented in schedcpu() */
+		while (newcpu && --ts->ts_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
 		td->td_estcpu = newcpu;
 	}
@@ -529,12 +494,12 @@
 		    NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 		    PRI_MAX_TIMESHARE);
-		td->td_user_pri = newpriority;
+		sched_user_prio(td, newpriority);
 	}
 }
 
 /*
- * Update the thread's priority when the associated ksegroup's user
+ * Update the thread's priority when the associated process's user
  * priority changes.
  */
 static void
@@ -562,11 +527,6 @@
 		sched_quantum = SCHED_QUANTUM;
 	hogticks = 2 * sched_quantum;
 
-	callout_init(&roundrobin_callout, CALLOUT_MPSAFE);
-
-	/* Kick off timeout driven events by calling first time. */
-	roundrobin(NULL);
-
 	/* Account for thread0. */
 	sched_load_add();
 }
@@ -585,9 +545,10 @@
 	 * Set up the scheduler specific parts of proc0.
 	 */
 	proc0.p_sched = NULL; /* XXX */
-	thread0.td_sched = &kse0;
-	kse0.ke_thread = &thread0;
-	kse0.ke_state = KES_THREAD;
+	thread0.td_sched = &td_sched0;
+	thread0.td_lock = &sched_lock;
+	td_sched0.ts_thread = &thread0;
+	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
 }
 
 int
@@ -625,17 +586,25 @@
 void
 sched_clock(struct thread *td)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	ke = td->td_kse;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
 
-	ke->ke_cpticks++;
+	ts->ts_cpticks++;
 	td->td_estcpu = ESTCPULIM(td->td_estcpu + 1);
 	if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
 		resetpriority(td);
 		resetpriority_thread(td);
 	}
+
+	/*
+	 * Force a context switch if the current thread has used up a full
+	 * quantum (default quantum is 100ms).
+	 */
+	if (!TD_IS_IDLETHREAD(td) &&
+	    ticks - PCPU_GET(switchticks) >= sched_quantum)
+		td->td_flags |= TDF_NEEDRESCHED;
 }
 
 /*
@@ -644,20 +613,39 @@
 void
 sched_exit(struct proc *p, struct thread *td)
 {
-	struct thread *parent = FIRST_THREAD_IN_PROC(p);
 
 	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
-	    td, td->td_proc->p_comm, td->td_priority);
+	    td, td->td_name, td->td_priority);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
+}
+
+void
+sched_exit_thread(struct thread *td, struct thread *child)
+{
 
-	parent->td_estcpu = ESTCPULIM(parent->td_estcpu + td->td_estcpu);
-	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
+	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
+	    child, child->td_name, child->td_priority);
+	thread_lock(td);
+	td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
+	thread_unlock(td);
+	mtx_lock_spin(&sched_lock);
+	if ((child->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_load_rem();
+	mtx_unlock_spin(&sched_lock);
 }
 
 void
 sched_fork(struct thread *td, struct thread *childtd)
 {
+	sched_fork_thread(td, childtd);
+}
+
+void
+sched_fork_thread(struct thread *td, struct thread *childtd)
+{
 	childtd->td_estcpu = td->td_estcpu;
+	childtd->td_lock = &sched_lock;
 	sched_newthread(childtd);
 }
 
@@ -667,18 +655,20 @@
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	p->p_nice = nice;
 	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
 		resetpriority(td);
 		resetpriority_thread(td);
+		thread_unlock(td);
 	}
 }
 
 void
 sched_class(struct thread *td, int class)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	td->td_pri_class = class;
 }
 
@@ -689,16 +679,17 @@
 sched_priority(struct thread *td, u_char prio)
 {
 	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
-	    td, td->td_proc->p_comm, td->td_priority, prio, curthread, 
-	    curthread->td_proc->p_comm);
+	    td, td->td_name, td->td_priority, prio, curthread, 
+	    curthread->td_name);
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
-	if (TD_ON_RUNQ(td)) {
-		adjustrunqueue(td, prio);
-	} else {
-		td->td_priority = prio;
+	td->td_priority = prio;
+	if (TD_ON_RUNQ(td) && 
+	    td->td_sched->ts_rqindex != (prio / RQ_PPQ)) {
+		sched_rem(td);
+		sched_add(td, SRQ_BORING);
 	}
 }
 
@@ -767,26 +758,78 @@
 }
 
 void
+sched_user_prio(struct thread *td, u_char prio)
+{
+	u_char oldprio;
+
+	td->td_base_user_pri = prio;
+	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
+		return;
+	oldprio = td->td_user_pri;
+	td->td_user_pri = prio;
+
+	if (TD_ON_UPILOCK(td) && oldprio != prio)
+		umtx_pi_adjust(td, oldprio);
+}
+
+void
+sched_lend_user_prio(struct thread *td, u_char prio)
+{
+	u_char oldprio;
+
+	td->td_flags |= TDF_UBORROWING;
+
+	oldprio = td->td_user_pri;
+	td->td_user_pri = prio;
+
+	if (TD_ON_UPILOCK(td) && oldprio != prio)
+		umtx_pi_adjust(td, oldprio);
+}
+
+void
+sched_unlend_user_prio(struct thread *td, u_char prio)
+{
+	u_char base_pri;
+
+	base_pri = td->td_base_user_pri;
+	if (prio >= base_pri) {
+		td->td_flags &= ~TDF_UBORROWING;
+		sched_user_prio(td, base_pri);
+	} else
+		sched_lend_user_prio(td, prio);
+}
+
+void
 sched_sleep(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	td->td_slptime = 0;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_slptick = ticks;
+	td->td_sched->ts_slptime = 0;
 }
 
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 	struct proc *p;
 
-	ke = td->td_kse;
+	ts = td->td_sched;
 	p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	/*  
+	 * Switch to the sched lock to fix things up and pick
+	 * a new thread.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_unlock(td);
+	}
 
 	if ((p->p_flag & P_NOLOAD) == 0)
 		sched_load_rem();
+
 	if (newtd) 
 		newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED);
 
@@ -800,12 +843,15 @@
 	 * or stopped or any thing else similar.  We never put the idle
 	 * threads on the run queue, however.
 	 */
-	if (td == PCPU_GET(idlethread))
+	if (td->td_flags & TDF_IDLETD) {
 		TD_SET_CAN_RUN(td);
-	else {
+#ifdef SMP
+		idle_cpus_mask &= ~PCPU_GET(cpumask);
+#endif
+	} else {
 		if (TD_IS_RUNNING(td)) {
-			/* Put us back on the run queue (kse and all). */
-			setrunqueue(td, (flags & SW_PREEMPT) ?
+			/* Put us back on the run queue. */
+			sched_add(td, (flags & SW_PREEMPT) ?
 			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 			    SRQ_OURSELF|SRQ_YIELDING);
 		}
@@ -816,45 +862,72 @@
 		 * as if it had been added to the run queue and selected.
 		 * It came from:
 		 * * A preemption
+		 * * An upcall 
 		 * * A followon
 		 */
 		KASSERT((newtd->td_inhibitors == 0),
-			("trying to run inhibitted thread"));
-		newtd->td_kse->ke_flags |= KEF_DIDRUN;
+			("trying to run inhibited thread"));
+		newtd->td_sched->ts_flags |= TSF_DIDRUN;
         	TD_SET_RUNNING(newtd);
 		if ((newtd->td_proc->p_flag & P_NOLOAD) == 0)
 			sched_load_add();
 	} else {
 		newtd = choosethread();
 	}
+	MPASS(newtd->td_lock == &sched_lock);
 
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
-		cpu_switch(td, newtd);
+
+                /* I feel sleepy */
+		cpu_switch(td, newtd, td->td_lock);
+		/*
+		 * Where am I?  What year is it?
+		 * We are in the same thread that went to sleep above,
+		 * but any amount of time may have passed. All out context
+		 * will still be available as will local variables.
+		 * PCPU values however may have changed as we may have
+		 * changed CPU so don't trust cached values of them.
+		 * New threads will go to fork_exit() instead of here
+		 * so if you change things here you may need to change
+		 * things there too.
+		 * If the thread above was exiting it will never wake
+		 * up again here, so either it has saved everything it
+		 * needed to, or the thread_wait() or wait() will
+		 * need to reap it.
+		 */
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	}
 
+#ifdef SMP
+	if (td->td_flags & TDF_IDLETD)
+		idle_cpus_mask |= PCPU_GET(cpumask);
+#endif
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
+	MPASS(td->td_lock == &sched_lock);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
+	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	if (td->td_slptime > 1) {
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	if (ts->ts_slptime > 1) {
 		updatepri(td);
 		resetpriority(td);
 	}
-	td->td_slptime = 0;
-	setrunqueue(td, SRQ_BORING);
+	td->td_slptick = ticks;
+	ts->ts_slptime = 0;
+	sched_add(td, SRQ_BORING);
 }
 
 #ifdef SMP
@@ -984,45 +1057,56 @@
 sched_add(struct thread *td, int flags)
 #ifdef SMP
 {
-	struct kse *ke;
+	struct td_sched *ts;
 	int forwarded = 0;
 	int cpu;
 	int single_cpu = 0;
 
-	ke = td->td_kse;
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(ke->ke_state != KES_ONRUNQ,
-	    ("sched_add: kse %p (%s) already in run queue", ke,
-	    td->td_proc->p_comm));
-	KASSERT(td->td_proc->p_sflag & PS_INMEM,
-	    ("sched_add: process swapped out"));
+	ts = td->td_sched;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT((td->td_inhibitors == 0),
+	    ("sched_add: trying to run inhibited thread"));
+	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+	    ("sched_add: bad thread state"));
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_add: thread swapped out"));
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
-	    td, td->td_proc->p_comm, td->td_priority, curthread,
-	    curthread->td_proc->p_comm);
+	    td, td->td_name, td->td_priority, curthread,
+	    curthread->td_name);
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
+	TD_SET_RUNQ(td);
 
 	if (td->td_pinned != 0) {
 		cpu = td->td_lastcpu;
-		ke->ke_runq = &runq_pcpu[cpu];
+		ts->ts_runq = &runq_pcpu[cpu];
 		single_cpu = 1;
 		CTR3(KTR_RUNQ,
-		    "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu);
-	} else if ((ke)->ke_flags & KEF_BOUND) {
+		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu);
+	} else if ((ts)->ts_flags & TSF_BOUND) {
 		/* Find CPU from bound runq */
-		KASSERT(SKE_RUNQ_PCPU(ke),("sched_add: bound kse not on cpu runq"));
-		cpu = ke->ke_runq - &runq_pcpu[0];
+		KASSERT(SKE_RUNQ_PCPU(ts),("sched_add: bound td_sched not on cpu runq"));
+		cpu = ts->ts_runq - &runq_pcpu[0];
 		single_cpu = 1;
 		CTR3(KTR_RUNQ,
-		    "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu);
+		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu);
 	} else {	
 		CTR2(KTR_RUNQ,
-		    "sched_add: adding kse:%p (td:%p) to gbl runq", ke, td);
+		    "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, td);
 		cpu = NOCPU;
-		ke->ke_runq = &runq;
+		ts->ts_runq = &runq;
 	}
 	
 	if (single_cpu && (cpu != PCPU_GET(cpuid))) {
 	        kick_other_cpu(td->td_priority,cpu);
 	} else {
+		
 		if (!single_cpu) {
 			cpumask_t me = PCPU_GET(cpumask);
 			int idle = idle_cpus_mask & me;	
@@ -1031,6 +1115,7 @@
 			    (idle_cpus_mask & ~(hlt_cpus_mask | me)))
 				forwarded = forward_wakeup(cpu);
 		}
+
 		if (!forwarded) {
 			if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
 				return;
@@ -1041,24 +1126,33 @@
 	
 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_load_add();
-	runq_add(ke->ke_runq, ke, flags);
-	ke->ke_state = KES_ONRUNQ;
+	runq_add(ts->ts_runq, ts, flags);
 }
 #else /* SMP */
 {
-	struct kse *ke;
-	ke = td->td_kse;
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(ke->ke_state != KES_ONRUNQ,
-	    ("sched_add: kse %p (%s) already in run queue", ke,
-	    td->td_proc->p_comm));
-	KASSERT(td->td_proc->p_sflag & PS_INMEM,
-	    ("sched_add: process swapped out"));
+	struct td_sched *ts;
+	ts = td->td_sched;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT((td->td_inhibitors == 0),
+	    ("sched_add: trying to run inhibited thread"));
+	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+	    ("sched_add: bad thread state"));
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_add: thread swapped out"));
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
-	    td, td->td_proc->p_comm, td->td_priority, curthread,
-	    curthread->td_proc->p_comm);
-	CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p) to runq", ke, td);
-	ke->ke_runq = &runq;
+	    td, td->td_name, td->td_priority, curthread,
+	    curthread->td_name);
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
+	TD_SET_RUNQ(td);
+	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
+	ts->ts_runq = &runq;
 
 	/* 
 	 * If we are yielding (on the way out anyhow) 
@@ -1077,70 +1171,30 @@
 	}	
 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_load_add();
-	runq_add(ke->ke_runq, ke, flags);
-	ke->ke_state = KES_ONRUNQ;
+	runq_add(ts->ts_runq, ts, flags);
 	maybe_resched(td);
 }
 #endif /* SMP */
 
 void
-sched_run_ithread(struct thread *td)
-{
-	struct kse *ke = td->td_kse;
-
-	/* Inline of setrunqueue */
-	CTR2(KTR_RUNQ, "sched_run_ithread: td:%p pid:%d",
-	    td, td->td_proc->p_pid);
-	CTR5(KTR_SCHED, "sched_run_ithread: %p(%s) prio %d by %p(%s)",
-	    td, td->td_proc->p_comm, td->td_priority, ctd,
-	    ctd->td_proc->p_comm);
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT((td->td_inhibitors == 0),
-		 ("sched_run_ithread: trying to run inhibitted thread"));
-	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
-	    ("sched_run_ithread: bad thread state"));
-	KASSERT(ke->ke_state != KES_ONRUNQ,
-	    ("sched_run_ithread: kse %p (%s) already in run queue", ke,
-	    td->td_proc->p_comm));
-	KASSERT(td->td_proc->p_sflag & PS_INMEM,
-	    ("sched_run_ithread: process swapped out"));
-	CTR5(KTR_SCHED, "sched_run_ithread: %p(%s) prio %d by %p(%s)",
-	    td, td->td_proc->p_comm, td->td_priority, curthread,
-	    curthread->td_proc->p_comm);
-	CTR2(KTR_RUNQ, "sched_run_ithread: adding kse:%p (td:%p) to runq", ke, td);
-
-	TD_SET_RUNQ(td);
-	ke->ke_runq = &runq;
-	/* Preempt if we can.  If we did, we're finished */
-	if (maybe_preempt(td))
-		return;
-	/* We didn't preempt. Place on runq */
-	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
-		sched_load_add();
-	runq_add(ke->ke_runq, ke, SRQ_INTR);
-	ke->ke_state = KES_ONRUNQ;
-	maybe_resched(td);
-}
-
-void
 sched_rem(struct thread *td)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 
-	ke = td->td_kse;
-	KASSERT(td->td_proc->p_sflag & PS_INMEM,
-	    ("sched_rem: process swapped out"));
-	KASSERT((ke->ke_state == KES_ONRUNQ),
-	    ("sched_rem: KSE not on run queue"));
+	ts = td->td_sched;
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_rem: thread swapped out"));
+	KASSERT(TD_ON_RUNQ(td),
+	    ("sched_rem: thread not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
-	    td, td->td_proc->p_comm, td->td_priority, curthread,
-	    curthread->td_proc->p_comm);
+	    td, td->td_name, td->td_priority, curthread,
+	    curthread->td_name);
 
 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_load_rem();
-	runq_remove(ke->ke_runq, ke);
-	ke->ke_state = KES_THREAD;
+	runq_remove(ts->ts_runq, ts);
+	TD_SET_CAN_RUN(td);
 }
 
 /*
@@ -1150,58 +1204,83 @@
 struct thread *
 sched_choose(void)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 	struct runq *rq;
 
+	mtx_assert(&sched_lock,  MA_OWNED);
 #ifdef SMP
-	struct kse *kecpu;
+	struct td_sched *kecpu;
 
 	rq = &runq;
-	ke = runq_choose(&runq);
+	ts = runq_choose(&runq);
 	kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
 
-	if (ke == NULL || 
+	if (ts == NULL || 
 	    (kecpu != NULL && 
-	     kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) {
-		CTR2(KTR_RUNQ, "choosing kse %p from pcpu runq %d", kecpu,
+	     kecpu->ts_thread->td_priority < ts->ts_thread->td_priority)) {
+		CTR2(KTR_RUNQ, "choosing td_sched %p from pcpu runq %d", kecpu,
 		     PCPU_GET(cpuid));
-		ke = kecpu;
+		ts = kecpu;
 		rq = &runq_pcpu[PCPU_GET(cpuid)];
 	} else { 
-		CTR1(KTR_RUNQ, "choosing kse %p from main runq", ke);
+		CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", ts);
 	}
 
 #else
 	rq = &runq;
-	ke = runq_choose(&runq);
+	ts = runq_choose(&runq);
 #endif
 
-	if (ke) {
-		runq_remove(rq, ke);
-		ke->ke_state = KES_THREAD;
+	if (ts) {
+		runq_remove(rq, ts);
+		ts->ts_flags |= TSF_DIDRUN;
+
+		KASSERT(ts->ts_thread->td_flags & TDF_INMEM,
+		    ("sched_choose: thread swapped out"));
+		return (ts->ts_thread);
+	} 
+	return (PCPU_GET(idlethread));
+}
 
-		KASSERT(ke->ke_thread->td_proc->p_sflag & PS_INMEM,
-		    ("sched_choose: process swapped out"));
-		return (ke->ke_thread);
+void
+sched_userret(struct thread *td)
+{
+	/*
+	 * XXX we cheat slightly on the locking here to avoid locking in
+	 * the usual case.  Setting td_priority here is essentially an
+	 * incomplete workaround for not setting it properly elsewhere.
+	 * Now that some interrupt handlers are threads, not setting it
+	 * properly elsewhere can clobber it in the window between setting
+	 * it here and returning to user mode, so don't waste time setting
+	 * it perfectly here.
+	 */
+	KASSERT((td->td_flags & TDF_BORROWING) == 0,
+	    ("thread with borrowed priority returning to userland"));
+	if (td->td_priority != td->td_user_pri) {
+		thread_lock(td);
+		td->td_priority = td->td_user_pri;
+		td->td_base_pri = td->td_user_pri;
+		thread_unlock(td);
 	}
-	return (NULL);
 }
 
 void
 sched_bind(struct thread *td, int cpu)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_RUNNING(td),
 	    ("sched_bind: cannot bind non-running thread"));
-	ke = td->td_kse;
-	ke->ke_flags |= KEF_BOUND;
+
+	ts = td->td_sched;
+
+	ts->ts_flags |= TSF_BOUND;
 #ifdef SMP

>>> TRUNCATED FOR MAIL (1000 lines) <<<