From owner-svn-src-stable@FreeBSD.ORG  Sat Oct  6 12:25:14 2012
Return-Path: <owner-svn-src-stable@FreeBSD.ORG>
Delivered-To: svn-src-stable@freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [69.147.83.52])
	by hub.freebsd.org (Postfix) with ESMTP id 5AA791065670;
	Sat,  6 Oct 2012 12:25:14 +0000 (UTC) (envelope-from mav@FreeBSD.org)
Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c])
	by mx1.freebsd.org (Postfix) with ESMTP id 420F48FC08;
	Sat,  6 Oct 2012 12:25:14 +0000 (UTC)
Received: from svn.freebsd.org (localhost [127.0.0.1])
	by svn.freebsd.org (8.14.4/8.14.4) with ESMTP id q96CPEKR062395;
	Sat, 6 Oct 2012 12:25:14 GMT (envelope-from mav@svn.freebsd.org)
Received: (from mav@localhost)
	by svn.freebsd.org (8.14.4/8.14.4/Submit) id q96CPEhs062393;
	Sat, 6 Oct 2012 12:25:14 GMT (envelope-from mav@svn.freebsd.org)
Message-Id: <201210061225.q96CPEhs062393@svn.freebsd.org>
From: Alexander Motin <mav@FreeBSD.org>
Date: Sat, 6 Oct 2012 12:25:14 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
	svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org
X-SVN-Group: stable-8
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Cc: 
Subject: svn commit: r241246 - stable/8/sys/kern
X-BeenThere: svn-src-stable@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: SVN commit messages for all the -stable branches of the src tree
	<svn-src-stable.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-stable>, 
	<mailto:svn-src-stable-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-stable>
List-Post: <mailto:svn-src-stable@freebsd.org>
List-Help: <mailto:svn-src-stable-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-stable>,
	<mailto:svn-src-stable-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Sat, 06 Oct 2012 12:25:14 -0000

Author: mav
Date: Sat Oct  6 12:25:13 2012
New Revision: 241246
URL: http://svn.freebsd.org/changeset/base/241246

Log:
  MFC r232207, r232454, r234066:
  Rework CPU load balancing in SCHED_ULE:
   - In sched_pickcpu() be more careful taking previous CPU on SMT systems.
  Do it only if all other logical CPUs of that physical one are idle to avoid
  extra resource sharing.
   - In sched_pickcpu() change general logic of CPU selection. First
  look for idle CPU, sharing last level cache with previously used one,
  skipping SMT CPU groups. If none found, search all CPUs for the least loaded
  one, where the thread with its priority can run now. If none found, search
  just for the least loaded CPU.
   - Make cpu_search() compare lowest/highest CPU load when comparing CPU
  groups with equal load. That allows to differentiate 1+1 and 2+0 loads.
   - Make cpu_search() to prefer specified (previous) CPU or group if load
  is equal. This improves cache affinity for more complicated topologies.
   - Randomize CPU selection if above factors are equal. Previous code tend
  to prefer CPUs with lower IDs, causing unneeded collisions.
   - Rework periodic balancer in sched_balance_group(). With cpu_search()
  more intelligent now, make balansing process flat, removing recursion
  over the topology tree. That fixes double swap problem and makes load
  distribution more even and predictable.
  
  All together this gives 10-15% performance improvement in many tests on
  CPUs with SMT, such as Core i7, for number of threads is less then number
  of logical CPUs. In some tests it also gives positive effect to systems
  without SMT.

Modified:
  stable/8/sys/kern/sched_ule.c
Directory Properties:
  stable/8/sys/   (props changed)
  stable/8/sys/kern/   (props changed)

Modified: stable/8/sys/kern/sched_ule.c
==============================================================================
--- stable/8/sys/kern/sched_ule.c	Sat Oct  6 10:02:11 2012	(r241245)
+++ stable/8/sys/kern/sched_ule.c	Sat Oct  6 12:25:13 2012	(r241246)
@@ -255,7 +255,6 @@ struct cpu_group *cpu_top;		/* CPU topol
 static int rebalance = 1;
 static int balance_interval = 128;	/* Default set in sched_initticks(). */
 static int affinity;
-static int steal_htt = 1;
 static int steal_idle = 1;
 static int steal_thresh = 2;
 
@@ -265,6 +264,7 @@ static int steal_thresh = 2;
 static struct tdq	tdq_cpu[MAXCPU];
 static struct tdq	*balance_tdq;
 static int balance_ticks;
+static DPCPU_DEFINE(uint32_t, randomval);
 
 #define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
 #define	TDQ_CPU(x)	(&tdq_cpu[(x)])
@@ -571,9 +571,11 @@ tdq_setlowpri(struct tdq *tdq, struct th
 #ifdef SMP
 struct cpu_search {
 	cpuset_t cs_mask;
-	u_int	cs_load;
-	u_int	cs_cpu;
-	int	cs_limit;	/* Min priority for low min load for high. */
+	u_int	cs_prefer;
+	int	cs_pri;		/* Min priority for low. */
+	int	cs_limit;	/* Max load for low, min load for high. */
+	int	cs_cpu;
+	int	cs_load;
 };
 
 #define	CPU_SEARCH_LOWEST	0x1
@@ -584,44 +586,14 @@ struct cpu_search {
 	for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++)		\
 		if ((mask) & 1 << (cpu))
 
-static __inline int cpu_search(struct cpu_group *cg, struct cpu_search *low,
+static __inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match);
-int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low);
-int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high);
-int cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low);
+int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high);
+int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high);
 
 /*
- * This routine compares according to the match argument and should be
- * reduced in actual instantiations via constant propagation and dead code
- * elimination.
- */ 
-static __inline int
-cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high,
-    const int match)
-{
-	struct tdq *tdq;
-
-	tdq = TDQ_CPU(cpu);
-	if (match & CPU_SEARCH_LOWEST)
-		if (CPU_ISSET(cpu, &low->cs_mask) &&
-		    tdq->tdq_load < low->cs_load &&
-		    tdq->tdq_lowpri > low->cs_limit) {
-			low->cs_cpu = cpu;
-			low->cs_load = tdq->tdq_load;
-		}
-	if (match & CPU_SEARCH_HIGHEST)
-		if (CPU_ISSET(cpu, &high->cs_mask) &&
-		    tdq->tdq_load >= high->cs_limit && 
-		    tdq->tdq_load > high->cs_load &&
-		    tdq->tdq_transferable) {
-			high->cs_cpu = cpu;
-			high->cs_load = tdq->tdq_load;
-		}
-	return (tdq->tdq_load);
-}
-
-/*
  * Search the tree of cpu_groups for the lowest or highest loaded cpu
  * according to the match argument.  This routine actually compares the
  * load on all paths through the tree and finds the least loaded cpu on
@@ -633,33 +605,44 @@ cpu_compare(int cpu, struct cpu_search *
  * also recursive to the depth of the tree.
  */
 static __inline int
-cpu_search(struct cpu_group *cg, struct cpu_search *low,
+cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match)
 {
-	int total;
+	struct cpu_search lgroup;
+	struct cpu_search hgroup;
+	cpumask_t cpumask;
+	struct cpu_group *child;
+	struct tdq *tdq;
+	int cpu, i, hload, lload, load, total, rnd, *rndptr;
 
 	total = 0;
-	if (cg->cg_children) {
-		struct cpu_search lgroup;
-		struct cpu_search hgroup;
-		struct cpu_group *child;
-		u_int lload;
-		int hload;
-		int load;
-		int i;
-
-		lload = -1;
-		hload = -1;
-		for (i = 0; i < cg->cg_children; i++) {
-			child = &cg->cg_child[i];
-			if (match & CPU_SEARCH_LOWEST) {
-				lgroup = *low;
-				lgroup.cs_load = -1;
-			}
-			if (match & CPU_SEARCH_HIGHEST) {
-				hgroup = *high;
-				lgroup.cs_load = 0;
-			}
+	cpumask = cg->cg_mask;
+	if (match & CPU_SEARCH_LOWEST) {
+		lload = INT_MAX;
+		lgroup = *low;
+	}
+	if (match & CPU_SEARCH_HIGHEST) {
+		hload = INT_MIN;
+		hgroup = *high;
+	}
+
+	/* Iterate through the child CPU groups and then remaining CPUs. */
+	for (i = cg->cg_children, cpu = mp_maxid; i >= 0; ) {
+		if (i == 0) {
+			while (cpu >= 0 && ((1 << cpu) & cpumask) == 0)
+				cpu--;
+			if (cpu < 0)
+				break;
+			child = NULL;
+		} else
+			child = &cg->cg_child[i - 1];
+
+		if (match & CPU_SEARCH_LOWEST)
+			lgroup.cs_cpu = -1;
+		if (match & CPU_SEARCH_HIGHEST)
+			hgroup.cs_cpu = -1;
+		if (child) {			/* Handle child CPU group. */
+			cpumask &= ~child->cg_mask;
 			switch (match) {
 			case CPU_SEARCH_LOWEST:
 				load = cpu_search_lowest(child, &lgroup);
@@ -671,23 +654,56 @@ cpu_search(struct cpu_group *cg, struct 
 				load = cpu_search_both(child, &lgroup, &hgroup);
 				break;
 			}
-			total += load;
-			if (match & CPU_SEARCH_LOWEST)
-				if (load < lload || low->cs_cpu == -1) {
-					*low = lgroup;
-					lload = load;
+		} else {			/* Handle child CPU. */
+			tdq = TDQ_CPU(cpu);
+			load = tdq->tdq_load * 256;
+			rndptr = DPCPU_PTR(randomval);
+			rnd = (*rndptr = *rndptr * 69069 + 5) >> 26;
+			if (match & CPU_SEARCH_LOWEST) {
+				if (cpu == low->cs_prefer)
+					load -= 64;
+				/* If that CPU is allowed and get data. */
+				if (tdq->tdq_lowpri > lgroup.cs_pri &&
+				    tdq->tdq_load <= lgroup.cs_limit &&
+				    CPU_ISSET(cpu, &lgroup.cs_mask)) {
+					lgroup.cs_cpu = cpu;
+					lgroup.cs_load = load - rnd;
 				}
-			if (match & CPU_SEARCH_HIGHEST) 
-				if (load > hload || high->cs_cpu == -1) {
-					hload = load;
-					*high = hgroup;
+			}
+			if (match & CPU_SEARCH_HIGHEST)
+				if (tdq->tdq_load >= hgroup.cs_limit &&
+				    tdq->tdq_transferable &&
+				    CPU_ISSET(cpu, &hgroup.cs_mask)) {
+					hgroup.cs_cpu = cpu;
+					hgroup.cs_load = load - rnd;
 				}
 		}
-	} else {
-		int cpu;
+		total += load;
 
-		CPUSET_FOREACH(cpu, cg->cg_mask)
-			total += cpu_compare(cpu, low, high, match);
+		/* We have info about child item. Compare it. */
+		if (match & CPU_SEARCH_LOWEST) {
+			if (lgroup.cs_cpu >= 0 &&
+			    (load < lload ||
+			     (load == lload && lgroup.cs_load < low->cs_load))) {
+				lload = load;
+				low->cs_cpu = lgroup.cs_cpu;
+				low->cs_load = lgroup.cs_load;
+			}
+		}
+		if (match & CPU_SEARCH_HIGHEST)
+			if (hgroup.cs_cpu >= 0 &&
+			    (load > hload ||
+			     (load == hload && hgroup.cs_load > high->cs_load))) {
+				hload = load;
+				high->cs_cpu = hgroup.cs_cpu;
+				high->cs_load = hgroup.cs_load;
+			}
+		if (child) {
+			i--;
+			if (i == 0 && cpumask == 0)
+				break;
+		} else
+			cpu--;
 	}
 	return (total);
 }
@@ -697,19 +713,19 @@ cpu_search(struct cpu_group *cg, struct 
  * optimization.
  */
 int
-cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low)
+cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low)
 {
 	return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST);
 }
 
 int
-cpu_search_highest(struct cpu_group *cg, struct cpu_search *high)
+cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high)
 {
 	return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST);
 }
 
 int
-cpu_search_both(struct cpu_group *cg, struct cpu_search *low,
+cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high)
 {
 	return cpu_search(cg, low, high, CPU_SEARCH_BOTH);
@@ -721,14 +737,16 @@ cpu_search_both(struct cpu_group *cg, st
  * acceptable.
  */
 static inline int
-sched_lowest(struct cpu_group *cg, cpuset_t mask, int pri)
+sched_lowest(const struct cpu_group *cg, cpuset_t mask, int pri, int maxload,
+    int prefer)
 {
 	struct cpu_search low;
 
 	low.cs_cpu = -1;
-	low.cs_load = -1;
+	low.cs_prefer = prefer;
 	low.cs_mask = mask;
-	low.cs_limit = pri;
+	low.cs_pri = pri;
+	low.cs_limit = maxload;
 	cpu_search_lowest(cg, &low);
 	return low.cs_cpu;
 }
@@ -737,12 +755,11 @@ sched_lowest(struct cpu_group *cg, cpuse
  * Find the cpu with the highest load via the highest loaded path.
  */
 static inline int
-sched_highest(struct cpu_group *cg, cpuset_t mask, int minload)
+sched_highest(const struct cpu_group *cg, cpuset_t mask, int minload)
 {
 	struct cpu_search high;
 
 	high.cs_cpu = -1;
-	high.cs_load = 0;
 	high.cs_mask = mask;
 	high.cs_limit = minload;
 	cpu_search_highest(cg, &high);
@@ -753,17 +770,17 @@ sched_highest(struct cpu_group *cg, cpus
  * Simultaneously find the highest and lowest loaded cpu reachable via
  * cg.
  */
-static inline void 
-sched_both(struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
+static inline void
+sched_both(const struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
 {
 	struct cpu_search high;
 	struct cpu_search low;
 
 	low.cs_cpu = -1;
-	low.cs_limit = -1;
-	low.cs_load = -1;
+	low.cs_prefer = -1;
+	low.cs_pri = -1;
+	low.cs_limit = INT_MAX;
 	low.cs_mask = mask;
-	high.cs_load = 0;
 	high.cs_cpu = -1;
 	high.cs_limit = -1;
 	high.cs_mask = mask;
@@ -776,30 +793,45 @@ sched_both(struct cpu_group *cg, cpuset_
 static void
 sched_balance_group(struct cpu_group *cg)
 {
-	cpuset_t mask;
-	int high;
-	int low;
-	int i;
+	cpuset_t hmask, lmask;
+	int high, low, anylow;
 
-	CPU_FILL(&mask);
+	CPU_FILL(&hmask);
 	for (;;) {
-		sched_both(cg, mask, &low, &high);
-		if (low == high || low == -1 || high == -1)
+		high = sched_highest(cg, hmask, 1);
+		/* Stop if there is no more CPU with transferrable threads. */
+		if (high == -1)
 			break;
-		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low)))
+		CPU_CLR(high, &hmask);
+		CPU_COPY(&hmask, &lmask);
+		/* Stop if there is no more CPU left for low. */
+		if (CPU_EMPTY(&lmask))
 			break;
-		/*
-		 * If we failed to move any threads determine which cpu
-		 * to kick out of the set and try again.
-	 	 */
-		if (TDQ_CPU(high)->tdq_transferable == 0)
-			CPU_CLR(high, &mask);
-		else
-			CPU_CLR(low, &mask);
+		anylow = 1;
+nextlow:
+		low = sched_lowest(cg, lmask, -1,
+		    TDQ_CPU(high)->tdq_load - 1, high);
+		/* Stop if we looked well and found no less loaded CPU. */
+		if (anylow && low == -1)
+			break;
+		/* Go to next high if we found no less loaded CPU. */
+		if (low == -1)
+			continue;
+		/* Transfer thread from high to low. */
+		if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) {
+			/* CPU that got thread can no longer be a donor. */
+			CPU_CLR(low, &hmask);
+		} else {
+			/*
+			 * If failed, then there is no threads on high
+			 * that can run on this low. Drop low from low
+			 * mask and look for different one.
+			 */
+			CPU_CLR(low, &lmask);
+			anylow = 0;
+			goto nextlow;
+		}
 	}
-
-	for (i = 0; i < cg->cg_children; i++)
-		sched_balance_group(&cg->cg_child[i]);
 }
 
 static void
@@ -852,31 +884,16 @@ tdq_unlock_pair(struct tdq *one, struct 
 static int
 sched_balance_pair(struct tdq *high, struct tdq *low)
 {
-	int transferable;
-	int high_load;
-	int low_load;
 	int moved;
-	int move;
-	int diff;
-	int i;
 
 	tdq_lock_pair(high, low);
-	transferable = high->tdq_transferable;
-	high_load = high->tdq_load;
-	low_load = low->tdq_load;
 	moved = 0;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
 	 * threads we actually have to give up (transferable).
 	 */
-	if (transferable != 0) {
-		diff = high_load - low_load;
-		move = diff / 2;
-		if (diff & 0x1)
-			move++;
-		move = min(move, transferable);
-		for (i = 0; i < move; i++)
-			moved += tdq_move(high, low);
+	if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load &&
+	    (moved = tdq_move(high, low)) > 0) {
 		/*
 		 * IPI the target cpu to force it to reschedule with the new
 		 * workload.
@@ -1016,8 +1033,7 @@ runq_steal_from(struct runq *rq, int cpu
 {
 	struct rqbits *rqb;
 	struct rqhead *rqh;
-	struct thread *td;
-	int first;
+	struct thread *td, *first;
 	int bit;
 	int pri;
 	int i;
@@ -1025,7 +1041,7 @@ runq_steal_from(struct runq *rq, int cpu
 	rqb = &rq->rq_status;
 	bit = start & (RQB_BPW -1);
 	pri = 0;
-	first = 0;
+	first = NULL;
 again:
 	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
 		if (rqb->rqb_bits[i] == 0)
@@ -1044,7 +1060,7 @@ again:
 			if (first && THREAD_CAN_MIGRATE(td) &&
 			    THREAD_CAN_SCHED(td, cpu))
 				return (td);
-			first = 1;
+			first = td;
 		}
 	}
 	if (start != 0) {
@@ -1052,6 +1068,9 @@ again:
 		goto again;
 	}
 
+	if (first && THREAD_CAN_MIGRATE(first) &&
+	    THREAD_CAN_SCHED(first, cpu))
+		return (first);
 	return (NULL);
 }
 
@@ -1153,13 +1172,11 @@ SCHED_STAT_DEFINE(pickcpu_migration, "Se
 static int
 sched_pickcpu(struct thread *td, int flags)
 {
-	struct cpu_group *cg;
+	struct cpu_group *cg, *ccg;
 	struct td_sched *ts;
 	struct tdq *tdq;
 	cpuset_t mask;
-	int self;
-	int pri;
-	int cpu;
+	int cpu, pri, self;
 
 	self = PCPU_GET(cpuid);
 	ts = td->td_sched;
@@ -1174,52 +1191,77 @@ sched_pickcpu(struct thread *td, int fla
 	 * Prefer to run interrupt threads on the processors that generate
 	 * the interrupt.
 	 */
+	pri = td->td_priority;
 	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) &&
 	    curthread->td_intr_nesting_level && ts->ts_cpu != self) {
 		SCHED_STAT_INC(pickcpu_intrbind);
 		ts->ts_cpu = self;
+		if (TDQ_CPU(self)->tdq_lowpri > pri) {
+			SCHED_STAT_INC(pickcpu_affinity);
+			return (ts->ts_cpu);
+		}
 	}
 	/*
 	 * If the thread can run on the last cpu and the affinity has not
 	 * expired or it is idle run it there.
 	 */
-	pri = td->td_priority;
 	tdq = TDQ_CPU(ts->ts_cpu);
-	if (THREAD_CAN_SCHED(td, ts->ts_cpu)) {
-		if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
+	cg = tdq->tdq_cg;
+	if (THREAD_CAN_SCHED(td, ts->ts_cpu) &&
+	    tdq->tdq_lowpri >= PRI_MIN_IDLE &&
+	    SCHED_AFFINITY(ts, CG_SHARE_L2)) {
+		if (cg->cg_flags & CG_FLAG_THREAD) {
+			CPUSET_FOREACH(cpu, cg->cg_mask) {
+				if (TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE)
+					break;
+			}
+		} else
+			cpu = INT_MAX;
+		if (cpu > mp_maxid) {
 			SCHED_STAT_INC(pickcpu_idle_affinity);
 			return (ts->ts_cpu);
 		}
-		if (SCHED_AFFINITY(ts, CG_SHARE_L2) && tdq->tdq_lowpri > pri) {
-			SCHED_STAT_INC(pickcpu_affinity);
-			return (ts->ts_cpu);
-		}
 	}
 	/*
-	 * Search for the highest level in the tree that still has affinity.
+	 * Search for the last level cache CPU group in the tree.
+	 * Skip caches with expired affinity time and SMT groups.
+	 * Affinity to higher level caches will be handled less aggressively.
 	 */
-	cg = NULL;
-	for (cg = tdq->tdq_cg; cg != NULL; cg = cg->cg_parent)
-		if (SCHED_AFFINITY(ts, cg->cg_level))
-			break;
+	for (ccg = NULL; cg != NULL; cg = cg->cg_parent) {
+		if (cg->cg_flags & CG_FLAG_THREAD)
+			continue;
+		if (!SCHED_AFFINITY(ts, cg->cg_level))
+			continue;
+		ccg = cg;
+	}
+	if (ccg != NULL)
+		cg = ccg;
 	cpu = -1;
+	/* Search the group for the less loaded idle CPU we can run now. */
 	mask = td->td_cpuset->cs_mask;
-	if (cg)
-		cpu = sched_lowest(cg, mask, pri);
+	if (cg != NULL && cg != cpu_top &&
+	    cg->cg_mask != cpu_top->cg_mask)
+		cpu = sched_lowest(cg, mask, max(pri, PRI_MAX_TIMESHARE),
+		    INT_MAX, ts->ts_cpu);
+	/* Search globally for the less loaded CPU we can run now. */
 	if (cpu == -1)
-		cpu = sched_lowest(cpu_top, mask, -1);
+		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
+	/* Search globally for the less loaded CPU. */
+	if (cpu == -1)
+		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
+	KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
 	/*
 	 * Compare the lowest loaded cpu to current cpu.
 	 */
 	if (THREAD_CAN_SCHED(td, self) && TDQ_CPU(self)->tdq_lowpri > pri &&
-	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE) {
+	    TDQ_CPU(cpu)->tdq_lowpri < PRI_MIN_IDLE &&
+	    TDQ_CPU(self)->tdq_load <= TDQ_CPU(cpu)->tdq_load + 1) {
 		SCHED_STAT_INC(pickcpu_local);
 		cpu = self;
 	} else
 		SCHED_STAT_INC(pickcpu_lowest);
 	if (cpu != ts->ts_cpu)
 		SCHED_STAT_INC(pickcpu_migration);
-	KASSERT(cpu != -1, ("sched_pickcpu: Failed to find a cpu."));
 	return (cpu);
 }
 #endif
@@ -2792,8 +2834,6 @@ SYSCTL_INT(_kern_sched, OID_AUTO, balanc
 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
     &balance_interval, 0,
     "Average frequency in stathz ticks to run the long-term balancer");
-SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
-    "Steals work from another hyper-threaded core on idle");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
     "Attempts to steal work from other cores before idling");
 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,