From owner-p4-projects@FreeBSD.ORG Fri Mar 14 01:55:20 2008 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id B579D1065677; Fri, 14 Mar 2008 01:55:20 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 74B52106566B for ; Fri, 14 Mar 2008 01:55:20 +0000 (UTC) (envelope-from peter-gmail@wemm.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id 5F0208FC13 for ; Fri, 14 Mar 2008 01:55:20 +0000 (UTC) (envelope-from peter-gmail@wemm.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id m2E1tK8B071436 for ; Fri, 14 Mar 2008 01:55:20 GMT (envelope-from peter-gmail@wemm.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.1/8.14.1/Submit) id m2E1tKUH071429 for perforce@freebsd.org; Fri, 14 Mar 2008 01:55:20 GMT (envelope-from peter-gmail@wemm.org) Date: Fri, 14 Mar 2008 01:55:20 GMT Message-Id: <200803140155.m2E1tKUH071429@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to peter-gmail@wemm.org using -f From: Peter Wemm To: Perforce Change Reviews Cc: Subject: PERFORCE change 137663 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 14 Mar 2008 01:55:21 -0000 http://perforce.freebsd.org/chv.cgi?CH=137663 Change 137663 by peter@peter_overcee on 2008/03/14 01:54:48 hack job at finishing the merge Affected files ... .. //depot/projects/bike_sched/sys/kern/sched_ule.c#20 integrate .. //depot/projects/bike_sched/sys/powerpc/powerpc/machdep.c#6 delete .. //depot/projects/bike_sched/sys/powerpc/powerpc/trap.c#4 delete .. //depot/projects/bike_sched/sys/powerpc/powerpc/vm_machdep.c#4 delete Differences ... ==== //depot/projects/bike_sched/sys/kern/sched_ule.c#20 (text+ko) ==== @@ -36,7 +36,7 @@ */ #include -__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.217 2007/11/14 06:21:23 julian Exp $"); +__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.232 2008/03/12 10:11:59 jeff Exp $"); #include "opt_hwpmc_hooks.h" #include "opt_sched.h" @@ -59,6 +59,7 @@ #include #include #include +#include #ifdef KTRACE #include #include @@ -85,16 +86,13 @@ struct runq *ts_runq; /* Run-queue we're queued on. */ short ts_flags; /* TSF_* flags. */ u_char ts_cpu; /* CPU that we have affinity for. */ + int ts_rltick; /* Real last tick, for affinity. */ int ts_slice; /* Ticks of slice remaining. */ u_int ts_slptime; /* Number of ticks we vol. slept */ u_int ts_runtime; /* Number of ticks we were running */ - /* The following variables are only used for pctcpu calculation */ int ts_ltick; /* Last tick that we were running on */ int ts_ftick; /* First tick that we were running on */ int ts_ticks; /* Tick count */ -#ifdef SMP - int ts_rltick; /* Real last tick, for affinity. */ -#endif }; /* flags kept in ts_flags */ #define TSF_BOUND 0x0001 /* Thread can not migrate. */ @@ -102,6 +100,10 @@ #define TD_TO_TS(td) ((struct td_sched *)(&(td)[1])) +#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) +#define THREAD_CAN_SCHED(td, cpu) \ + CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) + static struct { struct thread initial_thread; struct td_sched initial_sched; @@ -175,7 +177,7 @@ static int sched_interact = SCHED_INTERACT_THRESH; static int realstathz; static int tickincr; -static int sched_slice; +static int sched_slice = 1; #ifdef PREEMPTION #ifdef FULL_PREEMPTION static int preempt_thresh = PRI_MAX_IDLE; @@ -185,6 +187,7 @@ #else static int preempt_thresh = 0; #endif +static int static_boost = 1; /* * tdq - per processor runqs and statistics. All fields are protected by the @@ -192,80 +195,51 @@ * locking in sched_pickcpu(); */ struct tdq { - struct mtx *tdq_lock; /* Pointer to group lock. */ + /* Ordered to improve efficiency of cpu_search() and switch(). */ + struct mtx tdq_lock; /* run queue lock. */ + struct cpu_group *tdq_cg; /* Pointer to cpu topology. */ + int tdq_load; /* Aggregate load. */ + int tdq_sysload; /* For loadavg, !ITHD load. */ + int tdq_transferable; /* Transferable thread count. */ + u_char tdq_lowpri; /* Lowest priority thread. */ + u_char tdq_ipipending; /* IPI pending. */ + u_char tdq_idx; /* Current insert index. */ + u_char tdq_ridx; /* Current removal index. */ struct runq tdq_realtime; /* real-time run queue. */ struct runq tdq_timeshare; /* timeshare run queue. */ struct runq tdq_idle; /* Queue of IDLE threads. */ - int tdq_load; /* Aggregate load. */ - u_char tdq_idx; /* Current insert index. */ - u_char tdq_ridx; /* Current removal index. */ -#ifdef SMP - u_char tdq_lowpri; /* Lowest priority thread. */ - int tdq_transferable; /* Transferable thread count. */ - LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ - struct tdq_group *tdq_group; /* Our processor group. */ -#else - int tdq_sysload; /* For loadavg, !ITHD load. */ -#endif + char tdq_name[sizeof("sched lock") + 6]; } __aligned(64); #ifdef SMP -/* - * tdq groups are groups of processors which can cheaply share threads. When - * one processor in the group goes idle it will check the runqs of the other - * processors in its group prior to halting and waiting for an interrupt. - * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. - * In a numa environment we'd want an idle bitmap per group and a two tiered - * load balancer. - */ -struct tdq_group { - struct mtx tdg_lock; /* Protects all fields below. */ - int tdg_cpus; /* Count of CPUs in this tdq group. */ - cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ - cpumask_t tdg_idlemask; /* Idle cpus in this group. */ - cpumask_t tdg_mask; /* Bit mask for first cpu. */ - int tdg_load; /* Total load of this group. */ - int tdg_transferable; /* Transferable load of this group. */ - LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ - char tdg_name[16]; /* lock name. */ -} __aligned(64); +struct cpu_group *cpu_top; -#define SCHED_AFFINITY_DEFAULT (max(1, hz / 300)) -#define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity) +#define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) +#define SCHED_AFFINITY(ts, t) ((ts)->ts_rltick > ticks - ((t) * affinity)) /* * Run-time tunables. */ static int rebalance = 1; static int balance_interval = 128; /* Default set in sched_initticks(). */ -static int pick_pri = 1; static int affinity; -static int tryself = 1; static int steal_htt = 1; static int steal_idle = 1; static int steal_thresh = 2; -static int topology = 0; /* * One thread queue per processor. */ -static volatile cpumask_t tdq_idle; -static int tdg_maxid; static struct tdq tdq_cpu[MAXCPU]; -static struct tdq_group tdq_groups[MAXCPU]; static struct tdq *balance_tdq; -static int balance_group_ticks; static int balance_ticks; #define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) #define TDQ_CPU(x) (&tdq_cpu[(x)]) #define TDQ_ID(x) ((int)((x) - tdq_cpu)) -#define TDQ_GROUP(x) (&tdq_groups[(x)]) -#define TDG_ID(x) ((int)((x) - tdq_groups)) #else /* !SMP */ static struct tdq tdq_cpu; -static struct mtx tdq_lock; #define TDQ_ID(x) (0) #define TDQ_SELF() (&tdq_cpu) @@ -276,7 +250,7 @@ #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) -#define TDQ_LOCKPTR(t) ((t)->tdq_lock) +#define TDQ_LOCKPTR(t) (&(t)->tdq_lock) static void sched_priority(struct thread *); static void sched_thread_priority(struct thread *, u_char); @@ -292,26 +266,23 @@ static void tdq_load_rem(struct tdq *, struct thread *); static __inline void tdq_runq_add(struct tdq *, struct thread *, int); static __inline void tdq_runq_rem(struct tdq *, struct thread *); +static inline int sched_shouldpreempt(int, int, int); void tdq_print(int cpu); static void runq_print(struct runq *rq); static void tdq_add(struct tdq *, struct thread *, int); #ifdef SMP -static void tdq_move(struct tdq *, struct tdq *); +static int tdq_move(struct tdq *, struct tdq *); static int tdq_idled(struct tdq *); -static void tdq_notify(struct thread *); -static struct thread *tdq_steal(struct tdq *); +static void tdq_notify(struct tdq *, struct thread *); +static struct thread *tdq_steal(struct tdq *, int); static struct thread *runq_steal(struct runq *); static int sched_pickcpu(struct thread *, int); static void sched_balance(void); -static void sched_balance_groups(void); -static void sched_balance_group(struct tdq_group *); -static void sched_balance_pair(struct tdq *, struct tdq *); +static int sched_balance_pair(struct tdq *, struct tdq *); static inline struct tdq *sched_setcpu(struct thread *, int, int); static inline struct mtx *thread_block_switch(struct thread *); static inline void thread_unblock_switch(struct thread *, struct mtx *); static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int); - -#define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) #endif static void sched_setup(void *dummy); @@ -358,7 +329,8 @@ tdq = TDQ_CPU(cpu); printf("tdq %d:\n", TDQ_ID(tdq)); - printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq)); + printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); + printf("\tLock name: %s\n", tdq->tdq_name); printf("\tload: %d\n", tdq->tdq_load); printf("\ttimeshare idx: %d\n", tdq->tdq_idx); printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); @@ -368,12 +340,41 @@ runq_print(&tdq->tdq_timeshare); printf("\tidle runq:\n"); runq_print(&tdq->tdq_idle); -#ifdef SMP printf("\tload transferable: %d\n", tdq->tdq_transferable); printf("\tlowest priority: %d\n", tdq->tdq_lowpri); - printf("\tgroup: %d\n", TDG_ID(tdq->tdq_group)); - printf("\tLock name: %s\n", tdq->tdq_group->tdg_name); -#endif +} + +static inline int +sched_shouldpreempt(int pri, int cpri, int remote) +{ + /* + * If the new priority is not better than the current priority there is + * nothing to do. + */ + if (pri >= cpri) + return (0); + /* + * Always preempt idle. + */ + if (cpri >= PRI_MIN_IDLE) + return (1); + /* + * If preemption is disabled don't preempt others. + */ + if (preempt_thresh == 0) + return (0); + /* + * Preempt if we exceed the threshold. + */ + if (pri <= preempt_thresh) + return (1); + /* + * If we're realtime or better and there is timeshare or worse running + * preempt only remote processors. + */ + if (remote && pri <= PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME) + return (1); + return (0); } #define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) @@ -385,20 +386,22 @@ static __inline void tdq_runq_add(struct tdq *tdq, struct thread *td, int flags) { + u_char pri; struct td_sched *ts = TD_TO_TS(td); + TDQ_LOCK_ASSERT(tdq, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_OWNED); -#ifdef SMP + + TD_SET_RUNQ(ts->ts_thread); if (THREAD_CAN_MIGRATE(td)) { tdq->tdq_transferable++; - tdq->tdq_group->tdg_transferable++; ts->ts_flags |= TSF_XFERABLE; } -#endif - if (ts->ts_runq == &tdq->tdq_timeshare) { - u_char pri; - - pri = td->td_priority; + pri = td->td_priority; + if (pri <= PRI_MAX_REALTIME) { + ts->ts_runq = &tdq->tdq_realtime; + } else if (pri <= PRI_MAX_TIMESHARE) { + ts->ts_runq = &tdq->tdq_timeshare; KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE, ("Invalid priority %d on timeshare runq", pri)); /* @@ -419,8 +422,10 @@ } else pri = tdq->tdq_ridx; runq_add_pri(ts->ts_runq, ts, pri, flags); + return; } else - runq_add(ts->ts_runq, ts, flags); + ts->ts_runq = &tdq->tdq_idle; + runq_add(ts->ts_runq, ts, flags); } /* @@ -435,25 +440,15 @@ TDQ_LOCK_ASSERT(tdq, MA_OWNED); KASSERT(ts->ts_runq != NULL, ("tdq_runq_remove: thread %p null ts_runq", td)); -#ifdef SMP if (ts->ts_flags & TSF_XFERABLE) { tdq->tdq_transferable--; - tdq->tdq_group->tdg_transferable--; ts->ts_flags &= ~TSF_XFERABLE; } -#endif if (ts->ts_runq == &tdq->tdq_timeshare) { if (tdq->tdq_idx != tdq->tdq_ridx) runq_remove_idx(ts->ts_runq, td, &tdq->tdq_ridx); else runq_remove_idx(ts->ts_runq, td, NULL); - /* - * For timeshare threads we update the priority here so - * the priority reflects the time we've been sleeping. - */ - ts->ts_ltick = ticks; - sched_pctcpu_update(td); - sched_priority(td); } else runq_remove(ts->ts_runq, ts); } @@ -474,11 +469,7 @@ CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); if (class != PRI_ITHD && (td->td_proc->p_flag & P_NOLOAD) == 0) -#ifdef SMP - tdq->tdq_group->tdg_load++; -#else tdq->tdq_sysload++; -#endif } /* @@ -495,11 +486,7 @@ class = PRI_BASE(td->td_pri_class); if (class != PRI_ITHD && (td->td_proc->p_flag & P_NOLOAD) == 0) -#ifdef SMP - tdq->tdq_group->tdg_load--; -#else tdq->tdq_sysload--; -#endif KASSERT(tdq->tdq_load != 0, ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); tdq->tdq_load--; @@ -507,112 +494,282 @@ TD_TO_TS(td)->ts_runq = NULL; } +/* + * Set lowpri to its exact value by searching the run-queue and + * evaluating curthread. curthread may be passed as an optimization. + */ +static void +tdq_setlowpri(struct tdq *tdq, struct thread *ctd) +{ + struct td_sched *ts; + struct thread *td; + + TDQ_LOCK_ASSERT(tdq, MA_OWNED); + if (ctd == NULL) + ctd = pcpu_find(TDQ_ID(tdq))->pc_curthread; + ts = tdq_choose(tdq); + if (ts) + td = ts->ts_thread; + if (ts == NULL || td->td_priority > ctd->td_priority) + tdq->tdq_lowpri = ctd->td_priority; + else + tdq->tdq_lowpri = td->td_priority; +} + #ifdef SMP +struct cpu_search { + cpumask_t cs_mask; /* Mask of valid cpus. */ + u_int cs_load; + u_int cs_cpu; + int cs_limit; /* Min priority for low min load for high. */ +}; + +#define CPU_SEARCH_LOWEST 0x1 +#define CPU_SEARCH_HIGHEST 0x2 +#define CPU_SEARCH_BOTH (CPU_SEARCH_LOWEST|CPU_SEARCH_HIGHEST) + +#define CPUMASK_FOREACH(cpu, mask) \ + for ((cpu) = 0; (cpu) < sizeof((mask)) * 8; (cpu)++) \ + if ((mask) & 1 << (cpu)) + +__inline int cpu_search(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high, const int match); +int cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low); +int cpu_search_highest(struct cpu_group *cg, struct cpu_search *high); +int cpu_search_both(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high); + +/* + * This routine compares according to the match argument and should be + * reduced in actual instantiations via constant propagation and dead code + * elimination. + */ +static __inline int +cpu_compare(int cpu, struct cpu_search *low, struct cpu_search *high, + const int match) +{ + struct tdq *tdq; + + tdq = TDQ_CPU(cpu); + if (match & CPU_SEARCH_LOWEST) + if (low->cs_mask & (1 << cpu) && + tdq->tdq_load < low->cs_load && + tdq->tdq_lowpri > low->cs_limit) { + low->cs_cpu = cpu; + low->cs_load = tdq->tdq_load; + } + if (match & CPU_SEARCH_HIGHEST) + if (high->cs_mask & (1 << cpu) && + tdq->tdq_load >= high->cs_limit && + tdq->tdq_load > high->cs_load && + tdq->tdq_transferable) { + high->cs_cpu = cpu; + high->cs_load = tdq->tdq_load; + } + return (tdq->tdq_load); +} + /* - * sched_balance is a simple CPU load balancing algorithm. It operates by - * finding the least loaded and most loaded cpu and equalizing their load - * by migrating some processes. + * Search the tree of cpu_groups for the lowest or highest loaded cpu + * according to the match argument. This routine actually compares the + * load on all paths through the tree and finds the least loaded cpu on + * the least loaded path, which may differ from the least loaded cpu in + * the system. This balances work among caches and busses. * - * Dealing only with two CPUs at a time has two advantages. Firstly, most - * installations will only have 2 cpus. Secondly, load balancing too much at - * once can have an unpleasant effect on the system. The scheduler rarely has - * enough information to make perfect decisions. So this algorithm chooses - * simplicity and more gradual effects on load in larger systems. - * + * This inline is instantiated in three forms below using constants for the + * match argument. It is reduced to the minimum set for each case. It is + * also recursive to the depth of the tree. + */ +static inline int +cpu_search(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high, const int match) +{ + int total; + + total = 0; + if (cg->cg_children) { + struct cpu_search lgroup; + struct cpu_search hgroup; + struct cpu_group *child; + u_int lload; + int hload; + int load; + int i; + + lload = -1; + hload = -1; + for (i = 0; i < cg->cg_children; i++) { + child = &cg->cg_child[i]; + if (match & CPU_SEARCH_LOWEST) { + lgroup = *low; + lgroup.cs_load = -1; + } + if (match & CPU_SEARCH_HIGHEST) { + hgroup = *high; + lgroup.cs_load = 0; + } + switch (match) { + case CPU_SEARCH_LOWEST: + load = cpu_search_lowest(child, &lgroup); + break; + case CPU_SEARCH_HIGHEST: + load = cpu_search_highest(child, &hgroup); + break; + case CPU_SEARCH_BOTH: + load = cpu_search_both(child, &lgroup, &hgroup); + break; + } + total += load; + if (match & CPU_SEARCH_LOWEST) + if (load < lload || low->cs_cpu == -1) { + *low = lgroup; + lload = load; + } + if (match & CPU_SEARCH_HIGHEST) + if (load > hload || high->cs_cpu == -1) { + hload = load; + *high = hgroup; + } + } + } else { + int cpu; + + CPUMASK_FOREACH(cpu, cg->cg_mask) + total += cpu_compare(cpu, low, high, match); + } + return (total); +} + +/* + * cpu_search instantiations must pass constants to maintain the inline + * optimization. + */ +int +cpu_search_lowest(struct cpu_group *cg, struct cpu_search *low) +{ + return cpu_search(cg, low, NULL, CPU_SEARCH_LOWEST); +} + +int +cpu_search_highest(struct cpu_group *cg, struct cpu_search *high) +{ + return cpu_search(cg, NULL, high, CPU_SEARCH_HIGHEST); +} + +int +cpu_search_both(struct cpu_group *cg, struct cpu_search *low, + struct cpu_search *high) +{ + return cpu_search(cg, low, high, CPU_SEARCH_BOTH); +} + +/* + * Find the cpu with the least load via the least loaded path that has a + * lowpri greater than pri pri. A pri of -1 indicates any priority is + * acceptable. + */ +static inline int +sched_lowest(struct cpu_group *cg, cpumask_t mask, int pri) +{ + struct cpu_search low; + + low.cs_cpu = -1; + low.cs_load = -1; + low.cs_mask = mask; + low.cs_limit = pri; + cpu_search_lowest(cg, &low); + return low.cs_cpu; +} + +/* + * Find the cpu with the highest load via the highest loaded path. + */ +static inline int +sched_highest(struct cpu_group *cg, cpumask_t mask, int minload) +{ + struct cpu_search high; + + high.cs_cpu = -1; + high.cs_load = 0; + high.cs_mask = mask; + high.cs_limit = minload; + cpu_search_highest(cg, &high); + return high.cs_cpu; +} + +/* + * Simultaneously find the highest and lowest loaded cpu reachable via + * cg. */ +static inline void +sched_both(struct cpu_group *cg, cpumask_t mask, int *lowcpu, int *highcpu) +{ + struct cpu_search high; + struct cpu_search low; + + low.cs_cpu = -1; + low.cs_limit = -1; + low.cs_load = -1; + low.cs_mask = mask; + high.cs_load = 0; + high.cs_cpu = -1; + high.cs_limit = -1; + high.cs_mask = mask; + cpu_search_both(cg, &low, &high); + *lowcpu = low.cs_cpu; + *highcpu = high.cs_cpu; + return; +} + static void -sched_balance() +sched_balance_group(struct cpu_group *cg) { - struct tdq_group *high; - struct tdq_group *low; - struct tdq_group *tdg; - struct tdq *tdq; - int cnt; + cpumask_t mask; + int high; + int low; int i; - /* - * Select a random time between .5 * balance_interval and - * 1.5 * balance_interval. - */ - balance_ticks = max(balance_interval / 2, 1); - balance_ticks += random() % balance_interval; - if (smp_started == 0 || rebalance == 0) - return; - tdq = TDQ_SELF(); - TDQ_UNLOCK(tdq); - low = high = NULL; - i = random() % (tdg_maxid + 1); - for (cnt = 0; cnt <= tdg_maxid; cnt++) { - tdg = TDQ_GROUP(i); + mask = -1; + for (;;) { + sched_both(cg, mask, &low, &high); + if (low == high || low == -1 || high == -1) + break; + if (sched_balance_pair(TDQ_CPU(high), TDQ_CPU(low))) + break; /* - * Find the CPU with the highest load that has some - * threads to transfer. - */ - if ((high == NULL || tdg->tdg_load > high->tdg_load) - && tdg->tdg_transferable) - high = tdg; - if (low == NULL || tdg->tdg_load < low->tdg_load) - low = tdg; - if (++i > tdg_maxid) - i = 0; + * If we failed to move any threads determine which cpu + * to kick out of the set and try again. + */ + if (TDQ_CPU(high)->tdq_transferable == 0) + mask &= ~(1 << high); + else + mask &= ~(1 << low); } - if (low != NULL && high != NULL && high != low) - sched_balance_pair(LIST_FIRST(&high->tdg_members), - LIST_FIRST(&low->tdg_members)); - TDQ_LOCK(tdq); + + for (i = 0; i < cg->cg_children; i++) + sched_balance_group(&cg->cg_child[i]); } -/* - * Balance load between CPUs in a group. Will only migrate within the group. - */ static void -sched_balance_groups() +sched_balance() { struct tdq *tdq; - int i; /* * Select a random time between .5 * balance_interval and * 1.5 * balance_interval. */ - balance_group_ticks = max(balance_interval / 2, 1); - balance_group_ticks += random() % balance_interval; + balance_ticks = max(balance_interval / 2, 1); + balance_ticks += random() % balance_interval; if (smp_started == 0 || rebalance == 0) return; tdq = TDQ_SELF(); TDQ_UNLOCK(tdq); - for (i = 0; i <= tdg_maxid; i++) - sched_balance_group(TDQ_GROUP(i)); + sched_balance_group(cpu_top); TDQ_LOCK(tdq); } /* - * Finds the greatest imbalance between two tdqs in a group. - */ -static void -sched_balance_group(struct tdq_group *tdg) -{ - struct tdq *tdq; - struct tdq *high; - struct tdq *low; - int load; - - if (tdg->tdg_transferable == 0) - return; - low = NULL; - high = NULL; - LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { - load = tdq->tdq_load; - if (high == NULL || load > high->tdq_load) - high = tdq; - if (low == NULL || load < low->tdq_load) - low = tdq; - } - if (high != NULL && low != NULL && high != low) - sched_balance_pair(high, low); -} - -/* * Lock two thread queues using their address to maintain lock order. */ static void @@ -640,31 +797,22 @@ /* * Transfer load between two imbalanced thread queues. */ -static void +static int sched_balance_pair(struct tdq *high, struct tdq *low) { int transferable; int high_load; int low_load; + int moved; int move; int diff; int i; tdq_lock_pair(high, low); - /* - * If we're transfering within a group we have to use this specific - * tdq's transferable count, otherwise we can steal from other members - * of the group. - */ - if (high->tdq_group == low->tdq_group) { - transferable = high->tdq_transferable; - high_load = high->tdq_load; - low_load = low->tdq_load; - } else { - transferable = high->tdq_group->tdg_transferable; - high_load = high->tdq_group->tdg_load; - low_load = low->tdq_group->tdg_load; - } + transferable = high->tdq_transferable; + high_load = high->tdq_load; + low_load = low->tdq_load; + moved = 0; /* * Determine what the imbalance is and then adjust that to how many * threads we actually have to give up (transferable). @@ -676,7 +824,7 @@ move++; move = min(move, transferable); for (i = 0; i < move; i++) - tdq_move(high, low); + moved += tdq_move(high, low); /* * IPI the target cpu to force it to reschedule with the new * workload. @@ -684,13 +832,13 @@ ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT); } tdq_unlock_pair(high, low); - return; + return (moved); } /* * Move a thread from one thread queue to another. */ -static void +static int tdq_move(struct tdq *from, struct tdq *to) { struct thread *td; @@ -703,22 +851,9 @@ tdq = from; cpu = TDQ_ID(to); - td = tdq_steal(tdq); - if (td == NULL) { - struct tdq_group *tdg; - - tdg = tdq->tdq_group; - LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { - if (tdq == from || tdq->tdq_transferable == 0) - continue; - td = tdq_steal(tdq); - break; - } - if (td == NULL) - return; - } - if (tdq == to) - return; + td = tdq_steal(tdq, cpu); + if (td == NULL) + return (0); /* * Although the run queue is locked the thread may be blocked. Lock * it to clear this and acquire the run-queue lock. @@ -730,6 +865,7 @@ TD_TO_TS(ts)->ts_cpu = cpu; td->td_lock = TDQ_LOCKPTR(to); tdq_add(to, td, SRQ_YIELDING); + return (1); } /* @@ -739,116 +875,72 @@ static int tdq_idled(struct tdq *tdq) { - struct tdq_group *tdg; + struct cpu_group *cg; struct tdq *steal; - int highload; - int highcpu; + cpumask_t mask; + int thresh; int cpu; if (smp_started == 0 || steal_idle == 0) return (1); - /* We don't want to be preempted while we're iterating over tdqs */ + mask = -1; + mask &= ~PCPU_GET(cpumask); + /* We don't want to be preempted while we're iterating. */ spinlock_enter(); - tdg = tdq->tdq_group; - /* - * If we're in a cpu group, try and steal threads from another cpu in - * the group before idling. In a HTT group all cpus share the same - * run-queue lock, however, we still need a recursive lock to - * call tdq_move(). - */ - if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) { - TDQ_LOCK(tdq); - LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { - if (steal == tdq || steal->tdq_transferable == 0) - continue; - TDQ_LOCK(steal); - goto steal; + for (cg = tdq->tdq_cg; cg != NULL; ) { + if ((cg->cg_flags & (CG_FLAG_HTT | CG_FLAG_THREAD)) == 0) + thresh = steal_thresh; + else + thresh = 1; + cpu = sched_highest(cg, mask, thresh); + if (cpu == -1) { + cg = cg->cg_parent; + continue; + } + steal = TDQ_CPU(cpu); + mask &= ~(1 << cpu); + tdq_lock_pair(tdq, steal); + if (steal->tdq_load < thresh || steal->tdq_transferable == 0) { + tdq_unlock_pair(tdq, steal); + continue; } - TDQ_UNLOCK(tdq); - } - /* - * Find the least loaded CPU with a transferable thread and attempt - * to steal it. We make a lockless pass and then verify that the - * thread is still available after locking. - */ - for (;;) { - highcpu = 0; - highload = 0; - for (cpu = 0; cpu <= mp_maxid; cpu++) { - if (CPU_ABSENT(cpu)) - continue; - steal = TDQ_CPU(cpu); - if (steal->tdq_transferable == 0) - continue; - if (steal->tdq_load < highload) - continue; - highload = steal->tdq_load; - highcpu = cpu; + /* + * If a thread was added while interrupts were disabled don't + * steal one here. If we fail to acquire one due to affinity + * restrictions loop again with this cpu removed from the + * set. + */ + if (tdq->tdq_load == 0 && tdq_move(steal, tdq) == 0) { + tdq_unlock_pair(tdq, steal); + continue; } - if (highload < steal_thresh) - break; - steal = TDQ_CPU(highcpu); - if (steal == tdq) - break; - tdq_lock_pair(tdq, steal); - if (steal->tdq_load >= steal_thresh && steal->tdq_transferable) - goto steal; - tdq_unlock_pair(tdq, steal); + spinlock_exit(); + TDQ_UNLOCK(steal); + mi_switch(SW_VOL, NULL); + thread_unlock(curthread); + + return (0); } spinlock_exit(); return (1); -steal: - spinlock_exit(); - tdq_move(steal, tdq); - TDQ_UNLOCK(steal); - mi_switch(SW_VOL, NULL); - thread_unlock(curthread); - - return (0); } /* * Notify a remote cpu of new work. Sends an IPI if criteria are met. */ static void -tdq_notify(struct thread *td) +tdq_notify(struct tdq *tdq, struct thread *td) { - struct thread *ctd; - struct pcpu *pcpu; int cpri; int pri; int cpu; - cpu = TD_TO_TS(ts)->ts_cpu; - pri = td->td_priority; - pcpu = pcpu_find(cpu); - ctd = pcpu->pc_curthread; - cpri = ctd->td_priority; - - /* - * If our priority is not better than the current priority there is - * nothing to do. - */ - if (pri > cpri) + if (tdq->tdq_ipipending) return; - /* - * Always IPI idle. - */ - if (cpri > PRI_MIN_IDLE) - goto sendipi; - /* - * If we're realtime or better and there is timeshare or worse running - * send an IPI. - */ - if (pri < PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME) - goto sendipi; - /* - * Otherwise only IPI if we exceed the threshold. - */ - if (pri > preempt_thresh) + cpri = pcpu_find(cpu)->pc_curthread->td_priority; + if (!sched_shouldpreempt(pri, cpri, 1)) return; -sendipi: - ctd->td_flags |= TDF_NEEDRESCHED; + tdq->tdq_ipipending = 1; ipi_selected(1 << cpu, IPI_PREEMPT); } @@ -857,7 +949,7 @@ * index. */ static struct thread * -runq_steal_from(struct runq *rq, u_char start) +runq_steal_from(struct runq *rq, int cpu, u_char start) { struct thread *td; struct rqbits *rqb; @@ -886,7 +978,8 @@ pri += (i << RQB_L2BPW); rqh = &rq->rq_queues[pri]; TAILQ_FOREACH(td, rqh, td_procq) { - if (first && THREAD_CAN_MIGRATE(td)) + if (first && THREAD_CAN_MIGRATE(td) && + THREAD_CAN_SCHED(td, cpu)) return (td); first = 1; } @@ -903,7 +996,7 @@ * Steals load from a standard linear queue. >>> TRUNCATED FOR MAIL (1000 lines) <<<